diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-19 13:34:10 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2011-12-19 13:34:10 +0000 |
commit | 2576d4686e427f362d435c24606db9e5a76b6339 (patch) | |
tree | bc0b18e9e98a0d093c8aa2eef4b94707c8e53408 | |
parent | 1183e193897ab3e03c2cebec978a1a053fcb179d (diff) | |
download | pcre-2576d4686e427f362d435c24606db9e5a76b6339.tar.gz |
A lot more work on pcretest; now runs many (but not all) tests.
git-svn-id: svn://vcs.exim.org/pcre/code/branches/pcre16@810 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rwxr-xr-x | RunTest | 129 | ||||
-rw-r--r-- | pcre_printint.c | 5 | ||||
-rw-r--r-- | pcretest.c | 172 | ||||
-rw-r--r-- | testdata/testinput1 | 3 | ||||
-rw-r--r-- | testdata/testinput16 | 238 | ||||
-rw-r--r-- | testdata/testinput17 | 282 | ||||
-rw-r--r-- | testdata/testinput2 | 4 | ||||
-rw-r--r-- | testdata/testinput4 | 44 | ||||
-rw-r--r-- | testdata/testinput5 | 246 | ||||
-rw-r--r-- | testdata/testoutput1 | 3 | ||||
-rw-r--r-- | testdata/testoutput10 | 10 | ||||
-rw-r--r-- | testdata/testoutput13 | 22 | ||||
-rw-r--r-- | testdata/testoutput16 | 819 | ||||
-rw-r--r-- | testdata/testoutput17 | 907 | ||||
-rw-r--r-- | testdata/testoutput2 | 6 | ||||
-rw-r--r-- | testdata/testoutput4 | 65 | ||||
-rw-r--r-- | testdata/testoutput5 | 919 | ||||
-rw-r--r-- | testdata/testoutput8 | 8 |
18 files changed, 2545 insertions, 1337 deletions
@@ -18,7 +18,10 @@ # two tests for JIT-specific features, one to be run when JIT support is # available, and one when it is not. -# The arguments for this script can be individual test numbers, or the word +# Whichever of the 8-bit and 16-bit libraries exist are tested. It is also +# possible to select which to test by the arguments -8 or -16. + +# Other arguments for this script can be individual test numbers, or the word # "valgrind", or "sim" followed by an argument to run cross-compiled # executables under a simulator, for example: # @@ -26,6 +29,8 @@ valgrind= sim= +arg8= +arg16= # Select which tests to run; for those that are explicitly requested, check # that the necessary optional facilities are available. @@ -45,6 +50,8 @@ do12=no do13=no do14=no do15=no +do16=no +do17=no while [ $# -gt 0 ] ; do case $1 in @@ -63,6 +70,10 @@ while [ $# -gt 0 ] ; do 13) do13=yes;; 14) do14=yes;; 15) do15=yes;; + 16) do16=yes;; + 17) do17=yes;; + -8) arg8=yes;; + -16) arg16=yes;; valgrind) valgrind="valgrind -q --smc-check=all";; sim) shift; sim=$1;; *) echo "Unknown test number $1"; exit 1;; @@ -107,12 +118,26 @@ $sim ./pcretest -C | $sim ./pcregrep '8-bit and 16-bit support' >/dev/null if [ $? -eq 0 ] ; then test8= test16=-16 + if [ "$arg8" = yes -a "$arg16" != yes ] ; then + test16=skip + fi + if [ "$arg16" = yes -a "$arg8" != yes ] ; then + test8=skip + fi else $sim ./pcretest -C | $sim ./pcregrep '8-bit support' >/dev/null if [ $? -eq 0 ] ; then + if [ "$arg16" = yes ] ; then + echo "Cannot run 16-bit library tests: 16-bit library not compiled" + exit 1 + fi test8= test16=skip else + if [ "$arg8" = yes ] ; then + echo "Cannot run 8-bit library tests: 8-bit library not compiled" + exit 1 + fi test8=skip test16=-16 fi @@ -135,6 +160,20 @@ if [ $jit -ne 0 ] ; then jitopt=-s+ fi +if [ "$test8" = skip ] ; then + if [ $do17 = yes ] ; then + echo "Can't run test 17 because the 8-bit library is not built" + exit 1 + fi +fi + +if [ "$test16" = skip ] ; then + if [ $do16 = yes ] ; then + echo "Can't run test 16 because the 16-bit library is not built" + exit 1 + fi +fi + if [ $utf -eq 0 ] ; then if [ $do4 = yes ] ; then echo "Can't run test 4 because UTF support is not configured" @@ -152,6 +191,12 @@ if [ $utf -eq 0 ] ; then echo "Can't run test 12 because UTF support is not configured" exit 1 fi + if [ $do16 = yes ] ; then + echo "Can't run test 16 because UTF support is not configured" + fi + if [ $do17 = yes ] ; then + echo "Can't run test 17 because UTF support is not configured" + fi fi if [ $ucp -eq 0 ] ; then @@ -198,7 +243,8 @@ fi if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ $do5 = no -a $do6 = no -a $do7 = no -a $do8 = no -a \ $do9 = no -a $do10 = no -a $do11 = no -a $do12 = no -a \ - $do13 = no -a $do14 = no -a $do15 = no ] ; then + $do13 = no -a $do14 = no -a $do15 = no -a $do16 = no -a \ + $do17 = no ] ; then do1=yes do2=yes do3=yes @@ -214,6 +260,8 @@ if [ $do1 = no -a $do2 = no -a $do3 = no -a $do4 = no -a \ do13=yes do14=yes do15=yes + do16=yes + do17=yes fi # Show which release and which test data @@ -226,8 +274,8 @@ for bmode in "$test8" "$test16"; do case "$bmode" in skip) continue;; -16) if [ "$test8" != "skip" ] ; then echo ""; fi - echo "---- Testing 16-bit library ----"; echo "";; - *) echo "---- Testing 8-bit library ----"; echo "";; + bits=16; echo "---- Testing 16-bit library ----"; echo "";; + *) bits=8; echo "---- Testing 8-bit library ----"; echo "";; esac # Primary test, compatible with JIT and all versions of Perl >= 5.8 @@ -251,7 +299,7 @@ fi # PCRE tests that are not JIT or Perl-compatible: API, errors, internals if [ $do2 = yes ] ; then - echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-8/16)" + echo "Test 2: API, errors, internals, and non-Perl stuff (not UTF-$bits)" for opt in "" "-s" $jitopt; do $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput2 testtry if [ $? = 0 ] ; then @@ -336,9 +384,9 @@ fi # Additional tests for UTF support if [ $do4 = yes ] ; then - echo "Test 4: UTF-8/16 support (Compatible with Perl >= 5.8)" + echo "Test 4: UTF-$bits support (Compatible with Perl >= 5.8)" if [ $utf -eq 0 ] ; then - echo " Skipped because UTF support is not available" + echo " Skipped because UTF-$bits support is not available" else for opt in "" "-s" $jitopt; do $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput4 testtry @@ -356,9 +404,9 @@ if [ $do4 = yes ] ; then fi if [ $do5 = yes ] ; then - echo "Test 5: API, internals, and non-Perl stuff for UTF-8/16 support" + echo "Test 5: API, internals, and non-Perl stuff for UTF-$bits support" if [ $utf -eq 0 ] ; then - echo " Skipped because UTF support is not available" + echo " Skipped because UTF-$bits support is not available" else for opt in "" "-s" $jitopt; do $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput5 testtry @@ -411,9 +459,9 @@ if [ $do7 = yes ] ; then fi if [ $do8 = yes ] ; then - echo "Test 8: DFA matching with UTF-8 or UTF-16" + echo "Test 8: DFA matching with UTF-$bits" if [ $utf -eq 0 ] ; then - echo " Skipped because UTF support is not available" + echo " Skipped because UTF-$bits support is not available" else for opt in "" "-s"; do $sim $valgrind ./pcretest -q $bmode $opt -dfa $testdata/testinput8 testtry @@ -469,10 +517,10 @@ if [ $do10 = yes ] ; then fi fi -# Test of Perl >= 5.10 features without UTF8 support +# Test of Perl >= 5.10 features without UTF support if [ $do11 = yes ] ; then - echo "Test 11: Features from Perl >= 5.10 without UTF8 support" + echo "Test 11: Features from Perl >= 5.10 without UTF-$bits support" for opt in "" "-s" $jitopt; do $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput11 testtry if [ $? = 0 ] ; then @@ -487,12 +535,12 @@ if [ $do11 = yes ] ; then done fi -# Test of Perl >= 5.10 features with UTF8 support +# Test of Perl >= 5.10 features with UTF support if [ $do12 = yes ] ; then - echo "Test 12: Features from Perl >= 5.10 with UTF-8 or UTF-16 support" + echo "Test 12: Features from Perl >= 5.10 with UTF-$bits support" if [ $utf -eq 0 ] ; then - echo " Skipped because UTF support is not available" + echo " Skipped because UTF-$bits support is not available" else for opt in "" "-s" $jitopt; do $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput12 testtry @@ -565,8 +613,55 @@ if [ $do15 = yes ] ; then fi fi -# End of loop for 8-bit/16-bit tests +# Tests for 16-bit-specific features (needs UTF-8 support) + +if [ $do16 = yes ] ; then + echo "Test 16: specials for the 16-bit library" + if [ "$bits" = "8" ] ; then + echo " Skipped when running 8-bit tests" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + for opt in "" "-s" $jitopt; do + $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput16 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput16 testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-s" ] ; then echo " OK with study" + elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" + else echo " OK" + fi + done + fi +fi + +# Tests for 8-bit-specific features (needs UTF-8 support) + +if [ $do17 = yes ] ; then + echo "Test 17: specials for the 8-bit library" + if [ "$bits" = "16" ] ; then + echo " Skipped when running 16-bit tests" + elif [ $utf -eq 0 ] ; then + echo " Skipped because UTF-$bits support is not available" + else + for opt in "" "-s" $jitopt; do + $sim $valgrind ./pcretest -q $bmode $opt $testdata/testinput17 testtry + if [ $? = 0 ] ; then + $cf $testdata/testoutput17 testtry + if [ $? != 0 ] ; then exit 1; fi + else exit 1 + fi + if [ "$opt" = "-s" ] ; then echo " OK with study" + elif [ "$opt" = "-s+" ] ; then echo " OK with JIT study" + else echo " OK" + fi + done + fi +fi +# End of loop for 8-bit/16-bit tests done # End diff --git a/pcre_printint.c b/pcre_printint.c index 2fcf985..8d504ce 100644 --- a/pcre_printint.c +++ b/pcre_printint.c @@ -473,7 +473,10 @@ for(;;) case OP_NOT: c = code[1]; if (PRINTABLE(c)) fprintf(f, " %s [^%c]", flag, c); - else fprintf(f, " %s [^\\x%02x]", flag, c); + else if (utf || c > 0xff) + fprintf(f, " %s [^\\x{%02x}]", flag, c); + else + fprintf(f, " %s [^\\x%02x]", flag, c); break; case OP_NOTSTARI: @@ -219,12 +219,12 @@ use these in the definitions of generic macros. */ count = pcre16_exec(re, extra, (PCRE_SPTR16)bptr, len, start_offset, \ options, offsets, size_offsets) -#define PCRE_STUDY16(extra, re, options, error) \ - extra = pcre16_study(re, options, error) - #define PCRE_FREE_STUDY16(extra) \ pcre16_free_study(extra) +#define PCRE_STUDY16(extra, re, options, error) \ + extra = pcre16_study(re, options, error) + #endif /* SUPPORT_PCRE16 */ @@ -259,18 +259,18 @@ use these in the definitions of generic macros. */ PCRE_EXEC8(count, re, extra, bptr, len, start_offset, options, \ offsets, size_offsets) -#define PCRE_STUDY(extra, re, options, error) \ - if (use_pcre16) \ - PCRE_STUDY16(extra, re, options, error); \ - else \ - PCRE_STUDY8(extra, re, options, error) - #define PCRE_FREE_STUDY(extra) \ if (use_pcre16) \ PCRE_FREE_STUDY16(extra); \ else \ PCRE_FREE_STUDY8(extra) +#define PCRE_STUDY(extra, re, options, error) \ + if (use_pcre16) \ + PCRE_STUDY16(extra, re, options, error); \ + else \ + PCRE_STUDY8(extra, re, options, error) + /* ----- Only 8-bit mode is supported ----- */ #elif defined SUPPORT_PCRE8 @@ -278,8 +278,8 @@ use these in the definitions of generic macros. */ #define PCHARSV PCHARSV8 #define PCRE_COMPILE PCRE_COMPILE8 #define PCRE_EXEC PCRE_EXEC8 -#define PCRE_STUDY PCRE_STUDY8 #define PCRE_FREE_STUDY PCRE_FREE_STUDY8 +#define PCRE_STUDY PCRE_STUDY8 /* ----- Only 16-bit mode is supported ----- */ @@ -288,8 +288,8 @@ use these in the definitions of generic macros. */ #define PCHARSV PCHARSV16 #define PCRE_COMPILE PCRE_COMPILE16 #define PCRE_EXEC PCRE_EXEC16 -#define PCRE_STUDY PCRE_STUDY16 #define PCRE_FREE_STUDY PCRE_FREE_STUDY16 +#define PCRE_STUDY PCRE_STUDY16 #endif /* ----- End of mode-specific function call macros ----- */ @@ -321,7 +321,7 @@ static int debug_lengths; static int first_callout; static int locale_set = 0; static int show_malloc; -static int use_utf8; +static int use_utf; static size_t gotten_store; static size_t first_gotten_store = 0; static const unsigned char *last_callout_mark = NULL; @@ -848,7 +848,16 @@ return i + 1; 8-bit size. For a UTF-8 string, the size needed for UTF-16 is no more than double, because up to 0xffff uses no more than 3 bytes in UTF-8 but possibly 4 in UTF-16. Higher values use 4 bytes in UTF-8 and up to 4 bytes in UTF-16. The -result is always left in buffer16. */ +result is always left in buffer16. + +Arguments: + p points to a byte string + utf true if UTF-8 (to be converted to UTF-16) + len number of bytes in the string (excluding trailing zero) + +Returns: number of 16-bit data items used (excluding trailing zero) + OR -1 if a UTF-8 string is malformed +*/ static int to16(pcre_uint8 *p, int utf, int len) @@ -880,6 +889,7 @@ else while (len > 0) { int chlen = utf82ord(p, &c); + if (chlen <= 0) return -1; p += chlen; len -= chlen; if (c < 0x10000) *pp++ = c; else @@ -1030,6 +1040,43 @@ return(result); +/************************************************* +* Print one character * +*************************************************/ + +/* Print a single character either literally, or as a hex escape. */ + +static int pchar(int c, FILE *f) +{ +if (PRINTOK(c)) + { + if (f != NULL) fprintf(f, "%c", c); + return 1; + } + +if (c < 0x100) + { + if (use_utf) + { + if (f != NULL) fprintf(f, "\\x{%02x}", c); + return 6; + } + else + { + if (f != NULL) fprintf(f, "\\x%02x", c); + return 4; + } + } + +if (f != NULL) fprintf(f, "\\x{%02x}", c); +return (c <= 0x000000ff)? 6 : + (c <= 0x00000fff)? 7 : + (c <= 0x0000ffff)? 8 : + (c <= 0x000fffff)? 9 : 10; +} + + + #ifdef SUPPORT_PCRE8 /************************************************* * Print 8-bit character string * @@ -1046,46 +1093,20 @@ int yield = 0; while (length-- > 0) { #if !defined NOUTF8 - if (use_utf8) + if (use_utf) { int rc = utf82ord(p, &c); - if (rc > 0 && rc <= length + 1) /* Mustn't run over the end */ { length -= rc - 1; p += rc; - if (PRINTOK(c)) - { - if (f != NULL) fprintf(f, "%c", c); - yield++; - } - else - { - int n = 4; - if (f != NULL) fprintf(f, "\\x{%02x}", c); - yield += (n <= 0x000000ff)? 2 : - (n <= 0x00000fff)? 3 : - (n <= 0x0000ffff)? 4 : - (n <= 0x000fffff)? 5 : 6; - } - continue; + yield += pchar(c, f); + continue; } } #endif - - /* Not UTF-8, or malformed UTF-8 */ - c = *p++; - if (PRINTOK(c)) - { - if (f != NULL) fprintf(f, "%c", c); - yield++; - } - else - { - if (f != NULL) fprintf(f, "\\x%02x", c); - yield += 4; - } + yield += pchar(c, f); } return yield; @@ -1109,9 +1130,8 @@ int yield = 0; while (length-- > 0) { int c = *p++ & 0xffff; - #if !defined NOUTF8 - if (use_utf8 && c >= 0xD800 && c < 0xDC00 && length > 0) + if (use_utf && c >= 0xD800 && c < 0xDC00 && length > 0) { int d = *p & 0xffff; if (d >= 0xDC00 && d < 0xDFFF) @@ -1122,28 +1142,7 @@ while (length-- > 0) } } #endif - - if (PRINTOK(c)) - { - if (f != NULL) fprintf(f, "%c", c); - yield++; - } - else - { - yield += 4; - if (c < 0x100) - { - if (f != NULL) fprintf(f, "\\x%02x", c); - } - else - { - if (f != NULL) fprintf(f, "\\x{%02x}", c); - yield += (c <= 0x000000ff)? 2 : - (c <= 0x00000fff)? 3 : - (c <= 0x0000ffff)? 4 : - (c <= 0x000fffff)? 5 : 6; - } - } + yield += pchar(c, f); } return yield; @@ -1795,7 +1794,7 @@ while (!done) int do_flip = 0; int erroroffset, len, delimiter, poffset; - use_utf8 = 0; + use_utf = 0; debug_lengths = 1; if (extend_inputline(infile, buffer, " re> ") == NULL) break; @@ -1859,7 +1858,7 @@ while (!done) /* Need to know if UTF-8 for printing data strings */ new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); - use_utf8 = (get_options & PCRE_UTF8) != 0; + use_utf = (get_options & PCRE_UTF8) != 0; /* Now see if there is any following study data. */ @@ -2004,7 +2003,7 @@ while (!done) case 'X': options |= PCRE_EXTRA; break; case 'Y': options |= PCRE_NO_START_OPTIMISE; break; case 'Z': debug_lengths = 0; break; - case '8': options |= PCRE_UTF8; use_utf8 = 1; break; + case '8': options |= PCRE_UTF8; use_utf = 1; break; case '?': options |= PCRE_NO_UTF8_CHECK; break; case 'T': @@ -2122,7 +2121,12 @@ while (!done) #ifdef SUPPORT_PCRE16 if (use_pcre16) { - (void)to16(p, options & PCRE_UTF8, (int)strlen((char *)p)); + if (to16(p, options & PCRE_UTF8, (int)strlen((char *)p)) < 0) + { + fprintf(outfile, "**Failed: invalid UTF-8 string cannot be " + "converted to UTF-16\n"); + goto SKIP_DATA; + } p = (pcre_uint8 *)buffer16; } #endif @@ -2178,7 +2182,7 @@ while (!done) lines. */ new_info(re, NULL, PCRE_INFO_OPTIONS, &get_options); - if ((get_options & PCRE_UTF8) != 0) use_utf8 = 1; + if ((get_options & PCRE_UTF8) != 0) use_utf = 1; /* Extract the size for possible writing before possibly flipping it, and remember the store that was got. */ @@ -2395,9 +2399,9 @@ while (!done) ((get_options & PCRE_EXTRA) != 0)? " extra" : "", ((get_options & PCRE_UNGREEDY) != 0)? " ungreedy" : "", ((get_options & PCRE_NO_AUTO_CAPTURE) != 0)? " no_auto_capture" : "", - ((get_options & PCRE_UTF8) != 0)? " utf8" : "", + ((get_options & PCRE_UTF8) != 0)? " utf" : "", ((get_options & PCRE_UCP) != 0)? " ucp" : "", - ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf8_check" : "", + ((get_options & PCRE_NO_UTF8_CHECK) != 0)? " no_utf_check" : "", ((get_options & PCRE_NO_START_OPTIMIZE) != 0)? " no_start_optimize" : "", ((get_options & PCRE_DUPNAMES) != 0)? " dupnames" : ""); @@ -2442,11 +2446,15 @@ while (!done) const char *caseless = ((((real_pcre *)re)->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)"; - + if (PRINTOK(first_char)) fprintf(outfile, "First char = \'%c\'%s\n", first_char, caseless); else - fprintf(outfile, "First char = %d%s\n", first_char, caseless); + { + fprintf(outfile, "First char = "); + pchar(first_char, outfile); + fprintf(outfile, "%s\n", caseless); + } } if (need_char < 0) @@ -2690,7 +2698,7 @@ while (!done) c = c * 8 + *p++ - '0'; #if !defined NOUTF8 - if (use_utf8 && c > 255) + if (use_utf && c > 255) { pcre_uint8 buff8[8]; int ii, utn; @@ -2722,7 +2730,7 @@ while (!done) { pcre_uint8 buff8[8]; int ii, utn; - if (use_utf8) + if (use_utf) { utn = ord2utf8(c, buff8); for (ii = 0; ii < utn - 1; ii++) *q++ = buff8[ii]; @@ -3055,6 +3063,12 @@ while (!done) if (use_pcre16) { len = to16(bptr, (((real_pcre *)re)->options) & PCRE_UTF8, len); + if (len < 0) + { + fprintf(outfile, "**Failed: invalid UTF-8 string cannot be " + "converted to UTF-16\n"); + goto NEXT_DATA; + } bptr = (pcre_uint8 *)buffer16; } #endif @@ -3369,7 +3383,7 @@ while (!done) bptr[start_offset] == '\r' && bptr[start_offset+1] == '\n') onechar++; - else if (use_utf8) + else if (use_utf) { while (start_offset + onechar < len) { diff --git a/testdata/testinput1 b/testdata/testinput1 index 36d7028..d9849fe 100644 --- a/testdata/testinput1 +++ b/testdata/testinput1 @@ -1,5 +1,6 @@ /-- This set of tests is for features that are compatible with all versions of - Perl 5, in non-UTF-8 mode. --/ + Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and + 16-bit PCRE libraries. --/ /the quick brown fox/ the quick brown fox diff --git a/testdata/testinput16 b/testdata/testinput16 new file mode 100644 index 0000000..83cadbe --- /dev/null +++ b/testdata/testinput16 @@ -0,0 +1,238 @@ +/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit + library. There are some non-UTF 16-bit tests as well (it doesn't seem + worth setting up another test file just for this case). --/ + +/xxx/8?DZSS + +/abc/8 + ] + +/X(\C{3})/8 + X\x{11234}Y + +/X(\C{4})/8 + X\x{11234}YZ + +/X\C*/8 + XYZabcdce + +/X\C*?/8 + XYZabcde + +/X\C{3,5}/8 + Xabcdefg + X\x{11234}Y + X\x{11234}YZ + X\x{11234}\x{512} + X\x{11234}\x{512}YZ + X\x{11234}\x{512}\x{11234}Z + +/X\C{3,5}?/8 + Xabcdefg + X\x{11234}Y + X\x{11234}YZ + X\x{11234}\x{512}YZ + *** Failers + X\x{11234} + +/a\Cb/ + aXb + a\nb + +/a\Cb/8 + aXb + a\nb + +/a\C\Cb/8 + a\x{12257}b + ** Failers + a\x{100}b + +/ab\Cde/8 + abXde + +/-- Check maximum non-UTF character size --/ + +/\x{ffff}/ + +/\x{10000}/ + +/\x{100}/8DZ + +/\x{1000}/8DZ + +/\x{10000}/8DZ + +/\x{100000}/8DZ + +/\x{10ffff}/8DZ + +/[\x{ff}]/8DZ + +/[\x{100}]/8DZ + +/\x80/8DZ + +/\xff/8DZ + +/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 + \x{D55c}\x{ad6d}\x{C5B4} + +/\x{65e5}\x{672c}\x{8a9e}/DZ8 + \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/DZ8 + +/\x{084}/DZ8 + +/\x{104}/DZ8 + +/\x{861}/DZ8 + +/\x{212ab}/DZ8 + +/-- This one is here not because it's different to Perl, but because the way +the captured single-byte is displayed. (In Perl it becomes a character, and you +can't tell the difference.) --/ + +/X(\C)(.*)/8 + X\x{1234} + X\nabc + +/-- This one is here because Perl gives out a grumbly error message (quite +correctly, but that messes up comparisons). --/ + +/a\Cb/8 + *** Failers + a\x{100}b + +/[^ab\xC0-\xF0]/8SDZ + \x{f1} + \x{bf} + \x{100} + \x{1000} + *** Failers + \x{c0} + \x{f0} + +/Ā{3,4}/8SDZ + \x{100}\x{100}\x{100}\x{100\x{100} + +/(\x{100}+|x)/8SDZ + +/(\x{100}*a|x)/8SDZ + +/(\x{100}{0,2}a|x)/8SDZ + +/(\x{100}{1,2}a|x)/8SDZ + +/\x{100}/8DZ + +/a\x{100}\x{101}*/8DZ + +/a\x{100}\x{101}+/8DZ + +/[^\x{c4}]/DZ + +/[\x{100}]/8DZ + \x{100} + Z\x{100} + \x{100}Z + *** Failers + +/[\xff]/DZ8 + >\x{ff}< + +/[^\xff]/8DZ + +/\x{100}abc(xyz(?1))/8DZ + +/\777/8I + \x{1ff} + \777 + +/\x{100}+\x{200}/8DZ + +/\x{100}+X/8DZ + +/^[\QĀ\E-\QŐ\E/BZ8 + +/-- This tests the stricter UTF-8 check according to RFC 3629. --/ + +/X/8 + \x{0}\x{d7ff}\x{e000}\x{10ffff} + \x{d800} + \x{d800}\? + \x{da00} + \x{da00}\? + \x{dfff} + \x{dfff}\? + \x{110000} + \x{110000}\? + \x{2000000} + \x{2000000}\? + \x{7fffffff} + \x{7fffffff}\? + +/(*UTF16)\x{11234}/ + abcd\x{11234}pqr + +/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I + +/\h/SI8 + ABC\x{09} + ABC\x{20} + ABC\x{a0} + ABC\x{1680} + ABC\x{180e} + ABC\x{2000} + ABC\x{202f} + ABC\x{205f} + ABC\x{3000} + +/\v/SI8 + ABC\x{0a} + ABC\x{0b} + ABC\x{0c} + ABC\x{0d} + ABC\x{85} + ABC\x{2028} + +/\h*A/SI8 + CDBABC + +/\v+A/SI8 + +/\s?xxx\s/8SI + +/\sxxx\s/I8ST1 + AB\x{85}xxx\x{a0}XYZ + AB\x{a0}xxx\x{85}XYZ + +/\S \S/I8ST1 + \x{a2} \x{84} + A Z + +/a+/8 + a\x{123}aa\>1 + a\x{123}aa\>2 + a\x{123}aa\>3 + a\x{123}aa\>4 + a\x{123}aa\>5 + a\x{123}aa\>6 + +/\x{1234}+/iS8I + +/\x{1234}+?/iS8I + +/\x{1234}++/iS8I + +/\x{1234}{2}/iS8I + +/[^\x{c4}]/8DZ + +/X+\x{200}/8DZ + +/\R/SI8 + +/-- End of testinput16 --/ diff --git a/testdata/testinput17 b/testdata/testinput17 new file mode 100644 index 0000000..5b16183 --- /dev/null +++ b/testdata/testinput17 @@ -0,0 +1,282 @@ +/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit + library. --/ + +/X(\C{3})/8 + X\x{1234} + +/X(\C{4})/8 + X\x{1234}YZ + +/X\C*/8 + XYZabcdce + +/X\C*?/8 + XYZabcde + +/X\C{3,5}/8 + Xabcdefg + X\x{1234} + X\x{1234}YZ + X\x{1234}\x{512} + X\x{1234}\x{512}YZ + +/X\C{3,5}?/8 + Xabcdefg + X\x{1234} + X\x{1234}YZ + X\x{1234}\x{512} + +/a\Cb/ + aXb + a\nb + +/a\Cb/8 + aXb + a\nb + +/a\C\Cb/8 + a\x{100}b + +/ab\Cde/8 + abXde + +/a\C\Cb/8 + a\x{100}b + ** Failers + a\x{12257}b + +/[]/8 + +//8 + +/xxx/8 + +/xxx/8?DZSS + +/abc/8 + ] + + + \? + \xe1\x88 + \P\xe1\x88 + \P\P\xe1\x88 + XX\xea + \O0XX\xea + \O1XX\xea + \O2XX\xea + XX\xf1 + XX\xf8 + XX\xfc + ZZ\xea\xaf\x20YY + ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY + ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY + ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY + ZZ\xffYY + ZZ\xfeYY + +/anything/8 + \xc0\x80 + \xc1\x8f + \xe0\x9f\x80 + \xf0\x8f\x80\x80 + \xf8\x87\x80\x80\x80 + \xfc\x83\x80\x80\x80\x80 + \xfe\x80\x80\x80\x80\x80 + \xff\x80\x80\x80\x80\x80 + \xc3\x8f + \xe0\xaf\x80 + \xe1\x80\x80 + \xf0\x9f\x80\x80 + \xf1\x8f\x80\x80 + \xf8\x88\x80\x80\x80 + \xf9\x87\x80\x80\x80 + \xfc\x84\x80\x80\x80\x80 + \xfd\x83\x80\x80\x80\x80 + \?\xf8\x88\x80\x80\x80 + \?\xf9\x87\x80\x80\x80 + \?\xfc\x84\x80\x80\x80\x80 + \?\xfd\x83\x80\x80\x80\x80 + +/\x{100}/8DZ + +/\x{1000}/8DZ + +/\x{10000}/8DZ + +/\x{100000}/8DZ + +/\x{10ffff}/8DZ + +/[\x{ff}]/8DZ + +/[\x{100}]/8DZ + +/\x80/8DZ + +/\xff/8DZ + +/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 + \x{D55c}\x{ad6d}\x{C5B4} + +/\x{65e5}\x{672c}\x{8a9e}/DZ8 + \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/DZ8 + +/\x{084}/DZ8 + +/\x{104}/DZ8 + +/\x{861}/DZ8 + +/\x{212ab}/DZ8 + +/-- This one is here not because it's different to Perl, but because the way +the captured single-byte is displayed. (In Perl it becomes a character, and you +can't tell the difference.) --/ + +/X(\C)(.*)/8 + X\x{1234} + X\nabc + +/-- This one is here because Perl gives out a grumbly error message (quite +correctly, but that messes up comparisons). --/ + +/a\Cb/8 + *** Failers + a\x{100}b + +/[^ab\xC0-\xF0]/8SDZ + \x{f1} + \x{bf} + \x{100} + \x{1000} + *** Failers + \x{c0} + \x{f0} + +/Ā{3,4}/8SDZ + \x{100}\x{100}\x{100}\x{100\x{100} + +/(\x{100}+|x)/8SDZ + +/(\x{100}*a|x)/8SDZ + +/(\x{100}{0,2}a|x)/8SDZ + +/(\x{100}{1,2}a|x)/8SDZ + +/\x{100}/8DZ + +/a\x{100}\x{101}*/8DZ + +/a\x{100}\x{101}+/8DZ + +/[^\x{c4}]/DZ + +/[\x{100}]/8DZ + \x{100} + Z\x{100} + \x{100}Z + *** Failers + +/[\xff]/DZ8 + >\x{ff}< + +/[^\xff]/8DZ + +/\x{100}abc(xyz(?1))/8DZ + +/a\x{1234}b/P8 + a\x{1234}b + +/\777/8I + \x{1ff} + \777 + +/\x{100}+\x{200}/8DZ + +/\x{100}+X/8DZ + +/^[\QĀ\E-\QŐ\E/BZ8 + +/-- This tests the stricter UTF-8 check according to RFC 3629. --/ + +/X/8 + \x{0}\x{d7ff}\x{e000}\x{10ffff} + \x{d800} + \x{d800}\? + \x{da00} + \x{da00}\? + \x{dfff} + \x{dfff}\? + \x{110000} + \x{110000}\? + \x{2000000} + \x{2000000}\? + \x{7fffffff} + \x{7fffffff}\? + +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + +/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I + +/\h/SI8 + ABC\x{09} + ABC\x{20} + ABC\x{a0} + ABC\x{1680} + ABC\x{180e} + ABC\x{2000} + ABC\x{202f} + ABC\x{205f} + ABC\x{3000} + +/\v/SI8 + ABC\x{0a} + ABC\x{0b} + ABC\x{0c} + ABC\x{0d} + ABC\x{85} + ABC\x{2028} + +/\h*A/SI8 + CDBABC + +/\v+A/SI8 + +/\s?xxx\s/8SI + +/\sxxx\s/I8ST1 + AB\x{85}xxx\x{a0}XYZ + AB\x{a0}xxx\x{85}XYZ + +/\S \S/I8ST1 + \x{a2} \x{84} + A Z + +/a+/8 + a\x{123}aa\>1 + a\x{123}aa\>2 + a\x{123}aa\>3 + a\x{123}aa\>4 + a\x{123}aa\>5 + a\x{123}aa\>6 + +/\x{1234}+/iS8I + +/\x{1234}+?/iS8I + +/\x{1234}++/iS8I + +/\x{1234}{2}/iS8I + +/[^\x{c4}]/8DZ + +/X+\x{200}/8DZ + +/\R/SI8 + +/-- End of testinput17 --/ diff --git a/testdata/testinput2 b/testdata/testinput2 index 19801ef..7c1e3c5 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -5,8 +5,8 @@ either because PCRE can't be compatible, or there is a possible Perl bug. - NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test - 5, and if Unicode Property Support is needed, use test 13. --/ + NOTE: This is a non-UTF set of tests. When UTF support is needed, use + test 5, and if Unicode Property Support is needed, use test 13. --/ /-- Originally, the Perl >= 5.10 things were in here too, but now I have separated many (most?) of them out into test 11. However, there may still diff --git a/testdata/testinput4 b/testdata/testinput4 index b339f71..3adaa1a 100644 --- a/testdata/testinput4 +++ b/testdata/testinput4 @@ -1,5 +1,6 @@ -/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is - compatible with all versions of Perl 5. --/ +/-- This set of tests is for UTF support, excluding Unicode properties. It is + compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE + libraries. --/ /a.b/8 acb @@ -126,31 +127,6 @@ *** Failers XYZ -/X(\C{3})/8 - X\x{1234} - -/X(\C{4})/8 - X\x{1234}YZ - -/X\C*/8 - XYZabcdce - -/X\C*?/8 - XYZabcde - -/X\C{3,5}/8 - Xabcdefg - X\x{1234} - X\x{1234}YZ - X\x{1234}\x{512} - X\x{1234}\x{512}YZ - -/X\C{3,5}?/8 - Xabcdefg - X\x{1234} - X\x{1234}YZ - X\x{1234}\x{512} - /[^a]+/8g bcd \x{100}aY\x{256}Z @@ -456,17 +432,6 @@ \x{150}X \x{200}X -/a\Cb/ - aXb - a\nb - -/a\Cb/8 - aXb - a\nb - -/a\C\Cb/8 - a\x{100}b - /[z-\x{100}]/8i z Z @@ -650,7 +615,4 @@ /(abc)\1/8 abc -/ab\Cde/8 - abXde - /-- End of testinput4 --/ diff --git a/testdata/testinput5 b/testdata/testinput5 index 9ba5b4b..87f0884 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -1,22 +1,9 @@ -/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8 - support, excluding Unicode properties. --/ - -/\x{100}/8DZ - -/\x{1000}/8DZ - -/\x{10000}/8DZ - -/\x{100000}/8DZ - -/\x{10ffff}/8DZ +/-- This set of tests checks the API, internals, and non-Perl stuff for UTF + support, excluding Unicode properties. However, tests that give different + results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/ /\x{110000}/8DZ -/[\x{ff}]/8DZ - -/[\x{100}]/8DZ - /\x{ffffffff}/8 /\x{100000000}/8 @@ -32,54 +19,18 @@ /^\x{100}a\x{1234}/8 \x{100}a\x{1234}bcd -/\x80/8DZ - -/\xff/8DZ - /\x{0041}\x{2262}\x{0391}\x{002e}/DZ8 \x{0041}\x{2262}\x{0391}\x{002e} -/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 - \x{D55c}\x{ad6d}\x{C5B4} - -/\x{65e5}\x{672c}\x{8a9e}/DZ8 - \x{65e5}\x{672c}\x{8a9e} - -/\x{80}/DZ8 - -/\x{084}/DZ8 - -/\x{104}/DZ8 - -/\x{861}/DZ8 - -/\x{212ab}/DZ8 - /.{3,5}X/DZ8 \x{212ab}\x{212ab}\x{212ab}\x{861}X - /.{3,5}?/DZ8 \x{212ab}\x{212ab}\x{212ab}\x{861} /(?<=\C)X/8 Should produce an error diagnostic -/-- This one is here not because it's different to Perl, but because the way -the captured single-byte is displayed. (In Perl it becomes a character, and you -can't tell the difference.) --/ - -/X(\C)(.*)/8 - X\x{1234} - X\nabc - -/-- This one is here because Perl gives out a grumbly error message (quite -correctly, but that messes up comparisons). --/ - -/a\Cb/8 - *** Failers - a\x{100}b - /^[ab]/8DZ bar *** Failers @@ -94,26 +45,6 @@ correctly, but that messes up comparisons). --/ *** Failers aaa -/[^ab\xC0-\xF0]/8SDZ - \x{f1} - \x{bf} - \x{100} - \x{1000} - *** Failers - \x{c0} - \x{f0} - -/Ā{3,4}/8SDZ - \x{100}\x{100}\x{100}\x{100\x{100} - -/(\x{100}+|x)/8SDZ - -/(\x{100}*a|x)/8SDZ - -/(\x{100}{0,2}a|x)/8SDZ - -/(\x{100}{1,2}a|x)/8SDZ - /\x{100}*(\d+|"(?1)")/8 1234 "1234" @@ -124,33 +55,17 @@ correctly, but that messes up comparisons). --/ *** Failers \x{100}\x{100}abcd -/\x{100}/8DZ - /\x{100}*/8DZ /a\x{100}*/8DZ /ab\x{100}*/8DZ -/a\x{100}\x{101}*/8DZ - -/a\x{100}\x{101}+/8DZ - /\x{100}*A/8DZ A /\x{100}*\d(?R)/8DZ -/[^\x{c4}]/DZ - -/[^\x{c4}]/8DZ - -/[\x{100}]/8DZ - \x{100} - Z\x{100} - \x{100}Z - *** Failers - /[Z\x{100}]/8DZ Z\x{100} \x{100} @@ -175,13 +90,8 @@ correctly, but that messes up comparisons). --/ /[\xFF]/DZ >\xff< -/[\xff]/DZ8 - >\x{ff}< - /[^\xFF]/DZ -/[^\xff]/8DZ - /[Ä-Ü]/8 Ö # Matches without Study \x{d6} @@ -198,61 +108,6 @@ correctly, but that messes up comparisons). --/ Ö <-- Same with Study \x{d6} -/[]/8 - -//8 - -/xxx/8 - -/xxx/8?DZSS - -/abc/8 - ] - - - \? - \xe1\x88 - \P\xe1\x88 - \P\P\xe1\x88 - XX\xea - \O0XX\xea - \O1XX\xea - \O2XX\xea - XX\xf1 - XX\xf8 - XX\xfc - ZZ\xea\xaf\x20YY - ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY - ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY - ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY - ZZ\xffYY - ZZ\xfeYY - -/anything/8 - \xc0\x80 - \xc1\x8f - \xe0\x9f\x80 - \xf0\x8f\x80\x80 - \xf8\x87\x80\x80\x80 - \xfc\x83\x80\x80\x80\x80 - \xfe\x80\x80\x80\x80\x80 - \xff\x80\x80\x80\x80\x80 - \xc3\x8f - \xe0\xaf\x80 - \xe1\x80\x80 - \xf0\x9f\x80\x80 - \xf1\x8f\x80\x80 - \xf8\x88\x80\x80\x80 - \xf9\x87\x80\x80\x80 - \xfc\x84\x80\x80\x80\x80 - \xfd\x83\x80\x80\x80\x80 - \?\xf8\x88\x80\x80\x80 - \?\xf9\x87\x80\x80\x80 - \?\xfc\x84\x80\x80\x80\x80 - \?\xfd\x83\x80\x80\x80\x80 - -/\x{100}abc(xyz(?1))/8DZ - /[^\x{100}]abc(xyz(?1))/8DZ /[ab\x{100}]abc(xyz(?1))/8DZ @@ -272,17 +127,10 @@ correctly, but that messes up comparisons). --/ /\w/8 \x{100}X -/a\x{1234}b/P8 - a\x{1234}b - /^\ሴ/8DZ /\777/I -/\777/8I - \x{1ff} - \777 - /\x{100}*\d/8DZ /\x{100}*\s/8DZ @@ -295,12 +143,6 @@ correctly, but that messes up comparisons). --/ /\x{100}*\W/8DZ -/\x{100}+\x{200}/8DZ - -/\x{100}+X/8DZ - -/X+\x{200}/8DZ - /()()()()()()()()()() ()()()()()()()()()() ()()()()()()()()()() @@ -312,8 +154,6 @@ correctly, but that messes up comparisons). --/ /^[\QĀ\E-\QŐ\E]/BZ8 -/^[\QĀ\E-\QŐ\E/BZ8 - /^abc./mgx8<any> abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK @@ -408,23 +248,6 @@ correctly, but that messes up comparisons). --/ /.*$/8<any> \x{1ec5} -/-- This tests the stricter UTF-8 check according to RFC 3629. --/ - -/X/8 - \x{0}\x{d7ff}\x{e000}\x{10ffff} - \x{d800} - \x{d800}\? - \x{da00} - \x{da00}\? - \x{dfff} - \x{dfff}\? - \x{110000} - \x{110000}\? - \x{2000000} - \x{2000000}\? - \x{7fffffff} - \x{7fffffff}\? - /a\Rb/I8<bsr_anycrlf> a\rb a\nb @@ -488,11 +311,6 @@ correctly, but that messes up comparisons). --/ /X/8f<any> A\x{1ec5}ABCXYZ -/(*UTF8)\x{1234}/ - abcd\x{1234}pqr - -/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I - /Xa{2,4}b/8 X\P Xa\P @@ -776,53 +594,17 @@ correctly, but that messes up comparisons). --/ /\h/SI -/\h/SI8 - ABC\x{09} - ABC\x{20} - ABC\x{a0} - ABC\x{1680} - ABC\x{180e} - ABC\x{2000} - ABC\x{202f} - ABC\x{205f} - ABC\x{3000} - /\v/SI -/\v/SI8 - ABC\x{0a} - ABC\x{0b} - ABC\x{0c} - ABC\x{0d} - ABC\x{85} - ABC\x{2028} - /\R/SI -/\R/SI8 - -/\h*A/SI8 - CDBABC - -/\v+A/SI8 - -/\s?xxx\s/8SI - /\sxxx\s/8T1 AB\x{85}xxx\x{a0}XYZ AB\x{a0}xxx\x{85}XYZ -/\sxxx\s/I8ST1 - AB\x{85}xxx\x{a0}XYZ - AB\x{a0}xxx\x{85}XYZ - /\S \S/8T1 \x{a2} \x{84} -/\S \S/I8ST1 - \x{a2} \x{84} - A Z - 'A#хц'8x<any>BZ 'A#хц @@ -838,14 +620,6 @@ correctly, but that messes up comparisons). --/ /\g{A}xxx#bх(?'A'123)
(?'A'456)/8x<any>BZ -/a+/8 - a\x{123}aa\>1 - a\x{123}aa\>2 - a\x{123}aa\>3 - a\x{123}aa\>4 - a\x{123}aa\>5 - a\x{123}aa\>6 - /^\cģ/8 /(\R*)(.)/s8 @@ -858,14 +632,6 @@ correctly, but that messes up comparisons). --/ \r\r\n\n\r \r\r\n\n\r\n -/\x{1234}+/iS8I - -/\x{1234}+?/iS8I - -/\x{1234}++/iS8I - -/\x{1234}{2}/iS8I - /[^\x{1234}]+/iS8I /[^\x{1234}]+?/iS8I @@ -887,5 +653,11 @@ correctly, but that messes up comparisons). --/ /f.*/8s \P\Pfor + +/\x{d7ff}\x{e000}/8 + +/\x{d800}/8 + +/\x{dfff}/8 /-- End of testinput5 --/ diff --git a/testdata/testoutput1 b/testdata/testoutput1 index bc237ab..13c79c5 100644 --- a/testdata/testoutput1 +++ b/testdata/testoutput1 @@ -1,5 +1,6 @@ /-- This set of tests is for features that are compatible with all versions of - Perl 5, in non-UTF-8 mode. --/ + Perl 5, in non-UTF-8 mode. It should run clean for both the 8-bit and + 16-bit PCRE libraries. --/ /the quick brown fox/ the quick brown fox diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 62d6f3e..55f10a5 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -374,7 +374,7 @@ Memory allocation (code space): 18 17 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'A' Need char = '.' @@ -387,8 +387,8 @@ Memory allocation (code space): 19 18 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 -First char = 237 +Options: utf +First char = \x{ed} Need char = 180 /\x{65e5}\x{672c}\x{8a9e}/D8M @@ -400,8 +400,8 @@ Memory allocation (code space): 19 18 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 -First char = 230 +Options: utf +First char = \x{e6} Need char = 158 /[\x{100}]/8BM diff --git a/testdata/testoutput13 b/testdata/testoutput13 index 3151699..20d8c0e 100644 --- a/testdata/testoutput13 +++ b/testdata/testoutput13 @@ -57,7 +57,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -69,7 +69,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char 1234 @@ -83,7 +83,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char 1234 @@ -105,7 +105,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf First char = 'A' (caseless) No need char @@ -117,7 +117,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'A' Need char = 176 @@ -129,7 +129,7 @@ Need char = 176 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'A' Need char = 176 @@ -141,7 +141,7 @@ Need char = 176 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf First char = 'A' (caseless) Need char = 'B' (caseless) @@ -153,7 +153,7 @@ Need char = 'B' (caseless) End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char \x{104} @@ -177,7 +177,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Z @@ -215,7 +215,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char @@ -1049,7 +1049,7 @@ No match: POSIX code 17: match failed /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/8iSI Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Subject length lower bound = 17 diff --git a/testdata/testoutput16 b/testdata/testoutput16 new file mode 100644 index 0000000..2cc97aa --- /dev/null +++ b/testdata/testoutput16 @@ -0,0 +1,819 @@ +/-- This set of tests is for UTF-16 support, and is relevant only to the 16-bit + library. There are some non-UTF 16-bit tests as well (it doesn't seem + worth setting up another test file just for this case). --/ + +/xxx/8?DZSS +**Failed: invalid UTF-8 string cannot be converted to UTF-16 + +/abc/8 + ] +**Failed: invalid UTF-8 string cannot be converted to UTF-16 + +/X(\C{3})/8 + X\x{11234}Y + 0: X\x{11234}Y + 1: \x{11234}Y + +/X(\C{4})/8 + X\x{11234}YZ + 0: X\x{11234}YZ + 1: \x{11234}YZ + +/X\C*/8 + XYZabcdce + 0: XYZabcdce + +/X\C*?/8 + XYZabcde + 0: X + +/X\C{3,5}/8 + Xabcdefg + 0: Xabcde + X\x{11234}Y + 0: X\x{11234}Y + X\x{11234}YZ + 0: X\x{11234}YZ + X\x{11234}\x{512} + 0: X\x{11234}\x{512} + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512}YZ + X\x{11234}\x{512}\x{11234}Z + 0: X\x{11234}\x{512}\x{11234} + +/X\C{3,5}?/8 + Xabcdefg + 0: Xabc + X\x{11234}Y + 0: X\x{11234}Y + X\x{11234}YZ + 0: X\x{11234}Y + X\x{11234}\x{512}YZ + 0: X\x{11234}\x{512} + *** Failers +No match + X\x{11234} +No match + +/a\Cb/ + aXb + 0: aXb + a\nb + 0: a\x0ab + +/a\Cb/8 + aXb + 0: aXb + a\nb + 0: a\x{0a}b + +/a\C\Cb/8 + a\x{12257}b + 0: a\x{12257}b + ** Failers +No match + a\x{100}b +No match + +/ab\Cde/8 + abXde + 0: abXde + +/-- Check maximum non-UTF character size --/ + +/\x{ffff}/ + +/\x{10000}/ +Failed: character value in \x{...} sequence is too large at offset 8 + +/\x{100}/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +No need char + +/\x{1000}/8DZ +------------------------------------------------------------------ + Bra + \x{1000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{1000} +No need char + +/\x{10000}/8DZ +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{d800} +Need char = 56320 + +/\x{100000}/8DZ +------------------------------------------------------------------ + Bra + \x{100000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{dbc0} +Need char = 56320 + +/\x{10ffff}/8DZ +------------------------------------------------------------------ + Bra + \x{10ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{dbff} +Need char = 57343 + +/[\x{ff}]/8DZ +------------------------------------------------------------------ + Bra + \xff + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{ff} +No need char + +/[\x{100}]/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +No need char + +/\x80/8DZ +------------------------------------------------------------------ + Bra + \x80 + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{80} +No need char + +/\xff/8DZ +------------------------------------------------------------------ + Bra + \xff + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{ff} +No need char + +/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 +------------------------------------------------------------------ + Bra + \x{d55c}\x{ad6d}\x{c5b4} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{d55c} +Need char = 50612 + \x{D55c}\x{ad6d}\x{C5B4} + 0: \x{d55c}\x{ad6d}\x{c5b4} + +/\x{65e5}\x{672c}\x{8a9e}/DZ8 +------------------------------------------------------------------ + Bra + \x{65e5}\x{672c}\x{8a9e} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{65e5} +Need char = 35486 + \x{65e5}\x{672c}\x{8a9e} + 0: \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/DZ8 +------------------------------------------------------------------ + Bra + \x80 + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{80} +No need char + +/\x{084}/DZ8 +------------------------------------------------------------------ + Bra + \x84 + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{84} +No need char + +/\x{104}/DZ8 +------------------------------------------------------------------ + Bra + \x{104} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{104} +No need char + +/\x{861}/DZ8 +------------------------------------------------------------------ + Bra + \x{861} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{861} +No need char + +/\x{212ab}/DZ8 +------------------------------------------------------------------ + Bra + \x{212ab} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{d844} +Need char = 57003 + +/-- This one is here not because it's different to Perl, but because the way +the captured single-byte is displayed. (In Perl it becomes a character, and you +can't tell the difference.) --/ + +/X(\C)(.*)/8 + X\x{1234} + 0: X\x{1234} + 1: \x{1234} + 2: + X\nabc + 0: X\x{0a}abc + 1: \x{0a} + 2: abc + +/-- This one is here because Perl gives out a grumbly error message (quite +correctly, but that messes up comparisons). --/ + +/a\Cb/8 + *** Failers +No match + a\x{100}b + 0: a\x{100}b + +/[^ab\xC0-\xF0]/8SDZ +------------------------------------------------------------------ + Bra + [\x00-`c-\xbf\xf1-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \x80 \x81 \x82 \x83 \x84 \x85 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e + \x8f \x90 \x91 \x92 \x93 \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d + \x9e \x9f \xa0 \xa1 \xa2 \xa3 \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac + \xad \xae \xaf \xb0 \xb1 \xb2 \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb + \xbc \xbd \xbe \xbf \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb + \xfc \xfd \xfe \xff + \x{f1} + 0: \x{f1} + \x{bf} + 0: \x{bf} + \x{100} + 0: \x{100} + \x{1000} + 0: \x{1000} + *** Failers + 0: * + \x{c0} +No match + \x{f0} +No match + +/Ā{3,4}/8SDZ +------------------------------------------------------------------ + Bra + \x{100}{3} + \x{100}? + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +Need char = 256 +Subject length lower bound = 3 +No set of starting bytes + \x{100}\x{100}\x{100}\x{100\x{100} + 0: \x{100}\x{100}\x{100} + +/(\x{100}+|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}+ + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: x \xff + +/(\x{100}*a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}*+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: a x \xff + +/(\x{100}{0,2}a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}{0,2} + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: a x \xff + +/(\x{100}{1,2}a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100} + \x{100}{0,1} + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: x \xff + +/\x{100}/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +No need char + +/a\x{100}\x{101}*/8DZ +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}* + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'a' +Need char = 256 + +/a\x{100}\x{101}+/8DZ +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'a' +Need char = 257 + +/[^\x{c4}]/DZ +------------------------------------------------------------------ + Bra + [^\xc4] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first char +No need char + +/[\x{100}]/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +No need char + \x{100} + 0: \x{100} + Z\x{100} + 0: \x{100} + \x{100}Z + 0: \x{100} + *** Failers +No match + +/[\xff]/DZ8 +------------------------------------------------------------------ + Bra + \xff + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{ff} +No need char + >\x{ff}< + 0: \x{ff} + +/[^\xff]/8DZ +------------------------------------------------------------------ + Bra + [^\x{ff}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char + +/\x{100}abc(xyz(?1))/8DZ +------------------------------------------------------------------ + Bra + \x{100}abc + CBra 1 + xyz + Recurse + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +First char = \x{100} +Need char = 'z' + +/\777/8I +Capturing subpattern count = 0 +Options: utf +First char = \x{1ff} +No need char + \x{1ff} + 0: \x{1ff} + \777 + 0: \x{1ff} + +/\x{100}+\x{200}/8DZ +------------------------------------------------------------------ + Bra + \x{100}++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +Need char = 512 + +/\x{100}+X/8DZ +------------------------------------------------------------------ + Bra + \x{100}++ + X + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{100} +Need char = 'X' + +/^[\QĀ\E-\QŐ\E/BZ8 +Failed: missing terminating ] for character class at offset 13 + +/-- This tests the stricter UTF-8 check according to RFC 3629. --/ + +/X/8 + \x{0}\x{d7ff}\x{e000}\x{10ffff} +No match + \x{d800} +Error -10 (bad UTF-8 string) offset=0 reason=1 + \x{d800}\? +No match + \x{da00} +Error -10 (bad UTF-8 string) offset=0 reason=1 + \x{da00}\? +No match + \x{dfff} +Error -10 (bad UTF-8 string) offset=0 reason=3 + \x{dfff}\? +No match + \x{110000} +Error -10 (bad UTF-8 string) offset=0 reason=3 + \x{110000}\? +No match + \x{2000000} +Error -10 (bad UTF-8 string) offset=1 reason=3 + \x{2000000}\? +No match + \x{7fffffff} +Error -10 (bad UTF-8 string) offset=1 reason=3 + \x{7fffffff}\? +No match + +/(*UTF16)\x{11234}/ + abcd\x{11234}pqr + 0: \x{11234} + +/(*CRLF)(*UTF16)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Options: bsr_unicode utf +Forced newline sequence: CRLF +First char = 'a' +Need char = 'b' + +/\h/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x09 \x20 \xa0 \xff + ABC\x{09} + 0: \x{09} + ABC\x{20} + 0: + ABC\x{a0} + 0: \x{a0} + ABC\x{1680} + 0: \x{1680} + ABC\x{180e} + 0: \x{180e} + ABC\x{2000} + 0: \x{2000} + ABC\x{202f} + 0: \x{202f} + ABC\x{205f} + 0: \x{205f} + ABC\x{3000} + 0: \x{3000} + +/\v/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff + ABC\x{0a} + 0: \x{0a} + ABC\x{0b} + 0: \x{0b} + ABC\x{0c} + 0: \x{0c} + ABC\x{0d} + 0: \x{0d} + ABC\x{85} + 0: \x{85} + ABC\x{2028} + 0: \x{2028} + +/\h*A/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'A' +Subject length lower bound = 1 +Starting byte set: \x09 \x20 A \xa0 + CDBABC + 0: A + +/\v+A/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'A' +Subject length lower bound = 2 +Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff + +/\s?xxx\s/8SI +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'x' +Subject length lower bound = 4 +Starting byte set: \x09 \x0a \x0c \x0d \x20 x + +/\sxxx\s/I8ST1 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'x' +Subject length lower bound = 5 +Starting byte set: \x09 \x0a \x0c \x0d \x20 \x85 \xa0 + AB\x{85}xxx\x{a0}XYZ + 0: \x{85}xxx\x{a0} + AB\x{a0}xxx\x{85}XYZ + 0: \x{a0}xxx\x{85} + +/\S \S/I8ST1 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = ' ' +Subject length lower bound = 3 +Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e + \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d + \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e + f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \x80 \x81 \x82 \x83 + \x84 \x86 \x87 \x88 \x89 \x8a \x8b \x8c \x8d \x8e \x8f \x90 \x91 \x92 \x93 + \x94 \x95 \x96 \x97 \x98 \x99 \x9a \x9b \x9c \x9d \x9e \x9f \xa1 \xa2 \xa3 + \xa4 \xa5 \xa6 \xa7 \xa8 \xa9 \xaa \xab \xac \xad \xae \xaf \xb0 \xb1 \xb2 + \xb3 \xb4 \xb5 \xb6 \xb7 \xb8 \xb9 \xba \xbb \xbc \xbd \xbe \xbf \xc0 \xc1 + \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 + \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf + \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee + \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd + \xfe \xff + \x{a2} \x{84} + 0: \x{a2} \x{84} + A Z + 0: A Z + +/a+/8 + a\x{123}aa\>1 + 0: aa + a\x{123}aa\>2 + 0: aa + a\x{123}aa\>3 + 0: a + a\x{123}aa\>4 +No match + a\x{123}aa\>5 +Error -24 (bad offset value) + a\x{123}aa\>6 +Error -24 (bad offset value) + +/\x{1234}+/iS8I +Capturing subpattern count = 0 +Options: caseless utf +First char = \x{1234} +No need char +Subject length lower bound = 1 +No set of starting bytes + +/\x{1234}+?/iS8I +Capturing subpattern count = 0 +Options: caseless utf +First char = \x{1234} +No need char +Subject length lower bound = 1 +No set of starting bytes + +/\x{1234}++/iS8I +Capturing subpattern count = 0 +Options: caseless utf +First char = \x{1234} +No need char +Subject length lower bound = 1 +No set of starting bytes + +/\x{1234}{2}/iS8I +Capturing subpattern count = 0 +Options: caseless utf +First char = \x{1234} +Need char = 4660 +Subject length lower bound = 2 +No set of starting bytes + +/[^\x{c4}]/8DZ +------------------------------------------------------------------ + Bra + [^\x{c4}] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char + +/X+\x{200}/8DZ +------------------------------------------------------------------ + Bra + X++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'X' +Need char = 512 + +/\R/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x0a \x0b \x0c \x0d \x85 \xff + +/-- End of testinput16 --/ diff --git a/testdata/testoutput17 b/testdata/testoutput17 new file mode 100644 index 0000000..23fc980 --- /dev/null +++ b/testdata/testoutput17 @@ -0,0 +1,907 @@ +/-- This set of tests is for UTF-8 support, and is relevant only to the 8-bit + library. --/ + +/X(\C{3})/8 + X\x{1234} + 0: X\x{1234} + 1: \x{1234} + +/X(\C{4})/8 + X\x{1234}YZ + 0: X\x{1234}Y + 1: \x{1234}Y + +/X\C*/8 + XYZabcdce + 0: XYZabcdce + +/X\C*?/8 + XYZabcde + 0: X + +/X\C{3,5}/8 + Xabcdefg + 0: Xabcde + X\x{1234} + 0: X\x{1234} + X\x{1234}YZ + 0: X\x{1234}YZ + X\x{1234}\x{512} + 0: X\x{1234}\x{512} + X\x{1234}\x{512}YZ + 0: X\x{1234}\x{512} + +/X\C{3,5}?/8 + Xabcdefg + 0: Xabc + X\x{1234} + 0: X\x{1234} + X\x{1234}YZ + 0: X\x{1234} + X\x{1234}\x{512} + 0: X\x{1234} + +/a\Cb/ + aXb + 0: aXb + a\nb + 0: a\x0ab + +/a\Cb/8 + aXb + 0: aXb + a\nb + 0: a\x{0a}b + +/a\C\Cb/8 + a\x{100}b + 0: a\x{100}b + +/ab\Cde/8 + abXde + 0: abXde + +/a\C\Cb/8 + a\x{100}b + 0: a\x{100}b + ** Failers +No match + a\x{12257}b +No match + +/[]/8 +Failed: invalid UTF-8 string at offset 1 + +//8 +Failed: invalid UTF-8 string at offset 0 + +/xxx/8 +Failed: invalid UTF-8 string at offset 0 + +/xxx/8?DZSS +------------------------------------------------------------------ + Bra + \X{c0}\X{c0}\X{c0}xxx + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf no_utf_check +First char = \x{c3} +Need char = 'x' + +/abc/8 + ] +Error -10 (bad UTF-8 string) offset=0 reason=6 + +Error -10 (bad UTF-8 string) offset=0 reason=1 + +Error -10 (bad UTF-8 string) offset=0 reason=6 + \? +No match + \xe1\x88 +Error -10 (bad UTF-8 string) offset=0 reason=1 + \P\xe1\x88 +Error -10 (bad UTF-8 string) offset=0 reason=1 + \P\P\xe1\x88 +Error -25 (short UTF-8 string) offset=0 reason=1 + XX\xea +Error -10 (bad UTF-8 string) offset=2 reason=2 + \O0XX\xea +Error -10 (bad UTF-8 string) + \O1XX\xea +Error -10 (bad UTF-8 string) + \O2XX\xea +Error -10 (bad UTF-8 string) offset=2 reason=2 + XX\xf1 +Error -10 (bad UTF-8 string) offset=2 reason=3 + XX\xf8 +Error -10 (bad UTF-8 string) offset=2 reason=4 + XX\xfc +Error -10 (bad UTF-8 string) offset=2 reason=5 + ZZ\xea\xaf\x20YY +Error -10 (bad UTF-8 string) offset=2 reason=7 + ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY +Error -10 (bad UTF-8 string) offset=2 reason=8 + ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY +Error -10 (bad UTF-8 string) offset=2 reason=9 + ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY +Error -10 (bad UTF-8 string) offset=2 reason=10 + ZZ\xffYY +Error -10 (bad UTF-8 string) offset=2 reason=21 + ZZ\xfeYY +Error -10 (bad UTF-8 string) offset=2 reason=21 + +/anything/8 + \xc0\x80 +Error -10 (bad UTF-8 string) offset=0 reason=15 + \xc1\x8f +Error -10 (bad UTF-8 string) offset=0 reason=15 + \xe0\x9f\x80 +Error -10 (bad UTF-8 string) offset=0 reason=16 + \xf0\x8f\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=17 + \xf8\x87\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=18 + \xfc\x83\x80\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=19 + \xfe\x80\x80\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=21 + \xff\x80\x80\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=21 + \xc3\x8f +No match + \xe0\xaf\x80 +No match + \xe1\x80\x80 +No match + \xf0\x9f\x80\x80 +No match + \xf1\x8f\x80\x80 +No match + \xf8\x88\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=11 + \xf9\x87\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=11 + \xfc\x84\x80\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=12 + \xfd\x83\x80\x80\x80\x80 +Error -10 (bad UTF-8 string) offset=0 reason=12 + \?\xf8\x88\x80\x80\x80 +No match + \?\xf9\x87\x80\x80\x80 +No match + \?\xfc\x84\x80\x80\x80\x80 +No match + \?\xfd\x83\x80\x80\x80\x80 +No match + +/\x{100}/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 + +/\x{1000}/8DZ +------------------------------------------------------------------ + Bra + \x{1000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{e1} +Need char = 128 + +/\x{10000}/8DZ +------------------------------------------------------------------ + Bra + \x{10000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{f0} +Need char = 128 + +/\x{100000}/8DZ +------------------------------------------------------------------ + Bra + \x{100000} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{f4} +Need char = 128 + +/\x{10ffff}/8DZ +------------------------------------------------------------------ + Bra + \x{10ffff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{f4} +Need char = 191 + +/[\x{ff}]/8DZ +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c3} +Need char = 191 + +/[\x{100}]/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 + +/\x80/8DZ +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c2} +Need char = 128 + +/\xff/8DZ +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c3} +Need char = 191 + +/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 +------------------------------------------------------------------ + Bra + \x{d55c}\x{ad6d}\x{c5b4} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{ed} +Need char = 180 + \x{D55c}\x{ad6d}\x{C5B4} + 0: \x{d55c}\x{ad6d}\x{c5b4} + +/\x{65e5}\x{672c}\x{8a9e}/DZ8 +------------------------------------------------------------------ + Bra + \x{65e5}\x{672c}\x{8a9e} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{e6} +Need char = 158 + \x{65e5}\x{672c}\x{8a9e} + 0: \x{65e5}\x{672c}\x{8a9e} + +/\x{80}/DZ8 +------------------------------------------------------------------ + Bra + \x{80} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c2} +Need char = 128 + +/\x{084}/DZ8 +------------------------------------------------------------------ + Bra + \x{84} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c2} +Need char = 132 + +/\x{104}/DZ8 +------------------------------------------------------------------ + Bra + \x{104} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 132 + +/\x{861}/DZ8 +------------------------------------------------------------------ + Bra + \x{861} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{e0} +Need char = 161 + +/\x{212ab}/DZ8 +------------------------------------------------------------------ + Bra + \x{212ab} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{f0} +Need char = 171 + +/-- This one is here not because it's different to Perl, but because the way +the captured single-byte is displayed. (In Perl it becomes a character, and you +can't tell the difference.) --/ + +/X(\C)(.*)/8 + X\x{1234} + 0: X\x{1234} + 1: \x{e1} + 2: \x{88}\x{b4} + X\nabc + 0: X\x{0a}abc + 1: \x{0a} + 2: abc + +/-- This one is here because Perl gives out a grumbly error message (quite +correctly, but that messes up comparisons). --/ + +/a\Cb/8 + *** Failers +No match + a\x{100}b +No match + +/[^ab\xC0-\xF0]/8SDZ +------------------------------------------------------------------ + Bra + [\x00-`c-\xbf\xf1-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a + \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 + \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 + 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y + Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f + \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 + \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf + \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee + \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd + \xfe \xff + \x{f1} + 0: \x{f1} + \x{bf} + 0: \x{bf} + \x{100} + 0: \x{100} + \x{1000} + 0: \x{1000} + *** Failers + 0: * + \x{c0} +No match + \x{f0} +No match + +/Ā{3,4}/8SDZ +------------------------------------------------------------------ + Bra + \x{100}{3} + \x{100}? + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 +Subject length lower bound = 3 +No set of starting bytes + \x{100}\x{100}\x{100}\x{100\x{100} + 0: \x{100}\x{100}\x{100} + +/(\x{100}+|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}+ + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: x \xc4 + +/(\x{100}*a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}*+ + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: a x \xc4 + +/(\x{100}{0,2}a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100}{0,2} + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: a x \xc4 + +/(\x{100}{1,2}a|x)/8SDZ +------------------------------------------------------------------ + Bra + CBra 1 + \x{100} + \x{100}{0,1} + a + Alt + x + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: x \xc4 + +/\x{100}/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 + +/a\x{100}\x{101}*/8DZ +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}* + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'a' +Need char = 128 + +/a\x{100}\x{101}+/8DZ +------------------------------------------------------------------ + Bra + a\x{100} + \x{101}+ + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'a' +Need char = 129 + +/[^\x{c4}]/DZ +------------------------------------------------------------------ + Bra + [^\xc4] + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +No options +No first char +No need char + +/[\x{100}]/8DZ +------------------------------------------------------------------ + Bra + \x{100} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 + \x{100} + 0: \x{100} + Z\x{100} + 0: \x{100} + \x{100}Z + 0: \x{100} + *** Failers +No match + +/[\xff]/DZ8 +------------------------------------------------------------------ + Bra + \x{ff} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c3} +Need char = 191 + >\x{ff}< + 0: \x{ff} + +/[^\xff]/8DZ +------------------------------------------------------------------ + Bra + [\x00-\xfe] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char + +/\x{100}abc(xyz(?1))/8DZ +------------------------------------------------------------------ + Bra + \x{100}abc + CBra 1 + xyz + Recurse + Ket + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 1 +Options: utf +First char = \x{c4} +Need char = 'z' + +/a\x{1234}b/P8 + a\x{1234}b + 0: a\x{1234}b + +/\777/8I +Capturing subpattern count = 0 +Options: utf +First char = \x{c7} +Need char = 191 + \x{1ff} + 0: \x{1ff} + \777 + 0: \x{1ff} + +/\x{100}+\x{200}/8DZ +------------------------------------------------------------------ + Bra + \x{100}++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 128 + +/\x{100}+X/8DZ +------------------------------------------------------------------ + Bra + \x{100}++ + X + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = \x{c4} +Need char = 'X' + +/^[\QĀ\E-\QŐ\E/BZ8 +Failed: missing terminating ] for character class at offset 15 + +/-- This tests the stricter UTF-8 check according to RFC 3629. --/ + +/X/8 + \x{0}\x{d7ff}\x{e000}\x{10ffff} +No match + \x{d800} +Error -10 (bad UTF-8 string) offset=0 reason=14 + \x{d800}\? +No match + \x{da00} +Error -10 (bad UTF-8 string) offset=0 reason=14 + \x{da00}\? +No match + \x{dfff} +Error -10 (bad UTF-8 string) offset=0 reason=14 + \x{dfff}\? +No match + \x{110000} +Error -10 (bad UTF-8 string) offset=0 reason=13 + \x{110000}\? +No match + \x{2000000} +Error -10 (bad UTF-8 string) offset=0 reason=11 + \x{2000000}\? +No match + \x{7fffffff} +Error -10 (bad UTF-8 string) offset=0 reason=12 + \x{7fffffff}\? +No match + +/(*UTF8)\x{1234}/ + abcd\x{1234}pqr + 0: \x{1234} + +/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I +Capturing subpattern count = 0 +Options: bsr_unicode utf +Forced newline sequence: CRLF +First char = 'a' +Need char = 'b' + +/\h/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 + ABC\x{09} + 0: \x{09} + ABC\x{20} + 0: + ABC\x{a0} + 0: \x{a0} + ABC\x{1680} + 0: \x{1680} + ABC\x{180e} + 0: \x{180e} + ABC\x{2000} + 0: \x{2000} + ABC\x{202f} + 0: \x{202f} + ABC\x{205f} + 0: \x{205f} + ABC\x{3000} + 0: \x{3000} + +/\v/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 + ABC\x{0a} + 0: \x{0a} + ABC\x{0b} + 0: \x{0b} + ABC\x{0c} + 0: \x{0c} + ABC\x{0d} + 0: \x{0d} + ABC\x{85} + 0: \x{85} + ABC\x{2028} + 0: \x{2028} + +/\h*A/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'A' +Subject length lower bound = 1 +Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 + CDBABC + 0: A + +/\v+A/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'A' +Subject length lower bound = 2 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 + +/\s?xxx\s/8SI +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'x' +Subject length lower bound = 4 +Starting byte set: \x09 \x0a \x0c \x0d \x20 x + +/\sxxx\s/I8ST1 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = 'x' +Subject length lower bound = 5 +Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2 + AB\x{85}xxx\x{a0}XYZ + 0: \x{85}xxx\x{a0} + AB\x{a0}xxx\x{85}XYZ + 0: \x{a0}xxx\x{85} + +/\S \S/I8ST1 +Capturing subpattern count = 0 +Options: utf +No first char +Need char = ' ' +Subject length lower bound = 3 +Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e + \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d + \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ + A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e + f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 + \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 + \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 + \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 + \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff + \x{a2} \x{84} + 0: \x{a2} \x{84} + A Z + 0: A Z + +/a+/8 + a\x{123}aa\>1 + 0: aa + a\x{123}aa\>2 +Error -11 (bad UTF-8 offset) + a\x{123}aa\>3 + 0: aa + a\x{123}aa\>4 + 0: a + a\x{123}aa\>5 +No match + a\x{123}aa\>6 +Error -24 (bad offset value) + +/\x{1234}+/iS8I +Capturing subpattern count = 0 +Options: caseless utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \xe1 + +/\x{1234}+?/iS8I +Capturing subpattern count = 0 +Options: caseless utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \xe1 + +/\x{1234}++/iS8I +Capturing subpattern count = 0 +Options: caseless utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \xe1 + +/\x{1234}{2}/iS8I +Capturing subpattern count = 0 +Options: caseless utf +No first char +No need char +Subject length lower bound = 2 +Starting byte set: \xe1 + +/[^\x{c4}]/8DZ +------------------------------------------------------------------ + Bra + [\x00-\xc3\xc5-\xff] (neg) + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +No first char +No need char + +/X+\x{200}/8DZ +------------------------------------------------------------------ + Bra + X++ + \x{200} + Ket + End +------------------------------------------------------------------ +Capturing subpattern count = 0 +Options: utf +First char = 'X' +Need char = 128 + +/\R/SI8 +Capturing subpattern count = 0 +Options: utf +No first char +No need char +Subject length lower bound = 1 +Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 + +/-- End of testinput17 --/ diff --git a/testdata/testoutput2 b/testdata/testoutput2 index 22a0725..5796f10 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -5,8 +5,8 @@ either because PCRE can't be compatible, or there is a possible Perl bug. - NOTE: This is a non-UTF-8 set of tests. When UTF-8 is needed, use test - 5, and if Unicode Property Support is needed, use test 13. --/ + NOTE: This is a non-UTF set of tests. When UTF support is needed, use + test 5, and if Unicode Property Support is needed, use test 13. --/ /-- Originally, the Perl >= 5.10 things were in here too, but now I have separated many (most?) of them out into test 11. However, there may still @@ -6178,7 +6178,7 @@ Failed: character value in \x{...} sequence is too large at offset 6 /\x{0000ff}/I Capturing subpattern count = 0 No options -First char = 255 +First char = \xff No need char /^((?P<A>a1)|(?P<A>a2)b)/I diff --git a/testdata/testoutput4 b/testdata/testoutput4 index 2f1b4fd..067d6f5 100644 --- a/testdata/testoutput4 +++ b/testdata/testoutput4 @@ -1,5 +1,6 @@ -/-- This set of tests is for UTF-8 support, excluding Unicode properties. It is - compatible with all versions of Perl 5. --/ +/-- This set of tests is for UTF support, excluding Unicode properties. It is + compatible with all versions of Perl 5 and both the 8-bit and 16-bit PCRE + libraries. --/ /a.b/8 acb @@ -255,46 +256,6 @@ No match XYZ No match -/X(\C{3})/8 - X\x{1234} - 0: X\x{1234} - 1: \x{1234} - -/X(\C{4})/8 - X\x{1234}YZ - 0: X\x{1234}Y - 1: \x{1234}Y - -/X\C*/8 - XYZabcdce - 0: XYZabcdce - -/X\C*?/8 - XYZabcde - 0: X - -/X\C{3,5}/8 - Xabcdefg - 0: Xabcde - X\x{1234} - 0: X\x{1234} - X\x{1234}YZ - 0: X\x{1234}YZ - X\x{1234}\x{512} - 0: X\x{1234}\x{512} - X\x{1234}\x{512}YZ - 0: X\x{1234}\x{512} - -/X\C{3,5}?/8 - Xabcdefg - 0: Xabc - X\x{1234} - 0: X\x{1234} - X\x{1234}YZ - 0: X\x{1234} - X\x{1234}\x{512} - 0: X\x{1234} - /[^a]+/8g bcd 0: bcd @@ -791,22 +752,6 @@ No match \x{200}X No match -/a\Cb/ - aXb - 0: aXb - a\nb - 0: a\x0ab - -/a\Cb/8 - aXb - 0: aXb - a\nb - 0: a\x{0a}b - -/a\C\Cb/8 - a\x{100}b - 0: a\x{100}b - /[z-\x{100}]/8i z 0: z @@ -1136,8 +1081,4 @@ No match abc No match -/ab\Cde/8 - abXde - 0: abXde - /-- End of testinput4 --/ diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 603e55f..9b86dad 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1,93 +1,10 @@ -/-- This set of tests checks the API, internals, and non-Perl stuff for UTF-8 - support, excluding Unicode properties. --/ - -/\x{100}/8DZ ------------------------------------------------------------------- - Bra - \x{100} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 - -/\x{1000}/8DZ ------------------------------------------------------------------- - Bra - \x{1000} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 225 -Need char = 128 - -/\x{10000}/8DZ ------------------------------------------------------------------- - Bra - \x{10000} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 240 -Need char = 128 - -/\x{100000}/8DZ ------------------------------------------------------------------- - Bra - \x{100000} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 244 -Need char = 128 - -/\x{10ffff}/8DZ ------------------------------------------------------------------- - Bra - \x{10ffff} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 244 -Need char = 191 +/-- This set of tests checks the API, internals, and non-Perl stuff for UTF + support, excluding Unicode properties. However, tests that give different + results in 8-bit and 16-bit modes are excluded (see tests 16 and 17). --/ /\x{110000}/8DZ Failed: character value in \x{...} sequence is too large at offset 9 -/[\x{ff}]/8DZ ------------------------------------------------------------------- - Bra - \x{ff} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 195 -Need char = 191 - -/[\x{100}]/8DZ ------------------------------------------------------------------- - Bra - \x{100} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 - /\x{ffffffff}/8 Failed: character value in \x{...} sequence is too large at offset 11 @@ -108,30 +25,6 @@ Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7 \x{100}a\x{1234}bcd 0: \x{100}a\x{1234} -/\x80/8DZ ------------------------------------------------------------------- - Bra - \x{80} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 194 -Need char = 128 - -/\xff/8DZ ------------------------------------------------------------------- - Bra - \x{ff} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 195 -Need char = 191 - /\x{0041}\x{2262}\x{0391}\x{002e}/DZ8 ------------------------------------------------------------------ Bra @@ -140,100 +33,12 @@ Need char = 191 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'A' Need char = '.' \x{0041}\x{2262}\x{0391}\x{002e} 0: A\x{2262}\x{391}. -/\x{D55c}\x{ad6d}\x{C5B4}/DZ8 ------------------------------------------------------------------- - Bra - \x{d55c}\x{ad6d}\x{c5b4} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 237 -Need char = 180 - \x{D55c}\x{ad6d}\x{C5B4} - 0: \x{d55c}\x{ad6d}\x{c5b4} - -/\x{65e5}\x{672c}\x{8a9e}/DZ8 ------------------------------------------------------------------- - Bra - \x{65e5}\x{672c}\x{8a9e} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 230 -Need char = 158 - \x{65e5}\x{672c}\x{8a9e} - 0: \x{65e5}\x{672c}\x{8a9e} - -/\x{80}/DZ8 ------------------------------------------------------------------- - Bra - \x{80} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 194 -Need char = 128 - -/\x{084}/DZ8 ------------------------------------------------------------------- - Bra - \x{84} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 194 -Need char = 132 - -/\x{104}/DZ8 ------------------------------------------------------------------- - Bra - \x{104} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 132 - -/\x{861}/DZ8 ------------------------------------------------------------------- - Bra - \x{861} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 224 -Need char = 161 - -/\x{212ab}/DZ8 ------------------------------------------------------------------- - Bra - \x{212ab} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 240 -Need char = 171 - /.{3,5}X/DZ8 ------------------------------------------------------------------ Bra @@ -244,13 +49,12 @@ Need char = 171 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char Need char = 'X' \x{212ab}\x{212ab}\x{212ab}\x{861}X 0: \x{212ab}\x{212ab}\x{212ab}\x{861}X - /.{3,5}?/DZ8 ------------------------------------------------------------------ Bra @@ -260,7 +64,7 @@ Need char = 'X' End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char \x{212ab}\x{212ab}\x{212ab}\x{861} @@ -269,29 +73,6 @@ No need char /(?<=\C)X/8 Failed: \C not allowed in lookbehind assertion at offset 6 -/-- This one is here not because it's different to Perl, but because the way -the captured single-byte is displayed. (In Perl it becomes a character, and you -can't tell the difference.) --/ - -/X(\C)(.*)/8 - X\x{1234} - 0: X\x{1234} - 1: \xe1 - 2: \x88\xb4 - X\nabc - 0: X\x{0a}abc - 1: \x{0a} - 2: abc - -/-- This one is here because Perl gives out a grumbly error message (quite -correctly, but that messes up comparisons). --/ - -/a\Cb/8 - *** Failers -No match - a\x{100}b -No match - /^[ab]/8DZ ------------------------------------------------------------------ Bra @@ -301,7 +82,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: anchored utf8 +Options: anchored utf No first char No need char bar @@ -324,7 +105,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: anchored utf8 +Options: anchored utf No first char No need char c @@ -338,136 +119,6 @@ No need char aaa No match -/[^ab\xC0-\xF0]/8SDZ ------------------------------------------------------------------- - Bra - [\x00-`c-\xbf\xf1-\xff] (neg) - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x09 \x0a - \x0b \x0c \x0d \x0e \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 - \x1a \x1b \x1c \x1d \x1e \x1f \x20 ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 - 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y - Z [ \ ] ^ _ ` c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f - \xc2 \xc3 \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 - \xd1 \xd2 \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf - \xe0 \xe1 \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee - \xef \xf0 \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd - \xfe \xff - \x{f1} - 0: \x{f1} - \x{bf} - 0: \x{bf} - \x{100} - 0: \x{100} - \x{1000} - 0: \x{1000} - *** Failers - 0: * - \x{c0} -No match - \x{f0} -No match - -/Ā{3,4}/8SDZ ------------------------------------------------------------------- - Bra - \x{100}{3} - \x{100}? - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 -Subject length lower bound = 3 -No set of starting bytes - \x{100}\x{100}\x{100}\x{100\x{100} - 0: \x{100}\x{100}\x{100} - -/(\x{100}+|x)/8SDZ ------------------------------------------------------------------- - Bra - CBra 1 - \x{100}+ - Alt - x - Ket - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 1 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: x \xc4 - -/(\x{100}*a|x)/8SDZ ------------------------------------------------------------------- - Bra - CBra 1 - \x{100}*+ - a - Alt - x - Ket - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 1 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: a x \xc4 - -/(\x{100}{0,2}a|x)/8SDZ ------------------------------------------------------------------- - Bra - CBra 1 - \x{100}{0,2} - a - Alt - x - Ket - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 1 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: a x \xc4 - -/(\x{100}{1,2}a|x)/8SDZ ------------------------------------------------------------------- - Bra - CBra 1 - \x{100} - \x{100}{0,1} - a - Alt - x - Ket - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 1 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: x \xc4 - /\x{100}*(\d+|"(?1)")/8 1234 0: 1234 @@ -492,18 +143,6 @@ No match \x{100}\x{100}abcd No match -/\x{100}/8DZ ------------------------------------------------------------------- - Bra - \x{100} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 - /\x{100}*/8DZ ------------------------------------------------------------------ Bra @@ -512,7 +151,7 @@ Need char = 128 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -525,7 +164,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'a' No need char @@ -538,36 +177,10 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf First char = 'a' Need char = 'b' -/a\x{100}\x{101}*/8DZ ------------------------------------------------------------------- - Bra - a\x{100} - \x{101}* - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 'a' -Need char = 128 - -/a\x{100}\x{101}+/8DZ ------------------------------------------------------------------- - Bra - a\x{100} - \x{101}+ - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 'a' -Need char = 129 - /\x{100}*A/8DZ ------------------------------------------------------------------ Bra @@ -577,7 +190,7 @@ Need char = 129 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char Need char = 'A' A @@ -593,54 +206,10 @@ Need char = 'A' End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char -/[^\x{c4}]/DZ ------------------------------------------------------------------- - Bra - [^\xc4] - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -No options -No first char -No need char - -/[^\x{c4}]/8DZ ------------------------------------------------------------------- - Bra - [\x00-\xc3\xc5-\xff] (neg) - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char - -/[\x{100}]/8DZ ------------------------------------------------------------------- - Bra - \x{100} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 - \x{100} - 0: \x{100} - Z\x{100} - 0: \x{100} - \x{100}Z - 0: \x{100} - *** Failers -No match - /[Z\x{100}]/8DZ ------------------------------------------------------------------ Bra @@ -649,7 +218,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char Z\x{100} @@ -684,7 +253,7 @@ No match End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -696,7 +265,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char \x{100} @@ -713,25 +282,11 @@ No need char ------------------------------------------------------------------ Capturing subpattern count = 0 No options -First char = 255 +First char = \xff No need char >\xff< 0: \xff -/[\xff]/DZ8 ------------------------------------------------------------------- - Bra - \x{ff} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 195 -Need char = 191 - >\x{ff}< - 0: \x{ff} - /[^\xFF]/DZ ------------------------------------------------------------------ Bra @@ -744,18 +299,6 @@ No options No first char No need char -/[^\xff]/8DZ ------------------------------------------------------------------- - Bra - [\x00-\xfe] (neg) - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char - /[Ä-Ü]/8 Ö # Matches without Study 0: \x{d6} @@ -780,129 +323,6 @@ No need char \x{d6} 0: \x{d6} -/[]/8 -Failed: invalid UTF-8 string at offset 1 - -//8 -Failed: invalid UTF-8 string at offset 0 - -/xxx/8 -Failed: invalid UTF-8 string at offset 0 - -/xxx/8?DZSS ------------------------------------------------------------------- - Bra - \X{c0}\X{c0}\X{c0}xxx - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 no_utf8_check -First char = 195 -Need char = 'x' - -/abc/8 - ] -Error -10 (bad UTF-8 string) offset=0 reason=6 - -Error -10 (bad UTF-8 string) offset=0 reason=1 - -Error -10 (bad UTF-8 string) offset=0 reason=6 - \? -No match - \xe1\x88 -Error -10 (bad UTF-8 string) offset=0 reason=1 - \P\xe1\x88 -Error -10 (bad UTF-8 string) offset=0 reason=1 - \P\P\xe1\x88 -Error -25 (short UTF-8 string) offset=0 reason=1 - XX\xea -Error -10 (bad UTF-8 string) offset=2 reason=2 - \O0XX\xea -Error -10 (bad UTF-8 string) - \O1XX\xea -Error -10 (bad UTF-8 string) - \O2XX\xea -Error -10 (bad UTF-8 string) offset=2 reason=2 - XX\xf1 -Error -10 (bad UTF-8 string) offset=2 reason=3 - XX\xf8 -Error -10 (bad UTF-8 string) offset=2 reason=4 - XX\xfc -Error -10 (bad UTF-8 string) offset=2 reason=5 - ZZ\xea\xaf\x20YY -Error -10 (bad UTF-8 string) offset=2 reason=7 - ZZ\xfd\xbf\xbf\x2f\xbf\xbfYY -Error -10 (bad UTF-8 string) offset=2 reason=8 - ZZ\xfd\xbf\xbf\xbf\x2f\xbfYY -Error -10 (bad UTF-8 string) offset=2 reason=9 - ZZ\xfd\xbf\xbf\xbf\xbf\x2fYY -Error -10 (bad UTF-8 string) offset=2 reason=10 - ZZ\xffYY -Error -10 (bad UTF-8 string) offset=2 reason=21 - ZZ\xfeYY -Error -10 (bad UTF-8 string) offset=2 reason=21 - -/anything/8 - \xc0\x80 -Error -10 (bad UTF-8 string) offset=0 reason=15 - \xc1\x8f -Error -10 (bad UTF-8 string) offset=0 reason=15 - \xe0\x9f\x80 -Error -10 (bad UTF-8 string) offset=0 reason=16 - \xf0\x8f\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=17 - \xf8\x87\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=18 - \xfc\x83\x80\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=19 - \xfe\x80\x80\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=21 - \xff\x80\x80\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=21 - \xc3\x8f -No match - \xe0\xaf\x80 -No match - \xe1\x80\x80 -No match - \xf0\x9f\x80\x80 -No match - \xf1\x8f\x80\x80 -No match - \xf8\x88\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=11 - \xf9\x87\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=11 - \xfc\x84\x80\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=12 - \xfd\x83\x80\x80\x80\x80 -Error -10 (bad UTF-8 string) offset=0 reason=12 - \?\xf8\x88\x80\x80\x80 -No match - \?\xf9\x87\x80\x80\x80 -No match - \?\xfc\x84\x80\x80\x80\x80 -No match - \?\xfd\x83\x80\x80\x80\x80 -No match - -/\x{100}abc(xyz(?1))/8DZ ------------------------------------------------------------------- - Bra - \x{100}abc - CBra 1 - xyz - Recurse - Ket - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 1 -Options: utf8 -First char = 196 -Need char = 'z' - /[^\x{100}]abc(xyz(?1))/8DZ ------------------------------------------------------------------ Bra @@ -916,7 +336,7 @@ Need char = 'z' End ------------------------------------------------------------------ Capturing subpattern count = 1 -Options: utf8 +Options: utf No first char Need char = 'z' @@ -933,7 +353,7 @@ Need char = 'z' End ------------------------------------------------------------------ Capturing subpattern count = 1 -Options: utf8 +Options: utf No first char Need char = 'z' @@ -953,7 +373,7 @@ Need char = 'z' End ------------------------------------------------------------------ Capturing subpattern count = 2 -Options: utf8 +Options: utf No first char No need char @@ -984,7 +404,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 2 -Options: utf8 +Options: utf No first char No need char @@ -1004,7 +424,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 2 -Options: utf8 +Options: utf No first char No need char @@ -1035,7 +455,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 2 -Options: utf8 +Options: utf No first char No need char @@ -1049,10 +469,6 @@ No need char \x{100}X 0: X -/a\x{1234}b/P8 - a\x{1234}b - 0: a\x{1234}b - /^\ሴ/8DZ ------------------------------------------------------------------ Bra @@ -1062,23 +478,13 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: anchored utf8 +Options: anchored utf No first char No need char /\777/I Failed: octal value is greater than \377 (not in UTF-8 mode) at offset 3 -/\777/8I -Capturing subpattern count = 0 -Options: utf8 -First char = 199 -Need char = 191 - \x{1ff} - 0: \x{1ff} - \777 - 0: \x{1ff} - /\x{100}*\d/8DZ ------------------------------------------------------------------ Bra @@ -1088,7 +494,7 @@ Need char = 191 End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -1101,7 +507,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -1114,7 +520,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -1127,7 +533,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -1140,7 +546,7 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char @@ -1153,49 +559,10 @@ No need char End ------------------------------------------------------------------ Capturing subpattern count = 0 -Options: utf8 +Options: utf No first char No need char -/\x{100}+\x{200}/8DZ ------------------------------------------------------------------- - Bra - \x{100}++ - \x{200} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 128 - -/\x{100}+X/8DZ ------------------------------------------------------------------- - Bra - \x{100}++ - X - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 196 -Need char = 'X' - -/X+\x{200}/8DZ ------------------------------------------------------------------- - Bra - X++ - \x{200} - Ket - End ------------------------------------------------------------------- -Capturing subpattern count = 0 -Options: utf8 -First char = 'X' -Need char = 128 - /()()()()()()()()()() ()()()()()()()()()() ()()()()()()()()()() @@ -1237,9 +604,6 @@ Matched, but too many substrings End ------------------------------------------------------------------ -/^[\QĀ\E-\QŐ\E/BZ8 -Failed: missing terminating ] for character class at offset 15 - /^abc./mgx8<any> abc1 \x0aabc2 \x0babc3xx \x0cabc4 \x0dabc5xx \x0d\x0aabc6 \x{0085}abc7 \x{2028}abc8 \x{2029}abc9 JUNK 0: abc1 @@ -1442,39 +806,9 @@ No match \x{1ec5} 0: \x{1ec5} -/-- This tests the stricter UTF-8 check according to RFC 3629. --/ - -/X/8 - \x{0}\x{d7ff}\x{e000}\x{10ffff} -No match - \x{d800} -Error -10 (bad UTF-8 string) offset=0 reason=14 - \x{d800}\? -No match - \x{da00} -Error -10 (bad UTF-8 string) offset=0 reason=14 - \x{da00}\? -No match - \x{dfff} -Error -10 (bad UTF-8 string) offset=0 reason=14 - \x{dfff}\? -No match - \x{110000} -Error -10 (bad UTF-8 string) offset=0 reason=13 - \x{110000}\? -No match - \x{2000000} -Error -10 (bad UTF-8 string) offset=0 reason=11 - \x{2000000}\? -No match - \x{7fffffff} -Error -10 (bad UTF-8 string) offset=0 reason=12 - \x{7fffffff}\? -No match - /a\Rb/I8<bsr_anycrlf> Capturing subpattern count = 0 -Options: bsr_anycrlf utf8 +Options: bsr_anycrlf utf First char = 'a' Need char = 'b' a\rb @@ -1492,7 +826,7 @@ No match /a\Rb/I8<bsr_unicode> Capturing subpattern count = 0 -Options: bsr_unicode utf8 +Options: bsr_unicode utf First char = 'a' Need char = 'b' a\rb @@ -1514,7 +848,7 @@ No match /a\R?b/I8<bsr_anycrlf> Capturing subpattern count = 0 -Options: bsr_anycrlf utf8 +Options: bsr_anycrlf utf First char = 'a' Need char = 'b' a\rb @@ -1532,7 +866,7 @@ No match /a\R?b/I8<bsr_unicode> Capturing subpattern count = 0 -Options: bsr_unicode utf8 +Options: bsr_unicode utf First char = 'a' Need char = 'b' a\rb @@ -1598,17 +932,6 @@ No match A\x{1ec5}ABCXYZ 0: X -/(*UTF8)\x{1234}/ - abcd\x{1234}pqr - 0: \x{1234} - -/(*CRLF)(*UTF8)(*BSR_UNICODE)a\Rb/I -Capturing subpattern count = 0 -Options: bsr_unicode utf8 -Forced newline sequence: CRLF -First char = 'a' -Need char = 'b' - /Xa{2,4}b/8 X\P Partial match: X @@ -2094,32 +1417,6 @@ No need char Subject length lower bound = 1 Starting byte set: \x09 \x20 \xa0 -/\h/SI8 -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \x09 \x20 \xc2 \xe1 \xe2 \xe3 - ABC\x{09} - 0: \x{09} - ABC\x{20} - 0: - ABC\x{a0} - 0: \x{a0} - ABC\x{1680} - 0: \x{1680} - ABC\x{180e} - 0: \x{180e} - ABC\x{2000} - 0: \x{2000} - ABC\x{202f} - 0: \x{202f} - ABC\x{205f} - 0: \x{205f} - ABC\x{3000} - 0: \x{3000} - /\v/SI Capturing subpattern count = 0 No options @@ -2128,26 +1425,6 @@ No need char Subject length lower bound = 1 Starting byte set: \x0a \x0b \x0c \x0d \x85 -/\v/SI8 -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 - ABC\x{0a} - 0: \x{0a} - ABC\x{0b} - 0: \x{0b} - ABC\x{0c} - 0: \x{0c} - ABC\x{0d} - 0: \x{0d} - ABC\x{85} - 0: \x{85} - ABC\x{2028} - 0: \x{2028} - /\R/SI Capturing subpattern count = 0 No options @@ -2156,82 +1433,16 @@ No need char Subject length lower bound = 1 Starting byte set: \x0a \x0b \x0c \x0d \x85 -/\R/SI8 -Capturing subpattern count = 0 -Options: utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 - -/\h*A/SI8 -Capturing subpattern count = 0 -Options: utf8 -No first char -Need char = 'A' -Subject length lower bound = 1 -Starting byte set: \x09 \x20 A \xc2 \xe1 \xe2 \xe3 - CDBABC - 0: A - -/\v+A/SI8 -Capturing subpattern count = 0 -Options: utf8 -No first char -Need char = 'A' -Subject length lower bound = 2 -Starting byte set: \x0a \x0b \x0c \x0d \xc2 \xe2 - -/\s?xxx\s/8SI -Capturing subpattern count = 0 -Options: utf8 -No first char -Need char = 'x' -Subject length lower bound = 4 -Starting byte set: \x09 \x0a \x0c \x0d \x20 x - /\sxxx\s/8T1 AB\x{85}xxx\x{a0}XYZ 0: \x{85}xxx\x{a0} AB\x{a0}xxx\x{85}XYZ 0: \x{a0}xxx\x{85} -/\sxxx\s/I8ST1 -Capturing subpattern count = 0 -Options: utf8 -No first char -Need char = 'x' -Subject length lower bound = 5 -Starting byte set: \x09 \x0a \x0c \x0d \x20 \xc2 - AB\x{85}xxx\x{a0}XYZ - 0: \x{85}xxx\x{a0} - AB\x{a0}xxx\x{85}XYZ - 0: \x{a0}xxx\x{85} - /\S \S/8T1 \x{a2} \x{84} 0: \x{a2} \x{84} -/\S \S/I8ST1 -Capturing subpattern count = 0 -Options: utf8 -No first char -Need char = ' ' -Subject length lower bound = 3 -Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e - \x0f \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1a \x1b \x1c \x1d - \x1e \x1f ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ - A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e - f g h i j k l m n o p q r s t u v w x y z { | } ~ \x7f \xc0 \xc1 \xc2 \xc3 - \xc4 \xc5 \xc6 \xc7 \xc8 \xc9 \xca \xcb \xcc \xcd \xce \xcf \xd0 \xd1 \xd2 - \xd3 \xd4 \xd5 \xd6 \xd7 \xd8 \xd9 \xda \xdb \xdc \xdd \xde \xdf \xe0 \xe1 - \xe2 \xe3 \xe4 \xe5 \xe6 \xe7 \xe8 \xe9 \xea \xeb \xec \xed \xee \xef \xf0 - \xf1 \xf2 \xf3 \xf4 \xf5 \xf6 \xf7 \xf8 \xf9 \xfa \xfb \xfc \xfd \xfe \xff - \x{a2} \x{84} - 0: \x{a2} \x{84} - A Z - 0: A Z - 'A#хц'8x<any>BZ ------------------------------------------------------------------ Bra @@ -2293,20 +1504,6 @@ Starting byte set: \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0b \x0e End ------------------------------------------------------------------ -/a+/8 - a\x{123}aa\>1 - 0: aa - a\x{123}aa\>2 -Error -11 (bad UTF-8 offset) - a\x{123}aa\>3 - 0: aa - a\x{123}aa\>4 - 0: a - a\x{123}aa\>5 -No match - a\x{123}aa\>6 -Error -24 (bad offset value) - /^\cģ/8 Failed: \c must be followed by an ASCII character at offset 3 @@ -2338,41 +1535,9 @@ Failed: \c must be followed by an ASCII character at offset 3 1: \x{0a} 2: \x{0d} -/\x{1234}+/iS8I -Capturing subpattern count = 0 -Options: caseless utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \xe1 - -/\x{1234}+?/iS8I -Capturing subpattern count = 0 -Options: caseless utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \xe1 - -/\x{1234}++/iS8I -Capturing subpattern count = 0 -Options: caseless utf8 -No first char -No need char -Subject length lower bound = 1 -Starting byte set: \xe1 - -/\x{1234}{2}/iS8I -Capturing subpattern count = 0 -Options: caseless utf8 -No first char -No need char -Subject length lower bound = 2 -Starting byte set: \xe1 - /[^\x{1234}]+/iS8I Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Subject length lower bound = 1 @@ -2380,7 +1545,7 @@ No set of starting bytes /[^\x{1234}]+?/iS8I Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Subject length lower bound = 1 @@ -2388,7 +1553,7 @@ No set of starting bytes /[^\x{1234}]++/iS8I Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Subject length lower bound = 1 @@ -2396,7 +1561,7 @@ No set of starting bytes /[^\x{1234}]{2}/iS8I Capturing subpattern count = 0 -Options: caseless utf8 +Options: caseless utf No first char No need char Subject length lower bound = 2 @@ -2420,5 +1585,13 @@ Partial match: for /f.*/8s \P\Pfor Partial match: for + +/\x{d7ff}\x{e000}/8 + +/\x{d800}/8 +Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7 + +/\x{dfff}/8 +Failed: disallowed UTF-8/16 code point (>= 0xd800 && <= 0xdfff) at offset 7 /-- End of testinput5 --/ diff --git a/testdata/testoutput8 b/testdata/testoutput8 index 0c569b3..29d51e2 100644 --- a/testdata/testoutput8 +++ b/testdata/testoutput8 @@ -1210,7 +1210,7 @@ No match /a\Rb/I8<bsr_anycrlf> Capturing subpattern count = 0 -Options: bsr_anycrlf utf8 +Options: bsr_anycrlf utf First char = 'a' Need char = 'b' a\rb @@ -1228,7 +1228,7 @@ No match /a\Rb/I8<bsr_unicode> Capturing subpattern count = 0 -Options: bsr_unicode utf8 +Options: bsr_unicode utf First char = 'a' Need char = 'b' a\rb @@ -1250,7 +1250,7 @@ No match /a\R?b/I8<bsr_anycrlf> Capturing subpattern count = 0 -Options: bsr_anycrlf utf8 +Options: bsr_anycrlf utf First char = 'a' Need char = 'b' a\rb @@ -1268,7 +1268,7 @@ No match /a\R?b/I8<bsr_unicode> Capturing subpattern count = 0 -Options: bsr_unicode utf8 +Options: bsr_unicode utf First char = 'a' Need char = 'b' a\rb |