diff options
author | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-01-14 16:20:44 +0000 |
---|---|---|
committer | ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> | 2012-01-14 16:20:44 +0000 |
commit | 3cbf1c2db892334e94f79fbed6f83ff33ba5297c (patch) | |
tree | 943d276f29fdd6132f36f982ebe78f9af4b47f37 | |
parent | 32925a47941f651ab158479c977828875c478348 (diff) | |
download | pcre-3cbf1c2db892334e94f79fbed6f83ff33ba5297c.tar.gz |
Fix issues with UTF-8 in the Perl checking script.
git-svn-id: svn://vcs.exim.org/pcre/code/trunk@871 2f5784b3-3f2a-0410-8824-cb99058d5e15
-rw-r--r-- | doc/perltest.txt | 10 | ||||
-rwxr-xr-x | perltest.pl | 33 |
2 files changed, 28 insertions, 15 deletions
diff --git a/doc/perltest.txt b/doc/perltest.txt index 3785bdd..37e0012 100644 --- a/doc/perltest.txt +++ b/doc/perltest.txt @@ -28,13 +28,15 @@ the initial identifying banner. The perltest.pl script can also test UTF-8 features. It recognizes the special modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4 and testinput6 files can be fed to perltest to run compatible UTF-8 tests. -However, it is necessary to add "use utf8;" to the script to make this work -correctly. +However, it is necessary to add "use utf8; require Encode" to the script to +make this work correctly. I have not managed to find a way to handle this +automatically. The other testinput files are not suitable for feeding to perltest.pl, since they make use of the special upper case modifiers and escapes that pcretest -uses to test some features of PCRE. Some of these files also contains malformed -regular expressions, in order to check that PCRE diagnoses them correctly. +uses to test certain features of PCRE. Some of these files also contain +malformed regular expressions, in order to check that PCRE diagnoses them +correctly. Philip Hazel January 2012 diff --git a/perltest.pl b/perltest.pl index d81a6c4..d44e6c5 100755 --- a/perltest.pl +++ b/perltest.pl @@ -1,16 +1,18 @@ #! /usr/bin/env perl # Program for testing regular expressions with perl to check that PCRE handles -# them the same. This is the version that supports /8 for UTF-8 testing. As it -# stands, it requires at least Perl 5.8 for UTF-8 support. However, it needs to -# have "use utf8" at the start for running the UTF-8 tests, but *not* for the -# other tests. The only way I've found for doing this is to cat this line in -# explicitly in the RunPerlTest script. +# them the same. This version supports /8 for UTF-8 testing. However, it needs +# to have "use utf8" at the start for running the UTF-8 tests, but *not* for +# the other tests. The only way I've found for doing this is to cat this line +# in explicitly in the RunPerlTest script. I've also used this method to supply +# "require Encode" for the UTF-8 tests, so that the main test will still run +# where Encode is not installed. # use locale; # With this included, \x0b matches \s! -# Function for turning a string into a string of printing chars. There are -# currently problems with UTF-8 strings; this fudges round them. +# Function for turning a string into a string of printing chars. + +#require Encode; sub pchars { my($t) = ""; @@ -21,10 +23,10 @@ if ($utf8) foreach $c (@p) { if ($c >= 32 && $c < 127) { $t .= chr $c; } - else { $t .= sprintf("\\x{%02x}", $c); } + else { $t .= sprintf("\\x{%02x}", $c); + } } } - else { foreach $c (split(//, $_[0])) @@ -192,7 +194,7 @@ for (;;) { printf $outfile "No match"; if (defined $REGERROR && $REGERROR != 1) - { print $outfile (", mark = $REGERROR"); } + { printf $outfile (", mark = %s", &pchars($REGERROR)); } printf $outfile "\n"; } else @@ -214,8 +216,17 @@ for (;;) } splice(@subs, 0, 18); } + + # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is + # set and the input pattern was a UTF-8 string. We can, however, force + # it to be so marked. + if (defined $REGMARK && $REGMARK != 1) - { print $outfile ("MK: $REGMARK\n"); } + { + $xx = $REGMARK; + $xx = Encode::decode_utf8($xx) if $utf8; + printf $outfile ("MK: %s\n", &pchars($xx)); + } } } } |