Fix issues with UTF-8 in the Perl checking script.

git-svn-id: svn://vcs.exim.org/pcre/code/trunk@871 2f5784b3-3f2a-0410-8824-cb99058d5e15
author: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-01-14 16:20:44 +0000
committer: ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15> 2012-01-14 16:20:44 +0000
commit: 3cbf1c2db892334e94f79fbed6f83ff33ba5297c (patch)
tree: 943d276f29fdd6132f36f982ebe78f9af4b47f37
parent: 32925a47941f651ab158479c977828875c478348 (diff)
download: pcre-3cbf1c2db892334e94f79fbed6f83ff33ba5297c.tar.gz
2 files changed, 28 insertions, 15 deletions
diff --git a/doc/perltest.txt b/doc/perltest.txt
index 3785bdd..37e0012 100644
--- a/doc/perltest.txt
+++ b/doc/perltest.txt
@@ -28,13 +28,15 @@ the initial identifying banner.
 The perltest.pl script can also test UTF-8 features. It recognizes the special
 modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput4
 and testinput6 files can be fed to perltest to run compatible UTF-8 tests.
-However, it is necessary to add "use utf8;" to the script to make this work
-correctly.
+However, it is necessary to add "use utf8; require Encode" to the script to
+make this work correctly. I have not managed to find a way to handle this 
+automatically.
 
 The other testinput files are not suitable for feeding to perltest.pl, since
 they make use of the special upper case modifiers and escapes that pcretest
-uses to test some features of PCRE. Some of these files also contains malformed
-regular expressions, in order to check that PCRE diagnoses them correctly.
+uses to test certain features of PCRE. Some of these files also contain
+malformed regular expressions, in order to check that PCRE diagnoses them
+correctly.
 
 Philip Hazel
 January 2012
diff --git a/perltest.pl b/perltest.pl
index d81a6c4..d44e6c5 100755
--- a/perltest.pl
+++ b/perltest.pl
@@ -1,16 +1,18 @@
 #! /usr/bin/env perl
 
 # Program for testing regular expressions with perl to check that PCRE handles
-# them the same. This is the version that supports /8 for UTF-8 testing. As it
-# stands, it requires at least Perl 5.8 for UTF-8 support. However, it needs to
-# have "use utf8" at the start for running the UTF-8 tests, but *not* for the
-# other tests. The only way I've found for doing this is to cat this line in
-# explicitly in the RunPerlTest script.
+# them the same. This version supports /8 for UTF-8 testing. However, it needs
+# to have "use utf8" at the start for running the UTF-8 tests, but *not* for
+# the other tests. The only way I've found for doing this is to cat this line
+# in explicitly in the RunPerlTest script. I've also used this method to supply
+# "require Encode" for the UTF-8 tests, so that the main test will still run
+# where Encode is not installed.
 
 # use locale;  # With this included, \x0b matches \s!
 
-# Function for turning a string into a string of printing chars. There are
-# currently problems with UTF-8 strings; this fudges round them.
+# Function for turning a string into a string of printing chars.
+
+#require Encode;
 
 sub pchars {
 my($t) = "";
@@ -21,10 +23,10 @@ if ($utf8)
   foreach $c (@p)
     {
     if ($c >= 32 && $c < 127) { $t .= chr $c; }
-      else { $t .= sprintf("\\x{%02x}", $c); }
+      else { $t .= sprintf("\\x{%02x}", $c); 
+      }
     }
   }
-
 else
   {
   foreach $c (split(//, $_[0]))
@@ -192,7 +194,7 @@ for (;;)
       {
       printf $outfile "No match";
       if (defined $REGERROR && $REGERROR != 1)
-        { print $outfile (", mark = $REGERROR"); }
+        { printf $outfile (", mark = %s", &pchars($REGERROR)); }
       printf $outfile "\n";
       }
     else
@@ -214,8 +216,17 @@ for (;;)
           }
         splice(@subs, 0, 18);
         }
+        
+      # It seems that $REGMARK is not marked as UTF-8 even when use utf8 is
+      # set and the input pattern was a UTF-8 string. We can, however, force
+      # it to be so marked.  
+       
       if (defined $REGMARK && $REGMARK != 1)
-        { print $outfile ("MK: $REGMARK\n"); }
+        {
+        $xx = $REGMARK;  
+        $xx = Encode::decode_utf8($xx) if $utf8; 
+        printf $outfile ("MK: %s\n", &pchars($xx)); 
+        }
       }
     }
   }
author	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-01-14 16:20:44 +0000
committer	ph10 <ph10@2f5784b3-3f2a-0410-8824-cb99058d5e15>	2012-01-14 16:20:44 +0000
commit	3cbf1c2db892334e94f79fbed6f83ff33ba5297c (patch)
tree	943d276f29fdd6132f36f982ebe78f9af4b47f37
parent	32925a47941f651ab158479c977828875c478348 (diff)
download	pcre-3cbf1c2db892334e94f79fbed6f83ff33ba5297c.tar.gz