another go; was RE: [perl #49302] [[:print:]] v \p{Print}

From: "Robin Barker" <Robin.Barker@npl.co.uk> Message-ID: <46A0F33545E63740BC7563DE59CA9C6D093B12@exchsvr2.npl.ad.local> p4raw-id: //depot/perl@33752
author: Robin Barker <RMBarker@cpan.org> 2008-04-25 15:21:06 +0100
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2008-04-26 22:06:23 +0000
commit: fdf0a293a88d8a14c42b43c2f82c991c50f7dc39 (patch)
tree: 12b28e3d916d039998f7eb9a68a3f26b532b9c85
parent: 216e512e84140c4f1a6711dd28168b2a82c86fd9 (diff)
download: perl-fdf0a293a88d8a14c42b43c2f82c991c50f7dc39.tar.gz
2 files changed, 70 insertions, 4 deletions
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 04c7b8e724..a076d3ad66 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -375,20 +375,60 @@ X<character class> X<\p> X<\p{}>
     digit       IsDigit        \d
     graph       IsGraph
     lower       IsLower
-    print       IsPrint
-    punct       IsPunct
+    print       IsPrint		(but see [2] below)
+    punct       IsPunct		(but see [3] below)
     space       IsSpace
                 IsSpacePerl    \s
     upper       IsUpper
-    word        IsWord
+    word        IsWord         \w
     xdigit      IsXDigit
 
 For example C<[[:lower:]]> and C<\p{IsLower}> are equivalent.
 
+However, the equivalence between C<[[:xxxxx:]]> and C<\p{IsXxxxx}>
+is not exact.
+
+=over 4
+
+=item [1]
+
 If the C<utf8> pragma is not used but the C<locale> pragma is, the
 classes correlate with the usual isalpha(3) interface (except for
 "word" and "blank").
 
+But if the C<locale> or C<encoding> pragmas are not used and
+the string is not C<utf8>, then C<[[:xxxxx:]]> (and C<\w>, etc.)
+will not match characters 0x80-0xff; whereas C<\p{IsXxxxx}> will
+force the string to C<utf8> and can match these characters
+(as Unicode).
+
+=item [2]
+
+C<\p{IsPrint}> matches characters 0x09-0x0d but C<[[:print:]]> does not.
+
+=item [3]
+
+C<[[:punct::]]> matches the following but C<\p{IsPunct}> does not,
+because they are classed as symbols (not punctuation) in Unicode.
+
+=over 4
+
+=item C<$>
+
+Currency symbol
+
+=item C<+> C<< < >> C<=> C<< > >> C<|> C<~>
+
+Mathematical symbols
+
+=item C<^> C<`>
+
+Modifier symbols (accents)
+
+=back
+
+=back
+
 The other named classes are:
 
 =over 4
diff --git a/t/op/pat.t b/t/op/pat.t
index 82cf498b99..5ff4b9218d 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -4604,6 +4604,32 @@ sub kt
     iseq($te[0], '../');
 }
 
+SKIP: {
+    if (ordA == 193) { skip("Assumes ASCII", 4) }
+
+    my @notIsPunct = grep {/[[:punct:]]/ and not /\p{IsPunct}/}
+			map {chr} 0x20..0x7f;
+    iseq( join('', @notIsPunct), '$+<=>^`|~',
+	'[:punct:] disagress with IsPunct on Symbols');
+
+    my @isPrint = grep {not/[[:print:]]/ and /\p{IsPrint}/}
+			map {chr} 0..0x1f, 0x7f..0x9f;
+    iseq( join('', @isPrint), "\x09\x0a\x0b\x0c\x0d\x85",
+	'IsPrint disagrees with [:print:] on control characters');
+
+    my @isPunct = grep {/[[:punct:]]/ != /\p{IsPunct}/}
+			map {chr} 0x80..0xff;
+    iseq( join('', @isPunct), "\xa1\xab\xb7\xbb\xbf",		# ¡ « · » ¿
+	'IsPunct disagrees with [:punct:] outside ASCII');
+
+    my @isPunctLatin1 = eval q{
+	use encoding 'latin1';
+	grep {/[[:punct:]]/ != /\p{IsPunct}/} map {chr} 0x80..0xff;
+    };
+    if( $@ ){ skip( $@, 1); }
+    iseq( join('', @isPunctLatin1), '', 
+	'IsPunct agrees with [:punct:] with explicit Latin1');
+} 
 
 
 # Test counter is at bottom of file. Put new tests above here.
@@ -4667,7 +4693,7 @@ iseq(0+$::test,$::TestCount,"Got the right number of tests!");
 
 # Don't forget to update this!
 BEGIN {
-    $::TestCount = 4031;
+    $::TestCount = 4035;
     print "1..$::TestCount\n";
 }
author	Robin Barker <RMBarker@cpan.org>	2008-04-25 15:21:06 +0100
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2008-04-26 22:06:23 +0000
commit	fdf0a293a88d8a14c42b43c2f82c991c50f7dc39 (patch)
tree	12b28e3d916d039998f7eb9a68a3f26b532b9c85
parent	216e512e84140c4f1a6711dd28168b2a82c86fd9 (diff)
download	perl-fdf0a293a88d8a14c42b43c2f82c991c50f7dc39.tar.gz