fix unicode split /\s+/

Message-ID: <9b18b3110701181714r4f3bc9ebq9ba462eba8338734@mail.gmail.com> p4raw-id: //depot/perl@29880
author: Yves Orton <demerphq@gmail.com> 2007-01-19 03:14:06 +0100
committer: Rafael Garcia-Suarez <rgarciasuarez@gmail.com> 2007-01-19 09:33:00 +0000
commit: 8727f688bf9bab57862da9dd9073020b13c82940 (patch)
tree: 5a3990bb74f985536ce5a1f7e19b51329f5419b8
parent: 89086707de1a3d9012ea80f0c19441d3352a2f73 (diff)
download: perl-8727f688bf9bab57862da9dd9073020b13c82940.tar.gz
2 files changed, 69 insertions, 10 deletions
diff --git a/pp.c b/pp.c
index 4523584068..4b021c0d6d 100644
--- a/pp.c
+++ b/pp.c
@@ -4606,12 +4606,29 @@ PP(pp_split)
     if (!limit)
 	limit = maxiters + 2;
     if (pm->op_pmflags & PMf_WHITE) {
+        if (do_utf8 && !PL_utf8_space) { 
+            /* force PL_utf8_space to be loaded */
+            bool ok; 
+            ENTER; 
+            ok = is_utf8_space((const U8*)" "); 
+            assert(ok); 
+            LEAVE; 
+        } 
 	while (--limit) {
 	    m = s;
-	    while (m < strend &&
-		   !((pm->op_pmflags & PMf_LOCALE)
-		     ? isSPACE_LC(*m) : isSPACE(*m)))
-		++m;
+	    /* this one uses 'm' and is a negative test */
+	    if (do_utf8) {
+	        STRLEN uskip;
+                while (m < strend &&
+                       !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m, do_utf8) ))
+	            m +=  UTF8SKIP(m);
+            } else if (pm->op_pmflags & PMf_LOCALE) {
+	        while (m < strend && !isSPACE_LC(*m))
+		    ++m;
+            } else {
+                while (m < strend && !isSPACE(*m))
+                    ++m;
+            }  
 	    if (m >= strend)
 		break;
 
@@ -4623,10 +4640,18 @@ PP(pp_split)
 	    XPUSHs(dstr);
 
 	    s = m + 1;
-	    while (s < strend &&
-		   ((pm->op_pmflags & PMf_LOCALE)
-		    ? isSPACE_LC(*s) : isSPACE(*s)))
-		++s;
+	    /* this one uses 's' and is a positive test */
+	    if (do_utf8) {
+                while (s < strend &&
+                       ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8) ))
+	            s +=  UTF8SKIP(s);
+            } else if (pm->op_pmflags & PMf_LOCALE) {
+	        while (s < strend && isSPACE_LC(*s))
+		    ++s;
+            } else {
+                while (s < strend && isSPACE(*s))
+                    ++s;
+            } 	    
 	}
     }
     else if (rx->extflags & RXf_START_ONLY) {
diff --git a/t/op/split.t b/t/op/split.t
index d2deff38d3..f5d0c41989 100755
--- a/t/op/split.t
+++ b/t/op/split.t
@@ -6,7 +6,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 55;
+plan tests => 80;
 
 $FS = ':';
 
@@ -297,4 +297,38 @@ ok(@ary == 3 &&
     $x = \$a[2];
     is (ref $x, 'SCALAR', '#28938 - garbage after extend');
 }
-
+{
+    # check the special casing of split /\s/ and unicode
+    use charnames qw(:full);
+    # below test data is extracted from
+    # PropList-5.0.0.txt
+    # Date: 2006-06-07, 23:22:52 GMT [MD]
+    #
+    # Unicode Character Database
+    # Copyright (c) 1991-2006 Unicode, Inc.
+    # For terms of use, see http://www.unicode.org/terms_of_use.html
+    # For documentation, see UCD.html
+    my @spaces=(
+        0x0009..0x000A, # Cc   [5] <control-0009>..<control-000D>
+        0x000C..0x000D, # EXCLUDING \v aka ctl-000B aka vert-tab
+        0x0020,         # Zs       SPACE
+        0x0085,         # Cc       <control-0085>
+        0x00A0,         # Zs       NO-BREAK SPACE
+        0x1680,         # Zs       OGHAM SPACE MARK
+        0x180E,         # Zs       MONGOLIAN VOWEL SEPARATOR
+        0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
+        0x2028,         # Zl       LINE SEPARATOR
+        0x2029,         # Zp       PARAGRAPH SEPARATOR
+        0x202F,         # Zs       NARROW NO-BREAK SPACE
+        0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
+        0x3000          # Zs       IDEOGRAPHIC SPACE
+    );
+    #diag "Have @{[0+@spaces]} to test\n";
+    foreach my $cp (@spaces) {
+        my $space = chr($cp);
+        my $str="A:$space:B\x{FFFF}";
+        chop $str;
+        my @res=split(/\s+/,$str);
+        is(0+@res,2) or do { diag sprintf "Char failed: 0x%x",$cp }
+    }
+}
author	Yves Orton <demerphq@gmail.com>	2007-01-19 03:14:06 +0100
committer	Rafael Garcia-Suarez <rgarciasuarez@gmail.com>	2007-01-19 09:33:00 +0000
commit	8727f688bf9bab57862da9dd9073020b13c82940 (patch)
tree	5a3990bb74f985536ce5a1f7e19b51329f5419b8
parent	89086707de1a3d9012ea80f0c19441d3352a2f73 (diff)
download	perl-8727f688bf9bab57862da9dd9073020b13c82940.tar.gz