diff options
author | SADAHIRO Tomoyuki <BQW10602@nifty.com> | 2007-01-20 09:52:42 +0900 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2007-01-19 21:11:40 +0000 |
commit | 613f191e047db38572099edee06ae7ede4343b5b (patch) | |
tree | 10c80a637c6af8e44e27d83efcbf6f6f15cd7d97 /pp.c | |
parent | 36dfb0728c176423708a8d381586c8e380e47696 (diff) | |
download | perl-613f191e047db38572099edee06ae7ede4343b5b.tar.gz |
Re: [PATCH] fix unicode split /\s+/
Message-Id: <20070120005232.D9CC.BQW10602@nifty.com>
Date: Sat, 20 Jan 2007 00:52:42 +0900
p4raw-id: //depot/perl@29887
Diffstat (limited to 'pp.c')
-rw-r--r-- | pp.c | 36 |
1 files changed, 20 insertions, 16 deletions
@@ -4590,7 +4590,11 @@ PP(pp_split) base = SP - PL_stack_base; orig = s; if (pm->op_pmflags & PMf_SKIPWHITE) { - if (pm->op_pmflags & PMf_LOCALE) { + if (do_utf8) { + while (*s == ' ' || is_utf8_space((U8*)s)) + s += UTF8SKIP(s); + } + else if (pm->op_pmflags & PMf_LOCALE) { while (isSPACE_LC(*s)) s++; } @@ -4606,22 +4610,18 @@ PP(pp_split) if (!limit) limit = maxiters + 2; if (pm->op_pmflags & PMf_WHITE) { - if (do_utf8 && !PL_utf8_space) { - /* force PL_utf8_space to be loaded */ - bool ok; - ENTER; - ok = is_utf8_space((const U8*)" "); - assert(ok); - LEAVE; - } while (--limit) { m = s; /* this one uses 'm' and is a negative test */ if (do_utf8) { - STRLEN uskip; - while (m < strend && - !( *m == ' ' || swash_fetch(PL_utf8_space,(U8*)m, do_utf8) )) - m += UTF8SKIP(m); + while (m < strend && !( *m == ' ' || is_utf8_space((U8*)m) )) { + const int t = UTF8SKIP(m); + /* is_utf8_space returns FALSE for malform utf8 */ + if (strend - m < t) + m = strend; + else + m += t; + } } else if (pm->op_pmflags & PMf_LOCALE) { while (m < strend && !isSPACE_LC(*m)) ++m; @@ -4639,11 +4639,15 @@ PP(pp_split) (void)SvUTF8_on(dstr); XPUSHs(dstr); - s = m + 1; + /* skip the whitespace found last */ + if (do_utf8) + s = m + UTF8SKIP(m); + else + s = m + 1; + /* this one uses 's' and is a positive test */ if (do_utf8) { - while (s < strend && - ( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, do_utf8) )) + while (s < strend && ( *s == ' ' || is_utf8_space((U8*)s) )) s += UTF8SKIP(s); } else if (pm->op_pmflags & PMf_LOCALE) { while (s < strend && isSPACE_LC(*s)) |