diff options
-rw-r--r-- | doop.c | 158 | ||||
-rw-r--r-- | embed.h | 60 | ||||
-rwxr-xr-x | embed.pl | 14 | ||||
-rw-r--r-- | proto.h | 14 | ||||
-rwxr-xr-x | t/op/tr.t | 49 |
5 files changed, 200 insertions, 95 deletions
@@ -22,12 +22,13 @@ #endif STATIC I32 -S_do_trans_CC_simple(pTHX_ SV *sv) +S_do_trans_simple(pTHX_ SV *sv) /* SPC - OK */ { dTHR; U8 *s; U8 *send; I32 matches = 0; + I32 hasutf = SvUTF8(sv); STRLEN len; short *tbl; I32 ch; @@ -40,11 +41,15 @@ S_do_trans_CC_simple(pTHX_ SV *sv) send = s + len; while (s < send) { - if ((ch = tbl[*s]) >= 0) { - matches++; - *s = ch; - } + if (hasutf && *s & 0x80) + s+=UTF8SKIP(s); /* Given that we're here because tbl is !UTF8...*/ + else { + if ((ch = tbl[*s]) >= 0) { + matches++; + *s = ch; + } s++; + } } SvSETMAGIC(sv); @@ -52,12 +57,13 @@ S_do_trans_CC_simple(pTHX_ SV *sv) } STATIC I32 -S_do_trans_CC_count(pTHX_ SV *sv) +S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */ { dTHR; U8 *s; U8 *send; I32 matches = 0; + I32 hasutf = SvUTF8(sv); STRLEN len; short *tbl; @@ -69,21 +75,26 @@ S_do_trans_CC_count(pTHX_ SV *sv) send = s + len; while (s < send) { - if (tbl[*s] >= 0) - matches++; - s++; + if (hasutf && *s & 0x80) + s+=UTF8SKIP(s); + else { + if (tbl[*s] >= 0) + matches++; + s++; + } } return matches; } STATIC I32 -S_do_trans_CC_complex(pTHX_ SV *sv) +S_do_trans_complex(pTHX_ SV *sv)/* SPC - OK */ { dTHR; U8 *s; U8 *send; U8 *d; + I32 hasutf = SvUTF8(sv); I32 matches = 0; STRLEN len; short *tbl; @@ -101,29 +112,37 @@ S_do_trans_CC_complex(pTHX_ SV *sv) U8* p = send; while (s < send) { - if ((ch = tbl[*s]) >= 0) { - *d = ch; - matches++; - if (p == d - 1 && *p == *d) - matches--; - else - p = d++; - } - else if (ch == -1) /* -1 is unmapped character */ - *d++ = *s; /* -2 is delete character */ - s++; + if (hasutf && *s & 0x80) + s+=UTF8SKIP(s); + else { + if ((ch = tbl[*s]) >= 0) { + *d = ch; + matches++; + if (p == d - 1 && *p == *d) + matches--; + else + p = d++; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; /* -2 is delete character */ + s++; + } } } else { while (s < send) { - if ((ch = tbl[*s]) >= 0) { - *d = ch; - matches++; - d++; - } - else if (ch == -1) /* -1 is unmapped character */ - *d++ = *s; /* -2 is delete character */ - s++; + if (hasutf && *s & 0x80) + s+=UTF8SKIP(s); + else { + if ((ch = tbl[*s]) >= 0) { + *d = ch; + matches++; + d++; + } + else if (ch == -1) /* -1 is unmapped character */ + *d++ = *s; /* -2 is delete character */ + s++; + } } } matches += send - d; /* account for disappeared chars */ @@ -135,12 +154,14 @@ S_do_trans_CC_complex(pTHX_ SV *sv) } STATIC I32 -S_do_trans_UU_simple(pTHX_ SV *sv) +S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */ { dTHR; U8 *s; U8 *send; U8 *d; + U8 *start; + U8 *dstart; I32 matches = 0; STRLEN len; @@ -151,43 +172,83 @@ S_do_trans_UU_simple(pTHX_ SV *sv) UV extra = none + 1; UV final; UV uv; + I32 isutf; + I32 howmany; + isutf = SvUTF8(sv); s = (U8*)SvPV(sv, len); send = s + len; + start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); if (svp) final = SvUV(*svp); - d = s; + /* d needs to be bigger than s, in case e.g. upgrading is required */ + Newz(0, d, len*2+1, U8); + dstart = d; while (s < send) { if ((uv = swash_fetch(rv, s)) < none) { s += UTF8SKIP(s); matches++; + if (uv & 0x80 && !isutf) { + /* Sneaky-upgrade dstart...d */ + U8* new; + STRLEN len; + len = dstart - d; + new = bytes_to_utf8(dstart, &len); + Copy(new,dstart,len,U8*); + d = dstart + len; + isutf++; + } d = uv_to_utf8(d, uv); } else if (uv == none) { int i; - for (i = UTF8SKIP(s); i; i--) - *d++ = *s++; + i = UTF8SKIP(s); + if (i > 1 && !isutf) { + U8* new; + STRLEN len; + len = dstart - d; + new = bytes_to_utf8(dstart, &len); + Copy(new,dstart,len,U8*); + d = dstart + len; + isutf++; + } + while(i--) + *d++ = *s++; } else if (uv == extra) { - s += UTF8SKIP(s); + int i; + i = UTF8SKIP(s); + s += i; matches++; + if (i > 1 && !isutf) { + U8* new; + STRLEN len; + len = dstart - d; + new = bytes_to_utf8(dstart, &len); + Copy(new,dstart,len,U8*); + d = dstart + len; + isutf++; + } d = uv_to_utf8(d, final); } else s += UTF8SKIP(s); } *d = '\0'; - SvCUR_set(sv, d - (U8*)SvPVX(sv)); + SvPV_set(sv, dstart); + SvCUR_set(sv, d - dstart); SvSETMAGIC(sv); + if (isutf) + SvUTF8_on(sv); return matches; } STATIC I32 -S_do_trans_UU_count(pTHX_ SV *sv) +S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */ { dTHR; U8 *s; @@ -202,6 +263,8 @@ S_do_trans_UU_count(pTHX_ SV *sv) UV uv; s = (U8*)SvPV(sv, len); + if (!SvUTF8(sv)) + s = bytes_to_utf8(s, &len); send = s + len; while (s < send) { @@ -214,7 +277,7 @@ S_do_trans_UU_count(pTHX_ SV *sv) } STATIC I32 -S_do_trans_UU_complex(pTHX_ SV *sv) +S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */ { dTHR; U8 *s; @@ -403,7 +466,10 @@ Perl_do_trans(pTHX_ SV *sv) { dTHR; STRLEN len; + I32 hasutf = (PL_op->op_private & + (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)); + PL_op->op_private &= ~hasutf; if (SvREADONLY(sv) && !(PL_op->op_private & OPpTRANS_IDENTICAL)) Perl_croak(aTHX_ PL_no_modify); @@ -419,22 +485,22 @@ Perl_do_trans(pTHX_ SV *sv) switch (PL_op->op_private & 63) { case 0: - if (SvUTF8(sv)) - return do_trans_UU_simple(sv); + if (hasutf) + return do_trans_simple_utf8(sv); else - return do_trans_CC_simple(sv); + return do_trans_simple(sv); case OPpTRANS_IDENTICAL: - if (SvUTF8(sv)) - return do_trans_UU_count(sv); + if (hasutf) + return do_trans_count_utf8(sv); else - return do_trans_CC_count(sv); + return do_trans_count(sv); default: - if (SvUTF8(sv)) - return do_trans_UU_complex(sv); /* could be UC or CU too */ + if (hasutf) + return do_trans_complex_utf8(sv); else - return do_trans_CC_complex(sv); + return do_trans_complex(sv); } } @@ -846,14 +846,12 @@ #define avhv_index S_avhv_index #endif #if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT) -#define do_trans_CC_simple S_do_trans_CC_simple -#define do_trans_CC_count S_do_trans_CC_count -#define do_trans_CC_complex S_do_trans_CC_complex -#define do_trans_UU_simple S_do_trans_UU_simple -#define do_trans_UU_count S_do_trans_UU_count -#define do_trans_UU_complex S_do_trans_UU_complex -#define do_trans_UC_trivial S_do_trans_UC_trivial -#define do_trans_CU_trivial S_do_trans_CU_trivial +#define do_trans_simple S_do_trans_simple +#define do_trans_count S_do_trans_count +#define do_trans_complex S_do_trans_complex +#define do_trans_simple_utf8 S_do_trans_simple_utf8 +#define do_trans_count_utf8 S_do_trans_count_utf8 +#define do_trans_complex_utf8 S_do_trans_complex_utf8 #endif #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT) #define gv_init_sv S_gv_init_sv @@ -1105,6 +1103,7 @@ #define sublex_push S_sublex_push #define sublex_start S_sublex_start #define filter_gets S_filter_gets +#define find_in_my_stash S_find_in_my_stash #define new_constant S_new_constant #define ao S_ao #define depcom S_depcom @@ -2289,14 +2288,12 @@ #define avhv_index(a,b,c) S_avhv_index(aTHX_ a,b,c) #endif #if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT) -#define do_trans_CC_simple(a) S_do_trans_CC_simple(aTHX_ a) -#define do_trans_CC_count(a) S_do_trans_CC_count(aTHX_ a) -#define do_trans_CC_complex(a) S_do_trans_CC_complex(aTHX_ a) -#define do_trans_UU_simple(a) S_do_trans_UU_simple(aTHX_ a) -#define do_trans_UU_count(a) S_do_trans_UU_count(aTHX_ a) -#define do_trans_UU_complex(a) S_do_trans_UU_complex(aTHX_ a) -#define do_trans_UC_trivial(a) S_do_trans_UC_trivial(aTHX_ a) -#define do_trans_CU_trivial(a) S_do_trans_CU_trivial(aTHX_ a) +#define do_trans_simple(a) S_do_trans_simple(aTHX_ a) +#define do_trans_count(a) S_do_trans_count(aTHX_ a) +#define do_trans_complex(a) S_do_trans_complex(aTHX_ a) +#define do_trans_simple_utf8(a) S_do_trans_simple_utf8(aTHX_ a) +#define do_trans_count_utf8(a) S_do_trans_count_utf8(aTHX_ a) +#define do_trans_complex_utf8(a) S_do_trans_complex_utf8(aTHX_ a) #endif #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT) #define gv_init_sv(a,b) S_gv_init_sv(aTHX_ a,b) @@ -2547,6 +2544,7 @@ #define sublex_push() S_sublex_push(aTHX) #define sublex_start() S_sublex_start(aTHX) #define filter_gets(a,b,c) S_filter_gets(aTHX_ a,b,c) +#define find_in_my_stash(a,b) S_find_in_my_stash(aTHX_ a,b) #define new_constant(a,b,c,d,e,f) S_new_constant(aTHX_ a,b,c,d,e,f) #define ao(a) S_ao(aTHX_ a) #define depcom() S_depcom(aTHX) @@ -4479,22 +4477,18 @@ #define avhv_index S_avhv_index #endif #if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT) -#define S_do_trans_CC_simple CPerlObj::S_do_trans_CC_simple -#define do_trans_CC_simple S_do_trans_CC_simple -#define S_do_trans_CC_count CPerlObj::S_do_trans_CC_count -#define do_trans_CC_count S_do_trans_CC_count -#define S_do_trans_CC_complex CPerlObj::S_do_trans_CC_complex -#define do_trans_CC_complex S_do_trans_CC_complex -#define S_do_trans_UU_simple CPerlObj::S_do_trans_UU_simple -#define do_trans_UU_simple S_do_trans_UU_simple -#define S_do_trans_UU_count CPerlObj::S_do_trans_UU_count -#define do_trans_UU_count S_do_trans_UU_count -#define S_do_trans_UU_complex CPerlObj::S_do_trans_UU_complex -#define do_trans_UU_complex S_do_trans_UU_complex -#define S_do_trans_UC_trivial CPerlObj::S_do_trans_UC_trivial -#define do_trans_UC_trivial S_do_trans_UC_trivial -#define S_do_trans_CU_trivial CPerlObj::S_do_trans_CU_trivial -#define do_trans_CU_trivial S_do_trans_CU_trivial +#define S_do_trans_simple CPerlObj::S_do_trans_simple +#define do_trans_simple S_do_trans_simple +#define S_do_trans_count CPerlObj::S_do_trans_count +#define do_trans_count S_do_trans_count +#define S_do_trans_complex CPerlObj::S_do_trans_complex +#define do_trans_complex S_do_trans_complex +#define S_do_trans_simple_utf8 CPerlObj::S_do_trans_simple_utf8 +#define do_trans_simple_utf8 S_do_trans_simple_utf8 +#define S_do_trans_count_utf8 CPerlObj::S_do_trans_count_utf8 +#define do_trans_count_utf8 S_do_trans_count_utf8 +#define S_do_trans_complex_utf8 CPerlObj::S_do_trans_complex_utf8 +#define do_trans_complex_utf8 S_do_trans_complex_utf8 #endif #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT) #define S_gv_init_sv CPerlObj::S_gv_init_sv @@ -4953,6 +4947,8 @@ #define sublex_start S_sublex_start #define S_filter_gets CPerlObj::S_filter_gets #define filter_gets S_filter_gets +#define S_find_in_my_stash CPerlObj::S_find_in_my_stash +#define find_in_my_stash S_find_in_my_stash #define S_new_constant CPerlObj::S_new_constant #define new_constant S_new_constant #define S_ao CPerlObj::S_ao @@ -2199,14 +2199,12 @@ s |I32 |avhv_index |AV* av|SV* sv|U32 hash #endif #if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT) -s |I32 |do_trans_CC_simple |SV *sv -s |I32 |do_trans_CC_count |SV *sv -s |I32 |do_trans_CC_complex |SV *sv -s |I32 |do_trans_UU_simple |SV *sv -s |I32 |do_trans_UU_count |SV *sv -s |I32 |do_trans_UU_complex |SV *sv -s |I32 |do_trans_UC_trivial |SV *sv -s |I32 |do_trans_CU_trivial |SV *sv +s |I32 |do_trans_simple |SV *sv +s |I32 |do_trans_count |SV *sv +s |I32 |do_trans_complex |SV *sv +s |I32 |do_trans_simple_utf8 |SV *sv +s |I32 |do_trans_count_utf8 |SV *sv +s |I32 |do_trans_complex_utf8 |SV *sv #endif #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT) @@ -959,14 +959,12 @@ STATIC I32 S_avhv_index(pTHX_ AV* av, SV* sv, U32 hash); #endif #if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT) -STATIC I32 S_do_trans_CC_simple(pTHX_ SV *sv); -STATIC I32 S_do_trans_CC_count(pTHX_ SV *sv); -STATIC I32 S_do_trans_CC_complex(pTHX_ SV *sv); -STATIC I32 S_do_trans_UU_simple(pTHX_ SV *sv); -STATIC I32 S_do_trans_UU_count(pTHX_ SV *sv); -STATIC I32 S_do_trans_UU_complex(pTHX_ SV *sv); -STATIC I32 S_do_trans_UC_trivial(pTHX_ SV *sv); -STATIC I32 S_do_trans_CU_trivial(pTHX_ SV *sv); +STATIC I32 S_do_trans_simple(pTHX_ SV *sv); +STATIC I32 S_do_trans_count(pTHX_ SV *sv); +STATIC I32 S_do_trans_complex(pTHX_ SV *sv); +STATIC I32 S_do_trans_simple_utf8(pTHX_ SV *sv); +STATIC I32 S_do_trans_count_utf8(pTHX_ SV *sv); +STATIC I32 S_do_trans_complex_utf8(pTHX_ SV *sv); #endif #if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT) @@ -5,7 +5,7 @@ BEGIN { unshift @INC, "../lib"; } -print "1..8\n"; +print "1..15\n"; $_ = "abcdefghijklmnopqrstuvwxyz"; @@ -61,3 +61,50 @@ print "ok 7\n"; $x =~ tr/A/B/; print "not " if $x ne 256.66.258 or length $x != 3; print "ok 8\n"; + +{ +use utf8; + +# 9 - changing UTF8 characters in a UTF8 string, same length. +$l = chr(300); $r = chr(400); +$x = 200.300.400; +$x =~ tr/\x{12c}/\x{190}/; +printf "not (%vd) ", $x if $x ne 200.400.400 or length $x != 3; +print "ok 9\n"; + +# 10 - changing UTF8 characters in UTF8 string, more bytes. +$x = 200.300.400; +$x =~ tr/\x{12c}/\x{be8}/; +printf "not (%vd) ", $x if $x ne 200.3048.400 or length $x != 3; +print "ok 10\n"; + +# 11 - introducing UTF8 characters to non-UTF8 string. +$x = 100.125.60; +$x =~ tr/\x{64}/\x{190}/; +printf "not (%vd) ", $x if $x ne 400.125.60 or length $x != 3; +print "ok 11\n"; + +# 12 - removing UTF8 characters from UTF8 string +$x = 400.125.60; +$x =~ tr/\x{190}/\x{64}/; +printf "not (%vd) ", $x if $x ne 100.125.60 or length $x != 3; +print "ok 12\n"; + +# 13 - counting UTF8 chars in UTF8 string +$x = 400.125.60.400; +$y = $x =~ tr/\x{190}/\x{190}/; +print "not " if $y != 2; +print "ok 13\n"; + +# 14 - counting non-UTF8 chars in UTF8 string +$x = 60.400.125.60.400; +$y = $x =~ tr/\x{3c}/\x{3c}/; +print "not " if $y != 2; +print "ok 14\n"; + +# 15 - counting UTF8 chars in non-UTF8 string +$x = 200.125.60; +$y = $x =~ tr/\x{190}/\x{190}/; +print "not " if $y != 0; +print "ok 15\n"; +} |