summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c158
-rw-r--r--embed.h60
-rwxr-xr-xembed.pl14
-rw-r--r--proto.h14
-rwxr-xr-xt/op/tr.t49
5 files changed, 200 insertions, 95 deletions
diff --git a/doop.c b/doop.c
index fe2df464f5..a7634c4f3c 100644
--- a/doop.c
+++ b/doop.c
@@ -22,12 +22,13 @@
#endif
STATIC I32
-S_do_trans_CC_simple(pTHX_ SV *sv)
+S_do_trans_simple(pTHX_ SV *sv) /* SPC - OK */
{
dTHR;
U8 *s;
U8 *send;
I32 matches = 0;
+ I32 hasutf = SvUTF8(sv);
STRLEN len;
short *tbl;
I32 ch;
@@ -40,11 +41,15 @@ S_do_trans_CC_simple(pTHX_ SV *sv)
send = s + len;
while (s < send) {
- if ((ch = tbl[*s]) >= 0) {
- matches++;
- *s = ch;
- }
+ if (hasutf && *s & 0x80)
+ s+=UTF8SKIP(s); /* Given that we're here because tbl is !UTF8...*/
+ else {
+ if ((ch = tbl[*s]) >= 0) {
+ matches++;
+ *s = ch;
+ }
s++;
+ }
}
SvSETMAGIC(sv);
@@ -52,12 +57,13 @@ S_do_trans_CC_simple(pTHX_ SV *sv)
}
STATIC I32
-S_do_trans_CC_count(pTHX_ SV *sv)
+S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
{
dTHR;
U8 *s;
U8 *send;
I32 matches = 0;
+ I32 hasutf = SvUTF8(sv);
STRLEN len;
short *tbl;
@@ -69,21 +75,26 @@ S_do_trans_CC_count(pTHX_ SV *sv)
send = s + len;
while (s < send) {
- if (tbl[*s] >= 0)
- matches++;
- s++;
+ if (hasutf && *s & 0x80)
+ s+=UTF8SKIP(s);
+ else {
+ if (tbl[*s] >= 0)
+ matches++;
+ s++;
+ }
}
return matches;
}
STATIC I32
-S_do_trans_CC_complex(pTHX_ SV *sv)
+S_do_trans_complex(pTHX_ SV *sv)/* SPC - OK */
{
dTHR;
U8 *s;
U8 *send;
U8 *d;
+ I32 hasutf = SvUTF8(sv);
I32 matches = 0;
STRLEN len;
short *tbl;
@@ -101,29 +112,37 @@ S_do_trans_CC_complex(pTHX_ SV *sv)
U8* p = send;
while (s < send) {
- if ((ch = tbl[*s]) >= 0) {
- *d = ch;
- matches++;
- if (p == d - 1 && *p == *d)
- matches--;
- else
- p = d++;
- }
- else if (ch == -1) /* -1 is unmapped character */
- *d++ = *s; /* -2 is delete character */
- s++;
+ if (hasutf && *s & 0x80)
+ s+=UTF8SKIP(s);
+ else {
+ if ((ch = tbl[*s]) >= 0) {
+ *d = ch;
+ matches++;
+ if (p == d - 1 && *p == *d)
+ matches--;
+ else
+ p = d++;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s; /* -2 is delete character */
+ s++;
+ }
}
}
else {
while (s < send) {
- if ((ch = tbl[*s]) >= 0) {
- *d = ch;
- matches++;
- d++;
- }
- else if (ch == -1) /* -1 is unmapped character */
- *d++ = *s; /* -2 is delete character */
- s++;
+ if (hasutf && *s & 0x80)
+ s+=UTF8SKIP(s);
+ else {
+ if ((ch = tbl[*s]) >= 0) {
+ *d = ch;
+ matches++;
+ d++;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s; /* -2 is delete character */
+ s++;
+ }
}
}
matches += send - d; /* account for disappeared chars */
@@ -135,12 +154,14 @@ S_do_trans_CC_complex(pTHX_ SV *sv)
}
STATIC I32
-S_do_trans_UU_simple(pTHX_ SV *sv)
+S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
{
dTHR;
U8 *s;
U8 *send;
U8 *d;
+ U8 *start;
+ U8 *dstart;
I32 matches = 0;
STRLEN len;
@@ -151,43 +172,83 @@ S_do_trans_UU_simple(pTHX_ SV *sv)
UV extra = none + 1;
UV final;
UV uv;
+ I32 isutf;
+ I32 howmany;
+ isutf = SvUTF8(sv);
s = (U8*)SvPV(sv, len);
send = s + len;
+ start = s;
svp = hv_fetch(hv, "FINAL", 5, FALSE);
if (svp)
final = SvUV(*svp);
- d = s;
+ /* d needs to be bigger than s, in case e.g. upgrading is required */
+ Newz(0, d, len*2+1, U8);
+ dstart = d;
while (s < send) {
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
+ if (uv & 0x80 && !isutf) {
+ /* Sneaky-upgrade dstart...d */
+ U8* new;
+ STRLEN len;
+ len = dstart - d;
+ new = bytes_to_utf8(dstart, &len);
+ Copy(new,dstart,len,U8*);
+ d = dstart + len;
+ isutf++;
+ }
d = uv_to_utf8(d, uv);
}
else if (uv == none) {
int i;
- for (i = UTF8SKIP(s); i; i--)
- *d++ = *s++;
+ i = UTF8SKIP(s);
+ if (i > 1 && !isutf) {
+ U8* new;
+ STRLEN len;
+ len = dstart - d;
+ new = bytes_to_utf8(dstart, &len);
+ Copy(new,dstart,len,U8*);
+ d = dstart + len;
+ isutf++;
+ }
+ while(i--)
+ *d++ = *s++;
}
else if (uv == extra) {
- s += UTF8SKIP(s);
+ int i;
+ i = UTF8SKIP(s);
+ s += i;
matches++;
+ if (i > 1 && !isutf) {
+ U8* new;
+ STRLEN len;
+ len = dstart - d;
+ new = bytes_to_utf8(dstart, &len);
+ Copy(new,dstart,len,U8*);
+ d = dstart + len;
+ isutf++;
+ }
d = uv_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
}
*d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
+ SvPV_set(sv, dstart);
+ SvCUR_set(sv, d - dstart);
SvSETMAGIC(sv);
+ if (isutf)
+ SvUTF8_on(sv);
return matches;
}
STATIC I32
-S_do_trans_UU_count(pTHX_ SV *sv)
+S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
{
dTHR;
U8 *s;
@@ -202,6 +263,8 @@ S_do_trans_UU_count(pTHX_ SV *sv)
UV uv;
s = (U8*)SvPV(sv, len);
+ if (!SvUTF8(sv))
+ s = bytes_to_utf8(s, &len);
send = s + len;
while (s < send) {
@@ -214,7 +277,7 @@ S_do_trans_UU_count(pTHX_ SV *sv)
}
STATIC I32
-S_do_trans_UU_complex(pTHX_ SV *sv)
+S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
{
dTHR;
U8 *s;
@@ -403,7 +466,10 @@ Perl_do_trans(pTHX_ SV *sv)
{
dTHR;
STRLEN len;
+ I32 hasutf = (PL_op->op_private &
+ (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF));
+ PL_op->op_private &= ~hasutf;
if (SvREADONLY(sv) && !(PL_op->op_private & OPpTRANS_IDENTICAL))
Perl_croak(aTHX_ PL_no_modify);
@@ -419,22 +485,22 @@ Perl_do_trans(pTHX_ SV *sv)
switch (PL_op->op_private & 63) {
case 0:
- if (SvUTF8(sv))
- return do_trans_UU_simple(sv);
+ if (hasutf)
+ return do_trans_simple_utf8(sv);
else
- return do_trans_CC_simple(sv);
+ return do_trans_simple(sv);
case OPpTRANS_IDENTICAL:
- if (SvUTF8(sv))
- return do_trans_UU_count(sv);
+ if (hasutf)
+ return do_trans_count_utf8(sv);
else
- return do_trans_CC_count(sv);
+ return do_trans_count(sv);
default:
- if (SvUTF8(sv))
- return do_trans_UU_complex(sv); /* could be UC or CU too */
+ if (hasutf)
+ return do_trans_complex_utf8(sv);
else
- return do_trans_CC_complex(sv);
+ return do_trans_complex(sv);
}
}
diff --git a/embed.h b/embed.h
index fa199fbe15..f419792721 100644
--- a/embed.h
+++ b/embed.h
@@ -846,14 +846,12 @@
#define avhv_index S_avhv_index
#endif
#if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT)
-#define do_trans_CC_simple S_do_trans_CC_simple
-#define do_trans_CC_count S_do_trans_CC_count
-#define do_trans_CC_complex S_do_trans_CC_complex
-#define do_trans_UU_simple S_do_trans_UU_simple
-#define do_trans_UU_count S_do_trans_UU_count
-#define do_trans_UU_complex S_do_trans_UU_complex
-#define do_trans_UC_trivial S_do_trans_UC_trivial
-#define do_trans_CU_trivial S_do_trans_CU_trivial
+#define do_trans_simple S_do_trans_simple
+#define do_trans_count S_do_trans_count
+#define do_trans_complex S_do_trans_complex
+#define do_trans_simple_utf8 S_do_trans_simple_utf8
+#define do_trans_count_utf8 S_do_trans_count_utf8
+#define do_trans_complex_utf8 S_do_trans_complex_utf8
#endif
#if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
#define gv_init_sv S_gv_init_sv
@@ -1105,6 +1103,7 @@
#define sublex_push S_sublex_push
#define sublex_start S_sublex_start
#define filter_gets S_filter_gets
+#define find_in_my_stash S_find_in_my_stash
#define new_constant S_new_constant
#define ao S_ao
#define depcom S_depcom
@@ -2289,14 +2288,12 @@
#define avhv_index(a,b,c) S_avhv_index(aTHX_ a,b,c)
#endif
#if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT)
-#define do_trans_CC_simple(a) S_do_trans_CC_simple(aTHX_ a)
-#define do_trans_CC_count(a) S_do_trans_CC_count(aTHX_ a)
-#define do_trans_CC_complex(a) S_do_trans_CC_complex(aTHX_ a)
-#define do_trans_UU_simple(a) S_do_trans_UU_simple(aTHX_ a)
-#define do_trans_UU_count(a) S_do_trans_UU_count(aTHX_ a)
-#define do_trans_UU_complex(a) S_do_trans_UU_complex(aTHX_ a)
-#define do_trans_UC_trivial(a) S_do_trans_UC_trivial(aTHX_ a)
-#define do_trans_CU_trivial(a) S_do_trans_CU_trivial(aTHX_ a)
+#define do_trans_simple(a) S_do_trans_simple(aTHX_ a)
+#define do_trans_count(a) S_do_trans_count(aTHX_ a)
+#define do_trans_complex(a) S_do_trans_complex(aTHX_ a)
+#define do_trans_simple_utf8(a) S_do_trans_simple_utf8(aTHX_ a)
+#define do_trans_count_utf8(a) S_do_trans_count_utf8(aTHX_ a)
+#define do_trans_complex_utf8(a) S_do_trans_complex_utf8(aTHX_ a)
#endif
#if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
#define gv_init_sv(a,b) S_gv_init_sv(aTHX_ a,b)
@@ -2547,6 +2544,7 @@
#define sublex_push() S_sublex_push(aTHX)
#define sublex_start() S_sublex_start(aTHX)
#define filter_gets(a,b,c) S_filter_gets(aTHX_ a,b,c)
+#define find_in_my_stash(a,b) S_find_in_my_stash(aTHX_ a,b)
#define new_constant(a,b,c,d,e,f) S_new_constant(aTHX_ a,b,c,d,e,f)
#define ao(a) S_ao(aTHX_ a)
#define depcom() S_depcom(aTHX)
@@ -4479,22 +4477,18 @@
#define avhv_index S_avhv_index
#endif
#if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT)
-#define S_do_trans_CC_simple CPerlObj::S_do_trans_CC_simple
-#define do_trans_CC_simple S_do_trans_CC_simple
-#define S_do_trans_CC_count CPerlObj::S_do_trans_CC_count
-#define do_trans_CC_count S_do_trans_CC_count
-#define S_do_trans_CC_complex CPerlObj::S_do_trans_CC_complex
-#define do_trans_CC_complex S_do_trans_CC_complex
-#define S_do_trans_UU_simple CPerlObj::S_do_trans_UU_simple
-#define do_trans_UU_simple S_do_trans_UU_simple
-#define S_do_trans_UU_count CPerlObj::S_do_trans_UU_count
-#define do_trans_UU_count S_do_trans_UU_count
-#define S_do_trans_UU_complex CPerlObj::S_do_trans_UU_complex
-#define do_trans_UU_complex S_do_trans_UU_complex
-#define S_do_trans_UC_trivial CPerlObj::S_do_trans_UC_trivial
-#define do_trans_UC_trivial S_do_trans_UC_trivial
-#define S_do_trans_CU_trivial CPerlObj::S_do_trans_CU_trivial
-#define do_trans_CU_trivial S_do_trans_CU_trivial
+#define S_do_trans_simple CPerlObj::S_do_trans_simple
+#define do_trans_simple S_do_trans_simple
+#define S_do_trans_count CPerlObj::S_do_trans_count
+#define do_trans_count S_do_trans_count
+#define S_do_trans_complex CPerlObj::S_do_trans_complex
+#define do_trans_complex S_do_trans_complex
+#define S_do_trans_simple_utf8 CPerlObj::S_do_trans_simple_utf8
+#define do_trans_simple_utf8 S_do_trans_simple_utf8
+#define S_do_trans_count_utf8 CPerlObj::S_do_trans_count_utf8
+#define do_trans_count_utf8 S_do_trans_count_utf8
+#define S_do_trans_complex_utf8 CPerlObj::S_do_trans_complex_utf8
+#define do_trans_complex_utf8 S_do_trans_complex_utf8
#endif
#if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
#define S_gv_init_sv CPerlObj::S_gv_init_sv
@@ -4953,6 +4947,8 @@
#define sublex_start S_sublex_start
#define S_filter_gets CPerlObj::S_filter_gets
#define filter_gets S_filter_gets
+#define S_find_in_my_stash CPerlObj::S_find_in_my_stash
+#define find_in_my_stash S_find_in_my_stash
#define S_new_constant CPerlObj::S_new_constant
#define new_constant S_new_constant
#define S_ao CPerlObj::S_ao
diff --git a/embed.pl b/embed.pl
index c4cb7051aa..bf41a0afc0 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2199,14 +2199,12 @@ s |I32 |avhv_index |AV* av|SV* sv|U32 hash
#endif
#if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT)
-s |I32 |do_trans_CC_simple |SV *sv
-s |I32 |do_trans_CC_count |SV *sv
-s |I32 |do_trans_CC_complex |SV *sv
-s |I32 |do_trans_UU_simple |SV *sv
-s |I32 |do_trans_UU_count |SV *sv
-s |I32 |do_trans_UU_complex |SV *sv
-s |I32 |do_trans_UC_trivial |SV *sv
-s |I32 |do_trans_CU_trivial |SV *sv
+s |I32 |do_trans_simple |SV *sv
+s |I32 |do_trans_count |SV *sv
+s |I32 |do_trans_complex |SV *sv
+s |I32 |do_trans_simple_utf8 |SV *sv
+s |I32 |do_trans_count_utf8 |SV *sv
+s |I32 |do_trans_complex_utf8 |SV *sv
#endif
#if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
diff --git a/proto.h b/proto.h
index d46179a376..0d7033213a 100644
--- a/proto.h
+++ b/proto.h
@@ -959,14 +959,12 @@ STATIC I32 S_avhv_index(pTHX_ AV* av, SV* sv, U32 hash);
#endif
#if defined(PERL_IN_DOOP_C) || defined(PERL_DECL_PROT)
-STATIC I32 S_do_trans_CC_simple(pTHX_ SV *sv);
-STATIC I32 S_do_trans_CC_count(pTHX_ SV *sv);
-STATIC I32 S_do_trans_CC_complex(pTHX_ SV *sv);
-STATIC I32 S_do_trans_UU_simple(pTHX_ SV *sv);
-STATIC I32 S_do_trans_UU_count(pTHX_ SV *sv);
-STATIC I32 S_do_trans_UU_complex(pTHX_ SV *sv);
-STATIC I32 S_do_trans_UC_trivial(pTHX_ SV *sv);
-STATIC I32 S_do_trans_CU_trivial(pTHX_ SV *sv);
+STATIC I32 S_do_trans_simple(pTHX_ SV *sv);
+STATIC I32 S_do_trans_count(pTHX_ SV *sv);
+STATIC I32 S_do_trans_complex(pTHX_ SV *sv);
+STATIC I32 S_do_trans_simple_utf8(pTHX_ SV *sv);
+STATIC I32 S_do_trans_count_utf8(pTHX_ SV *sv);
+STATIC I32 S_do_trans_complex_utf8(pTHX_ SV *sv);
#endif
#if defined(PERL_IN_GV_C) || defined(PERL_DECL_PROT)
diff --git a/t/op/tr.t b/t/op/tr.t
index e9a1b4c42d..100dcfe98a 100755
--- a/t/op/tr.t
+++ b/t/op/tr.t
@@ -5,7 +5,7 @@ BEGIN {
unshift @INC, "../lib";
}
-print "1..8\n";
+print "1..15\n";
$_ = "abcdefghijklmnopqrstuvwxyz";
@@ -61,3 +61,50 @@ print "ok 7\n";
$x =~ tr/A/B/;
print "not " if $x ne 256.66.258 or length $x != 3;
print "ok 8\n";
+
+{
+use utf8;
+
+# 9 - changing UTF8 characters in a UTF8 string, same length.
+$l = chr(300); $r = chr(400);
+$x = 200.300.400;
+$x =~ tr/\x{12c}/\x{190}/;
+printf "not (%vd) ", $x if $x ne 200.400.400 or length $x != 3;
+print "ok 9\n";
+
+# 10 - changing UTF8 characters in UTF8 string, more bytes.
+$x = 200.300.400;
+$x =~ tr/\x{12c}/\x{be8}/;
+printf "not (%vd) ", $x if $x ne 200.3048.400 or length $x != 3;
+print "ok 10\n";
+
+# 11 - introducing UTF8 characters to non-UTF8 string.
+$x = 100.125.60;
+$x =~ tr/\x{64}/\x{190}/;
+printf "not (%vd) ", $x if $x ne 400.125.60 or length $x != 3;
+print "ok 11\n";
+
+# 12 - removing UTF8 characters from UTF8 string
+$x = 400.125.60;
+$x =~ tr/\x{190}/\x{64}/;
+printf "not (%vd) ", $x if $x ne 100.125.60 or length $x != 3;
+print "ok 12\n";
+
+# 13 - counting UTF8 chars in UTF8 string
+$x = 400.125.60.400;
+$y = $x =~ tr/\x{190}/\x{190}/;
+print "not " if $y != 2;
+print "ok 13\n";
+
+# 14 - counting non-UTF8 chars in UTF8 string
+$x = 60.400.125.60.400;
+$y = $x =~ tr/\x{3c}/\x{3c}/;
+print "not " if $y != 2;
+print "ok 14\n";
+
+# 15 - counting UTF8 chars in non-UTF8 string
+$x = 200.125.60;
+$y = $x =~ tr/\x{190}/\x{190}/;
+print "not " if $y != 0;
+print "ok 15\n";
+}