summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doop.c212
-rwxr-xr-xembed.pl4
-rw-r--r--pod/perlop.pod13
-rw-r--r--toke.c29
-rw-r--r--utf8.c66
5 files changed, 81 insertions, 243 deletions
diff --git a/doop.c b/doop.c
index ebac52f3b9..fe2df464f5 100644
--- a/doop.c
+++ b/doop.c
@@ -214,188 +214,6 @@ S_do_trans_UU_count(pTHX_ SV *sv)
}
STATIC I32
-S_do_trans_UC_simple(pTHX_ SV *sv)
-{
- dTHR;
- U8 *s;
- U8 *send;
- U8 *d;
- I32 matches = 0;
- STRLEN len;
-
- SV* rv = (SV*)cSVOP->op_sv;
- HV* hv = (HV*)SvRV(rv);
- SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
- UV none = svp ? SvUV(*svp) : 0x7fffffff;
- UV extra = none + 1;
- UV final;
- UV uv;
-
- s = (U8*)SvPV(sv, len);
- send = s + len;
-
- svp = hv_fetch(hv, "FINAL", 5, FALSE);
- if (svp)
- final = SvUV(*svp);
-
- d = s;
- while (s < send) {
- if ((uv = swash_fetch(rv, s)) < none) {
- s += UTF8SKIP(s);
- matches++;
- *d++ = (U8)uv;
- }
- else if (uv == none) {
- I32 ulen;
- uv = utf8_to_uv(s, &ulen);
- s += ulen;
- *d++ = (U8)uv;
- }
- else if (uv == extra) {
- s += UTF8SKIP(s);
- matches++;
- *d++ = (U8)final;
- }
- else
- s += UTF8SKIP(s);
- }
- *d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
- SvSETMAGIC(sv);
-
- return matches;
-}
-
-STATIC I32
-S_do_trans_CU_simple(pTHX_ SV *sv)
-{
- dTHR;
- U8 *s;
- U8 *send;
- U8 *d;
- U8 *dst;
- I32 matches = 0;
- STRLEN len;
-
- SV* rv = (SV*)cSVOP->op_sv;
- HV* hv = (HV*)SvRV(rv);
- SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
- UV none = svp ? SvUV(*svp) : 0x7fffffff;
- UV extra = none + 1;
- UV final;
- UV uv;
- U8 tmpbuf[UTF8_MAXLEN];
- I32 bits = 16;
-
- s = (U8*)SvPV(sv, len);
- send = s + len;
-
- svp = hv_fetch(hv, "BITS", 4, FALSE);
- if (svp)
- bits = (I32)SvIV(*svp);
-
- svp = hv_fetch(hv, "FINAL", 5, FALSE);
- if (svp)
- final = SvUV(*svp);
-
- Newz(801, d, len * (bits >> 3) + 1, U8);
- dst = d;
-
- while (s < send) {
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
-
- if ((uv = swash_fetch(rv, tmpbuf)) < none) {
- matches++;
- d = uv_to_utf8(d, uv);
- }
- else if (uv == none)
- d = uv_to_utf8(d, s[-1]);
- else if (uv == extra) {
- matches++;
- d = uv_to_utf8(d, final);
- }
- }
- *d = '\0';
- sv_usepvn_mg(sv, (char*)dst, d - dst);
-
- return matches;
-}
-
-/* utf-8 to latin-1 */
-
-STATIC I32
-S_do_trans_UC_trivial(pTHX_ SV *sv)
-{
- dTHR;
- U8 *s;
- U8 *send;
- U8 *d;
- STRLEN len;
-
- s = (U8*)SvPV(sv, len);
- send = s + len;
-
- d = s;
- while (s < send) {
- if (*s < 0x80)
- *d++ = *s++;
- else {
- I32 ulen;
- UV uv = utf8_to_uv(s, &ulen);
- s += ulen;
- *d++ = (U8)uv;
- }
- }
- *d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
- SvSETMAGIC(sv);
-
- return SvCUR(sv);
-}
-
-/* latin-1 to utf-8 */
-
-STATIC I32
-S_do_trans_CU_trivial(pTHX_ SV *sv)
-{
- dTHR;
- U8 *s;
- U8 *send;
- U8 *d;
- U8 *dst;
- I32 matches;
- STRLEN len;
-
- s = (U8*)SvPV(sv, len);
- send = s + len;
-
- Newz(801, d, len * 2 + 1, U8);
- dst = d;
-
- matches = send - s;
-
- while (s < send) {
- if (*s < 0x80)
- *d++ = *s++;
- else {
- UV uv = *s++;
- *d++ = (( uv >> 6) | 0xc0);
- *d++ = (( uv & 0x3f) | 0x80);
- }
- }
- *d = '\0';
- sv_usepvn_mg(sv, (char*)dst, d - dst);
-
- return matches;
-}
-
-STATIC I32
S_do_trans_UU_complex(pTHX_ SV *sv)
{
dTHR;
@@ -601,31 +419,19 @@ Perl_do_trans(pTHX_ SV *sv)
switch (PL_op->op_private & 63) {
case 0:
- return do_trans_CC_simple(sv);
-
- case OPpTRANS_FROM_UTF:
- return do_trans_UC_simple(sv);
-
- case OPpTRANS_TO_UTF:
- return do_trans_CU_simple(sv);
-
- case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF:
- return do_trans_UU_simple(sv);
+ if (SvUTF8(sv))
+ return do_trans_UU_simple(sv);
+ else
+ return do_trans_CC_simple(sv);
case OPpTRANS_IDENTICAL:
- return do_trans_CC_count(sv);
-
- case OPpTRANS_FROM_UTF|OPpTRANS_IDENTICAL:
- return do_trans_UC_trivial(sv);
-
- case OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
- return do_trans_CU_trivial(sv);
-
- case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
- return do_trans_UU_count(sv);
+ if (SvUTF8(sv))
+ return do_trans_UU_count(sv);
+ else
+ return do_trans_CC_count(sv);
default:
- if (PL_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
+ if (SvUTF8(sv))
return do_trans_UU_complex(sv); /* could be UC or CU too */
else
return do_trans_CC_complex(sv);
diff --git a/embed.pl b/embed.pl
index f807d96ff6..377491d121 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2048,6 +2048,8 @@ Ap |U8* |utf16_to_utf8 |U16* p|U8 *d|I32 bytelen
Ap |U8* |utf16_to_utf8_reversed|U16* p|U8 *d|I32 bytelen
Ap |I32 |utf8_distance |U8 *a|U8 *b
Ap |U8* |utf8_hop |U8 *s|I32 off
+Ap |U8* |utf8_to_bytes |U8 *s|STRLEN len
+Ap |U8* |bytes_to_utf8 |U8 *s|STRLEN len
Ap |UV |utf8_to_uv |U8 *s|I32* retlen
Ap |U8* |uv_to_utf8 |U8 *d|UV uv
p |void |vivify_defelem |SV* sv
@@ -2188,8 +2190,6 @@ s |I32 |do_trans_CC_complex |SV *sv
s |I32 |do_trans_UU_simple |SV *sv
s |I32 |do_trans_UU_count |SV *sv
s |I32 |do_trans_UU_complex |SV *sv
-s |I32 |do_trans_UC_simple |SV *sv
-s |I32 |do_trans_CU_simple |SV *sv
s |I32 |do_trans_UC_trivial |SV *sv
s |I32 |do_trans_CU_trivial |SV *sv
#endif
diff --git a/pod/perlop.pod b/pod/perlop.pod
index b4caed9155..3c84e60801 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -1207,9 +1207,9 @@ to occur that you might want. Here are two common cases:
# expand tabs to 8-column spacing
1 while s/\t+/' ' x (length($&)*8 - length($`)%8)/e;
-=item tr/SEARCHLIST/REPLACEMENTLIST/cdsUC
+=item tr/SEARCHLIST/REPLACEMENTLIST/cds
-=item y/SEARCHLIST/REPLACEMENTLIST/cdsUC
+=item y/SEARCHLIST/REPLACEMENTLIST/cds
Transliterates all occurrences of the characters found in the search list
with the corresponding character in the replacement list. It returns
@@ -1243,8 +1243,6 @@ Options:
c Complement the SEARCHLIST.
d Delete found but unreplaced characters.
s Squash duplicate replaced characters.
- U Translate to/from UTF-8.
- C Translate to/from 8-bit char (octet).
If the C</c> modifier is specified, the SEARCHLIST character set
is complemented. If the C</d> modifier is specified, any characters
@@ -1262,10 +1260,6 @@ enough. If the REPLACEMENTLIST is empty, the SEARCHLIST is replicated.
This latter is useful for counting characters in a class or for
squashing character sequences in a class.
-The first C</U> or C</C> modifier applies to the left side of the translation.
-The second one applies to the right side. If present, these modifiers override
-the current utf8 state.
-
Examples:
$ARGV[1] =~ tr/A-Z/a-z/; # canonicalize to lower case
@@ -1285,9 +1279,6 @@ Examples:
tr [\200-\377]
[\000-\177]; # delete 8th bit
- tr/\0-\xFF//CU; # change Latin-1 to Unicode
- tr/\0-\x{FF}//UC; # change Unicode to Latin-1
-
If multiple transliterations are given for a character, only the
first one is used:
diff --git a/toke.c b/toke.c
index 05822e1e87..fe1435805c 100644
--- a/toke.c
+++ b/toke.c
@@ -6130,45 +6130,20 @@ S_scan_trans(pTHX_ char *start)
Perl_croak(aTHX_ "Transliteration replacement not terminated");
}
- if (UTF) {
- o = newSVOP(OP_TRANS, 0, 0);
- utf8 = OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF;
- }
- else {
New(803,tbl,256,short);
o = newPVOP(OP_TRANS, 0, (char*)tbl);
- utf8 = 0;
- }
complement = del = squash = 0;
- while (strchr("cdsCU", *s)) {
+ while (strchr("cds", *s)) {
if (*s == 'c')
complement = OPpTRANS_COMPLEMENT;
else if (*s == 'd')
del = OPpTRANS_DELETE;
else if (*s == 's')
squash = OPpTRANS_SQUASH;
- else {
- switch (count++) {
- case 0:
- if (*s == 'C')
- utf8 &= ~OPpTRANS_FROM_UTF;
- else
- utf8 |= OPpTRANS_FROM_UTF;
- break;
- case 1:
- if (*s == 'C')
- utf8 &= ~OPpTRANS_TO_UTF;
- else
- utf8 |= OPpTRANS_TO_UTF;
- break;
- default:
- Perl_croak(aTHX_ "Too many /C and /U options");
- }
- }
s++;
}
- o->op_private = del|squash|complement|utf8;
+ o->op_private = del|squash|complement;
PL_lex_op = o;
yylval.ival = OP_TRANS;
diff --git a/utf8.c b/utf8.c
index 76eb932f2d..b570b12ae3 100644
--- a/utf8.c
+++ b/utf8.c
@@ -222,6 +222,72 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
return s;
}
+/*
+=for apidoc utf8_to_bytes
+
+Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string.
+
+=cut
+*/
+
+U8 *
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+{
+ dTHR;
+ U8 *send;
+ U8 *d;
+ U8 *save;
+
+ send = s + len;
+ d = save = s;
+ while (s < send) {
+ if (*s < 0x80)
+ *d++ = *s++;
+ else {
+ I32 ulen;
+ UV uv = utf8_to_uv(s, &ulen);
+ s += ulen;
+ *d++ = (U8)uv;
+ }
+ }
+ *d = '\0';
+ return save;
+}
+
+/*
+=for apidoc bytes_to_utf8
+
+Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
+Returns a pointer to the newly-created string.
+
+*/
+
+U8*
+Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN len)
+{
+ dTHR;
+ U8 *send;
+ U8 *d;
+ U8 *dst;
+ send = s + len;
+
+ Newz(801, d, len * 2 + 1, U8);
+ dst = d;
+
+ while (s < send) {
+ if (*s < 0x80)
+ *d++ = *s++;
+ else {
+ UV uv = *s++;
+ *d++ = (( uv >> 6) | 0xc0);
+ *d++ = (( uv & 0x3f) | 0x80);
+ }
+ }
+ *d = '\0';
+ return dst;
+}
+
/* XXX NOTHING CALLS THE FOLLOWING TWO ROUTINES YET!!! */
/*
* Convert native or reversed UTF-16 to UTF-8.