summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--doio.c11
-rw-r--r--embed.h4
-rwxr-xr-xembed.pl1
-rw-r--r--global.sym1
-rw-r--r--hv.c95
-rw-r--r--objXSUB.h4
-rw-r--r--perlapi.c7
-rw-r--r--pod/perlapi.pod31
-rw-r--r--proto.h1
-rw-r--r--sv.c32
-rwxr-xr-xt/io/utf8.t10
-rw-r--r--t/lib/charnames.t12
-rwxr-xr-xt/op/each.t15
-rwxr-xr-xt/op/tr.t11
-rwxr-xr-xt/pragma/utf8.t15
-rw-r--r--toke.c25
-rw-r--r--utf8.c57
17 files changed, 275 insertions, 57 deletions
diff --git a/doio.c b/doio.c
index 6056ea704c..a1d0e466ea 100644
--- a/doio.c
+++ b/doio.c
@@ -1169,13 +1169,12 @@ Perl_do_print(pTHX_ register SV *sv, PerlIO *fp)
/* FALL THROUGH */
default:
if (PerlIO_isutf8(fp)) {
- tmps = SvPVutf8(sv, len);
- }
- else {
- if (DO_UTF8(sv))
- sv_utf8_downgrade(sv, FALSE);
- tmps = SvPV(sv, len);
+ if (!SvUTF8(sv))
+ sv_utf8_upgrade(sv = sv_mortalcopy(sv));
}
+ else if (DO_UTF8(sv))
+ sv_utf8_downgrade((sv = sv_mortalcopy(sv)), FALSE);
+ tmps = SvPV(sv, len);
break;
}
/* To detect whether the process is about to overstep its
diff --git a/embed.h b/embed.h
index f2a05c7423..fe0b6b3198 100644
--- a/embed.h
+++ b/embed.h
@@ -737,6 +737,7 @@
#define utf8_distance Perl_utf8_distance
#define utf8_hop Perl_utf8_hop
#define utf8_to_bytes Perl_utf8_to_bytes
+#define bytes_from_utf8 Perl_bytes_from_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
#define utf8_to_uv_simple Perl_utf8_to_uv_simple
#define utf8_to_uv Perl_utf8_to_uv
@@ -2214,6 +2215,7 @@
#define utf8_distance(a,b) Perl_utf8_distance(aTHX_ a,b)
#define utf8_hop(a,b) Perl_utf8_hop(aTHX_ a,b)
#define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b)
+#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
#define utf8_to_uv_simple(a,b) Perl_utf8_to_uv_simple(aTHX_ a,b)
#define utf8_to_uv(a,b,c,d) Perl_utf8_to_uv(aTHX_ a,b,c,d)
@@ -4342,6 +4344,8 @@
#define utf8_hop Perl_utf8_hop
#define Perl_utf8_to_bytes CPerlObj::Perl_utf8_to_bytes
#define utf8_to_bytes Perl_utf8_to_bytes
+#define Perl_bytes_from_utf8 CPerlObj::Perl_bytes_from_utf8
+#define bytes_from_utf8 Perl_bytes_from_utf8
#define Perl_bytes_to_utf8 CPerlObj::Perl_bytes_to_utf8
#define bytes_to_utf8 Perl_bytes_to_utf8
#define Perl_utf8_to_uv_simple CPerlObj::Perl_utf8_to_uv_simple
diff --git a/embed.pl b/embed.pl
index e71337bfd0..97327730b2 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2085,6 +2085,7 @@ Adp |STRLEN |utf8_length |U8* s|U8 *e
Apd |IV |utf8_distance |U8 *a|U8 *b
Apd |U8* |utf8_hop |U8 *s|I32 off
ApMd |U8* |utf8_to_bytes |U8 *s|STRLEN *len
+ApMd |U8* |bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
ApMd |U8* |bytes_to_utf8 |U8 *s|STRLEN *len
Apd |UV |utf8_to_uv_simple|U8 *s|STRLEN* retlen
Adp |UV |utf8_to_uv |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
diff --git a/global.sym b/global.sym
index 48128c911d..dab2a7cd1f 100644
--- a/global.sym
+++ b/global.sym
@@ -470,6 +470,7 @@ Perl_utf8_length
Perl_utf8_distance
Perl_utf8_hop
Perl_utf8_to_bytes
+Perl_bytes_from_utf8
Perl_bytes_to_utf8
Perl_utf8_to_uv_simple
Perl_utf8_to_uv
diff --git a/hv.c b/hv.c
index 0e50523bea..c999488bf3 100644
--- a/hv.c
+++ b/hv.c
@@ -152,6 +152,7 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval)
register HE *entry;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
@@ -196,6 +197,9 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval)
return 0;
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
entry = ((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
@@ -208,6 +212,8 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return &HeVAL(entry);
}
#ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */
@@ -217,14 +223,24 @@ Perl_hv_fetch(pTHX_ HV *hv, const char *key, I32 klen, I32 lval)
if (env) {
sv = newSVpvn(env,len);
SvTAINTED_on(sv);
+ if (key != keysave)
+ Safefree(key);
return hv_store(hv,key,klen,sv,hash);
}
}
#endif
if (lval) { /* gonna assign to this, so it better be there */
sv = NEWSV(61,0);
- return hv_store(hv,key,is_utf8?-klen:klen,sv,hash);
+ if (key != keysave) { /* must be is_utf8 == 0 */
+ SV **ret = hv_store(hv,key,klen,sv,hash);
+ Safefree(key);
+ return ret;
+ }
+ else
+ return hv_store(hv,key,is_utf8?-klen:klen,sv,hash);
}
+ if (key != keysave)
+ Safefree(key);
return 0;
}
@@ -256,6 +272,7 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash)
register HE *entry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
@@ -304,9 +321,12 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash)
return 0;
}
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv)!=0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
@@ -320,6 +340,8 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return entry;
}
#ifdef DYNAMIC_ENV_FETCH /* %ENV lookup? If so, try to fetch the value now */
@@ -333,6 +355,8 @@ Perl_hv_fetch_ent(pTHX_ HV *hv, SV *keysv, I32 lval, register U32 hash)
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
if (lval) { /* gonna assign to this, so it better be there */
sv = NEWSV(61,0);
return hv_store_ent(hv,keysv,sv,hash);
@@ -385,6 +409,7 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
register HE *entry;
register HE **oentry;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
@@ -412,6 +437,9 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
#endif
}
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
@@ -433,6 +461,8 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
continue;
SvREFCNT_dec(HeVAL(entry));
HeVAL(entry) = val;
+ if (key != keysave)
+ Safefree(key);
return &HeVAL(entry);
}
@@ -441,6 +471,8 @@ Perl_hv_store(pTHX_ HV *hv, const char *key, I32 klen, SV *val, register U32 has
HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash);
else /* gotta do the real thing */
HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash);
+ if (key != keysave)
+ Safefree(key);
HeVAL(entry) = val;
HeNEXT(entry) = *oentry;
*oentry = entry;
@@ -484,6 +516,7 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
register HE *entry;
register HE **oentry;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
@@ -513,9 +546,12 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
}
}
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
@@ -537,6 +573,8 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
continue;
SvREFCNT_dec(HeVAL(entry));
HeVAL(entry) = val;
+ if (key != keysave)
+ Safefree(key);
return entry;
}
@@ -545,6 +583,8 @@ Perl_hv_store_ent(pTHX_ HV *hv, SV *keysv, SV *val, register U32 hash)
HeKEY_hek(entry) = share_hek(key, is_utf8?-klen:klen, hash);
else /* gotta do the real thing */
HeKEY_hek(entry) = save_hek(key, is_utf8?-klen:klen, hash);
+ if (key != keysave)
+ Safefree(key);
HeVAL(entry) = val;
HeNEXT(entry) = *oentry;
*oentry = entry;
@@ -581,6 +621,7 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
SV **svp;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return Nullsv;
@@ -615,6 +656,9 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
if (!xhv->xhv_array)
return Nullsv;
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
oentry = &((HE**)xhv->xhv_array)[hash & (I32) xhv->xhv_max];
@@ -629,6 +673,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
*oentry = HeNEXT(entry);
if (i && !*oentry)
xhv->xhv_fill--;
@@ -645,6 +691,8 @@ Perl_hv_delete(pTHX_ HV *hv, const char *key, I32 klen, I32 flags)
--xhv->xhv_keys;
return sv;
}
+ if (key != keysave)
+ Safefree(key);
return Nullsv;
}
@@ -670,6 +718,7 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
register HE **oentry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return Nullsv;
@@ -702,9 +751,12 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
if (!xhv->xhv_array)
return Nullsv;
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
+
if (!hash)
PERL_HASH(hash, key, klen);
@@ -720,6 +772,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
*oentry = HeNEXT(entry);
if (i && !*oentry)
xhv->xhv_fill--;
@@ -736,6 +790,8 @@ Perl_hv_delete_ent(pTHX_ HV *hv, SV *keysv, I32 flags, U32 hash)
--xhv->xhv_keys;
return sv;
}
+ if (key != keysave)
+ Safefree(key);
return Nullsv;
}
@@ -756,6 +812,7 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen)
register HE *entry;
SV *sv;
bool is_utf8 = FALSE;
+ const char *keysave = key;
if (!hv)
return 0;
@@ -786,6 +843,9 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen)
return 0;
#endif
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, (STRLEN*)&klen, &is_utf8);
+
PERL_HASH(hash, key, klen);
#ifdef DYNAMIC_ENV_FETCH
@@ -802,6 +862,8 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return TRUE;
}
#ifdef DYNAMIC_ENV_FETCH /* is it out there? */
@@ -816,6 +878,8 @@ Perl_hv_exists(pTHX_ HV *hv, const char *key, I32 klen)
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
return FALSE;
}
@@ -839,6 +903,7 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash)
register HE *entry;
SV *sv;
bool is_utf8;
+ char *keysave;
if (!hv)
return 0;
@@ -867,8 +932,10 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash)
return 0;
#endif
- key = SvPV(keysv, klen);
+ keysave = key = SvPV(keysv, klen);
is_utf8 = (SvUTF8(keysv) != 0);
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ key = (char*)bytes_from_utf8((U8*)key, &klen, &is_utf8);
if (!hash)
PERL_HASH(hash, key, klen);
@@ -886,6 +953,8 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash)
continue;
if (HeKUTF8(entry) != (char)is_utf8)
continue;
+ if (key != keysave)
+ Safefree(key);
return TRUE;
}
#ifdef DYNAMIC_ENV_FETCH /* is it out there? */
@@ -900,6 +969,8 @@ Perl_hv_exists_ent(pTHX_ HV *hv, SV *keysv, U32 hash)
}
}
#endif
+ if (key != keysave)
+ Safefree(key);
return FALSE;
}
@@ -1471,10 +1542,13 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash)
register I32 i = 1;
I32 found = 0;
bool is_utf8 = FALSE;
+ const char *save = str;
if (len < 0) {
len = -len;
is_utf8 = TRUE;
+ if (!(PL_hints & HINT_UTF8_DISTINCT))
+ str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8);
}
/* what follows is the moral equivalent of:
@@ -1507,7 +1581,8 @@ Perl_unsharepvn(pTHX_ const char *str, I32 len, U32 hash)
break;
}
UNLOCK_STRTAB_MUTEX;
-
+ if (str != save)
+ Safefree(str);
if (!found && ckWARN_d(WARN_INTERNAL))
Perl_warner(aTHX_ WARN_INTERNAL, "Attempt to free non-existent shared string '%s'",str);
}
@@ -1525,10 +1600,13 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash)
register I32 i = 1;
I32 found = 0;
bool is_utf8 = FALSE;
+ const char *save = str;
if (len < 0) {
len = -len;
is_utf8 = TRUE;
+ if (!(PL_hints & HINT_UTF8_DISTINCT))
+ str = (char*)bytes_from_utf8((U8*)str, (STRLEN*)&len, &is_utf8);
}
/* what follows is the moral equivalent of:
@@ -1568,8 +1646,7 @@ Perl_share_hek(pTHX_ const char *str, I32 len, register U32 hash)
++HeVAL(entry); /* use value slot as REFCNT */
UNLOCK_STRTAB_MUTEX;
+ if (str != save)
+ Safefree(str);
return HeKEY_hek(entry);
}
-
-
-
diff --git a/objXSUB.h b/objXSUB.h
index 8cdfec059e..51e4eb2e9b 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -1889,6 +1889,10 @@
#define Perl_utf8_to_bytes pPerl->Perl_utf8_to_bytes
#undef utf8_to_bytes
#define utf8_to_bytes Perl_utf8_to_bytes
+#undef Perl_bytes_from_utf8
+#define Perl_bytes_from_utf8 pPerl->Perl_bytes_from_utf8
+#undef bytes_from_utf8
+#define bytes_from_utf8 Perl_bytes_from_utf8
#undef Perl_bytes_to_utf8
#define Perl_bytes_to_utf8 pPerl->Perl_bytes_to_utf8
#undef bytes_to_utf8
diff --git a/perlapi.c b/perlapi.c
index f0016d516e..04d48b92b5 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -3413,6 +3413,13 @@ Perl_utf8_to_bytes(pTHXo_ U8 *s, STRLEN *len)
return ((CPerlObj*)pPerl)->Perl_utf8_to_bytes(s, len);
}
+#undef Perl_bytes_from_utf8
+U8*
+Perl_bytes_from_utf8(pTHXo_ U8 *s, STRLEN *len, bool *is_utf8)
+{
+ return ((CPerlObj*)pPerl)->Perl_bytes_from_utf8(s, len, is_utf8);
+}
+
#undef Perl_bytes_to_utf8
U8*
Perl_bytes_to_utf8(pTHXo_ U8 *s, STRLEN *len)
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 25fe18a0b6..1828b55e29 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -182,6 +182,23 @@ must then use C<av_store> to assign values to these new elements.
=for hackers
Found in file av.c
+=item bytes_from_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new length.
+Returns the original string if no conversion occurs, C<len> and
+C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
+C<is_utf8> to 0 if C<s> is converted or malformed .
+
+NOTE: this function is experimental and may change or be
+removed without notice.
+
+ U8* bytes_from_utf8(U8 *s, STRLEN *len, bool *is_utf8)
+
+=for hackers
+Found in file utf8.c
+
=item bytes_to_utf8
Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
@@ -2420,19 +2437,19 @@ false, defined or undefined. Does not handle 'get' magic.
=for hackers
Found in file sv.h
-=item svtype
+=item SvTYPE
-An enum of flags for Perl types. These are found in the file B<sv.h>
-in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
+Returns the type of the SV. See C<svtype>.
+
+ svtype SvTYPE(SV* sv)
=for hackers
Found in file sv.h
-=item SvTYPE
-
-Returns the type of the SV. See C<svtype>.
+=item svtype
- svtype SvTYPE(SV* sv)
+An enum of flags for Perl types. These are found in the file B<sv.h>
+in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
=for hackers
Found in file sv.h
diff --git a/proto.h b/proto.h
index a1f0fee8a6..e39d33e9fb 100644
--- a/proto.h
+++ b/proto.h
@@ -818,6 +818,7 @@ PERL_CALLCONV STRLEN Perl_utf8_length(pTHX_ U8* s, U8 *e);
PERL_CALLCONV IV Perl_utf8_distance(pTHX_ U8 *a, U8 *b);
PERL_CALLCONV U8* Perl_utf8_hop(pTHX_ U8 *s, I32 off);
PERL_CALLCONV U8* Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len);
+PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8);
PERL_CALLCONV U8* Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len);
PERL_CALLCONV UV Perl_utf8_to_uv_simple(pTHX_ U8 *s, STRLEN* retlen);
PERL_CALLCONV UV Perl_utf8_to_uv(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
diff --git a/sv.c b/sv.c
index ed7ebdcbe1..c53486ac8c 100644
--- a/sv.c
+++ b/sv.c
@@ -4690,30 +4690,24 @@ Perl_sv_eq(pTHX_ register SV *sv1, register SV *sv2)
/* do not utf8ize the comparands as a side-effect */
if (cur1 && cur2 && SvUTF8(sv1) != SvUTF8(sv2) && !IN_BYTE) {
+ bool is_utf8 = TRUE;
+
if (PL_hints & HINT_UTF8_DISTINCT)
return FALSE;
if (SvUTF8(sv1)) {
- (void)utf8_to_bytes((U8*)(pv1 = savepvn(pv1, cur1)), &cur1);
- {
- IV scur1 = cur1;
- if (scur1 < 0) {
- Safefree(pv1);
- return 0;
- }
- }
- pv1tmp = TRUE;
+ char *pv = bytes_from_utf8((U8*)pv1, &cur1, &is_utf8);
+ if (is_utf8)
+ return 0;
+ pv1tmp = (pv != pv1);
+ pv1 = pv;
}
else {
- (void)utf8_to_bytes((U8*)(pv2 = savepvn(pv2, cur2)), &cur2);
- {
- IV scur2 = cur2;
- if (scur2 < 0) {
- Safefree(pv2);
- return 0;
- }
- }
- pv2tmp = TRUE;
+ char *pv = bytes_from_utf8((U8*)pv2, &cur2, &is_utf8);
+ if (is_utf8)
+ return 0;
+ pv2tmp = (pv != pv2);
+ pv2 = pv;
}
}
@@ -5601,6 +5595,8 @@ Perl_newSVpvn_share(pTHX_ const char *src, I32 len, U32 hash)
len = -len;
is_utf8 = TRUE;
}
+ if (is_utf8 && !(PL_hints & HINT_UTF8_DISTINCT))
+ src = (char*)bytes_from_utf8((U8*)src, (STRLEN*)&len, &is_utf8);
if (!hash)
PERL_HASH(hash, src, len);
new_SV(sv);
diff --git a/t/io/utf8.t b/t/io/utf8.t
index 04554e7797..d0201aaffb 100755
--- a/t/io/utf8.t
+++ b/t/io/utf8.t
@@ -91,17 +91,17 @@ print "not " unless $y == 1;
print "ok 17\n";
}
-print F $b,"\n"; # This upgrades $b!
+print F $b,"\n"; # Don't upgrades $b
{ # Check byte length of $b
use bytes; my $y = length($b);
-print "not " unless $y == 2;
+print "not ($y) " unless $y == 1;
print "ok 18\n";
}
{ my $x = tell(F);
{ use bytes; $y += 3;}
- print "not " unless $x == $y;
+ print "not ($x,$y) " unless $x == $y;
print "ok 19\n";
}
@@ -110,14 +110,14 @@ close F;
open F, "a" or die $!; # Not UTF
$x = <F>;
chomp($x);
-print "not " unless $x eq v196.172.194.130;
+printf "not (%vd) ", $x unless $x eq v196.172.194.130;
print "ok 20\n";
open F, "<:utf8", "a" or die $!;
$x = <F>;
chomp($x);
close F;
-print "not " unless $x eq chr(300).chr(130);
+printf "not (%vd) ", $x unless $x eq chr(300).chr(130);
print "ok 21\n";
# Now let's make it suffer.
diff --git a/t/lib/charnames.t b/t/lib/charnames.t
index 14da2e0f7b..d00396fb4a 100644
--- a/t/lib/charnames.t
+++ b/t/lib/charnames.t
@@ -8,7 +8,7 @@ BEGIN {
}
$| = 1;
-print "1..13\n";
+print "1..14\n";
use charnames ':full';
@@ -44,8 +44,7 @@ $encoded_alpha = "\316\261";
$encoded_bet = "\327\221";
sub to_bytes {
- use bytes;
- "".shift;
+ pack"a*", shift;
}
{
@@ -90,3 +89,10 @@ sub to_bytes {
print "ok 13\n";
}
+{
+ use charnames qw(:full);
+ use utf8;
+ print "not " unless "\x{100}\N{CENT SIGN}" eq "\x{100}"."\N{CENT SIGN}";
+ print "ok 14\n";
+}
+
diff --git a/t/op/each.t b/t/op/each.t
index 35792ab9c3..f1012c6402 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -1,6 +1,12 @@
#!./perl
-print "1..24\n";
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '.';
+ push @INC, '../lib';
+}
+
+print "1..25\n";
$h{'abc'} = 'ABC';
$h{'def'} = 'DEF';
@@ -156,3 +162,10 @@ print "#$b{$_}\n" for keys %b; # Used to core dump before change #8056.
print "ok 23\n";
print "#$u{$_}\n" for keys %u; # Used to core dump before change #8056.
print "ok 24\n";
+
+%u = (qu"\xe3\x81\x82" => "downglade");
+for (keys %u) {
+ use bytes;
+ print "not " if length ne 3 or $_ ne "\xe3\x81\x82";
+ print "ok 25\n";
+}
diff --git a/t/op/tr.t b/t/op/tr.t
index 6a4e1aa5f5..75887ab31c 100755
--- a/t/op/tr.t
+++ b/t/op/tr.t
@@ -5,7 +5,7 @@ BEGIN {
@INC = '../lib';
}
-print "1..49\n";
+print "1..51\n";
$_ = "abcdefghijklmnopqrstuvwxyz";
@@ -287,3 +287,12 @@ print "ok 48\n";
print "not " unless sprintf("%vd", $a) eq '196.172.200';
print "ok 49\n";
+# UTF8 range
+
+($a = v300.196.172.302.197.172) =~ tr/\x{12c}-\x{130}/\xc0-\xc4/;
+print "not " unless $a eq v192.196.172.194.197.172;
+print "ok 50\n";
+
+($a = v300.196.172.302.197.172) =~ tr/\xc4-\xc8/\x{12c}-\x{130}/;
+print "not " unless $a eq v300.300.172.302.301.172;
+print "ok 51\n";
diff --git a/t/pragma/utf8.t b/t/pragma/utf8.t
index 546b217f27..577e6b4e2b 100755
--- a/t/pragma/utf8.t
+++ b/t/pragma/utf8.t
@@ -10,7 +10,7 @@ BEGIN {
}
}
-print "1..106\n";
+print "1..107\n";
my $test = 1;
@@ -564,3 +564,16 @@ sub nok_bytes {
print "ok $test\n";
$test++; # 106
}
+
+{
+ use utf8;
+
+ my $w = 0;
+ local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
+ my $x = eval q/"\\/ . "\x{100}" . q/"/;;
+
+ print "not " unless $w == 0 && $x eq "\x{100}";
+ print "ok $test\n";
+ $test++; # 107
+}
+
diff --git a/toke.c b/toke.c
index 1b41dccb4b..a85a7252a9 100644
--- a/toke.c
+++ b/toke.c
@@ -1388,8 +1388,7 @@ S_scan_const(pTHX_ char *start)
"Unrecognized escape \\%c passed through",
*s);
/* default action is to copy the quoted character */
- *d++ = *s++;
- continue;
+ goto default_action;
}
/* \132 indicates an octal constant */
@@ -1479,6 +1478,13 @@ S_scan_const(pTHX_ char *start)
if (has_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
+ if (PL_lex_inwhat == OP_TRANS &&
+ PL_sublex_info.sub_op) {
+ PL_sublex_info.sub_op->op_private |=
+ (PL_lex_repl ? OPpTRANS_FROM_UTF
+ : OPpTRANS_TO_UTF);
+ utf = TRUE;
+ }
}
else {
*d++ = (char)uv;
@@ -1506,6 +1512,8 @@ S_scan_const(pTHX_ char *start)
res = newSVpvn(s + 1, e - s - 1);
res = new_constant( Nullch, 0, "charnames",
res, Nullsv, "\\N{...}" );
+ if (has_utf8)
+ sv_utf8_upgrade(res);
str = SvPV(res,len);
if (!has_utf8 && SvUTF8(res)) {
char *ostart = SvPVX(sv);
@@ -1588,8 +1596,7 @@ S_scan_const(pTHX_ char *start)
continue;
} /* end if (backslash) */
- /* (now in tr/// code again) */
-
+ default_action:
if (UTF8_IS_CONTINUED(*s) && (this_utf8 || has_utf8)) {
STRLEN len = (STRLEN) -1;
UV uv;
@@ -1608,10 +1615,15 @@ S_scan_const(pTHX_ char *start)
*d++ = *s++;
}
has_utf8 = TRUE;
+ if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
+ PL_sublex_info.sub_op->op_private |=
+ (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF);
+ utf = TRUE;
+ }
continue;
}
- *d++ = *s++;
+ *d++ = *s++;
} /* while loop to process each character */
/* terminate the string and set up the sv */
@@ -4742,7 +4754,8 @@ Perl_yylex(pTHX)
case KEY_qq:
case KEY_qu:
s = scan_str(s,FALSE,FALSE);
- if (tmp == KEY_qu && is_utf8_string((U8*)s, SvCUR(PL_lex_stuff)))
+ if (tmp == KEY_qu &&
+ is_utf8_string((U8*)SvPVX(PL_lex_stuff), SvCUR(PL_lex_stuff)))
SvUTF8_on(PL_lex_stuff);
if (!s)
missingterm((char*)0);
diff --git a/utf8.c b/utf8.c
index 156e63f717..046df74d9c 100644
--- a/utf8.c
+++ b/utf8.c
@@ -583,6 +583,63 @@ Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len)
}
/*
+=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
+
+Converts a string C<s> of length C<len> from UTF8 into byte encoding.
+Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
+the newly-created string, and updates C<len> to contain the new length.
+Returns the original string if no conversion occurs, C<len> and
+C<is_utf8> are unchanged. Do nothing if C<is_utf8> points to 0. Sets
+C<is_utf8> to 0 if C<s> is converted or malformed .
+
+=cut */
+
+U8 *
+Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8)
+{
+ U8 *send;
+ U8 *d;
+ U8 *start = s;
+ I32 count = 0;
+
+ if (!*is_utf8)
+ return start;
+
+ /* ensure valid UTF8 and chars < 256 before updating string */
+ for (send = s + *len; s < send;) {
+ U8 c = *s++;
+ if (!UTF8_IS_ASCII(c)) {
+ if (UTF8_IS_CONTINUATION(c) || s >= send ||
+ !UTF8_IS_CONTINUATION(*s)) {
+ *is_utf8 = 0;
+ return start;
+ }
+ if ((c & 0xfc) != 0xc0)
+ return start;
+ s++, count++;
+ }
+ }
+
+ *is_utf8 = 0;
+
+ if (!count)
+ return start;
+
+ Newz(801, d, (*len) - count + 1, U8);
+ d = s = start;
+ while (s < send) {
+ U8 c = *s++;
+ if (UTF8_IS_ASCII(c))
+ *d++ = c;
+ else
+ *d++ = UTF8_ACCUMULATE(c&3, *s++);
+ }
+ *d = '\0';
+ *len = d - start;
+ return start;
+}
+
+/*
=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.