summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Todo-5.63
-rw-r--r--embed.h16
-rwxr-xr-xembed.pl4
-rw-r--r--global.sym4
-rw-r--r--objXSUB.h16
-rw-r--r--perlapi.c28
-rw-r--r--pp_hot.c9
-rw-r--r--proto.h4
-rw-r--r--sv.c191
-rw-r--r--toke.c8
10 files changed, 269 insertions, 14 deletions
diff --git a/Todo-5.6 b/Todo-5.6
index 52fbc50a9a..28b146da4c 100644
--- a/Todo-5.6
+++ b/Todo-5.6
@@ -4,7 +4,7 @@ Bugs
Unicode support
finish byte <-> utf8 and localencoding <-> utf8 conversions
- make "$bytestr$charstr" do the right conversion
+ make substr($bytestr,0,0, $charstr) do the right conversion
add Unicode::Map equivivalent to core
add support for I/O disciplines
- open(F, "<!crlf!utf16", $file)
@@ -15,6 +15,7 @@ Unicode support
support C<print v1.2.3>
make C<v123> mean C<chr(123)> (if !exists(&v123))
autoload utf8_heavy.pl's swash routines in swash_init()
+ check uv_to_utf8() calls for buffer overflow
Multi-threading
support "use Thread;" under useithreads
diff --git a/embed.h b/embed.h
index b16eb3d00f..be6a68527f 100644
--- a/embed.h
+++ b/embed.h
@@ -783,6 +783,10 @@
#define sv_pv Perl_sv_pv
#define sv_pvutf8 Perl_sv_pvutf8
#define sv_pvbyte Perl_sv_pvbyte
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#define sv_utf8_encode Perl_sv_utf8_encode
+#define sv_utf8_decode Perl_sv_utf8_decode
#define sv_force_normal Perl_sv_force_normal
#define tmps_grow Perl_tmps_grow
#define sv_rvweaken Perl_sv_rvweaken
@@ -2191,6 +2195,10 @@
#define sv_pv(a) Perl_sv_pv(aTHX_ a)
#define sv_pvutf8(a) Perl_sv_pvutf8(aTHX_ a)
#define sv_pvbyte(a) Perl_sv_pvbyte(aTHX_ a)
+#define sv_utf8_upgrade(a) Perl_sv_utf8_upgrade(aTHX_ a)
+#define sv_utf8_downgrade(a,b) Perl_sv_utf8_downgrade(aTHX_ a,b)
+#define sv_utf8_encode(a) Perl_sv_utf8_encode(aTHX_ a)
+#define sv_utf8_decode(a) Perl_sv_utf8_decode(aTHX_ a)
#define sv_force_normal(a) Perl_sv_force_normal(aTHX_ a)
#define tmps_grow(a) Perl_tmps_grow(aTHX_ a)
#define sv_rvweaken(a) Perl_sv_rvweaken(aTHX_ a)
@@ -4299,6 +4307,14 @@
#define sv_pvutf8 Perl_sv_pvutf8
#define Perl_sv_pvbyte CPerlObj::Perl_sv_pvbyte
#define sv_pvbyte Perl_sv_pvbyte
+#define Perl_sv_utf8_upgrade CPerlObj::Perl_sv_utf8_upgrade
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_downgrade CPerlObj::Perl_sv_utf8_downgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_encode CPerlObj::Perl_sv_utf8_encode
+#define sv_utf8_encode Perl_sv_utf8_encode
+#define Perl_sv_utf8_decode CPerlObj::Perl_sv_utf8_decode
+#define sv_utf8_decode Perl_sv_utf8_decode
#define Perl_sv_force_normal CPerlObj::Perl_sv_force_normal
#define sv_force_normal Perl_sv_force_normal
#define Perl_tmps_grow CPerlObj::Perl_tmps_grow
diff --git a/embed.pl b/embed.pl
index 952e673529..3366a24b81 100755
--- a/embed.pl
+++ b/embed.pl
@@ -2108,6 +2108,10 @@ Ap |char* |sv_2pvbyte_nolen|SV* sv
Ap |char* |sv_pv |SV *sv
Ap |char* |sv_pvutf8 |SV *sv
Ap |char* |sv_pvbyte |SV *sv
+Ap |void |sv_utf8_upgrade|SV *sv
+Ap |bool |sv_utf8_downgrade|SV *sv|bool fail_ok
+Ap |void |sv_utf8_encode |SV *sv
+Ap |bool |sv_utf8_decode |SV *sv
Ap |void |sv_force_normal|SV *sv
Ap |void |tmps_grow |I32 n
Ap |SV* |sv_rvweaken |SV *sv
diff --git a/global.sym b/global.sym
index 1451d85db8..fee76148b9 100644
--- a/global.sym
+++ b/global.sym
@@ -499,6 +499,10 @@ Perl_sv_2pvbyte_nolen
Perl_sv_pv
Perl_sv_pvutf8
Perl_sv_pvbyte
+Perl_sv_utf8_upgrade
+Perl_sv_utf8_downgrade
+Perl_sv_utf8_encode
+Perl_sv_utf8_decode
Perl_sv_force_normal
Perl_tmps_grow
Perl_sv_rvweaken
diff --git a/objXSUB.h b/objXSUB.h
index 1243e9e668..2897a6ae5e 100644
--- a/objXSUB.h
+++ b/objXSUB.h
@@ -2023,6 +2023,22 @@
#define Perl_sv_pvbyte pPerl->Perl_sv_pvbyte
#undef sv_pvbyte
#define sv_pvbyte Perl_sv_pvbyte
+#undef Perl_sv_utf8_upgrade
+#define Perl_sv_utf8_upgrade pPerl->Perl_sv_utf8_upgrade
+#undef sv_utf8_upgrade
+#define sv_utf8_upgrade Perl_sv_utf8_upgrade
+#undef Perl_sv_utf8_downgrade
+#define Perl_sv_utf8_downgrade pPerl->Perl_sv_utf8_downgrade
+#undef sv_utf8_downgrade
+#define sv_utf8_downgrade Perl_sv_utf8_downgrade
+#undef Perl_sv_utf8_encode
+#define Perl_sv_utf8_encode pPerl->Perl_sv_utf8_encode
+#undef sv_utf8_encode
+#define sv_utf8_encode Perl_sv_utf8_encode
+#undef Perl_sv_utf8_decode
+#define Perl_sv_utf8_decode pPerl->Perl_sv_utf8_decode
+#undef sv_utf8_decode
+#define sv_utf8_decode Perl_sv_utf8_decode
#undef Perl_sv_force_normal
#define Perl_sv_force_normal pPerl->Perl_sv_force_normal
#undef sv_force_normal
diff --git a/perlapi.c b/perlapi.c
index f897146b6f..f0824984b3 100644
--- a/perlapi.c
+++ b/perlapi.c
@@ -3659,6 +3659,34 @@ Perl_sv_pvbyte(pTHXo_ SV *sv)
return ((CPerlObj*)pPerl)->Perl_sv_pvbyte(sv);
}
+#undef Perl_sv_utf8_upgrade
+void
+Perl_sv_utf8_upgrade(pTHXo_ SV *sv)
+{
+ ((CPerlObj*)pPerl)->Perl_sv_utf8_upgrade(sv);
+}
+
+#undef Perl_sv_utf8_downgrade
+bool
+Perl_sv_utf8_downgrade(pTHXo_ SV *sv, bool fail_ok)
+{
+ return ((CPerlObj*)pPerl)->Perl_sv_utf8_downgrade(sv, fail_ok);
+}
+
+#undef Perl_sv_utf8_encode
+void
+Perl_sv_utf8_encode(pTHXo_ SV *sv)
+{
+ ((CPerlObj*)pPerl)->Perl_sv_utf8_encode(sv);
+}
+
+#undef Perl_sv_utf8_decode
+bool
+Perl_sv_utf8_decode(pTHXo_ SV *sv)
+{
+ return ((CPerlObj*)pPerl)->Perl_sv_utf8_decode(sv);
+}
+
#undef Perl_sv_force_normal
void
Perl_sv_force_normal(pTHXo_ SV *sv)
diff --git a/pp_hot.c b/pp_hot.c
index 8dab65150c..ddb900f261 100644
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -184,7 +184,16 @@ PP(pp_concat)
}
}
#endif
+ if (DO_UTF8(right))
+ sv_utf8_upgrade(TARG);
sv_catpvn(TARG,s,len);
+ if (!IN_BYTE) {
+ if (SvUTF8(right))
+ SvUTF8_on(TARG);
+ }
+ else if (!SvUTF8(right)) {
+ SvUTF8_off(TARG);
+ }
}
else
sv_setpvn(TARG,s,len); /* suppress warning */
diff --git a/proto.h b/proto.h
index df2ddb4430..31b8f45649 100644
--- a/proto.h
+++ b/proto.h
@@ -885,6 +885,10 @@ PERL_CALLCONV char* Perl_sv_2pvbyte_nolen(pTHX_ SV* sv);
PERL_CALLCONV char* Perl_sv_pv(pTHX_ SV *sv);
PERL_CALLCONV char* Perl_sv_pvutf8(pTHX_ SV *sv);
PERL_CALLCONV char* Perl_sv_pvbyte(pTHX_ SV *sv);
+PERL_CALLCONV void Perl_sv_utf8_upgrade(pTHX_ SV *sv);
+PERL_CALLCONV bool Perl_sv_utf8_downgrade(pTHX_ SV *sv, bool fail_ok);
+PERL_CALLCONV void Perl_sv_utf8_encode(pTHX_ SV *sv);
+PERL_CALLCONV bool Perl_sv_utf8_decode(pTHX_ SV *sv);
PERL_CALLCONV void Perl_sv_force_normal(pTHX_ SV *sv);
PERL_CALLCONV void Perl_tmps_grow(pTHX_ I32 n);
PERL_CALLCONV SV* Perl_sv_rvweaken(pTHX_ SV *sv);
diff --git a/sv.c b/sv.c
index 616344b33d..e22dbc27d0 100644
--- a/sv.c
+++ b/sv.c
@@ -2214,7 +2214,8 @@ Perl_sv_2pv(pTHX_ register SV *sv, STRLEN *lp)
char *
Perl_sv_2pvbyte_nolen(pTHX_ register SV *sv)
{
- return sv_2pv_nolen(sv);
+ STRLEN n_a;
+ return sv_2pvbyte(sv, &n_a);
}
char *
@@ -2226,12 +2227,14 @@ Perl_sv_2pvbyte(pTHX_ register SV *sv, STRLEN *lp)
char *
Perl_sv_2pvutf8_nolen(pTHX_ register SV *sv)
{
- return sv_2pv_nolen(sv);
+ STRLEN n_a;
+ return sv_2pvutf8(sv, &n_a);
}
char *
Perl_sv_2pvutf8(pTHX_ register SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_2pv(sv,lp);
}
@@ -2273,6 +2276,139 @@ Perl_sv_2bool(pTHX_ register SV *sv)
}
}
+void
+Perl_sv_utf8_upgrade(pTHX_ register SV *sv)
+{
+ int hicount;
+ char *c;
+
+ if (!sv || !SvPOK(sv) || SvUTF8(sv))
+ return;
+
+ /* This function could be much more efficient if we had a FLAG
+ * to signal if there are any hibit chars in the string
+ */
+ hicount = 0;
+ for (c = SvPVX(sv); c < SvEND(sv); c++) {
+ if (*c & 0x80)
+ hicount++;
+ }
+
+ if (hicount) {
+ char *src, *dst;
+ SvGROW(sv, SvCUR(sv) + hicount + 1);
+
+ src = SvEND(sv) - 1;
+ SvCUR_set(sv, SvCUR(sv) + hicount);
+ dst = SvEND(sv) - 1;
+
+ while (src < dst) {
+ if (*src & 0x80) {
+ dst--;
+ uv_to_utf8((U8*)dst, (U8)*src--);
+ dst--;
+ }
+ else {
+ *dst-- = *src--;
+ }
+ }
+
+ SvUTF8_on(sv);
+ }
+}
+
+bool
+Perl_sv_utf8_downgrade(pTHX_ register SV* sv, bool fail_ok)
+{
+ if (SvPOK(sv) && SvUTF8(sv)) {
+ char *c = SvPVX(sv);
+ char *first_hi = 0;
+ /* need to figure out if this is possible at all first */
+ while (c < SvEND(sv)) {
+ if (*c & 0x80) {
+ I32 len;
+ UV uv = utf8_to_uv(c, &len);
+ if (uv >= 256) {
+ if (fail_ok)
+ return FALSE;
+ else {
+ /* XXX might want to make a callback here instead */
+ croak("Big byte");
+ }
+ }
+ if (!first_hi)
+ first_hi = c;
+ c += len;
+ }
+ else {
+ c++;
+ }
+ }
+
+ if (first_hi) {
+ char *src = first_hi;
+ char *dst = first_hi;
+ while (src < SvEND(sv)) {
+ if (*src & 0x80) {
+ I32 len;
+ U8 u = (U8)utf8_to_uv(src, &len);
+ *dst++ = u;
+ src += len;
+ }
+ else {
+ *dst++ = *src++;
+ }
+ }
+ SvCUR_set(sv, dst - SvPVX(sv));
+ }
+ SvUTF8_off(sv);
+ }
+ return TRUE;
+}
+
+void
+Perl_sv_utf8_encode(pTHX_ register SV *sv)
+{
+ sv_utf8_upgrade(sv);
+ SvUTF8_off(sv);
+}
+
+bool
+Perl_sv_utf8_decode(pTHX_ register SV *sv)
+{
+ if (SvPOK(sv)) {
+ char *c;
+ bool has_utf = FALSE;
+ if (!sv_utf8_downgrade(sv, TRUE))
+ return FALSE;
+
+ /* it is actually just a matter of turning the utf8 flag on, but
+ * we want to make sure everything inside is valid utf8 first.
+ */
+ c = SvPVX(sv);
+ while (c < SvEND(sv)) {
+ if (*c & 0x80) {
+ I32 len;
+ (void)utf8_to_uv((U8*)c, &len);
+ if (len == 1) {
+ /* bad utf8 */
+ return FALSE;
+ }
+ c += len;
+ has_utf = TRUE;
+ }
+ else {
+ c++;
+ }
+ }
+
+ if (has_utf)
+ SvUTF8_on(sv);
+ }
+ return TRUE;
+}
+
+
/* Note: sv_setsv() should not be called with a source string that needs
* to be reused, since it may destroy the source string if it is marked
* as temporary.
@@ -2955,10 +3091,13 @@ Perl_sv_catsv(pTHX_ SV *dstr, register SV *sstr)
STRLEN len;
if (!sstr)
return;
- if (s = SvPV(sstr, len))
+ if (s = SvPV(sstr, len)) {
+ if (SvUTF8(sstr))
+ sv_utf8_upgrade(dstr);
sv_catpvn(dstr,s,len);
- if (SvUTF8(sstr))
- SvUTF8_on(dstr);
+ if (SvUTF8(sstr))
+ SvUTF8_on(dstr);
+ }
}
/*
@@ -3807,11 +3946,42 @@ C<sv2>.
I32
Perl_sv_cmp(pTHX_ register SV *str1, register SV *str2)
{
- STRLEN cur1 = 0;
- char *pv1 = str1 ? SvPV(str1, cur1) : (char *) NULL;
- STRLEN cur2 = 0;
- char *pv2 = str2 ? SvPV(str2, cur2) : (char *) NULL;
+ STRLEN cur1, cur2;
+ char *pv1, *pv2;
I32 retval;
+ bool utf1;
+
+ if (str1) {
+ pv1 = SvPV(str1, cur1);
+ }
+ else {
+ cur1 = 0;
+ }
+
+ if (str2) {
+ if (SvPOK(str2)) {
+ if (SvPOK(str1) && SvUTF8(str1) != SvUTF8(str2) && !IN_BYTE) {
+ /* must upgrade other to UTF8 first */
+ if (SvUTF8(str1)) {
+ sv_utf8_upgrade(str2);
+ }
+ else {
+ sv_utf8_upgrade(str1);
+ /* refresh pointer and length */
+ pv1 = SvPVX(str1);
+ cur1 = SvCUR(str1);
+ }
+ }
+ pv2 = SvPVX(str2);
+ cur2 = SvCUR(str2);
+ }
+ else {
+ pv2 = sv_2pv(str2, &cur2);
+ }
+ }
+ else {
+ cur2 = 0;
+ }
if (!cur1)
return cur2 ? -1 : 0;
@@ -4957,18 +5127,21 @@ Perl_sv_pvbyten_force(pTHX_ SV *sv, STRLEN *lp)
char *
Perl_sv_pvutf8(pTHX_ SV *sv)
{
+ sv_utf8_upgrade(sv);
return sv_pv(sv);
}
char *
Perl_sv_pvutf8n(pTHX_ SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_pvn(sv,lp);
}
char *
Perl_sv_pvutf8n_force(pTHX_ SV *sv, STRLEN *lp)
{
+ sv_utf8_upgrade(sv);
return sv_pvn_force(sv,lp);
}
diff --git a/toke.c b/toke.c
index b6ffc2bdce..d978140f4a 100644
--- a/toke.c
+++ b/toke.c
@@ -6904,7 +6904,7 @@ Perl_scan_num(pTHX_ char *start)
pos++;
if (*pos == '.' && isDIGIT(pos[1])) {
UV rev;
- U8 tmpbuf[10];
+ U8 tmpbuf[UTF8_MAXLEN];
U8 *tmpend;
NV nshift = 1.0;
bool utf8 = FALSE;
@@ -6930,7 +6930,6 @@ Perl_scan_num(pTHX_ char *start)
tmpbuf[0] = (U8)rev;
tmpend = &tmpbuf[1];
}
- *tmpend = '\0';
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (rev > 0)
SvNVX(sv) += (NV)rev/nshift;
@@ -6943,7 +6942,6 @@ Perl_scan_num(pTHX_ char *start)
s = pos;
tmpend = uv_to_utf8(tmpbuf, rev);
utf8 = utf8 || rev > 127;
- *tmpend = '\0';
sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
if (rev > 0)
SvNVX(sv) += (NV)rev/nshift;
@@ -6951,8 +6949,10 @@ Perl_scan_num(pTHX_ char *start)
SvPOK_on(sv);
SvNOK_on(sv);
SvREADONLY_on(sv);
- if (utf8)
+ if (utf8) {
SvUTF8_on(sv);
+ sv_utf8_downgrade(sv, TRUE);
+ }
}
}
break;