summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2005-01-22 02:20:12 +0200
committerDave Mitchell <davem@fdisolutions.com>2005-01-22 00:17:40 +0000
commit89ebb4a3f2a55825eeed13aaf58db5c73d2140ef (patch)
treea66444144493fa61d6befce0c9bf1358973f9872
parent80a13697042a4d823de61ba24b77aa9d893765d6 (diff)
downloadperl-89ebb4a3f2a55825eeed13aaf58db5c73d2140ef.tar.gz
Re: uc($long_utf8_string) exhausts memory
Message-Id: <41F1801C.3080201@iki.fi> Make buffer size estimates for utf8 case conversion less maximally pessimistic p4raw-id: //depot/perl@23857
-rw-r--r--doop.c16
-rw-r--r--op.c2
-rw-r--r--pp.c52
-rw-r--r--pp_pack.c2
-rw-r--r--regcomp.c6
-rw-r--r--regexec.c48
-rw-r--r--sv.c2
-rw-r--r--t/op/lc.t5
-rw-r--r--toke.c6
-rw-r--r--utf8.c67
-rw-r--r--utf8.h28
11 files changed, 132 insertions, 102 deletions
diff --git a/doop.c b/doop.c
index b1de08dfb9..9e5a60d484 100644
--- a/doop.c
+++ b/doop.c
@@ -338,7 +338,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
if (grows) {
/* d needs to be bigger than s, in case e.g. upgrading is required */
- New(0, d, len*3+UTF8_MAXLEN, U8);
+ New(0, d, len * 3 + UTF8_MAXBYTES, U8);
dend = d + len * 3;
dstart = d;
}
@@ -370,10 +370,10 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)
if (d > dend) {
STRLEN clen = d - dstart;
- STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
if (!grows)
Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__);
- Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ Renew(dstart, nlen + UTF8_MAXBYTES, U8);
d = dstart + clen;
dend = dstart + nlen;
}
@@ -480,7 +480,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
if (grows) {
/* d needs to be bigger than s, in case e.g. upgrading is required */
- New(0, d, len*3+UTF8_MAXLEN, U8);
+ New(0, d, len * 3 + UTF8_MAXBYTES, U8);
dend = d + len * 3;
dstart = d;
}
@@ -496,10 +496,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
if (d > dend) {
STRLEN clen = d - dstart;
- STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
if (!grows)
Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ Renew(dstart, nlen + UTF8_MAXBYTES, U8);
d = dstart + clen;
dend = dstart + nlen;
}
@@ -550,10 +550,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv)
uv = swash_fetch(rv, s, TRUE);
if (d > dend) {
STRLEN clen = d - dstart;
- STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES;
if (!grows)
Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__);
- Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ Renew(dstart, nlen + UTF8_MAXBYTES, U8);
d = dstart + clen;
dend = dstart + nlen;
}
diff --git a/op.c b/op.c
index ab9f374e21..77a213ed5f 100644
--- a/op.c
+++ b/op.c
@@ -2423,7 +2423,7 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
*/
if (complement) {
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
UV *cp;
UV nextmin = 0;
New(1109, cp, 2*tlen, UV);
diff --git a/pp.c b/pp.c
index a90d9ee24e..12f5bfbde2 100644
--- a/pp.c
+++ b/pp.c
@@ -3344,7 +3344,7 @@ PP(pp_ord)
}
XPUSHu(DO_UTF8(argsv) ?
- utf8n_to_uvchr(s, UTF8_MAXLEN, 0, UTF8_ALLOW_ANYUV) :
+ utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, UTF8_ALLOW_ANYUV) :
(*s & 0xff));
RETURN;
@@ -3454,7 +3454,7 @@ PP(pp_ucfirst)
if (DO_UTF8(sv) &&
(s = (U8*)SvPV_nomg(sv, slen)) && slen &&
UTF8_IS_START(*s)) {
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN ulen;
STRLEN tculen;
@@ -3517,7 +3517,7 @@ PP(pp_lcfirst)
(s = (U8*)SvPV_nomg(sv, slen)) && slen &&
UTF8_IS_START(*s)) {
STRLEN ulen;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
U8 *tend;
UV uv;
@@ -3574,7 +3574,7 @@ PP(pp_uc)
STRLEN ulen;
register U8 *d;
U8 *send;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
s = (U8*)SvPV_nomg(sv,len);
if (!len) {
@@ -3583,18 +3583,28 @@ PP(pp_uc)
SETs(TARG);
}
else {
- STRLEN nchar = utf8_length(s, s + len);
-
(void)SvUPGRADE(TARG, SVt_PV);
- SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+ SvGROW(TARG, len + 1);
(void)SvPOK_only(TARG);
d = (U8*)SvPVX(TARG);
send = s + len;
while (s < send) {
+ STRLEN u = UTF8SKIP(s);
+
toUPPER_utf8(s, tmpbuf, &ulen);
+ if (ulen > u) {
+ UV o = d - (U8*)SvPVX(TARG);
+
+ /* If someone uppercases one million U+03B0s we
+ * SvGROW() one million times. Or we could try
+ * guess how much to allocate without overdoing.
+ * Such is life. */
+ SvGROW(TARG, SvCUR(TARG) + ulen - u);
+ d = (U8*)SvPVX(TARG) + o;
+ }
Copy(tmpbuf, d, ulen, U8);
d += ulen;
- s += UTF8SKIP(s);
+ s += u;
}
*d = '\0';
SvUTF8_on(TARG);
@@ -3643,7 +3653,7 @@ PP(pp_lc)
STRLEN ulen;
register U8 *d;
U8 *send;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
s = (U8*)SvPV_nomg(sv,len);
if (!len) {
@@ -3652,16 +3662,16 @@ PP(pp_lc)
SETs(TARG);
}
else {
- STRLEN nchar = utf8_length(s, s + len);
-
(void)SvUPGRADE(TARG, SVt_PV);
- SvGROW(TARG, (nchar * UTF8_MAXLEN_UCLC) + 1);
+ SvGROW(TARG, len + 1);
(void)SvPOK_only(TARG);
d = (U8*)SvPVX(TARG);
send = s + len;
while (s < send) {
+ STRLEN u = UTF8SKIP(s);
UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
-#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode */
+
+#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */
if (uv == GREEK_CAPITAL_LETTER_SIGMA) {
/*
* Now if the sigma is NOT followed by
@@ -3675,12 +3685,24 @@ PP(pp_lc)
* then it should be mapped to 0x03C2,
* (GREEK SMALL LETTER FINAL SIGMA),
* instead of staying 0x03A3.
- * See lib/unicore/SpecCase.txt.
+ * "should be": in other words,
+ * this is not implemented yet.
+ * See lib/unicore/SpecialCasing.txt.
*/
}
+ if (ulen > u) {
+ UV o = d - (U8*)SvPVX(TARG);
+
+ /* If someone lowercases one million U+0130s we
+ * SvGROW() one million times. Or we could try
+ * guess how much to allocate without overdoing.
+ Such is life. */
+ SvGROW(TARG, SvCUR(TARG) + ulen - u);
+ d = (U8*)SvPVX(TARG) + o;
+ }
Copy(tmpbuf, d, ulen, U8);
d += ulen;
- s += UTF8SKIP(s);
+ s += u;
}
*d = '\0';
SvUTF8_on(TARG);
diff --git a/pp_pack.c b/pp_pack.c
index 815c326ffa..2e830f40f3 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -2183,7 +2183,7 @@ S_pack_rec(pTHX_ SV *cat, register tempsym_t* symptr, register SV **beglist, SV
while (len-- > 0) {
fromstr = NEXTFROM;
auint = UNI_TO_NATIVE(SvUV(fromstr));
- SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
+ SvGROW(cat, SvCUR(cat) + UTF8_MAXBYTES + 1);
SvCUR_set(cat,
(char*)uvchr_to_utf8_flags((U8*)SvEND(cat),
auint,
diff --git a/regcomp.c b/regcomp.c
index 2a942d566d..ff939b2436 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3088,7 +3088,7 @@ tryagain:
char *oldp, *s;
STRLEN numlen;
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
parse_start = RExC_parse - 1;
@@ -4199,7 +4199,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state)
else if (prevnatvalue == natvalue) {
Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
if (FOLD) {
- U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
@@ -4869,7 +4869,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
if (lv) {
if (sw) {
- U8 s[UTF8_MAXLEN+1];
+ U8 s[UTF8_MAXBYTES_CASE+1];
for (i = 0; i <= 256; i++) { /* just the first 256 */
U8 *e = uvchr_to_utf8(s, i);
diff --git a/regexec.c b/regexec.c
index 6ed6d951ed..f254713d61 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1028,15 +1028,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (UTF) {
STRLEN ulen1, ulen2;
U8 *sm = (U8 *) m;
- U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
- U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
- c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN_UCLC,
+ c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
- c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN_UCLC,
+ c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
0, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
lnc = 0;
while (sm < ((U8 *) m + ln)) {
@@ -1074,15 +1074,15 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
if (do_utf8) {
UV c, f;
- U8 tmpbuf [UTF8_MAXLEN+1];
- U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ U8 tmpbuf [UTF8_MAXBYTES+1];
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN len, foldlen;
if (c1 == c2) {
/* Upper and lower of 1st char are equal -
* probably not a "letter". */
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
if ( c == c1
@@ -1109,7 +1109,7 @@ S_find_byclass(pTHX_ regexp * prog, regnode *c, char *s, char *strend, char *sta
}
else {
while (s <= e) {
- c = utf8n_to_uvchr((U8*)s, UTF8_MAXLEN, &len,
+ c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
@@ -2459,7 +2459,7 @@ S_regmatch(pTHX_ regnode *prog)
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*(U8*)s) !=
- utf8n_to_uvuni((U8*)l, UTF8_MAXLEN, &ulen,
+ utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY))
sayNO;
@@ -2473,7 +2473,7 @@ S_regmatch(pTHX_ regnode *prog)
if (l >= PL_regeol)
sayNO;
if (NATIVE_TO_UNI(*((U8*)l)) !=
- utf8n_to_uvuni((U8*)s, UTF8_MAXLEN, &ulen,
+ utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY))
sayNO;
@@ -2806,8 +2806,8 @@ S_regmatch(pTHX_ regnode *prog)
*/
if (OP(scan) == REFF) {
STRLEN ulen1, ulen2;
- U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
- U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
while (s < e) {
if (l >= PL_regeol)
sayNO;
@@ -3580,21 +3580,21 @@ S_regmatch(pTHX_ regnode *prog)
else { /* UTF */
if (OP(text_node) == EXACTF || OP(text_node) == REFF) {
STRLEN ulen1, ulen2;
- U8 tmpbuf1[UTF8_MAXLEN_UCLC+1];
- U8 tmpbuf2[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
- c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXLEN, 0,
+ c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
- c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXLEN, 0,
+ c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
}
else {
- c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXLEN, 0,
+ c2 = c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
}
@@ -3656,7 +3656,7 @@ S_regmatch(pTHX_ regnode *prog)
* utf8_distance(old, locinput) */
while (locinput <= e &&
utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXLEN, &len,
+ UTF8_MAXBYTES, &len,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY) != (UV)c1) {
locinput += len;
@@ -3667,7 +3667,7 @@ S_regmatch(pTHX_ regnode *prog)
* utf8_distance(old, locinput) */
while (locinput <= e) {
UV c = utf8n_to_uvchr((U8*)locinput,
- UTF8_MAXLEN, &len,
+ UTF8_MAXBYTES, &len,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
if (c == (UV)c1 || c == (UV)c2)
@@ -3704,7 +3704,7 @@ S_regmatch(pTHX_ regnode *prog)
if (c1 != -1000) {
if (do_utf8)
c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXLEN, 0,
+ UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
else
@@ -3754,7 +3754,7 @@ S_regmatch(pTHX_ regnode *prog)
if (c1 != -1000) {
if (do_utf8)
c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXLEN, 0,
+ UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
else
@@ -3777,7 +3777,7 @@ S_regmatch(pTHX_ regnode *prog)
if (c1 != -1000) {
if (do_utf8)
c = utf8n_to_uvchr((U8*)PL_reginput,
- UTF8_MAXLEN, 0,
+ UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
else
@@ -4370,7 +4370,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
STRLEN plen;
if (do_utf8 && !UTF8_IS_INVARIANT(c))
- c = utf8n_to_uvchr(p, UTF8_MAXLEN, &len,
+ c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
@@ -4407,7 +4407,7 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register b
}
}
if (!match) {
- U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN tmplen;
to_utf8_fold(p, tmpbuf, &tmplen);
diff --git a/sv.c b/sv.c
index c7211b7b80..343c5f0c8a 100644
--- a/sv.c
+++ b/sv.c
@@ -9271,7 +9271,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV
#endif
char esignbuf[4];
- U8 utf8buf[UTF8_MAXLEN+1];
+ U8 utf8buf[UTF8_MAXBYTES+1];
STRLEN esignlen = 0;
char *eptr = Nullch;
diff --git a/t/op/lc.t b/t/op/lc.t
index cee6661c9c..d75b64e6dc 100644
--- a/t/op/lc.t
+++ b/t/op/lc.t
@@ -111,8 +111,8 @@ is("\u\x{587}" , "\x{535}\x{582}", "ligature titlecase");
is("\U\x{587}" , "\x{535}\x{552}", "ligature uppercase");
# mktables had problems where many-to-one case mappings didn't work right.
-# The lib/unifold.t should give the fourth folding, "casefolding", a good
-# workout.
+# The lib/uni/fold.t should give the fourth folding, "casefolding", a good
+# workout (one cannot directly get that from Perl).
# \x{01C4} is LATIN CAPITAL LETTER DZ WITH CARON
# \x{01C5} is LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
# \x{01C6} is LATIN SMALL LETTER DZ WITH CARON
@@ -161,6 +161,5 @@ for my $a (0,1) {
chop $a;
$a =~ s/^(\s*)(\w*)/$1\u$2/;
is($a, v10, "[perl #18857]");
- $test++;
}
}
diff --git a/toke.c b/toke.c
index b4222f6ecf..5d5abf4573 100644
--- a/toke.c
+++ b/toke.c
@@ -1704,7 +1704,7 @@ S_scan_const(pTHX_ char *start)
UV uv = utf8_to_uvchr((U8*)str, 0);
if (uv < 0x100) {
- U8 tmpbuf[UTF8_MAXLEN+1], *d;
+ U8 tmpbuf[UTF8_MAXBYTES+1], *d;
d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv));
sv_setpvn(res, (char *)tmpbuf, d - tmpbuf);
@@ -7137,7 +7137,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
I32 brackets = 1; /* bracket nesting level */
bool has_utf8 = FALSE; /* is there any utf8 content? */
I32 termcode; /* terminating char. code */
- U8 termstr[UTF8_MAXLEN]; /* terminating string */
+ U8 termstr[UTF8_MAXBYTES]; /* terminating string */
STRLEN termlen; /* length of terminating string */
char *last = NULL; /* last position for nesting bracket */
@@ -8230,7 +8230,7 @@ Perl_scan_vstring(pTHX_ char *s, SV *sv)
if (!isALPHA(*pos)) {
UV rev;
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
U8 *tmpend;
if (*s == 'v') s++; /* get past 'v' */
diff --git a/utf8.c b/utf8.c
index 5c6f2715f3..5fa2aab686 100644
--- a/utf8.c
+++ b/utf8.c
@@ -38,7 +38,7 @@ within non-zero characters.
=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
@@ -551,7 +551,7 @@ returned and retlen is set, if possible, to -1.
UV
Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
{
- return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen,
+ return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
@@ -575,7 +575,7 @@ UV
Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
{
/* Call the low level routine asking for checks */
- return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen,
+ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
@@ -937,7 +937,7 @@ Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
bool
Perl_is_uni_alnum(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alnum(tmpbuf);
}
@@ -945,7 +945,7 @@ Perl_is_uni_alnum(pTHX_ UV c)
bool
Perl_is_uni_alnumc(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alnumc(tmpbuf);
}
@@ -953,7 +953,7 @@ Perl_is_uni_alnumc(pTHX_ UV c)
bool
Perl_is_uni_idfirst(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_idfirst(tmpbuf);
}
@@ -961,7 +961,7 @@ Perl_is_uni_idfirst(pTHX_ UV c)
bool
Perl_is_uni_alpha(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_alpha(tmpbuf);
}
@@ -969,7 +969,7 @@ Perl_is_uni_alpha(pTHX_ UV c)
bool
Perl_is_uni_ascii(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_ascii(tmpbuf);
}
@@ -977,7 +977,7 @@ Perl_is_uni_ascii(pTHX_ UV c)
bool
Perl_is_uni_space(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_space(tmpbuf);
}
@@ -985,7 +985,7 @@ Perl_is_uni_space(pTHX_ UV c)
bool
Perl_is_uni_digit(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_digit(tmpbuf);
}
@@ -993,7 +993,7 @@ Perl_is_uni_digit(pTHX_ UV c)
bool
Perl_is_uni_upper(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_upper(tmpbuf);
}
@@ -1001,7 +1001,7 @@ Perl_is_uni_upper(pTHX_ UV c)
bool
Perl_is_uni_lower(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_lower(tmpbuf);
}
@@ -1009,7 +1009,7 @@ Perl_is_uni_lower(pTHX_ UV c)
bool
Perl_is_uni_cntrl(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_cntrl(tmpbuf);
}
@@ -1017,7 +1017,7 @@ Perl_is_uni_cntrl(pTHX_ UV c)
bool
Perl_is_uni_graph(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_graph(tmpbuf);
}
@@ -1025,7 +1025,7 @@ Perl_is_uni_graph(pTHX_ UV c)
bool
Perl_is_uni_print(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_print(tmpbuf);
}
@@ -1033,7 +1033,7 @@ Perl_is_uni_print(pTHX_ UV c)
bool
Perl_is_uni_punct(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN+1];
+ U8 tmpbuf[UTF8_MAXBYTES+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_punct(tmpbuf);
}
@@ -1041,7 +1041,7 @@ Perl_is_uni_punct(pTHX_ UV c)
bool
Perl_is_uni_xdigit(pTHX_ UV c)
{
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
uvchr_to_utf8(tmpbuf, c);
return is_utf8_xdigit(tmpbuf);
}
@@ -1166,7 +1166,7 @@ Perl_to_uni_upper_lc(pTHX_ U32 c)
/* XXX returns only the first character -- do not use XXX */
/* XXX no locale support yet */
STRLEN len;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
return (U32)to_uni_upper(c, tmpbuf, &len);
}
@@ -1176,7 +1176,7 @@ Perl_to_uni_title_lc(pTHX_ U32 c)
/* XXX returns only the first character XXX -- do not use XXX */
/* XXX no locale support yet */
STRLEN len;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
return (U32)to_uni_title(c, tmpbuf, &len);
}
@@ -1186,7 +1186,7 @@ Perl_to_uni_lower_lc(pTHX_ U32 c)
/* XXX returns only the first character -- do not use XXX */
/* XXX no locale support yet */
STRLEN len;
- U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
return (U32)to_uni_lower(c, tmpbuf, &len);
}
@@ -1400,7 +1400,7 @@ UV
Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
{
UV uv0, uv1;
- U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
+ U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
STRLEN len = 0;
uv0 = utf8_to_uvchr(p, 0);
@@ -1489,9 +1489,8 @@ Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *norma
Convert the UTF-8 encoded character at p to its uppercase version and
store that in UTF-8 in ustrp and its length in bytes in lenp. Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-uppercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
+the uppercase version may be longer than the original character.
The first character of the uppercased version is returned
(but note, as explained above, that there may be more.)
@@ -1510,9 +1509,8 @@ Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
Convert the UTF-8 encoded character at p to its titlecase version and
store that in UTF-8 in ustrp and its length in bytes in lenp. Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-titlecase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+titlecase version may be longer than the original character.
The first character of the titlecased version is returned
(but note, as explained above, that there may be more.)
@@ -1531,9 +1529,8 @@ Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
Convert the UTF-8 encoded character at p to its lowercase version and
store that in UTF-8 in ustrp and its length in bytes in lenp. Note
-that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
-lowercase version may be longer than the original character (up to two
-characters).
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
+lowercase version may be longer than the original character.
The first character of the lowercased version is returned
(but note, as explained above, that there may be more.)
@@ -1552,7 +1549,7 @@ Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
Convert the UTF-8 encoded character at p to its foldcase version and
store that in UTF-8 in ustrp and its length in bytes in lenp. Note
-that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
+that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
foldcase version may be longer than the original character (up to
three characters).
@@ -1711,7 +1708,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
/* We use utf8n_to_uvuni() as we want an index into
Unicode tables, not a native character number.
*/
- UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, 0,
+ UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
ckWARN(WARN_UTF8) ?
0 : UTF8_ALLOW_ANY);
SV *errsv_save;
@@ -1778,7 +1775,7 @@ Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
Adds the UTF-8 representation of the Native codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
+of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
@@ -1954,8 +1951,8 @@ Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const
register U8 *e1 = 0, *f1 = 0, *q1 = 0;
register U8 *e2 = 0, *f2 = 0, *q2 = 0;
STRLEN n1 = 0, n2 = 0;
- U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
- U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
+ U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
+ U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
U8 natbuf[1+1];
STRLEN foldlen1, foldlen2;
bool match;
diff --git a/utf8.h b/utf8.h
index 376280d3e4..c206b3da39 100644
--- a/utf8.h
+++ b/utf8.h
@@ -162,14 +162,26 @@ encoded character.
#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
-/* how wide can a single UTF-8 encoded character become */
-#define UTF8_MAXLEN 13
-/* how wide a character can become when upper/lowercased */
-#define UTF8_MAXLEN_UCLC_MULT 3
-#define UTF8_MAXLEN_UCLC (UTF8_MAXLEN*UTF8_MAXLEN_UCLC_MULT)
-/* how wide a character can become when casefolded */
-#define UTF8_MAXLEN_FOLD_MULT 3
-#define UTF8_MAXLEN_FOLD (UTF8_MAXLEN*UTF8_MAXLEN_FOLD_MULT)
+#define UTF8_MAXBYTES 13
+/* How wide can a single UTF-8 encoded character become in bytes.
+ * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8
+ * since UTF-8 is an encoding of Unicode and given Unicode's current
+ * upper limit only four bytes is possible. Perl thinks of UTF-8
+ * as a way to encode non-negative integers in a binary format. */
+#define UTF8_MAXLEN UTF8_MAXBYTES
+
+#define UTF8_MAXLEN_UCLC 3 /* Obsolete, do not use. */
+#define UTF8_MAXLEN_UCLC_MULT 39 /* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD 3 /* Obsolete, do not use. */
+#define UTF8_MAXLEN_FOLD_MULT 39 /* Obsolete, do not use. */
+
+/* The maximum number of UTF-8 bytes a single Unicode character can
+ * uppercase/lowercase/fold into; this number depends on the Unicode
+ * version. An example of maximal expansion is the U+03B0 which
+ * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that
+ * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * SpecialCasing.txt. */
+#define UTF8_MAXBYTES_CASE 6
#define IN_BYTES (PL_curcop->op_private & HINT_BYTES)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)