summaryrefslogtreecommitdiff
path: root/utf8.c
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2018-11-07 20:11:50 -0700
committerKarl Williamson <khw@cpan.org>2018-11-16 09:29:42 -0700
commitd466271924da8273f92ecba0e715287a7d7582ac (patch)
treeb3ab8accfe5d50f05cb13516b15d32ec40c13460 /utf8.c
parent46167d76640b8ae760665fc0fd2f94ac4760438c (diff)
downloadperl-d466271924da8273f92ecba0e715287a7d7582ac.tar.gz
XXX utf8.c: calculate vairants instead of assuming worst case
When converting a byte string to UTF-8, the needed size may increase due to some bytes (the UTF-8 variants) occupying two bytes instead of one under UTF-8. Prior to this commit, the string was assumed to contain only variants, and enough memory was allocated for the worst case, then the excess was returned at the end. This commit actually calculates how much space is needed and allocates only that, so there is no need to trim afterwards. There is extra work involved in doing this calculation. But the string is parsed per-word. For short strings, it doesn't much matter either way. But for very long strings, it seems to me the consequences of potentially allocating way too much memory out weighs the negative of this extra work.
Diffstat (limited to 'utf8.c')
-rw-r--r--utf8.c6
1 files changed, 2 insertions, 4 deletions
diff --git a/utf8.c b/utf8.c
index 86586982e1..859b13a788 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2647,7 +2647,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
- Newx(d, (*lenp) * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
dst = d;
while (s < send) {
@@ -2658,9 +2659,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
*d = '\0';
*lenp = d-dst;
- /* Trim unused space */
- Renew(dst, *lenp + 1, U8);
-
return dst;
}