XXX utf8.c: calculate vairants instead of assuming worst case

When converting a byte string to UTF-8, the needed size may increase due to some bytes (the UTF-8 variants) occupying two bytes instead of one under UTF-8. Prior to this commit, the string was assumed to contain only variants, and enough memory was allocated for the worst case, then the excess was returned at the end. This commit actually calculates how much space is needed and allocates only that, so there is no need to trim afterwards. There is extra work involved in doing this calculation. But the string is parsed per-word. For short strings, it doesn't much matter either way. But for very long strings, it seems to me the consequences of potentially allocating way too much memory out weighs the negative of this extra work.
author: Karl Williamson <khw@cpan.org> 2018-11-07 20:11:50 -0700
committer: Karl Williamson <khw@cpan.org> 2018-11-16 09:29:42 -0700
commit: d466271924da8273f92ecba0e715287a7d7582ac (patch)
tree: b3ab8accfe5d50f05cb13516b15d32ec40c13460 /utf8.c
parent: 46167d76640b8ae760665fc0fd2f94ac4760438c (diff)
download: perl-d466271924da8273f92ecba0e715287a7d7582ac.tar.gz
1 files changed, 2 insertions, 4 deletions
diff --git a/utf8.c b/utf8.c
index 86586982e1..859b13a788 100644
--- a/utf8.c
+++ b/utf8.c
@@ -2647,7 +2647,8 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
     PERL_ARGS_ASSERT_BYTES_TO_UTF8;
     PERL_UNUSED_CONTEXT;
 
-    Newx(d, (*lenp) * 2 + 1, U8);
+    /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+    Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
     dst = d;
 
     while (s < send) {
@@ -2658,9 +2659,6 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
     *d = '\0';
     *lenp = d-dst;
 
-    /* Trim unused space */
-    Renew(dst, *lenp + 1, U8);
-
     return dst;
 }
author	Karl Williamson <khw@cpan.org>	2018-11-07 20:11:50 -0700
committer	Karl Williamson <khw@cpan.org>	2018-11-16 09:29:42 -0700
commit	d466271924da8273f92ecba0e715287a7d7582ac (patch)
tree	b3ab8accfe5d50f05cb13516b15d32ec40c13460 /utf8.c
parent	46167d76640b8ae760665fc0fd2f94ac4760438c (diff)
download	perl-d466271924da8273f92ecba0e715287a7d7582ac.tar.gz