summaryrefslogtreecommitdiff
path: root/utf8.h
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2019-10-09 10:02:31 -0600
committerKarl Williamson <khw@cpan.org>2019-10-09 10:19:21 -0600
commitee0ff0f58536ba7975a4b8f1d21309ae9f451df7 (patch)
tree1fada1b5c31569d52adac373dbc55834de43c52a /utf8.h
parenta281f16cacceabade4e75fbbbeb567285d462ba0 (diff)
downloadperl-ee0ff0f58536ba7975a4b8f1d21309ae9f451df7.tar.gz
Add UTF8_CHK_SKIP() macro
This is a safer version of UTF8SKIP for use when the input could be possibly malformed. It uses strnlen() to not read past a NUL in the input. Since Perl adds NULs to the end of SV's, this will likely prevent reading beyond the end of a buffer. A still safer version could be written that doesn't look for just a NUL, but any unexpected byte, and stops just before that. I suspect that is overkill, and since strnlen() can be very fast, I went with this approach instead. Nothing precludes adding another version that does this full checking
Diffstat (limited to 'utf8.h')
-rw-r--r--utf8.h42
1 files changed, 40 insertions, 2 deletions
diff --git a/utf8.h b/utf8.h
index 38bf83cfc5..83cccf16c3 100644
--- a/utf8.h
+++ b/utf8.h
@@ -530,8 +530,24 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than
/*
=for apidoc Am|STRLEN|UTF8SKIP|char* s
-returns the number of bytes in the UTF-8 encoded character whose first (perhaps
-only) byte is pointed to by C<s>.
+returns the number of bytes a non-malformed UTF-8 encoded character whose first
+(perhaps only) byte is pointed to by C<s>.
+
+If there is a possibility of malformed input, use instead:
+
+=over
+
+=item L</C<UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
+buffer pointed to by C<s>; or
+
+=item L</C<UTF8_CHK_SKIP>> if you don't know it.
+
+=back
+
+It is better to restructure your code so the end pointer is passed down so that
+you know what it actually is at the point of this call, but if that isn't
+possible, L</C<UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
+of the input buffer.
=cut
*/
@@ -547,6 +563,28 @@ This is a synonym for L</C<UTF8SKIP>>
#define UTF8_SKIP(s) UTF8SKIP(s)
/*
+=for apidoc Am|STRLEN|UTF8_CHK_SKIP|char* s
+
+This is a safer version of L</C<UTF8SKIP>>, but still not as safe as
+L</C<UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
+string pointed to by C<s> is well-formed, but verifies that there isn't a NUL
+terminating character before the expected end of the next character in C<s>.
+The length C<UTF8_CHK_SKIP> returns stops just before any such NUL.
+
+Perl tends to add NULs, as an insurance policy, after the end of strings in
+SV's, so it is likely that using this macro will prevent inadvertent reading
+beyond the end of the input buffer, even if it is malformed UTF-8.
+
+This macro is intended to be used by XS modules where the inputs could be
+malformed, and it isn't feasible to restructure to use the safer
+L</C<UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
+
+=cut
+*/
+
+#define UTF8_CHK_SKIP(s) \
+ (s[0] == '\0' ? 1 : MIN(my_strnlen((char *) (s), UTF8SKIP(s))))
+/*
=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e
returns 0 if S<C<s E<gt>= e>>; otherwise returns the number of bytes in the