summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc1
-rw-r--r--embed.h1
-rw-r--r--ext/XS-APItest/t/utf8.t24
-rw-r--r--proto.h6
-rw-r--r--utf8.c44
5 files changed, 57 insertions, 19 deletions
diff --git a/embed.fnc b/embed.fnc
index 5cc73b7978..d6312dcb1e 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -1683,6 +1683,7 @@ ApdD |UV |to_utf8_case |NN const U8 *p \
#if defined(PERL_IN_UTF8_C)
inRP |bool |does_utf8_overflow|NN const U8 * const s|NN const U8 * e
inRP |bool |is_utf8_overlong_given_start_byte_ok|NN const U8 * const s|const STRLEN len
+inRP |bool |isFF_OVERLONG |NN const U8 * const s|const STRLEN len
sMR |char * |unexpected_non_continuation_text \
|NN const U8 * const s \
|STRLEN print_len \
diff --git a/embed.h b/embed.h
index 1af29175cb..b8ee773505 100644
--- a/embed.h
+++ b/embed.h
@@ -1832,6 +1832,7 @@
#define _to_utf8_case(a,b,c,d,e,f,g) S__to_utf8_case(aTHX_ a,b,c,d,e,f,g)
#define check_locale_boundary_crossing(a,b,c,d) S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
#define does_utf8_overflow S_does_utf8_overflow
+#define isFF_OVERLONG S_isFF_OVERLONG
#define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d)
#define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits
#define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t
index 37a1e32185..121c6efac5 100644
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -1731,6 +1731,15 @@ my @tests = (
'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
nonportable_regex(0x80000000)
],
+ [ "highest 32 bit code point",
+ (isASCII)
+ ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
+ : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
+ $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+ $UTF8_GOT_ABOVE_31_BIT,
+ 'utf8', 0xFFFFFFFF, (isASCII) ? 7 : $max_bytes,
+ nonportable_regex(0xffffffff)
+ ],
[ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
(isASCII)
? "\xfe\x82\x80\x80\x80\x80\x80"
@@ -1764,7 +1773,20 @@ my @tests = (
],
);
-if ($is64bit) {
+if (! $is64bit) {
+ if (isASCII) {
+ no warnings qw{portable overflow};
+ push @tests,
+ [ "Lowest 33 bit code point: overflow",
+ "\xFE\x84\x80\x80\x80\x80\x80",
+ $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+ $UTF8_GOT_ABOVE_31_BIT,
+ 'utf8', 0x100000000, 7,
+ qr/and( is)? not portable/
+ ];
+ }
+}
+else {
no warnings qw{portable overflow};
push @tests,
[ "More than 32 bits",
diff --git a/proto.h b/proto.h
index 1d79c46e2e..670801698d 100644
--- a/proto.h
+++ b/proto.h
@@ -5605,6 +5605,12 @@ PERL_STATIC_INLINE bool S_does_utf8_overflow(const U8 * const s, const U8 * e)
#define PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW \
assert(s); assert(e)
+PERL_STATIC_INLINE bool S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+ __attribute__warn_unused_result__
+ __attribute__pure__;
+#define PERL_ARGS_ASSERT_ISFF_OVERLONG \
+ assert(s)
+
PERL_STATIC_INLINE bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char * const swashname, SV* const invlist)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_COMMON \
diff --git a/utf8.c b/utf8.c
index 80bafadea5..8b301b26f8 100644
--- a/utf8.c
+++ b/utf8.c
@@ -442,7 +442,20 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e)
* that could result in a non-overflowing code point */
PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
- assert(s + UTF8SKIP(s) >= e);
+ assert(s <= e && s + UTF8SKIP(s) >= e);
+
+#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
+
+ /* On 32 bit ASCII machines, many overlongs that start with FF don't
+ * overflow */
+
+ if (isFF_OVERLONG(s, e - s)) {
+ const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
+ return memGE(s, max_32_bit_overlong,
+ MIN(e - s, sizeof(max_32_bit_overlong)));
+ }
+
+#endif
for (x = s; x < e; x++, y++) {
@@ -521,27 +534,22 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
return TRUE;
}
-# if defined(UV_IS_QUAD) || defined(EBCDIC)
+ /* Check for the FF overlong */
+ return isFF_OVERLONG(s, len);
+}
+
+PERL_STATIC_INLINE bool
+S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
+{
+ PERL_ARGS_ASSERT_ISFF_OVERLONG;
/* Check for the FF overlong. This happens only if all these bytes match;
* what comes after them doesn't matter. See tables in utf8.h,
- * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
- * instead.) */
-
- if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
- && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
- sizeof(FF_OVERLONG_PREFIX) - 1)))
- {
- return TRUE;
- }
-
-#else
-
- PERL_UNUSED_ARG(len);
-
-#endif
+ * utfebcdic.h. */
- return FALSE;
+ return len >= sizeof(FF_OVERLONG_PREFIX) - 1
+ && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
+ sizeof(FF_OVERLONG_PREFIX) - 1));
}
#undef F0_ABOVE_OVERLONG