summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarl Williamson <khw@cpan.org>2021-06-27 02:19:19 -0600
committerKarl Williamson <khw@cpan.org>2021-08-07 05:14:43 -0600
commit8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9 (patch)
tree4d780adf97c32e8aca95b22269706a4d19ab62a8
parentd49e4ce1792863bdc3e344a52aef19110508e1ac (diff)
downloadperl-8b5f2733d203f1d4c77d2a7e2250f5b63c17c9e9.tar.gz
utf8.c: Generalize static fcn
I've always been uncomfortable with the input constraints this function had. Now that it has been refactored into using a switch(), new cases for full generality can be added without affecting performance, and some conditionals removed before calling it. The function is renamed to reflect its more generality
-rw-r--r--embed.fnc4
-rw-r--r--embed.h2
-rw-r--r--proto.h4
-rw-r--r--utf8.c65
4 files changed, 45 insertions, 30 deletions
diff --git a/embed.fnc b/embed.fnc
index 4a53b25d58..7d851bae01 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -2482,8 +2482,8 @@ EXp |char * |_byte_dump_string \
iTR |int |does_utf8_overflow|NN const U8 * const s \
|NN const U8 * e \
|const bool consider_overlongs
-iTR |int |is_utf8_overlong_given_start_byte_ok|NN const U8 * const s \
- |const STRLEN len
+iTR |int |is_utf8_overlong|NN const U8 * const s \
+ |const STRLEN len
iTR |int |isFF_overlong |NN const U8 * const s|const STRLEN len
SR |char * |unexpected_non_continuation_text \
|NN const U8 * const s \
diff --git a/embed.h b/embed.h
index ee85c75abd..e26c96149c 100644
--- a/embed.h
+++ b/embed.h
@@ -1991,7 +1991,7 @@
#define does_utf8_overflow S_does_utf8_overflow
#define isFF_overlong S_isFF_overlong
#define is_utf8_common(a,b,c) S_is_utf8_common(aTHX_ a,b,c)
-#define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok
+#define is_utf8_overlong S_is_utf8_overlong
#define new_msg_hv(a,b,c) S_new_msg_hv(aTHX_ a,b,c)
#define to_lower_latin1 S_to_lower_latin1
#define turkic_fc(a,b,c,d) S_turkic_fc(aTHX_ a,b,c,d)
diff --git a/proto.h b/proto.h
index bb77d8307b..94fdfecbb3 100644
--- a/proto.h
+++ b/proto.h
@@ -6647,9 +6647,9 @@ PERL_STATIC_INLINE bool S_is_utf8_common(pTHX_ const U8 *const p, const U8 *cons
#endif
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE int S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+PERL_STATIC_INLINE int S_is_utf8_overlong(const U8 * const s, const STRLEN len)
__attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK \
+#define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG \
assert(s)
#endif
diff --git a/utf8.c b/utf8.c
index cd8e84833e..1be95015cb 100644
--- a/utf8.c
+++ b/utf8.c
@@ -608,7 +608,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s,
* but for these huge code points, speed shouldn't be a consideration, and
* the compiler does have enough information, since it's static to this
* file, to optimize to just the needed parts.) */
- is_overlong = is_utf8_overlong_given_start_byte_ok(s, len);
+ is_overlong = is_utf8_overlong(s, len);
/* If it isn't overlong, more than 31 bits are required. */
if (is_overlong == 0) {
@@ -692,7 +692,7 @@ S_is_utf8_cp_above_31_bits(const U8 * const s,
#endif
PERL_STATIC_INLINE int
-S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+S_is_utf8_overlong(const U8 * const s, const STRLEN len)
{
/* Returns an int indicating whether or not the UTF-8 sequence from 's' to
* 's' + 'len' - 1 is an overlong. It returns 1 if it is an overlong; 0 if
@@ -700,23 +700,15 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
* return value can happen if the sequence is incomplete, missing some
* trailing bytes that would form a complete character. If there are
* enough bytes to make a definitive decision, this function does so.
- * Usually 2 bytes sufficient.
+ * Usually 2 bytes are sufficient.
*
* Overlongs can occur whenever the number of continuation bytes changes.
* That means whenever the number of leading 1 bits in a start byte
* increases from the next lower start byte. That happens for start bytes
- * C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following illegal
- * start bytes have already been excluded, so don't need to be tested here;
- * ASCII platforms: C0, C1
- * EBCDIC platforms C0, C1, C2, C3, C4, E0
+ * C0, E0, F0, F8, FC, FE, and FF.
*/
- U8 s1;
-
- PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
- assert(len > 1 && UTF8_IS_START(*s));
-
- s1 = NATIVE_UTF8_TO_I8(s[1]);
+ PERL_ARGS_ASSERT_IS_UTF8_OVERLONG;
/* Each platform has overlongs after the start bytes given above (expressed
* in I8 for EBCDIC). The values below were found by manually inspecting
@@ -724,18 +716,43 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
switch (NATIVE_UTF8_TO_I8(s[0])) {
default:
+ assert(UTF8_IS_START(s[0]));
return 0;
-#ifndef EBCDIC /* the E0 overlong has already been excluded on EBCDIC
- platforms. */
- case 0xE0: return s1 < 0xA0;
+ case 0xC0:
+ case 0xC1:
+ return 1;
+
+#ifdef EBCDIC
+
+ case 0xC2:
+ case 0xC3:
+ case 0xC4:
+ case 0xE0:
+ return 1;
+#else
+ case 0xE0:
+ return (len < 2) ? -1 : s[1] < 0xA0;
#endif
- case 0xF0: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x10;
- case 0xF8: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x08;
- case 0xFC: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x04;
- case 0xFE: return s1 < UTF_MIN_CONTINUATION_BYTE + 0x02;
- case 0xFF: return isFF_overlong(s, len);
+ case 0xF0:
+ return (len < 2)
+ ? -1
+ : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x10;
+ case 0xF8:
+ return (len < 2)
+ ? -1
+ : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x08;
+ case 0xFC:
+ return (len < 2)
+ ? -1
+ : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x04;
+ case 0xFE:
+ return (len < 2)
+ ? -1
+ : NATIVE_UTF8_TO_I8(s[1]) < UTF_MIN_CONTINUATION_BYTE + 0x02;
+ case 0xFF:
+ return isFF_overlong(s, len);
}
}
@@ -1055,7 +1072,7 @@ Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
/* Here is syntactically valid. Next, make sure this isn't the start of an
* overlong. */
- if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
+ if (is_utf8_overlong(s, len) > 0) {
return 0;
}
@@ -1733,9 +1750,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
&& UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
|| ( UNLIKELY(possible_problems)
&& ( UNLIKELY(! UTF8_IS_START(*s0))
- || ( curlen > 1
- && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
- s - s0))))))
+ || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0))))))
{
possible_problems |= UTF8_GOT_LONG;