summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.fnc2
-rw-r--r--embed.h6
-rw-r--r--ext/XS-APItest/t/utf8_warn_base.pl69
-rw-r--r--proto.h18
-rw-r--r--utf8.c201
-rw-r--r--utf8.h6
-rw-r--r--utfebcdic.h2
7 files changed, 177 insertions, 127 deletions
diff --git a/embed.fnc b/embed.fnc
index 1e661e4f3f..20f2987bfe 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -744,7 +744,9 @@ ADMpR |bool |isALNUM_lazy |NN const char* p
#ifdef PERL_IN_UTF8_C
snR |U8 |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp \
|const char dummy
+# ifndef UV_IS_QUAD
inR |bool |is_utf8_cp_above_31_bits|NN const U8 * const s|NN const U8 * const e
+# endif
#endif
#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C)
EXp |UV |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const unsigned int flags
diff --git a/embed.h b/embed.h
index a74458d7fd..608d252e54 100644
--- a/embed.h
+++ b/embed.h
@@ -1472,6 +1472,11 @@
#define mulexp10 S_mulexp10
# endif
# endif
+# if !defined(UV_IS_QUAD)
+# if defined(PERL_IN_UTF8_C)
+#define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits
+# endif
+# endif
# if !defined(WIN32)
#define do_exec3(a,b,c) Perl_do_exec3(aTHX_ a,b,c)
# endif
@@ -1844,7 +1849,6 @@
#define isFF_OVERLONG S_isFF_OVERLONG
#define is_utf8_common(a,b,c,d) S_is_utf8_common(aTHX_ a,b,c,d)
#define is_utf8_common_with_len(a,b,c,d,e) S_is_utf8_common_with_len(aTHX_ a,b,c,d,e)
-#define is_utf8_cp_above_31_bits S_is_utf8_cp_above_31_bits
#define is_utf8_overlong_given_start_byte_ok S_is_utf8_overlong_given_start_byte_ok
#define swash_scan_list_line(a,b,c,d,e,f,g) S_swash_scan_list_line(aTHX_ a,b,c,d,e,f,g)
#define swatch_get(a,b,c) S_swatch_get(aTHX_ a,b,c)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl
index 94df88e813..6c88f5c308 100644
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -28,6 +28,7 @@ local $SIG{__WARN__} = sub { my @copy = @_;
push @warnings_gotten, map { chomp; $_ } @copy;
};
+my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
sub requires_extended_utf8($) {
@@ -36,8 +37,7 @@ sub requires_extended_utf8($) {
# into 31 bits, subject to the convention that a negative code point
# stands for one that overflows the word size, so won't fit in 31 bits.
- my $cp = shift;
- return $cp > 0x7FFFFFFF;
+ return shift > $highest_non_extended_utf8_cp;
}
my @tests;
@@ -286,7 +286,6 @@ my @tests;
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
0x80000000,
- (isASCII) ? 1 : 8,
],
[ "highest 32 bit code point",
(isASCII)
@@ -294,7 +293,6 @@ my @tests;
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
0xFFFFFFFF,
- (isASCII) ? 1 : 8,
],
[ "Lowest 33 bit code point",
(isASCII)
@@ -340,7 +338,6 @@ my @tests;
[ "Lowest code point requiring 13 bytes to represent",
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
0x1000000000,
- 1,
],
[ "overflow that old algorithm failed to detect",
"\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
@@ -355,37 +352,31 @@ my @tests;
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x800000000,
- 7,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x10000000000,
- 6,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x200000000000,
- 5,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x4000000000000,
- 4,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x80000000000000,
- 3,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x1000000000000000,
- 2,
];
}
}
@@ -587,6 +578,11 @@ foreach my $test (@tests) {
# contain a code point. (This is a result of
# some sort of malformation that means we
# can't get an exact code poin
+ my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+ \Q requires a Perl extension, and so is not\E
+ \Q portable\E/x;
+ my $extended_non_cp_trailing_text
+ = "is a Perl extension, and so is not portable";
# Is this test malformed from the beginning? If so, we know to generally
# expect that the tests will show it isn't valid.
@@ -619,9 +615,9 @@ foreach my $test (@tests) {
$initially_malformed = 1;
}
elsif (requires_extended_utf8($allowed_uv)) {
- $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
- \Q and not portable\E/x;
- $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
+ $cp_message_qr = $extended_cp_message_qr;
+ $non_cp_trailing_text = $extended_non_cp_trailing_text;
+ $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
}
else {
$cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
@@ -870,6 +866,9 @@ foreach my $test (@tests) {
# maximum length, so skip if we're already at that length.
next if $overlong && $length >= $::max_bytes;
+ my $this_cp_message_qr = $cp_message_qr;
+ my $this_non_cp_trailing_text = $non_cp_trailing_text;
+
foreach my $malformed_allow_type (0..2) {
# 0 don't allow this malformation; ignored if no malformation
# 1 allow, with REPLACEMENT CHARACTER returned
@@ -899,6 +898,8 @@ foreach my $test (@tests) {
# combinations of on/off are tested for. It's either all are
# allowed, or none are.
my $allow_flags = 0;
+ my $overlong_is_in_perl_extended_utf8 = 0;
+ my $dont_use_overlong_cp = 0;
if ($overlong) {
my $new_expected_len;
@@ -929,8 +930,20 @@ foreach my $test (@tests) {
else { # Must use extended UTF-8. On ASCII platforms, we
# could express some overlongs here starting with
# \xFE, but there's no real reason to do so.
+ $overlong_is_in_perl_extended_utf8 = 1;
$start_byte = I8_to_native("\xFF");
$new_expected_len = $::max_bytes;
+ $this_cp_message_qr = $extended_cp_message_qr;
+
+ # The warning that gets raised doesn't include the code
+ # point in the message if the code point can be expressed
+ # without using extended UTF-8, but the particular
+ # overlong sequence used is in extended UTF-8. To do
+ # otherwise would be confusing to the user, as it would
+ # claim the code point requires extended, when it doesn't.
+ $dont_use_overlong_cp = 1
+ unless requires_extended_utf8($allowed_uv);
+ $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
}
# Splice in the revise continuation byte, preceded by the
@@ -1152,12 +1165,20 @@ foreach my $test (@tests) {
# on all the other flags. That makes sure that they all
# are independent of this flag, and so we don't need to
# test them individually.
- my $this_warning_flags = ($use_warn_flag)
- ? $this_utf8n_flag_to_warn
- : $utf8n_flag_to_warn_complement;
- my $this_disallow_flags = ($do_disallow)
- ? $this_utf8n_flag_to_disallow
- : $utf8n_flag_to_disallow_complement;
+ my $this_warning_flags
+ = ($use_warn_flag)
+ ? $this_utf8n_flag_to_warn
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_warn_complement
+ & ~$::UTF8_WARN_PERL_EXTENDED)
+ : $utf8n_flag_to_warn_complement);
+ my $this_disallow_flags
+ = ($do_disallow)
+ ? $this_utf8n_flag_to_disallow
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_disallow_complement
+ & ~$::UTF8_DISALLOW_PERL_EXTENDED)
+ : $utf8n_flag_to_disallow_complement);
my $expected_uv = $allowed_uv;
my $this_uv_string = $uv_string;
@@ -1216,19 +1237,21 @@ foreach my $test (@tests) {
# So far the array contains warnings generated by
# malformations. Add the expected regular one.
- unshift @expected_warnings, $cp_message_qr;
+ unshift @expected_warnings, $this_cp_message_qr;
# But it may need to be modified, because either of
# these malformations means we can't determine the
# expected code point.
- if ($short || $unexpected_noncont) {
+ if ( $short || $unexpected_noncont
+ || $dont_use_overlong_cp)
+ {
my $first_byte = substr($this_bytes, 0, 1);
$expected_warnings[0] = display_bytes(
substr($this_bytes, 0, $this_expected_len));
$expected_warnings[0]
= qr/[Aa]\Qny UTF-8 sequence that starts with\E
\Q $expected_warnings[0]\E
- \Q $non_cp_trailing_text\E/x;
+ \Q $this_non_cp_trailing_text\E/x;
}
}
diff --git a/proto.h b/proto.h
index d9ef91cd57..a8f6de868c 100644
--- a/proto.h
+++ b/proto.h
@@ -4078,6 +4078,17 @@ PERL_CALLCONV int Perl_my_sprintf(char *buffer, const char *pat, ...);
STATIC NV S_mulexp10(NV value, I32 exponent);
# endif
#endif
+#if !defined(UV_IS_QUAD)
+# if defined(PERL_IN_UTF8_C)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE bool S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS \
+ assert(s); assert(e)
+#endif
+
+# endif
+#endif
#if !defined(WIN32)
PERL_CALLCONV bool Perl_do_exec3(pTHX_ const char *incmd, int fd, int do_report);
#define PERL_ARGS_ASSERT_DO_EXEC3 \
@@ -5835,13 +5846,6 @@ PERL_STATIC_INLINE bool S_is_utf8_common_with_len(pTHX_ const U8 *const p, const
#endif
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
- __attribute__warn_unused_result__;
-#define PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS \
- assert(s); assert(e)
-#endif
-
-#ifndef PERL_NO_INLINE_FUNCTIONS
PERL_STATIC_INLINE bool S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK \
diff --git a/utf8.c b/utf8.c
index 88c2b32211..4be3bb734b 100644
--- a/utf8.c
+++ b/utf8.c
@@ -122,8 +122,9 @@ const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
" is not recommended for open interchange";
const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
" may not be portable";
-const char above_31_bit_cp_format[] = "Code point 0x%" UVXf " is not"
- " Unicode, and not portable";
+const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
+ " Unicode, requires a Perl extension," \
+ " and so is not portable";
#define HANDLE_UNICODE_SURROGATE(uv, flags) \
STMT_START { \
@@ -220,7 +221,7 @@ Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
/* Choose the more dire applicable warning */
(UNICODE_IS_PERL_EXTENDED(uv))
- ? above_31_bit_cp_format
+ ? perl_extended_cp_format
: super_cp_format,
uv);
}
@@ -362,30 +363,27 @@ defined in
L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
See L<perlunicode/Noncharacter code points>.
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points. Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these that written by the perl interpreter; nor would Perl understand files
-written by something that uses a different extension. For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted. These are the
-C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent. When that happens,
-effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UNICODE_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII. Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
-and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does. It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension. For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
+and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
+treat all above-Unicode code points, including these, as malformations. (Note
+that the Unicode standard considers anything above 0x10FFFF to be illegal, but
+there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
+
+A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
+C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
+flags can apply to code points that actually do fit in 31 bits. This happens
+on EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
+describe the situation in all cases.
=cut
*/
@@ -399,6 +397,8 @@ Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
return uvchr_to_utf8_flags(d, uv, flags);
}
+#ifndef UV_IS_QUAD
+
PERL_STATIC_INLINE bool
S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
{
@@ -480,6 +480,8 @@ S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
}
+#endif
+
/* Anything larger than this will overflow the word if it were converted into a UV */
#if defined(UV_IS_QUAD)
# ifdef EBCDIC /* Actually is I8 */
@@ -730,10 +732,12 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
/* B6 and B7 */ \
&& ((s1) & 0xFE ) == 0xB6)
+# define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
#else
# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
+# define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
#endif
if ( (flags & UTF8_DISALLOW_SUPER)
@@ -743,9 +747,9 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
}
if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
- && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
+ && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
{
- return 0; /* Above 31 bits */
+ return 0;
}
if (len > 1) {
@@ -954,36 +958,34 @@ a malformation and raise a warning, specify both the WARN and DISALLOW flags.
(But note that warnings are not raised if lexically disabled nor if
C<UTF8_CHECK_ONLY> is also specified.)
+Extremely high code points were never specified in any standard, and require an
+extension to UTF-8 to express, which Perl does. It is likely that programs
+written in something other than Perl would not be able to read files that
+contain these; nor would Perl understand files written by something that uses a
+different extension. For these reasons, there is a separate set of flags that
+can warn and/or disallow these extremely high code points, even if other
+above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
+C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
+L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
+above-Unicode code points, including these, as malformations.
+(Note that the Unicode standard considers anything above 0x10FFFF to be
+illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
+(2**31 -1))
+
+A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
+retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
+C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
+C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
+can apply to code points that actually do fit in 31 bits. This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
+describe the situation in all cases.
+
It is now deprecated to have very high code points (above C<IV_MAX> on the
platforms) and this function will raise a deprecation warning for these (unless
such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
in a 32-bit word.
-Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
-so using them is more problematic than other above-Unicode code points. Perl
-invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
-likely that non-Perl languages will not be able to read files that contain
-these; nor would Perl understand files
-written by something that uses a different extension. For these reasons, there
-is a separate set of flags that can warn and/or disallow these extremely high
-code points, even if other above-Unicode ones are accepted. These are the
-C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
-are entirely independent from the deprecation warning for code points above
-C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
-code point that needs more than 31 bits to represent. When that happens,
-effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
-32-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
-above-Unicode code points, including these, as malformations; and
-C<UTF8_WARN_SUPER> warns on these.)
-
-On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
-extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
-than on ASCII. Prior to that, code points 2**31 and higher were simply
-unrepresentable, and a different, incompatible method was used to represent
-code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
-and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
-platforms, warning and disallowing 2**31 and higher.
-
All other code points corresponding to Unicode characters, including private
use and those yet to be assigned, are never considered malformed and never
warn.
@@ -1026,12 +1028,36 @@ exceptions are noted:
=over 4
-=item C<UTF8_GOT_ABOVE_31_BIT>
+=item C<UTF8_GOT_PERL_EXTENDED>
-The code point represented by the input UTF-8 sequence occupies more than 31
-bits.
-This bit is set only if the input C<flags> parameter contains either the
-C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+The input sequence is not standard UTF-8, but a Perl extension. This bit is
+set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
+
+Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
+and so some extension must be used to express them. Perl uses a natural
+extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
+extension to represent even higher ones, so that any code point that fits in a
+64-bit word can be represented. Text using these extensions is not likely to
+be portable to non-Perl code. We lump both of these extensions together and
+refer to them as Perl extended UTF-8. There exist other extensions that people
+have invented, incompatible with Perl's.
+
+On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
+extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
+than on ASCII. Prior to that, code points 2**31 and higher were simply
+unrepresentable, and a different, incompatible method was used to represent
+code points between 2**30 and 2**31 - 1.
+
+On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
+Perl extended UTF-8 is used.
+
+In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
+may use for backward compatibility. That name is misleading, as this flag may
+be set when the code point actually does fit in 31 bits. This happens on
+EBCDIC platforms, and sometimes when the L<overlong
+malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
+describes the situation in all cases.
=item C<UTF8_GOT_CONTINUATION>
@@ -1119,8 +1145,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
* too short one. Otherwise the first two are set to 's0' and 'send', and
* the third not used at all */
U8 * adjusted_s0 = (U8 *) s0;
- U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
- warning) */
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
@@ -1212,7 +1236,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
else {
send += expectlen;
}
- adjusted_send = send;
/* Now, loop through the remaining bytes in the character's sequence,
* accumulating each into the working value as we go. */
@@ -1297,7 +1320,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
}
adjusted_s0 = temp_char_buf;
- adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+ (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
}
}
@@ -1316,7 +1339,8 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
* and we deal with those in the overflow handling
* code */
&& LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
- && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+ && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
+ || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
|UTF8_DISALLOW_SUPER
@@ -1396,13 +1420,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
* end, based on how many bytes the start byte tells
* us should be in it, but no further than s0 +
* avail_len
- * adjusted_s0 normally is the same as s0, but in case of an
- * overlong for which the UTF-8 matters below, it is
- * the first byte of the shortest form representation
- * of the input.
- * adjusted_send normally is the same as 'send', but if adjusted_s0
- * is set to something other than s0, this points one
- * beyond its end
*/
if (UNLIKELY(possible_problems)) {
@@ -1603,39 +1620,35 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
}
}
- /* The maximum code point ever specified by a standard was
- * 2**31 - 1. Anything larger than that is a Perl extension
- * that very well may not be understood by other applications
- * (including earlier perl versions on EBCDIC platforms). We
- * test for these after the regular SUPER ones, and before
- * possibly bailing out, so that the slightly more dire warning
- * will override the regular one. */
- if ( (flags & (UTF8_WARN_PERL_EXTENDED
- |UTF8_WARN_SUPER
- |UTF8_DISALLOW_PERL_EXTENDED))
- && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
- && UNLIKELY(is_utf8_cp_above_31_bits(
- adjusted_s0,
- adjusted_send)))
- || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
- && UNLIKELY(UNICODE_IS_PERL_EXTENDED(uv)))))
- {
+ /* Test for Perl's extended UTF-8 after the regular SUPER ones,
+ * and before possibly bailing out, so that the more dire
+ * warning will override the regular one. */
+ if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
if ( ! (flags & UTF8_CHECK_ONLY)
&& (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
&& ckWARN_d(WARN_NON_UNICODE))
{
pack_warn = packWARN(WARN_NON_UNICODE);
- if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ /* If it is an overlong that evaluates to a code point
+ * that doesn't have to use the Perl extended UTF-8, it
+ * still used it, and so we output a message that
+ * doesn't refer to the code point. The same is true
+ * if there was a SHORT malformation where the code
+ * point is not valid. In that case, 'uv' will have
+ * been set to the REPLACEMENT CHAR, and the message
+ * below without the code point in it will be selected
+ * */
+ if (UNICODE_IS_PERL_EXTENDED(uv)) {
message = Perl_form(aTHX_
- "Any UTF-8 sequence that starts with"
- " \"%s\" is for a non-Unicode code"
- " point, and is not portable",
- _byte_dump_string(s0, curlen, 0));
+ perl_extended_cp_format, uv);
}
else {
message = Perl_form(aTHX_
- above_31_bit_cp_format, uv);
+ "Any UTF-8 sequence that starts with"
+ " \"%s\" is a Perl extension, and"
+ " so is not portable",
+ _byte_dump_string(s0, curlen, 0));
}
}
diff --git a/utf8.h b/utf8.h
index c880375e1d..0f29817757 100644
--- a/utf8.h
+++ b/utf8.h
@@ -407,6 +407,8 @@ C<cp> is Unicode if above 255; otherwise is platform-native.
( LIKELY( ( ( ( ((const U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
: LIKELY( ( ( ( 0xF4 == ((const U8*)s)[0] ) && ( ( ((const U8*)s)[1] & 0xF0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((const U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
+#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x7FFFFFFF)
+
#endif /* EBCDIC vs ASCII */
/* 2**UTF_ACCUMULATION_SHIFT - 1 */
@@ -781,7 +783,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
#define UTF8_GOT_PERL_EXTENDED UTF8_DISALLOW_PERL_EXTENDED
#define UTF8_WARN_PERL_EXTENDED 0x8000
-/* For back compat, these old names are misleading for UTF_EBCDIC */
+/* For back compat, these old names are misleading for overlongs and
+ * UTF_EBCDIC. */
#define UTF8_DISALLOW_ABOVE_31_BIT UTF8_DISALLOW_PERL_EXTENDED
#define UTF8_GOT_ABOVE_31_BIT UTF8_GOT_PERL_EXTENDED
#define UTF8_WARN_ABOVE_31_BIT UTF8_WARN_PERL_EXTENDED
@@ -958,7 +961,6 @@ point's representation.
&& UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
#define UNICODE_IS_SUPER(uv) ((UV) (uv) > PERL_UNICODE_MAX)
-#define UNICODE_IS_PERL_EXTENDED(uv) ((UV) (uv) > 0x7FFFFFFF)
#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \
diff --git a/utfebcdic.h b/utfebcdic.h
index 0f81d1ffee..c2f0788cc4 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -511,6 +511,8 @@ explicitly forbidden, and the shortest possible encoding should always be used
* has this start byte (expressed in I8) as the maximum */
#define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9
+#define UNICODE_IS_PERL_EXTENDED(uv) UNLIKELY((UV) (uv) > 0x3FFFFFFF)
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/