summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.h4
-rwxr-xr-xembed.pl4
-rw-r--r--global.sym6
-rw-r--r--op.c3
-rw-r--r--pod/perlapi.pod16
-rw-r--r--pp.c5
-rw-r--r--proto.h2
-rwxr-xr-xt/op/each.t2
-rwxr-xr-xt/op/pat.t4
-rw-r--r--t/op/qq.t2
-rw-r--r--utf8.c45
-rw-r--r--utf8.h11
12 files changed, 85 insertions, 19 deletions
diff --git a/embed.h b/embed.h
index a748737f4f..fd65d07c39 100644
--- a/embed.h
+++ b/embed.h
@@ -755,6 +755,8 @@
#define utf8n_to_uvuni Perl_utf8n_to_uvuni
#define uvchr_to_utf8 Perl_uvchr_to_utf8
#define uvuni_to_utf8 Perl_uvuni_to_utf8
+#define uvchr_to_utf8_flags Perl_uvchr_to_utf8_flags
+#define uvuni_to_utf8_flags Perl_uvuni_to_utf8_flags
#define pv_uni_display Perl_pv_uni_display
#define sv_uni_display Perl_sv_uni_display
#define vivify_defelem Perl_vivify_defelem
@@ -2274,6 +2276,8 @@
#define utf8n_to_uvuni(a,b,c,d) Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
#define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX_ a,b)
#define uvuni_to_utf8(a,b) Perl_uvuni_to_utf8(aTHX_ a,b)
+#define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX_ a,b,c)
+#define uvuni_to_utf8_flags(a,b,c) Perl_uvuni_to_utf8_flags(aTHX_ a,b,c)
#define pv_uni_display(a,b,c,d,e) Perl_pv_uni_display(aTHX_ a,b,c,d,e)
#define sv_uni_display(a,b,c,d) Perl_sv_uni_display(aTHX_ a,b,c,d)
#define vivify_defelem(a) Perl_vivify_defelem(aTHX_ a)
diff --git a/embed.pl b/embed.pl
index 74fd9a537c..adbfcc38ed 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1853,7 +1853,9 @@ Apd |UV |utf8_to_uvuni |U8 *s|STRLEN* retlen
Adp |UV |utf8n_to_uvchr |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
Adp |UV |utf8n_to_uvuni |U8 *s|STRLEN curlen|STRLEN* retlen|U32 flags
Apd |U8* |uvchr_to_utf8 |U8 *d|UV uv
-Apd |U8* |uvuni_to_utf8 |U8 *d|UV uv
+Ap |U8* |uvuni_to_utf8 |U8 *d|UV uv
+Ap |U8* |uvchr_to_utf8_flags |U8 *d|UV uv|UV flags
+Apd |U8* |uvuni_to_utf8_flags |U8 *d|UV uv|UV flags
Apd |char* |pv_uni_display |SV *dsv|U8 *spv|STRLEN len \
|STRLEN pvlim|UV flags
Apd |char* |sv_uni_display |SV *dsv|SV *ssv|STRLEN pvlim|UV flags
diff --git a/global.sym b/global.sym
index b2a9225100..c19e004d66 100644
--- a/global.sym
+++ b/global.sym
@@ -157,6 +157,10 @@ Perl_ibcmp_utf8
Perl_init_stacks
Perl_init_tm
Perl_instr
+Perl_is_lvalue_sub
+Perl_to_uni_upper_lc
+Perl_to_uni_title_lc
+Perl_to_uni_lower_lc
Perl_is_uni_alnum
Perl_is_uni_alnumc
Perl_is_uni_idfirst
@@ -496,6 +500,8 @@ Perl_utf8n_to_uvchr
Perl_utf8n_to_uvuni
Perl_uvchr_to_utf8
Perl_uvuni_to_utf8
+Perl_uvchr_to_utf8_flags
+Perl_uvuni_to_utf8_flags
Perl_pv_uni_display
Perl_sv_uni_display
Perl_warn
diff --git a/op.c b/op.c
index a35c91966c..9b1556e2b0 100644
--- a/op.c
+++ b/op.c
@@ -2866,7 +2866,8 @@ Perl_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
U8 range_mark = UTF_TO_NATIVE(0xff);
sv_catpvn(transv, (char *)&range_mark, 1);
}
- t = uvuni_to_utf8(tmpbuf, 0x7fffffff);
+ t = uvuni_to_utf8_flags(tmpbuf, 0x7fffffff,
+ UNICODE_ALLOW_SUPER);
sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf);
t = (U8*)SvPVX(transv);
tlen = SvCUR(transv);
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 7bdf75c8c9..397f52b029 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1573,8 +1573,8 @@ Found in file handy.h
Returns a pointer to the next character after the parsed
vstring, as well as updating the passed in sv.
- *
-Function must be called like
+ *
+Function must be called like
sv = NEWSV(92,5);
s = new_vstring(s,sv);
@@ -4453,20 +4453,28 @@ is the recommended wide native character-aware way of saying
=for hackers
Found in file utf8.c
-=item uvuni_to_utf8
+=item uvuni_to_utf8_flags
Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
+ d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
d = uvuni_to_utf8(d, uv);
+(which is equivalent to)
+
+ d = uvuni_to_utf8_flags(d, uv, 0);
+
is the recommended Unicode-aware way of saying
*(d++) = uv;
- U8* uvuni_to_utf8(U8 *d, UV uv)
+ U8* uvuni_to_utf8_flags(U8 *d, UV uv, UV flags)
=for hackers
Found in file utf8.c
diff --git a/pp.c b/pp.c
index 0ddfefed6b..eb386eede1 100644
--- a/pp.c
+++ b/pp.c
@@ -2258,7 +2258,7 @@ PP(pp_complement)
while (tmps < send) {
UV c = utf8n_to_uvchr(tmps, send-tmps, &l, UTF8_ALLOW_ANYUV);
tmps += UTF8SKIP(tmps);
- result = uvchr_to_utf8(result, ~c);
+ result = uvchr_to_utf8_flags(result, ~c, UNICODE_ALLOW_ANY);
}
*result = '\0';
result -= targlen;
@@ -3148,7 +3148,8 @@ PP(pp_chr)
if (value > 255 && !IN_BYTES) {
SvGROW(TARG, UNISKIP(value)+1);
- tmps = (char*)uvchr_to_utf8((U8*)SvPVX(TARG), value);
+ tmps = (char*)uvchr_to_utf8_flags((U8*)SvPVX(TARG), value,
+ UNICODE_ALLOW_SUPER);
SvCUR_set(TARG, tmps - SvPVX(TARG));
*tmps = '\0';
(void)SvPOK_only(TARG);
diff --git a/proto.h b/proto.h
index 33e8b826ba..b6ed2872aa 100644
--- a/proto.h
+++ b/proto.h
@@ -832,6 +832,8 @@ PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen,
PERL_CALLCONV UV Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN* retlen, U32 flags);
PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
PERL_CALLCONV U8* Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv);
+PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
+PERL_CALLCONV U8* Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
PERL_CALLCONV char* Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags);
PERL_CALLCONV char* Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags);
PERL_CALLCONV void Perl_vivify_defelem(pTHX_ SV* sv);
diff --git a/t/op/each.t b/t/op/each.t
index 556479ef70..8212264d55 100755
--- a/t/op/each.t
+++ b/t/op/each.t
@@ -135,7 +135,7 @@ ok ($i == 5);
# Check for Unicode hash keys.
%u = ("\x{12}", "f", "\x{123}", "fo", "\x{1234}", "foo");
$u{"\x{12345}"} = "bar";
-@u{"\x{123456}"} = "zap";
+@u{"\x{10FFFD}"} = "zap";
my %u2;
foreach (keys %u) {
diff --git a/t/op/pat.t b/t/op/pat.t
index 6b4b0619bf..077b9579e5 100755
--- a/t/op/pat.t
+++ b/t/op/pat.t
@@ -1618,9 +1618,9 @@ EOT
{
# from Robin Houston
- my $x = "\x{12345678}";
+ my $x = "\x{10FFFD}";
$x =~ s/(.)/$1/g;
- print "not " unless ord($x) == 0x12345678 && length($x) == 1;
+ print "not " unless ord($x) == 0x10FFFD && length($x) == 1;
print "ok 587\n";
}
diff --git a/t/op/qq.t b/t/op/qq.t
index 651cf18a2e..d8831696a7 100644
--- a/t/op/qq.t
+++ b/t/op/qq.t
@@ -60,4 +60,4 @@ is ("\x{000000000000000000000000000000000000000000000000000000000000000072}",
chr 114);
is ("\x{0_06_5}", chr 101);
is ("\x{1234}", chr 4660);
-is ("\x{98765432}", chr 2557891634);
+is ("\x{10FFFD}", chr 1114109);
diff --git a/utf8.c b/utf8.c
index 81af39735a..debfb9ceac 100644
--- a/utf8.c
+++ b/utf8.c
@@ -27,15 +27,23 @@
/* Unicode support */
/*
-=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
+=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
+ d = uvuni_to_utf8_flags(d, uv, flags);
+
+or, in most cases,
+
d = uvuni_to_utf8(d, uv);
+(which is equivalent to)
+
+ d = uvuni_to_utf8_flags(d, uv, 0);
+
is the recommended Unicode-aware way of saying
*(d++) = uv;
@@ -44,13 +52,26 @@ is the recommended Unicode-aware way of saying
*/
U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
if (ckWARN_d(WARN_UTF8)) {
- if (UNICODE_IS_SURROGATE(uv))
+ if (UNICODE_IS_SURROGATE(uv) &&
+ !(flags & UNICODE_ALLOW_SURROGATE))
Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
- else if ((uv >= 0xFDD0 && uv <= 0xFDEF) ||
- (uv == 0xFFFE || uv == 0xFFFF))
+ else if (
+ ((uv >= 0xFDD0 && uv <= 0xFDEF &&
+ !(flags & UNICODE_ALLOW_FDD0))
+ ||
+ ((uv & 0xFFFF) == 0xFFFE &&
+ !(flags & UNICODE_ALLOW_FFFE))
+ ||
+ ((uv & 0xFFFF) == 0xFFFF &&
+ !(flags & UNICODE_ALLOW_FFFF))) &&
+ /* UNICODE_ALLOW_SUPER includes
+ * FFFEs and FFFFs beyond 0x10FFFF. */
+ ((uv <= PERL_UNICODE_MAX) ||
+ !(flags & UNICODE_ALLOW_SUPER))
+ )
Perl_warner(aTHX_ WARN_UTF8,
"Unicode character 0x%04"UVxf" is illegal", uv);
}
@@ -138,7 +159,12 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
#endif
#endif /* Loop style */
}
-
+
+U8 *
+Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
+{
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
+}
/*
@@ -1544,9 +1570,14 @@ is the recommended wide native character-aware way of saying
U8 *
Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
{
- return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
}
+U8 *
+Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
+{
+ return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
+}
/*
=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
diff --git a/utf8.h b/utf8.h
index 1c2243e2da..b35cfebb5e 100644
--- a/utf8.h
+++ b/utf8.h
@@ -166,6 +166,17 @@ END_EXTERN_C
#define UNICODE_BYTER_ORDER_MARK 0xfffe
#define UNICODE_ILLEGAL 0xffff
+/* Though our UTF-8 encoding can go beyond this,
+ * let's be conservative. */
+#define PERL_UNICODE_MAX 0x10FFFF
+
+#define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */
+#define UNICODE_ALLOW_FDD0 0x0002 /* Allow the U+FDD0...U+FDEF */
+#define UNICODE_ALLOW_FFFE 0x0004 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_FFFF 0x0008 /* Allow 0xFFFE, 0x1FFFE, ... */
+#define UNICODE_ALLOW_SUPER 0x0010 /* Allow past 10xFFFF */
+#define UNICODE_ALLOW_ANY 0xFFFF
+
#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
(c) <= UNICODE_SURROGATE_LAST)
#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)