summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--embed.h2
-rwxr-xr-xembed.pl1
-rw-r--r--global.sym1
-rw-r--r--lib/encoding.pm8
-rw-r--r--lib/encoding.t25
-rw-r--r--pod/perlapi.pod96
-rw-r--r--proto.h1
-rw-r--r--sv.c76
-rw-r--r--toke.c5
9 files changed, 134 insertions, 81 deletions
diff --git a/embed.h b/embed.h
index 4ac3878da5..71fb041770 100644
--- a/embed.h
+++ b/embed.h
@@ -688,6 +688,7 @@
#define sv_pos_b2u Perl_sv_pos_b2u
#define sv_pvutf8n_force Perl_sv_pvutf8n_force
#define sv_pvbyten_force Perl_sv_pvbyten_force
+#define sv_recode_to_utf8 Perl_sv_recode_to_utf8
#define sv_reftype Perl_sv_reftype
#define sv_replace Perl_sv_replace
#define sv_report_used Perl_sv_report_used
@@ -2201,6 +2202,7 @@
#define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b)
#define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b)
#define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b)
+#define sv_recode_to_utf8(a) Perl_sv_recode_to_utf8(aTHX_ a)
#define sv_reftype(a,b) Perl_sv_reftype(aTHX_ a,b)
#define sv_replace(a,b) Perl_sv_replace(aTHX_ a,b)
#define sv_report_used() Perl_sv_report_used(aTHX)
diff --git a/embed.pl b/embed.pl
index 345a299334..392e16cc29 100755
--- a/embed.pl
+++ b/embed.pl
@@ -1779,6 +1779,7 @@ Apd |void |sv_pos_b2u |SV* sv|I32* offsetp
Amd |char* |sv_pvn_force |SV* sv|STRLEN* lp
Apd |char* |sv_pvutf8n_force|SV* sv|STRLEN* lp
Apd |char* |sv_pvbyten_force|SV* sv|STRLEN* lp
+Apd |void |sv_recode_to_utf8 |SV*
Apd |char* |sv_reftype |SV* sv|int ob
Apd |void |sv_replace |SV* sv|SV* nsv
Apd |void |sv_report_used
diff --git a/global.sym b/global.sym
index c2535a978a..cf8ec98adf 100644
--- a/global.sym
+++ b/global.sym
@@ -438,6 +438,7 @@ Perl_sv_pos_u2b
Perl_sv_pos_b2u
Perl_sv_pvutf8n_force
Perl_sv_pvbyten_force
+Perl_sv_recode_to_utf8
Perl_sv_reftype
Perl_sv_replace
Perl_sv_report_used
diff --git a/lib/encoding.pm b/lib/encoding.pm
index 7dacd7a5bd..1addeb469b 100644
--- a/lib/encoding.pm
+++ b/lib/encoding.pm
@@ -38,12 +38,14 @@ expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the
encoding pragma you can change this default.
The pragma is a per script, not a per block lexical. Only the last
-'use encoding' seen matters.
+C<use encoding> matters, and it affects B<the whole script>.
=head1 FUTURE POSSIBILITIES
-The C<\x..> and C<\0...> in literals and regular expressions are not
-affected by this pragma. They probably should. Ditto C<\N{...}>.
+The C<\x..> and C<\0...> in regular expressions are not
+affected by this pragma. They probably should.
+
+Also C<\N{...}> might become affected.
=head1 SEE ALSO
diff --git a/lib/encoding.t b/lib/encoding.t
index 40d97a2b94..2be0312303 100644
--- a/lib/encoding.t
+++ b/lib/encoding.t
@@ -1,24 +1,31 @@
-print "1..3\n";
+print "1..5\n";
use encoding "latin1"; # ignored (overwritten by the next line)
use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...)
-$a = "\xDF";
-$b = "\x{100}";
-
-my $c = $a . $b;
-
# "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is
# \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS),
# instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S)
-print "not " unless ord($c) == 0x3af;
+$a = "\xDF";
+$b = "\x{100}";
+
+print "not " unless ord($a) == 0x3af;
print "ok 1\n";
-print "not " unless length($c) == 2;
+print "not " unless ord($b) == 0x100;
print "ok 2\n";
-print "not " unless ord(substr($c, 1, 1)) == 0x100;
+my $c;
+
+$c = $a . $b;
+
+print "not " unless ord($c) == 0x3af;
print "ok 3\n";
+print "not " unless length($c) == 2;
+print "ok 4\n";
+
+print "not " unless ord(substr($c, 1, 1)) == 0x100;
+print "ok 5\n";
diff --git a/pod/perlapi.pod b/pod/perlapi.pod
index 0435058d35..41d2373fa7 100644
--- a/pod/perlapi.pod
+++ b/pod/perlapi.pod
@@ -1404,17 +1404,6 @@ SV is B<not> incremented.
=for hackers
Found in file sv.c
-=item newSV
-
-Create a new null SV, or if len > 0, create a new empty SVt_PV type SV
-with an initial PV allocation of len+1. Normally accessed via the C<NEWSV>
-macro.
-
- SV* newSV(STRLEN len)
-
-=for hackers
-Found in file sv.c
-
=item NEWSV
Creates a new SV. A non-zero C<len> parameter indicates the number of
@@ -1428,6 +1417,17 @@ C<id> is an integer id between 0 and 1299 (used to identify leaks).
=for hackers
Found in file handy.h
+=item newSV
+
+Create a new null SV, or if len > 0, create a new empty SVt_PV type SV
+with an initial PV allocation of len+1. Normally accessed via the C<NEWSV>
+macro.
+
+ SV* newSV(STRLEN len)
+
+=for hackers
+Found in file sv.c
+
=item newSViv
Creates a new SV and copies an integer into it. The reference count for the
@@ -2282,22 +2282,22 @@ version which guarantees to evaluate sv only once.
=for hackers
Found in file sv.h
-=item SvIVX
+=item SvIVx
-Returns the raw value in the SV's IV slot, without checks or conversions.
-Only use when you are sure SvIOK is true. See also C<SvIV()>.
+Coerces the given SV to an integer and returns it. Guarantees to evaluate
+sv only once. Use the more efficent C<SvIV> otherwise.
- IV SvIVX(SV* sv)
+ IV SvIVx(SV* sv)
=for hackers
Found in file sv.h
-=item SvIVx
+=item SvIVX
-Coerces the given SV to an integer and returns it. Guarantees to evaluate
-sv only once. Use the more efficent C<SvIV> otherwise.
+Returns the raw value in the SV's IV slot, without checks or conversions.
+Only use when you are sure SvIOK is true. See also C<SvIV()>.
- IV SvIVx(SV* sv)
+ IV SvIVX(SV* sv)
=for hackers
Found in file sv.h
@@ -2606,21 +2606,21 @@ Like C<SvPV_nolen>, but converts sv to utf8 first if necessary.
=for hackers
Found in file sv.h
-=item SvPVx
+=item SvPVX
-A version of C<SvPV> which guarantees to evaluate sv only once.
+Returns a pointer to the physical string in the SV. The SV must contain a
+string.
- char* SvPVx(SV* sv, STRLEN len)
+ char* SvPVX(SV* sv)
=for hackers
Found in file sv.h
-=item SvPVX
+=item SvPVx
-Returns a pointer to the physical string in the SV. The SV must contain a
-string.
+A version of C<SvPV> which guarantees to evaluate sv only once.
- char* SvPVX(SV* sv)
+ char* SvPVx(SV* sv, STRLEN len)
=for hackers
Found in file sv.h
@@ -2827,19 +2827,19 @@ false, defined or undefined. Does not handle 'get' magic.
=for hackers
Found in file sv.h
-=item SvTYPE
-
-Returns the type of the SV. See C<svtype>.
+=item svtype
- svtype SvTYPE(SV* sv)
+An enum of flags for Perl types. These are found in the file B<sv.h>
+in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
=for hackers
Found in file sv.h
-=item svtype
+=item SvTYPE
-An enum of flags for Perl types. These are found in the file B<sv.h>
-in the C<svtype> enum. Test these flags with the C<SvTYPE> macro.
+Returns the type of the SV. See C<svtype>.
+
+ svtype SvTYPE(SV* sv)
=for hackers
Found in file sv.h
@@ -2950,22 +2950,22 @@ for a version which guarantees to evaluate sv only once.
=for hackers
Found in file sv.h
-=item SvUVX
+=item SvUVx
-Returns the raw value in the SV's UV slot, without checks or conversions.
-Only use when you are sure SvIOK is true. See also C<SvUV()>.
+Coerces the given SV to an unsigned integer and returns it. Guarantees to
+evaluate sv only once. Use the more efficent C<SvUV> otherwise.
- UV SvUVX(SV* sv)
+ UV SvUVx(SV* sv)
=for hackers
Found in file sv.h
-=item SvUVx
+=item SvUVX
-Coerces the given SV to an unsigned integer and returns it. Guarantees to
-evaluate sv only once. Use the more efficent C<SvUV> otherwise.
+Returns the raw value in the SV's UV slot, without checks or conversions.
+Only use when you are sure SvIOK is true. See also C<SvUV()>.
- UV SvUVx(SV* sv)
+ UV SvUVX(SV* sv)
=for hackers
Found in file sv.h
@@ -3662,6 +3662,20 @@ instead.
=for hackers
Found in file sv.c
+=item sv_recode_to_utf8
+
+If PL_encoding is set you can call this to recode the pv of the sv.
+The PL_encoding is assumed to be an Encode object, on entry the pv is assumed
+to be octets in that encoding, and the sv will be converted into Unicode
+(and UTF-8).
+
+If PL_encoding is not an Encode object, things will go boom.
+
+ void sv_recode_to_utf8(SV*)
+
+=for hackers
+Found in file sv.c
+
=item sv_reftype
Returns a string describing what the SV is a reference to.
diff --git a/proto.h b/proto.h
index dc96ebe45f..7d9bc021f8 100644
--- a/proto.h
+++ b/proto.h
@@ -760,6 +760,7 @@ PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV* sv, I32* offsetp);
/* PERL_CALLCONV char* sv_pvn_force(pTHX_ SV* sv, STRLEN* lp); */
PERL_CALLCONV char* Perl_sv_pvutf8n_force(pTHX_ SV* sv, STRLEN* lp);
PERL_CALLCONV char* Perl_sv_pvbyten_force(pTHX_ SV* sv, STRLEN* lp);
+PERL_CALLCONV void Perl_sv_recode_to_utf8(pTHX_ SV*);
PERL_CALLCONV char* Perl_sv_reftype(pTHX_ SV* sv, int ob);
PERL_CALLCONV void Perl_sv_replace(pTHX_ SV* sv, SV* nsv);
PERL_CALLCONV void Perl_sv_report_used(pTHX);
diff --git a/sv.c b/sv.c
index 520734cf55..a447517b96 100644
--- a/sv.c
+++ b/sv.c
@@ -3302,32 +3302,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
sv_force_normal(sv);
}
- if (PL_encoding) {
- SV *uni;
- STRLEN len;
- char *s;
- dSP;
- ENTER;
- SAVETMPS;
- PUSHMARK(sp);
- EXTEND(SP, 3);
- XPUSHs(PL_encoding);
- XPUSHs(sv);
- XPUSHs(&PL_sv_yes);
- PUTBACK;
- call_method("decode", G_SCALAR);
- SPAGAIN;
- uni = POPs;
- PUTBACK;
- s = SvPVutf8(uni, len);
- if (s != SvPVX(sv)) {
- SvGROW(sv, len);
- Move(s, SvPVX(sv), len, char);
- SvCUR_set(sv, len);
- }
- FREETMPS;
- LEAVE;
- } else { /* Assume Latin-1/EBCDIC */
+ if (PL_encoding)
+ Perl_sv_recode_to_utf8(aTHX_ sv);
+ else { /* Assume Latin-1/EBCDIC */
/* This function could be much more efficient if we
* had a FLAG in SVs to signal if there are any hibit
* chars in the PV. Given that there isn't such a flag
@@ -3350,9 +3327,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags)
Safefree(s); /* No longer using what was there before. */
SvLEN(sv) = len; /* No longer know the real size. */
}
+ /* Mark as UTF-8 even if no hibit - saves scanning loop */
+ SvUTF8_on(sv);
}
- /* Mark as UTF-8 even if no hibit - saves scanning loop */
- SvUTF8_on(sv);
return SvCUR(sv);
}
@@ -10382,3 +10359,46 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
#endif /* USE_ITHREADS */
+/*
+=for apidoc sv_recode_to_utf8
+
+If PL_encoding is set you can call this to recode the pv of the sv.
+The PL_encoding is assumed to be an Encode object, on entry the pv is assumed
+to be octets in that encoding, and the sv will be converted into Unicode
+(and UTF-8).
+
+If PL_encoding is not an Encode object, things will go boom.
+
+=cut
+*/
+
+void
+Perl_sv_recode_to_utf8(pTHX_ SV *sv)
+{
+ SV *uni;
+ STRLEN len;
+ char *s;
+ dSP;
+ ENTER;
+ SAVETMPS;
+ PUSHMARK(sp);
+ EXTEND(SP, 3);
+ XPUSHs(PL_encoding);
+ XPUSHs(sv);
+ XPUSHs(&PL_sv_yes);
+ PUTBACK;
+ call_method("decode", G_SCALAR);
+ SPAGAIN;
+ uni = POPs;
+ PUTBACK;
+ s = SvPVutf8(uni, len);
+ if (s != SvPVX(sv)) {
+ SvGROW(sv, len);
+ Move(s, SvPVX(sv), len, char);
+ SvCUR_set(sv, len);
+ }
+ FREETMPS;
+ LEAVE;
+ SvUTF8_on(sv);
+}
+
diff --git a/toke.c b/toke.c
index e6d7abc372..90f8305b6c 100644
--- a/toke.c
+++ b/toke.c
@@ -1653,6 +1653,10 @@ S_scan_const(pTHX_ char *start)
Perl_croak(aTHX_ "panic: constant overflowed allocated space");
SvPOK_on(sv);
+ if (PL_encoding && !has_utf8) {
+ Perl_sv_recode_to_utf8(aTHX_ sv);
+ has_utf8 = TRUE;
+ }
if (has_utf8) {
SvUTF8_on(sv);
if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) {
@@ -7734,3 +7738,4 @@ utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen)
return count;
}
#endif
+