diff options
-rw-r--r-- | embed.h | 2 | ||||
-rwxr-xr-x | embed.pl | 1 | ||||
-rw-r--r-- | global.sym | 1 | ||||
-rw-r--r-- | lib/encoding.pm | 8 | ||||
-rw-r--r-- | lib/encoding.t | 25 | ||||
-rw-r--r-- | pod/perlapi.pod | 96 | ||||
-rw-r--r-- | proto.h | 1 | ||||
-rw-r--r-- | sv.c | 76 | ||||
-rw-r--r-- | toke.c | 5 |
9 files changed, 134 insertions, 81 deletions
@@ -688,6 +688,7 @@ #define sv_pos_b2u Perl_sv_pos_b2u #define sv_pvutf8n_force Perl_sv_pvutf8n_force #define sv_pvbyten_force Perl_sv_pvbyten_force +#define sv_recode_to_utf8 Perl_sv_recode_to_utf8 #define sv_reftype Perl_sv_reftype #define sv_replace Perl_sv_replace #define sv_report_used Perl_sv_report_used @@ -2201,6 +2202,7 @@ #define sv_pos_b2u(a,b) Perl_sv_pos_b2u(aTHX_ a,b) #define sv_pvutf8n_force(a,b) Perl_sv_pvutf8n_force(aTHX_ a,b) #define sv_pvbyten_force(a,b) Perl_sv_pvbyten_force(aTHX_ a,b) +#define sv_recode_to_utf8(a) Perl_sv_recode_to_utf8(aTHX_ a) #define sv_reftype(a,b) Perl_sv_reftype(aTHX_ a,b) #define sv_replace(a,b) Perl_sv_replace(aTHX_ a,b) #define sv_report_used() Perl_sv_report_used(aTHX) @@ -1779,6 +1779,7 @@ Apd |void |sv_pos_b2u |SV* sv|I32* offsetp Amd |char* |sv_pvn_force |SV* sv|STRLEN* lp Apd |char* |sv_pvutf8n_force|SV* sv|STRLEN* lp Apd |char* |sv_pvbyten_force|SV* sv|STRLEN* lp +Apd |void |sv_recode_to_utf8 |SV* Apd |char* |sv_reftype |SV* sv|int ob Apd |void |sv_replace |SV* sv|SV* nsv Apd |void |sv_report_used diff --git a/global.sym b/global.sym index c2535a978a..cf8ec98adf 100644 --- a/global.sym +++ b/global.sym @@ -438,6 +438,7 @@ Perl_sv_pos_u2b Perl_sv_pos_b2u Perl_sv_pvutf8n_force Perl_sv_pvbyten_force +Perl_sv_recode_to_utf8 Perl_sv_reftype Perl_sv_replace Perl_sv_report_used diff --git a/lib/encoding.pm b/lib/encoding.pm index 7dacd7a5bd..1addeb469b 100644 --- a/lib/encoding.pm +++ b/lib/encoding.pm @@ -38,12 +38,14 @@ expected to be Latin-1 (or EBCDIC in EBCDIC platforms). With the encoding pragma you can change this default. The pragma is a per script, not a per block lexical. Only the last -'use encoding' seen matters. +C<use encoding> matters, and it affects B<the whole script>. =head1 FUTURE POSSIBILITIES -The C<\x..> and C<\0...> in literals and regular expressions are not -affected by this pragma. They probably should. Ditto C<\N{...}>. +The C<\x..> and C<\0...> in regular expressions are not +affected by this pragma. They probably should. + +Also C<\N{...}> might become affected. =head1 SEE ALSO diff --git a/lib/encoding.t b/lib/encoding.t index 40d97a2b94..2be0312303 100644 --- a/lib/encoding.t +++ b/lib/encoding.t @@ -1,24 +1,31 @@ -print "1..3\n"; +print "1..5\n"; use encoding "latin1"; # ignored (overwritten by the next line) use encoding "greek"; # iso 8859-7 (no "latin" alias, surprise...) -$a = "\xDF"; -$b = "\x{100}"; - -my $c = $a . $b; - # "greek" is "ISO 8859-7", and \xDF in ISO 8859-7 is # \x{3AF} in Unicode (GREEK SMALL LETTER IOTA WITH TONOS), # instead of \xDF in Unicode (LATIN SMALL LETTER SHARP S) -print "not " unless ord($c) == 0x3af; +$a = "\xDF"; +$b = "\x{100}"; + +print "not " unless ord($a) == 0x3af; print "ok 1\n"; -print "not " unless length($c) == 2; +print "not " unless ord($b) == 0x100; print "ok 2\n"; -print "not " unless ord(substr($c, 1, 1)) == 0x100; +my $c; + +$c = $a . $b; + +print "not " unless ord($c) == 0x3af; print "ok 3\n"; +print "not " unless length($c) == 2; +print "ok 4\n"; + +print "not " unless ord(substr($c, 1, 1)) == 0x100; +print "ok 5\n"; diff --git a/pod/perlapi.pod b/pod/perlapi.pod index 0435058d35..41d2373fa7 100644 --- a/pod/perlapi.pod +++ b/pod/perlapi.pod @@ -1404,17 +1404,6 @@ SV is B<not> incremented. =for hackers Found in file sv.c -=item newSV - -Create a new null SV, or if len > 0, create a new empty SVt_PV type SV -with an initial PV allocation of len+1. Normally accessed via the C<NEWSV> -macro. - - SV* newSV(STRLEN len) - -=for hackers -Found in file sv.c - =item NEWSV Creates a new SV. A non-zero C<len> parameter indicates the number of @@ -1428,6 +1417,17 @@ C<id> is an integer id between 0 and 1299 (used to identify leaks). =for hackers Found in file handy.h +=item newSV + +Create a new null SV, or if len > 0, create a new empty SVt_PV type SV +with an initial PV allocation of len+1. Normally accessed via the C<NEWSV> +macro. + + SV* newSV(STRLEN len) + +=for hackers +Found in file sv.c + =item newSViv Creates a new SV and copies an integer into it. The reference count for the @@ -2282,22 +2282,22 @@ version which guarantees to evaluate sv only once. =for hackers Found in file sv.h -=item SvIVX +=item SvIVx -Returns the raw value in the SV's IV slot, without checks or conversions. -Only use when you are sure SvIOK is true. See also C<SvIV()>. +Coerces the given SV to an integer and returns it. Guarantees to evaluate +sv only once. Use the more efficent C<SvIV> otherwise. - IV SvIVX(SV* sv) + IV SvIVx(SV* sv) =for hackers Found in file sv.h -=item SvIVx +=item SvIVX -Coerces the given SV to an integer and returns it. Guarantees to evaluate -sv only once. Use the more efficent C<SvIV> otherwise. +Returns the raw value in the SV's IV slot, without checks or conversions. +Only use when you are sure SvIOK is true. See also C<SvIV()>. - IV SvIVx(SV* sv) + IV SvIVX(SV* sv) =for hackers Found in file sv.h @@ -2606,21 +2606,21 @@ Like C<SvPV_nolen>, but converts sv to utf8 first if necessary. =for hackers Found in file sv.h -=item SvPVx +=item SvPVX -A version of C<SvPV> which guarantees to evaluate sv only once. +Returns a pointer to the physical string in the SV. The SV must contain a +string. - char* SvPVx(SV* sv, STRLEN len) + char* SvPVX(SV* sv) =for hackers Found in file sv.h -=item SvPVX +=item SvPVx -Returns a pointer to the physical string in the SV. The SV must contain a -string. +A version of C<SvPV> which guarantees to evaluate sv only once. - char* SvPVX(SV* sv) + char* SvPVx(SV* sv, STRLEN len) =for hackers Found in file sv.h @@ -2827,19 +2827,19 @@ false, defined or undefined. Does not handle 'get' magic. =for hackers Found in file sv.h -=item SvTYPE - -Returns the type of the SV. See C<svtype>. +=item svtype - svtype SvTYPE(SV* sv) +An enum of flags for Perl types. These are found in the file B<sv.h> +in the C<svtype> enum. Test these flags with the C<SvTYPE> macro. =for hackers Found in file sv.h -=item svtype +=item SvTYPE -An enum of flags for Perl types. These are found in the file B<sv.h> -in the C<svtype> enum. Test these flags with the C<SvTYPE> macro. +Returns the type of the SV. See C<svtype>. + + svtype SvTYPE(SV* sv) =for hackers Found in file sv.h @@ -2950,22 +2950,22 @@ for a version which guarantees to evaluate sv only once. =for hackers Found in file sv.h -=item SvUVX +=item SvUVx -Returns the raw value in the SV's UV slot, without checks or conversions. -Only use when you are sure SvIOK is true. See also C<SvUV()>. +Coerces the given SV to an unsigned integer and returns it. Guarantees to +evaluate sv only once. Use the more efficent C<SvUV> otherwise. - UV SvUVX(SV* sv) + UV SvUVx(SV* sv) =for hackers Found in file sv.h -=item SvUVx +=item SvUVX -Coerces the given SV to an unsigned integer and returns it. Guarantees to -evaluate sv only once. Use the more efficent C<SvUV> otherwise. +Returns the raw value in the SV's UV slot, without checks or conversions. +Only use when you are sure SvIOK is true. See also C<SvUV()>. - UV SvUVx(SV* sv) + UV SvUVX(SV* sv) =for hackers Found in file sv.h @@ -3662,6 +3662,20 @@ instead. =for hackers Found in file sv.c +=item sv_recode_to_utf8 + +If PL_encoding is set you can call this to recode the pv of the sv. +The PL_encoding is assumed to be an Encode object, on entry the pv is assumed +to be octets in that encoding, and the sv will be converted into Unicode +(and UTF-8). + +If PL_encoding is not an Encode object, things will go boom. + + void sv_recode_to_utf8(SV*) + +=for hackers +Found in file sv.c + =item sv_reftype Returns a string describing what the SV is a reference to. @@ -760,6 +760,7 @@ PERL_CALLCONV void Perl_sv_pos_b2u(pTHX_ SV* sv, I32* offsetp); /* PERL_CALLCONV char* sv_pvn_force(pTHX_ SV* sv, STRLEN* lp); */ PERL_CALLCONV char* Perl_sv_pvutf8n_force(pTHX_ SV* sv, STRLEN* lp); PERL_CALLCONV char* Perl_sv_pvbyten_force(pTHX_ SV* sv, STRLEN* lp); +PERL_CALLCONV void Perl_sv_recode_to_utf8(pTHX_ SV*); PERL_CALLCONV char* Perl_sv_reftype(pTHX_ SV* sv, int ob); PERL_CALLCONV void Perl_sv_replace(pTHX_ SV* sv, SV* nsv); PERL_CALLCONV void Perl_sv_report_used(pTHX); @@ -3302,32 +3302,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) sv_force_normal(sv); } - if (PL_encoding) { - SV *uni; - STRLEN len; - char *s; - dSP; - ENTER; - SAVETMPS; - PUSHMARK(sp); - EXTEND(SP, 3); - XPUSHs(PL_encoding); - XPUSHs(sv); - XPUSHs(&PL_sv_yes); - PUTBACK; - call_method("decode", G_SCALAR); - SPAGAIN; - uni = POPs; - PUTBACK; - s = SvPVutf8(uni, len); - if (s != SvPVX(sv)) { - SvGROW(sv, len); - Move(s, SvPVX(sv), len, char); - SvCUR_set(sv, len); - } - FREETMPS; - LEAVE; - } else { /* Assume Latin-1/EBCDIC */ + if (PL_encoding) + Perl_sv_recode_to_utf8(aTHX_ sv); + else { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we * had a FLAG in SVs to signal if there are any hibit * chars in the PV. Given that there isn't such a flag @@ -3350,9 +3327,9 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *sv, I32 flags) Safefree(s); /* No longer using what was there before. */ SvLEN(sv) = len; /* No longer know the real size. */ } + /* Mark as UTF-8 even if no hibit - saves scanning loop */ + SvUTF8_on(sv); } - /* Mark as UTF-8 even if no hibit - saves scanning loop */ - SvUTF8_on(sv); return SvCUR(sv); } @@ -10382,3 +10359,46 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, #endif /* USE_ITHREADS */ +/* +=for apidoc sv_recode_to_utf8 + +If PL_encoding is set you can call this to recode the pv of the sv. +The PL_encoding is assumed to be an Encode object, on entry the pv is assumed +to be octets in that encoding, and the sv will be converted into Unicode +(and UTF-8). + +If PL_encoding is not an Encode object, things will go boom. + +=cut +*/ + +void +Perl_sv_recode_to_utf8(pTHX_ SV *sv) +{ + SV *uni; + STRLEN len; + char *s; + dSP; + ENTER; + SAVETMPS; + PUSHMARK(sp); + EXTEND(SP, 3); + XPUSHs(PL_encoding); + XPUSHs(sv); + XPUSHs(&PL_sv_yes); + PUTBACK; + call_method("decode", G_SCALAR); + SPAGAIN; + uni = POPs; + PUTBACK; + s = SvPVutf8(uni, len); + if (s != SvPVX(sv)) { + SvGROW(sv, len); + Move(s, SvPVX(sv), len, char); + SvCUR_set(sv, len); + } + FREETMPS; + LEAVE; + SvUTF8_on(sv); +} + @@ -1653,6 +1653,10 @@ S_scan_const(pTHX_ char *start) Perl_croak(aTHX_ "panic: constant overflowed allocated space"); SvPOK_on(sv); + if (PL_encoding && !has_utf8) { + Perl_sv_recode_to_utf8(aTHX_ sv); + has_utf8 = TRUE; + } if (has_utf8) { SvUTF8_on(sv); if (PL_lex_inwhat == OP_TRANS && PL_sublex_info.sub_op) { @@ -7734,3 +7738,4 @@ utf16rev_textfilter(pTHX_ int idx, SV *sv, int maxlen) return count; } #endif + |