diff options
-rw-r--r-- | lib/byte.pm | 4 | ||||
-rw-r--r-- | lib/utf8.pm | 15 | ||||
-rw-r--r-- | pod/perlunicode.pod | 9 | ||||
-rw-r--r-- | pod/perlvar.pod | 38 | ||||
-rw-r--r-- | pp.c | 4 | ||||
-rw-r--r-- | sv.c | 2 |
6 files changed, 15 insertions, 57 deletions
diff --git a/lib/byte.pm b/lib/byte.pm index 569fa660e0..0424e1778d 100644 --- a/lib/byte.pm +++ b/lib/byte.pm @@ -38,9 +38,7 @@ the effect of C<use byte> within the current lexical scope. Perl normally assumes character semantics in the presence of character data (i.e. data that has come from a source that has -been marked as being of a particular character encoding) or when -the global $^U flag is enabled. [XXX: implement -C command line -switch and mention that instead of $^U?] +been marked as being of a particular character encoding). To understand the implications and differences between character semantics and byte semantics, see L<perlunicode>. diff --git a/lib/utf8.pm b/lib/utf8.pm index be7cc0bf0c..d9e9becdda 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -1,8 +1,5 @@ package utf8; -$^U = 1 if caller and caller eq 'main'; # they are unicode aware - # XXX split this out? - sub import { $^H |= 0x00800000; $enc{caller()} = $_[1] if $_[1]; @@ -60,15 +57,6 @@ and package names. =item * -As a side effect, when this pragma is used within the main package, -it also enables Unicode character semantics for the entire program. -See L<perlunicode> for more on that. - -[XXX: split this out into separate "pragma" and/or -C command-line -switch?] - -=item * - In the absence of inputs marked as UTF-8, regular expressions within the scope of this pragma will default to using character semantics instead of byte semantics. @@ -80,9 +68,6 @@ of byte semantics. @chars = split //, $data; # splits characters } -[XXX: Should this should be enabled like chr()/sprintf("%c") by looking -at $^U instead?] - =head1 SEE ALSO L<perlunicode>, L<byte> diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 5a73d4e959..bebf7aadec 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -31,12 +31,9 @@ or from literals and constants in the source text. Later, in L</Character encodings for input and output>, we'll see how such inputs may be marked as being Unicode character data sources. -One particular condition will enable character semantics on the entire -program, bypassing the compatibility mode: if the C<$^U> global flag is -set to C<1>, nearly all operations will use character semantics by -default. As an added convenience, if the C<utf8> pragma is used in the -C<main> package, C<$^U> is enabled automatically. [XXX: Should there -be a -C switch to enable $^U?] +If the C<$^U> global flag is set to C<1>, all system calls will use the +corresponding wide character APIs. This is currently only implemented +on Windows. [XXX: Should there be a -C switch to enable $^U?] Regardless of the above, the C<byte> pragma can always be used to force byte semantics in a particular lexical scope. See L<byte>. diff --git a/pod/perlvar.pod b/pod/perlvar.pod index dca9cc092f..79ec7f9ff0 100644 --- a/pod/perlvar.pod +++ b/pod/perlvar.pod @@ -875,37 +875,15 @@ and B<-C> filetests are based on this value. =item $^U -Global flag that switches on Unicode character support in the Perl -interpreter. The initial value is usually C<0> for compatibility -with Perl versions earlier than 5.6, but may be automatically set -to C<1> by Perl if the system provides a user-settable default -(e.g., C<$ENV{LC_CTYPE}>). It is also implicitly set to C<1> -whenever the utf8 pragma is loaded. +Global flag that enables system calls made by Perl to use wide character +APIs native to the system, if available. This is currently only implemented +on the Windows platform. -Setting it to C<1> has the following effects: +The initial value is typically C<0> for compatibility with Perl versions +earlier than 5.6, but may be automatically set to C<1> by Perl if the system +provides a user-settable default (e.g., C<$ENV{LC_CTYPE}>). -=over - -=item * - -C<chr> produces UTF-8 encoded Unicode characters. These are the same -as the corresponding ASCII characters if the argument is less than 128. - -=item * - -The C<%c> format in C<sprintf> generates a UTF-8 encoded Unicode -character. This is the same as the corresponding ASCII character -if the argument is less than 128. - -=item * - -Any system calls made by Perl will use wide character APIs native to -the system, if available. This is currently only implemented on the -Windows platform. - -=back - -The C<byte> pragma overrides the value of this flag in the current +The C<byte> pragma always overrides the effect of this flag in the current lexical scope. See L<byte>. =item $^V @@ -914,7 +892,7 @@ The revision, version, and subversion of the Perl interpreter, represented as a "version tuple". Version tuples have both a numeric value and a string value. The numeric value is a floating point number that amounts to revision + version/1000 + subversion/1000000, and the string value -is made of utf8 characters: +is made of characters possibly in the UTF-8 range: C<chr($revision) . chr($version) . chr($subversion)>. This can be used to determine whether the Perl interpreter executing a @@ -2199,10 +2199,9 @@ PP(pp_chr) char *tmps; U32 value = POPu; - SvUTF8_off(TARG); /* decontaminate */ (void)SvUPGRADE(TARG,SVt_PV); - if (value >= 128 && PL_bigchar && !IN_BYTE) { + if (value > 255 && !IN_BYTE) { SvGROW(TARG,8); tmps = SvPVX(TARG); tmps = (char*)uv_to_utf8((U8*)tmps, (UV)value); @@ -2219,6 +2218,7 @@ PP(pp_chr) tmps = SvPVX(TARG); *tmps++ = value; *tmps = '\0'; + SvUTF8_off(TARG); /* decontaminate */ (void)SvPOK_only(TARG); XPUSHs(TARG); RETURN; @@ -5830,7 +5830,7 @@ Perl_sv_vcatpvfn(pTHX_ SV *sv, const char *pat, STRLEN patlen, va_list *args, SV uv = va_arg(*args, int); else uv = (svix < svmax) ? SvIVx(svargs[svix++]) : 0; - if (uv >= 128 && PL_bigchar && !IN_BYTE) { + if ((uv > 255 || (uv > 127 && SvUTF8(sv))) && !IN_BYTE) { eptr = (char*)utf8buf; elen = uv_to_utf8((U8*)eptr, uv) - utf8buf; is_utf = TRUE; |