diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2002-04-16 03:59:00 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2002-04-16 03:59:00 +0000 |
commit | 82686b017bb20f55e16f84c47f7ac0bf8d0c714b (patch) | |
tree | e7ad28a90ea768b323c2fb37103841ceb7b8dd93 | |
parent | 58858581d2d18dc2bff021fb2c755408c36929c4 (diff) | |
download | perl-82686b017bb20f55e16f84c47f7ac0bf8d0c714b.tar.gz |
my $utf8here, our $utf8here, and package variable $utf8here.
The actual minimal fix is in utf8.c and from NI-S,
the rest are the tests (in fresh_perl since I couldn't get
them easily to work elsewhere) and a slight behaviour change:
previously UTF-8 identifiers had to start with an alphabetic
character. No more so, now they can start with an (Unicode)
ID_Continue character (which however is not a (Unicode) digit).
(Limiting the first character to ID_Start would be rather
restrictive, since ID_Start allows only alphabetic letters.)
TODO: use vars qw($utf8here). This I don't find to be
a showstopper.
p4raw-id: //depot/perl@15943
-rw-r--r-- | embed.fnc | 1 | ||||
-rw-r--r-- | embed.h | 2 | ||||
-rw-r--r-- | embedvar.h | 6 | ||||
-rw-r--r-- | handy.h | 5 | ||||
-rw-r--r-- | intrpvar.h | 3 | ||||
-rw-r--r-- | lib/vars.pm | 11 | ||||
-rw-r--r-- | perl.c | 4 | ||||
-rw-r--r-- | perlapi.h | 4 | ||||
-rw-r--r-- | proto.h | 1 | ||||
-rw-r--r-- | sv.c | 2 | ||||
-rw-r--r-- | t/run/fresh_perl.t | 23 | ||||
-rw-r--r-- | utf8.c | 35 |
12 files changed, 84 insertions, 13 deletions
@@ -342,6 +342,7 @@ Apd |bool |is_utf8_string |U8 *s|STRLEN len Ap |bool |is_utf8_alnum |U8 *p Ap |bool |is_utf8_alnumc |U8 *p Ap |bool |is_utf8_idfirst|U8 *p +Ap |bool |is_utf8_idcont |U8 *p Ap |bool |is_utf8_alpha |U8 *p Ap |bool |is_utf8_ascii |U8 *p Ap |bool |is_utf8_space |U8 *p @@ -325,6 +325,7 @@ #define is_utf8_alnum Perl_is_utf8_alnum #define is_utf8_alnumc Perl_is_utf8_alnumc #define is_utf8_idfirst Perl_is_utf8_idfirst +#define is_utf8_idcont Perl_is_utf8_idcont #define is_utf8_alpha Perl_is_utf8_alpha #define is_utf8_ascii Perl_is_utf8_ascii #define is_utf8_space Perl_is_utf8_space @@ -1892,6 +1893,7 @@ #define is_utf8_alnum(a) Perl_is_utf8_alnum(aTHX_ a) #define is_utf8_alnumc(a) Perl_is_utf8_alnumc(aTHX_ a) #define is_utf8_idfirst(a) Perl_is_utf8_idfirst(aTHX_ a) +#define is_utf8_idcont(a) Perl_is_utf8_idcont(aTHX_ a) #define is_utf8_alpha(a) Perl_is_utf8_alpha(aTHX_ a) #define is_utf8_ascii(a) Perl_is_utf8_ascii(aTHX_ a) #define is_utf8_space(a) Perl_is_utf8_space(aTHX_ a) diff --git a/embedvar.h b/embedvar.h index 379917832a..3c12a6af65 100644 --- a/embedvar.h +++ b/embedvar.h @@ -427,6 +427,8 @@ #define PL_utf8_cntrl (PERL_GET_INTERP->Iutf8_cntrl) #define PL_utf8_digit (PERL_GET_INTERP->Iutf8_digit) #define PL_utf8_graph (PERL_GET_INTERP->Iutf8_graph) +#define PL_utf8_idcont (PERL_GET_INTERP->Iutf8_idcont) +#define PL_utf8_idstart (PERL_GET_INTERP->Iutf8_idstart) #define PL_utf8_lower (PERL_GET_INTERP->Iutf8_lower) #define PL_utf8_mark (PERL_GET_INTERP->Iutf8_mark) #define PL_utf8_print (PERL_GET_INTERP->Iutf8_print) @@ -726,6 +728,8 @@ #define PL_utf8_cntrl (vTHX->Iutf8_cntrl) #define PL_utf8_digit (vTHX->Iutf8_digit) #define PL_utf8_graph (vTHX->Iutf8_graph) +#define PL_utf8_idcont (vTHX->Iutf8_idcont) +#define PL_utf8_idstart (vTHX->Iutf8_idstart) #define PL_utf8_lower (vTHX->Iutf8_lower) #define PL_utf8_mark (vTHX->Iutf8_mark) #define PL_utf8_print (vTHX->Iutf8_print) @@ -1028,6 +1032,8 @@ #define PL_Iutf8_cntrl PL_utf8_cntrl #define PL_Iutf8_digit PL_utf8_digit #define PL_Iutf8_graph PL_utf8_graph +#define PL_Iutf8_idcont PL_utf8_idcont +#define PL_Iutf8_idstart PL_utf8_idstart #define PL_Iutf8_lower PL_utf8_lower #define PL_Iutf8_mark PL_utf8_mark #define PL_Iutf8_print PL_utf8_print @@ -460,7 +460,10 @@ Converts the specified character to lowercase. #define isBLANK_LC_uni(c) isBLANK(c) /* could be wrong */ #define isALNUM_utf8(p) is_utf8_alnum(p) -#define isIDFIRST_utf8(p) is_utf8_idfirst(p) +/* The ID_Start of Unicode is quite limiting: it assumes a L-class + * character (meaning that you cannot have, say, a CJK character). + * Instead, let's allow ID_Continue but not digits. */ +#define isIDFIRST_utf8(p) (is_utf8_idcont(p) && !is_utf8_digit(p)) #define isALPHA_utf8(p) is_utf8_alpha(p) #define isSPACE_utf8(p) is_utf8_space(p) #define isDIGIT_utf8(p) is_utf8_digit(p) diff --git a/intrpvar.h b/intrpvar.h index 94125c5719..e940163651 100644 --- a/intrpvar.h +++ b/intrpvar.h @@ -517,6 +517,9 @@ PERLVAR(IOpSlab,I32 *) PERLVAR(Iwantutf8, bool) /* want utf8 as the default discipline */ +PERLVAR(Iutf8_idstart, SV *) +PERLVAR(Iutf8_idcont, SV *) + /* New variables must be added to the very end for binary compatibility. * XSUB.h provides wrapper functions via perlapi.h that make this * irrelevant, but not all code may be expected to #include XSUB.h. */ diff --git a/lib/vars.pm b/lib/vars.pm index c3a02234e9..233979d264 100644 --- a/lib/vars.pm +++ b/lib/vars.pm @@ -12,6 +12,10 @@ sub import { my ($pack, @imports) = @_; my ($sym, $ch); foreach (@imports) { + # TODO: UTF-8 names: (the unpack is quite wrong, + # /^(.)(.*)/ would probably be better.) While you + # are at it, until declaring empty package is made + # to work the * is too lenient. ($ch, $sym) = unpack('a1a*', $_); if ($sym =~ tr/A-Za-z_0-9//c) { # time for a more-detailed check-up @@ -20,10 +24,9 @@ sub import { Carp::croak("Can't declare individual elements of hash or array"); } elsif (warnings::enabled() and length($sym) == 1 and $sym !~ tr/a-zA-Z//) { warnings::warn("No need to declare built-in vars"); - } elsif (($^H &= strict::bits('vars')) && - # Either no 'use utf8' or if utf8, no non-word - ($^H & 0x00800000 == 0 || # matches $utf8::hint_bits - $sym =~ /\W/) ) { + } elsif (($^H &= strict::bits('vars'))) { + # TODO: UTF-8 names: be careful to load the UTF-8 + # machinery only if the symbol requires it. require Carp; Carp::croak("'$_' is not a valid variable name under strict vars"); } @@ -686,6 +686,8 @@ perl_destruct(pTHXx) SvREFCNT_dec(PL_utf8_totitle); SvREFCNT_dec(PL_utf8_tolower); SvREFCNT_dec(PL_utf8_tofold); + SvREFCNT_dec(PL_utf8_idstart); + SvREFCNT_dec(PL_utf8_idcont); PL_utf8_alnum = Nullsv; PL_utf8_alnumc = Nullsv; PL_utf8_ascii = Nullsv; @@ -704,6 +706,8 @@ perl_destruct(pTHXx) PL_utf8_totitle = Nullsv; PL_utf8_tolower = Nullsv; PL_utf8_tofold = Nullsv; + PL_utf8_idstart = Nullsv; + PL_utf8_idcont = Nullsv; if (!specialWARN(PL_compiling.cop_warnings)) SvREFCNT_dec(PL_compiling.cop_warnings); @@ -582,6 +582,10 @@ END_EXTERN_C #define PL_utf8_digit (*Perl_Iutf8_digit_ptr(aTHX)) #undef PL_utf8_graph #define PL_utf8_graph (*Perl_Iutf8_graph_ptr(aTHX)) +#undef PL_utf8_idcont +#define PL_utf8_idcont (*Perl_Iutf8_idcont_ptr(aTHX)) +#undef PL_utf8_idstart +#define PL_utf8_idstart (*Perl_Iutf8_idstart_ptr(aTHX)) #undef PL_utf8_lower #define PL_utf8_lower (*Perl_Iutf8_lower_ptr(aTHX)) #undef PL_utf8_mark @@ -377,6 +377,7 @@ PERL_CALLCONV bool Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len); PERL_CALLCONV bool Perl_is_utf8_alnum(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_alnumc(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_idfirst(pTHX_ U8 *p); +PERL_CALLCONV bool Perl_is_utf8_idcont(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_alpha(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_ascii(pTHX_ U8 *p); PERL_CALLCONV bool Perl_is_utf8_space(pTHX_ U8 *p); @@ -10312,6 +10312,8 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags, PL_utf8_totitle = sv_dup_inc(proto_perl->Iutf8_totitle, param); PL_utf8_tolower = sv_dup_inc(proto_perl->Iutf8_tolower, param); PL_utf8_tofold = sv_dup_inc(proto_perl->Iutf8_tofold, param); + PL_utf8_idstart = sv_dup_inc(proto_perl->Iutf8_idstart, param); + PL_utf8_idcont = sv_dup_inc(proto_perl->Iutf8_idcont, param); /* swatch cache */ PL_last_swash_hv = Nullhv; /* reinits on demand */ diff --git a/t/run/fresh_perl.t b/t/run/fresh_perl.t index 79aae7a04a..8a334a58b1 100644 --- a/t/run/fresh_perl.t +++ b/t/run/fresh_perl.t @@ -788,3 +788,26 @@ package main; $test = Foo->new(); # must be package var EXPECT ok +######## example from Camel 5, ch. 15, pp.406 (with my) +use strict; +use utf8; +my $人 = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph +$人++; # a child is born +print $人, "\n"; +EXPECT +3 +######## example from Camel 5, ch. 15, pp.406 (with our) +use strict; +use utf8; +our $人 = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph +$人++; # a child is born +print $人, "\n"; +EXPECT +3 +######## example from Camel 5, ch. 15, pp.406 (with package vars) +use utf8; +$人 = 2; # 0xe4 0xba 0xba: U+4eba, "human" in CJK ideograph +$人++; # a child is born +print $人, "\n"; +EXPECT +3 @@ -170,12 +170,11 @@ Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv) =for apidoc A|STRLEN|is_utf8_char|U8 *s Tests if some arbitrary number of bytes begins in a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character. -The actual number of bytes in the UTF-8 character will be returned if -it is valid, otherwise 0. +character. Note that an INVARIANT (i.e. ASCII) character is a valid +UTF-8 character. The actual number of bytes in the UTF-8 character +will be returned if it is valid, otherwise 0. -=cut -*/ +=cut */ STRLEN Perl_is_utf8_char(pTHX_ U8 *s) { @@ -1156,9 +1155,27 @@ Perl_is_utf8_alnumc(pTHX_ U8 *p) } bool -Perl_is_utf8_idfirst(pTHX_ U8 *p) +Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */ { - return *p == '_' || is_utf8_alpha(p); + if (*p == '_') + return TRUE; + if (!is_utf8_char(p)) + return FALSE; + if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */ + PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_idstart, p, TRUE); +} + +bool +Perl_is_utf8_idcont(pTHX_ U8 *p) +{ + if (*p == '_') + return TRUE; + if (!is_utf8_char(p)) + return FALSE; + if (!PL_utf8_idcont) + PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0); + return swash_fetch(PL_utf8_idcont, p, TRUE); } bool @@ -1514,9 +1531,11 @@ Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) SAVEI32(PL_hints); PL_hints = 0; save_re_context(); - if (PL_curcop == &PL_compiling) + if (PL_curcop == &PL_compiling) { /* XXX ought to be handled by lex_start */ + SAVEI32(PL_in_my); sv_setpv(tokenbufsv, PL_tokenbuf); + } errsv_save = newSVsv(ERRSV); if (call_method("SWASHNEW", G_SCALAR)) retval = newSVsv(*PL_stack_sp--); |