diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 1999-08-04 07:59:05 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 1999-08-04 07:59:05 +0000 |
commit | 423cee853811c26846bd1948939b85f9866dfb4a (patch) | |
tree | a1adb3cff6fd86474503248d2eba6747f0adeb02 | |
parent | fd1e013efb606b51dc27fba846b1bedb38910a76 (diff) | |
download | perl-423cee853811c26846bd1948939b85f9866dfb4a.tar.gz |
Introduce the charnames pragma.
Subject: [PATCH 5.005_58] Free \C (for named chars), move to \O
From: Ilya Zakharevich <[9]ilya@math.ohio-state.edu>
To: Chip Salzenberg <[11]chip@perlsupport.com>
Cc: Mailing list Perl5 <[12]perl5-porters@perl.org>
Date: Sat, 31 Jul 1999 05:44:05 -0400
Message-Id: <[13]199907311407.IAA25042@localhost.frii.com>
From: Ilya Zakharevich <ilya@math.ohio-state.edu>
To: Mailing list Perl5 <perl5-porters@perl.org>
Subject: [PATCH 5.005_58] Named characters in Perl
Date: Mon, 2 Aug 1999 19:25:40 -0400
Message-ID: <19990802192540.B24407@monk.mps.ohio-state.edu>
p4raw-id: //depot/cfgperl@3916
-rw-r--r-- | MAINTAIN | 2 | ||||
-rw-r--r-- | MANIFEST | 2 | ||||
-rw-r--r-- | lib/charnames.pm | 134 | ||||
-rw-r--r-- | lib/utf8.pm | 5 | ||||
-rw-r--r-- | pod/perldiag.pod | 15 | ||||
-rw-r--r-- | pod/perlop.pod | 4 | ||||
-rw-r--r-- | pod/perlre.pod | 6 | ||||
-rw-r--r-- | regcomp.c | 2 | ||||
-rw-r--r-- | t/lib/charnames.t | 53 | ||||
-rw-r--r-- | toke.c | 156 |
10 files changed, 325 insertions, 54 deletions
@@ -477,6 +477,7 @@ lib/bigint.pl lib/bigrat.pl lib/blib.pm lib/cacheout.pl +lib/charnames.pm ilya lib/chat2.pl lib/complete.pl lib/constant.pm @@ -683,6 +684,7 @@ t/lib/cgi-form.t t/lib/cgi-function.t t/lib/cgi-html.t t/lib/cgi-request.t +t/lib/charnames.t ilya t/lib/checktree.t t/lib/complex.t complex t/lib/db-btree.t pmarquess @@ -648,6 +648,7 @@ lib/bigrat.pl An arbitrary precision rational arithmetic package lib/blib.pm For "use blib" lib/cacheout.pl Manages output filehandles when you need too many lib/caller.pm Inherit pragmatic attributes from caller's context +lib/charnames.pm Character names lib/chat2.pl Obsolete ipc library (use Comm.pm etc instead) lib/complete.pl A command completion subroutine lib/constant.pm For "use constant" @@ -1112,6 +1113,7 @@ t/lib/cgi-form.t See if CGI.pm works t/lib/cgi-function.t See if CGI.pm works t/lib/cgi-html.t See if CGI.pm works t/lib/cgi-request.t See if CGI.pm works +t/lib/charnames.t See if character names work t/lib/checktree.t See if File::CheckTree works t/lib/complex.t See if Math::Complex works t/lib/db-btree.t See if DB_File works diff --git a/lib/charnames.pm b/lib/charnames.pm new file mode 100644 index 0000000000..e407ff7c8a --- /dev/null +++ b/lib/charnames.pm @@ -0,0 +1,134 @@ +package charnames; + +my $fname = 'unicode/UnicodeData-Latest.txt'; +my $txt; + +# This is not optimized in any way yet +sub charnames { + $name = shift; + $txt = do "unicode/Name.pl" unless $txt; + my @off; + if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) { + @off = ($-[0], $+[0]); + } + unless (@off) { + if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) { + my ($script, $cname) = ($1,$2); + my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); + if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) { + @off = ($-[0], $+[0]); + } + } + } + unless (@off) { + my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL"); + for ( @{$^H{charnames_scripts}} ) { + (@off = ($-[0], $+[0])), last + if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m; + } + } + die "Unknown charname '$name'" unless @off; + + # use caller 'encoding'; # Does not work at compile time? + + my $ord = hex substr $txt, $off[0] - 4, 4; + if ($^H & 0x8) { + use utf8; + return chr $ord; + } + return chr $ord if $ord <= 255; + my $hex = sprintf '%X=0%o', $ord, $ord; + my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2; + die "Character 0x$hex with name '$fname' is above 0xFF"; +} + +sub import { + shift; + die "No scripts for `use charnames'" unless @_; + $^H |= 0x20000; + $^H{charnames} = \&charnames ; + my %h; + @h{@_} = (1) x @_; + $^H{charnames_full} = delete $h{':full'}; + $^H{charnames_short} = delete $h{':short'}; + $^H{charnames_scripts} = [map uc, keys %h]; +} + + +1; +__END__ + +=head1 NAME + +charnames - define character names for C<\C{named}> string literal escape. + +=head1 SYNOPSIS + + use charnames ':full'; + print "\C{GREEK SMALL LETTER SIGMA} is called sigma.\n"; + + use charnames ':short'; + print "\C{greek:Sigma} is an upper-case sigma.\n"; + + use charnames qw(cyrillic greek); + print "\C{sigma} is Greek sigma, and \C{be} is Cyrillic b.\n"; + +=head1 DESCRIPTION + +Pragma C<use charnames> supports arguments C<:full>, C<:short> and +script names. If C<:full> is present, for expansion of +C<\C{CHARNAME}}> string C<CHARNAME> is first looked in the list of +standard Unicode names of chars. If C<:short> is present, and +C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up +as a letter in script C<SCRIPT>. If pragma C<use charnames> is used +with script name arguments, then for C<\C{CHARNAME}}> the name +C<CHARNAME> is looked up as a letter in the given scripts (in the +specified order). + +For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME> +F<charcodes.pm> looks for the names + + SCRIPTNAME CAPITAL LETTER CHARNAME + SCRIPTNAME SMALL LETTER CHARNAME + SCRIPTNAME LETTER CHARNAME + +in the table of standard Unicode names. If C<CHARNAME> is lowercase, +then the C<CAPITAL> variant is ignored, otherwise C<SMALL> variant is +ignored. + +=head1 CUSTOM TRANSLATORS + +The mechanism of translation is C<\C{...}> escapes is general and not +hardwired into F<charnames.pm>. A module can install custom +translations (inside the scope which C<use>s the module) by the +following magic incantation: + + sub import { + shift; + $^H |= 0x20000; + $^H{charnames} = \&translator; + } + +Here translator() is a subroutine which takes C<CHARNAME> as an +argument, and returns text to insert into the string instead of the +C<\C{CHARNAME}> escape. Since the text to insert should be different +in C<utf8> mode and out of it, the function should check the current +state of C<utf8>-flag as in + + sub translator { + if ($^H & 0x8) { + return utf_translator(@_); + } else { + return no_utf_translator(@_); + } + } + +=head1 BUGS + +Since evaluation of the translation function happens in a middle of +compilation (of a string literal), the translation function should not +do any C<eval>s or C<require>s. This restriction should be lifted in +a future version of Perl. + +=cut + diff --git a/lib/utf8.pm b/lib/utf8.pm index beb4568e2a..269a1c2177 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -71,9 +71,8 @@ attempt to canonicalize variable names for you.) =item * Regular expressions match characters instead of bytes. For instance, -"." matches a character instead of a byte. (However, the C<\C> pattern -is provided to force a match a single byte ("C<char>" in C, hence -C<\C>).) +"." matches a character instead of a byte. (However, the C<\O> pattern +is provided to force a match a single byte ("octet", hence C<\O>).) =item * diff --git a/pod/perldiag.pod b/pod/perldiag.pod index bffd191174..2542838d83 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -1130,6 +1130,16 @@ workarounds. inlining. See L<perlsub/"Constant Functions"> for commentary and workarounds. +=item constant(%s): %%^H is not localized + +(F) When setting compile-time-lexicalized hash %^H one should set the +corresponding bit of $^H as well. + +=item constant(%s): %s + +(F) Compile-time-substitutions (such as overloaded constants and +character names) were not correctly set up. + =item Copy method did not return a reference (F) The method which overloads "=" is buggy. See L<overload/Copy Constructor>. @@ -1662,6 +1672,11 @@ ended earlier on the current line. mentioned with the $ in Perl, unlike in the shells, where it can vary from one line to the next. +=item Missing %sbrace%s on \C{} + +(F) Wrong syntax of character name literal C<\C{charname}> within +double-quotish context. + =item Missing comma after first argument to %s function (F) While certain functions allow you to specify a filehandle or an diff --git a/pod/perlop.pod b/pod/perlop.pod index 3234131f90..bd4ca1df7d 100644 --- a/pod/perlop.pod +++ b/pod/perlop.pod @@ -673,6 +673,7 @@ a transliteration, the first eleven of these sequences may be used. \x1b hex char (ESC) \x{263a} wide hex char (SMILEY) \c[ control char (ESC) + \C{name} named char \l lowercase next char \u uppercase next char @@ -682,7 +683,8 @@ a transliteration, the first eleven of these sequences may be used. \Q quote non-word characters till \E If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u> -and C<\U> is taken from the current locale. See L<perllocale>. +and C<\U> is taken from the current locale. See L<perllocale>. For +documentation of C<\C{name}>, see L<charnames>. All systems use the virtual C<"\n"> to represent a line terminator, called a "newline". There is no such thing as an unvarying, physical diff --git a/pod/perlre.pod b/pod/perlre.pod index 6c05efc66f..85b2a949c2 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -149,6 +149,7 @@ also work: \x1B hex char \x{263a} wide hex char (Unicode SMILEY) \c[ control char + \C{name} named char \l lowercase next char (think vi) \u uppercase next char (think vi) \L lowercase till \E (think vi) @@ -157,7 +158,8 @@ also work: \Q quote (disable) pattern metacharacters till \E If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u> -and C<\U> is taken from the current locale. See L<perllocale>. +and C<\U> is taken from the current locale. See L<perllocale>. For +documentation of C<\C{name}>, see L<charnames>. You cannot include a literal C<$> or C<@> within a C<\Q> sequence. An unescaped C<$> or C<@> interpolates the corresponding variable, @@ -176,7 +178,7 @@ In addition, Perl defines the following: \PP Match non-P \X Match eXtended Unicode "combining character sequence", equivalent to C<(?:\PM\pM*)> - \C Match a single C char (octet) even under utf8. + \O Match a single C char (octet) even under utf8. A C<\w> matches a single alphanumeric character, not a whole word. Use C<\w+> to match a string of Perl-identifier characters (which isn't @@ -1779,7 +1779,7 @@ tryagain: PL_seen_zerolen++; /* Do not optimize RE away */ nextchar(); break; - case 'C': + case 'O': ret = reg_node(SANY); *flagp |= HASWIDTH|SIMPLE; nextchar(); diff --git a/t/lib/charnames.t b/t/lib/charnames.t new file mode 100644 index 0000000000..860cc03c75 --- /dev/null +++ b/t/lib/charnames.t @@ -0,0 +1,53 @@ +#!./perl + +BEGIN { + unless(grep /blib/, @INC) { + chdir 't' if -d 't'; + unshift @INC, '../lib' if -d '../lib'; + } +} + +$| = 1; +print "1..5\n"; + +use charnames ':full'; + +print "not " unless "Here\C{EXCLAMATION MARK}?" eq 'Here!?'; +print "ok 1\n"; + +print "# \$res=$res \$\@='$@'\nnot " + if $res = eval <<'EOE' +use charnames ":full"; +"Here: \C{CYRILLIC SMALL LETTER BE}!"; +1 +EOE + or $@ !~ /above 0xFF/; +print "ok 2\n"; +# print "# \$res=$res \$\@='$@'\n"; + +print "# \$res=$res \$\@='$@'\nnot " + if $res = eval <<'EOE' +use charnames 'cyrillic'; +"Here: \C{Be}!"; +1 +EOE + or $@ !~ /CYRILLIC CAPITAL LETTER BE.*above 0xFF/; +print "ok 3\n"; + +# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt +$encoded_be = "\320\261"; +$encoded_alpha = "\316\261"; +$encoded_bet = "\327\221"; +{ + use charnames ':full'; + use utf8; + + print "not " unless "\C{CYRILLIC SMALL LETTER BE}" eq $encoded_be; + print "ok 4\n"; + + use charnames qw(cyrillic greek :short); + + print "not " unless "\C{be},\C{alpha},\C{hebrew:bet}" + eq "$encoded_be,$encoded_alpha,$encoded_bet"; + print "ok 5\n"; +} @@ -1132,7 +1132,7 @@ S_scan_const(pTHX_ char *start) : UTF; char *leaveit = /* set of acceptably-backslashed characters */ PL_lex_inpat - ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" + ? "\\.^$@AGZdDwWsSbBpPXO+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#" : ""; while (s < send || dorange) { @@ -1353,6 +1353,43 @@ S_scan_const(pTHX_ char *start) } continue; + /* \C{latin small letter a} is a named character */ + case 'C': + ++s; + if (*s == '{') { + char* e = strchr(s, '}'); + HV *hv; + SV **svp; + SV *res, *cv; + STRLEN len; + char *str; + char *why = Nullch; + + if (!e) { + yyerror("Missing right brace on \\C{}"); + e = s - 1; + goto cont_scan; + } + res = newSVpvn(s + 1, e - s - 1); + res = new_constant( Nullch, 0, "charnames", + res, Nullsv, "\\C{...}" ); + str = SvPV(res,len); + if (len > e - s + 4) { + char *odest = SvPVX(sv); + + SvGROW(sv, (SvCUR(sv) + len - (e - s + 4))); + d = SvPVX(sv) + (d - odest); + } + Copy(str, d, len, char); + d += len; + SvREFCNT_dec(res); + cont_scan: + s = e + 1; + } + else + yyerror("Missing braces on \\C{}"); + continue; + /* \c is a control character */ case 'c': s++; @@ -5251,76 +5288,101 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what) } } +/* Either returns sv, or mortalizes sv and returns a new SV*. + Best used as sv=new_constant(..., sv, ...). + If s, pv are NULL, calls subroutine with one argument, + and type is used with error messages only. */ + STATIC SV * S_new_constant(pTHX_ char *s, STRLEN len, char *key, SV *sv, SV *pv, char *type) { dSP; HV *table = GvHV(PL_hintgv); /* ^H */ - BINOP myop; SV *res; - bool oldcatch = CATCH_GET; SV **cvp; SV *cv, *typesv; - + char *why, *why1, *why2; + + if (!(PL_hints & HINT_LOCALIZE_HH)) { + SV *msg; + + why = "%^H is not localized"; + report_short: + why1 = why2 = ""; + report: + msg = Perl_newSVpvf(aTHX_ "constant(%s): %s%s%s", + (type ? type: "undef"), why1, why2, why); + yyerror(SvPVX(msg)); + SvREFCNT_dec(msg); + return sv; + } if (!table) { - yyerror("%^H is not defined"); - return sv; + why = "%^H is not defined"; + goto report_short; } cvp = hv_fetch(table, key, strlen(key), FALSE); if (!cvp || !SvOK(*cvp)) { - char buf[128]; - sprintf(buf,"$^H{%s} is not defined", key); - yyerror(buf); - return sv; + why = "} is not defined"; + why1 = "$^H{"; + why2 = key; + goto report; } sv_2mortal(sv); /* Parent created it permanently */ cv = *cvp; - if (!pv) - pv = sv_2mortal(newSVpvn(s, len)); - if (type) - typesv = sv_2mortal(newSVpv(type, 0)); + if (!pv && s) + pv = sv_2mortal(newSVpvn(s, len)); + if (type && pv) + typesv = sv_2mortal(newSVpv(type, 0)); else - typesv = &PL_sv_undef; - CATCH_SET(TRUE); - Zero(&myop, 1, BINOP); - myop.op_last = (OP *) &myop; - myop.op_next = Nullop; - myop.op_flags = OPf_WANT_SCALAR | OPf_STACKED; - + typesv = &PL_sv_undef; + PUSHSTACKi(PERLSI_OVERLOAD); - ENTER; - SAVEOP(); - PL_op = (OP *) &myop; - if (PERLDB_SUB && PL_curstash != PL_debstash) - PL_op->op_private |= OPpENTERSUB_DB; - PUTBACK; - Perl_pp_pushmark(aTHX); - + ENTER ; + SAVETMPS; + + PUSHMARK(SP) ; EXTEND(sp, 4); - PUSHs(pv); + if (pv) + PUSHs(pv); PUSHs(sv); - PUSHs(typesv); + if (pv) + PUSHs(typesv); PUSHs(cv); PUTBACK; - - if (PL_op = Perl_pp_entersub(aTHX)) - CALLRUNOPS(aTHX); - LEAVE; - SPAGAIN; - - res = POPs; - PUTBACK; - CATCH_SET(oldcatch); + call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL)); + + SPAGAIN ; + + /* Check the eval first */ + if (!PL_in_eval && SvTRUE(ERRSV)) + { + STRLEN n_a; + sv_catpv(ERRSV, "Propagated"); + yyerror(SvPV(ERRSV, n_a)); /* Duplicates the message inside eval */ + POPs ; + res = SvREFCNT_inc(sv); + } + else { + res = POPs; + SvREFCNT_inc(res); + } + + PUTBACK ; + FREETMPS ; + LEAVE ; POPSTACK; - + if (!SvOK(res)) { - char buf[128]; - sprintf(buf,"Call to &{$^H{%s}} did not return a defined value", key); - yyerror(buf); - } - return SvREFCNT_inc(res); + why = "}} did not return a defined value"; + why1 = "Call to &{$^H{"; + why2 = key; + sv = res; + goto report; + } + + return res; } - + STATIC char * S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp) { |