summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAIN2
-rw-r--r--MANIFEST2
-rw-r--r--lib/charnames.pm134
-rw-r--r--lib/utf8.pm5
-rw-r--r--pod/perldiag.pod15
-rw-r--r--pod/perlop.pod4
-rw-r--r--pod/perlre.pod6
-rw-r--r--regcomp.c2
-rw-r--r--t/lib/charnames.t53
-rw-r--r--toke.c156
10 files changed, 325 insertions, 54 deletions
diff --git a/MAINTAIN b/MAINTAIN
index 12f987d5bc..4507ca9e6e 100644
--- a/MAINTAIN
+++ b/MAINTAIN
@@ -477,6 +477,7 @@ lib/bigint.pl
lib/bigrat.pl
lib/blib.pm
lib/cacheout.pl
+lib/charnames.pm ilya
lib/chat2.pl
lib/complete.pl
lib/constant.pm
@@ -683,6 +684,7 @@ t/lib/cgi-form.t
t/lib/cgi-function.t
t/lib/cgi-html.t
t/lib/cgi-request.t
+t/lib/charnames.t ilya
t/lib/checktree.t
t/lib/complex.t complex
t/lib/db-btree.t pmarquess
diff --git a/MANIFEST b/MANIFEST
index 0db9a3f0e9..0c0c0771f8 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -648,6 +648,7 @@ lib/bigrat.pl An arbitrary precision rational arithmetic package
lib/blib.pm For "use blib"
lib/cacheout.pl Manages output filehandles when you need too many
lib/caller.pm Inherit pragmatic attributes from caller's context
+lib/charnames.pm Character names
lib/chat2.pl Obsolete ipc library (use Comm.pm etc instead)
lib/complete.pl A command completion subroutine
lib/constant.pm For "use constant"
@@ -1112,6 +1113,7 @@ t/lib/cgi-form.t See if CGI.pm works
t/lib/cgi-function.t See if CGI.pm works
t/lib/cgi-html.t See if CGI.pm works
t/lib/cgi-request.t See if CGI.pm works
+t/lib/charnames.t See if character names work
t/lib/checktree.t See if File::CheckTree works
t/lib/complex.t See if Math::Complex works
t/lib/db-btree.t See if DB_File works
diff --git a/lib/charnames.pm b/lib/charnames.pm
new file mode 100644
index 0000000000..e407ff7c8a
--- /dev/null
+++ b/lib/charnames.pm
@@ -0,0 +1,134 @@
+package charnames;
+
+my $fname = 'unicode/UnicodeData-Latest.txt';
+my $txt;
+
+# This is not optimized in any way yet
+sub charnames {
+ $name = shift;
+ $txt = do "unicode/Name.pl" unless $txt;
+ my @off;
+ if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
+ @off = ($-[0], $+[0]);
+ }
+ unless (@off) {
+ if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
+ my ($script, $cname) = ($1,$2);
+ my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+ if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
+ @off = ($-[0], $+[0]);
+ }
+ }
+ }
+ unless (@off) {
+ my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+ for ( @{$^H{charnames_scripts}} ) {
+ (@off = ($-[0], $+[0])), last
+ if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
+ }
+ }
+ die "Unknown charname '$name'" unless @off;
+
+ # use caller 'encoding'; # Does not work at compile time?
+
+ my $ord = hex substr $txt, $off[0] - 4, 4;
+ if ($^H & 0x8) {
+ use utf8;
+ return chr $ord;
+ }
+ return chr $ord if $ord <= 255;
+ my $hex = sprintf '%X=0%o', $ord, $ord;
+ my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
+ die "Character 0x$hex with name '$fname' is above 0xFF";
+}
+
+sub import {
+ shift;
+ die "No scripts for `use charnames'" unless @_;
+ $^H |= 0x20000;
+ $^H{charnames} = \&charnames ;
+ my %h;
+ @h{@_} = (1) x @_;
+ $^H{charnames_full} = delete $h{':full'};
+ $^H{charnames_short} = delete $h{':short'};
+ $^H{charnames_scripts} = [map uc, keys %h];
+}
+
+
+1;
+__END__
+
+=head1 NAME
+
+charnames - define character names for C<\C{named}> string literal escape.
+
+=head1 SYNOPSIS
+
+ use charnames ':full';
+ print "\C{GREEK SMALL LETTER SIGMA} is called sigma.\n";
+
+ use charnames ':short';
+ print "\C{greek:Sigma} is an upper-case sigma.\n";
+
+ use charnames qw(cyrillic greek);
+ print "\C{sigma} is Greek sigma, and \C{be} is Cyrillic b.\n";
+
+=head1 DESCRIPTION
+
+Pragma C<use charnames> supports arguments C<:full>, C<:short> and
+script names. If C<:full> is present, for expansion of
+C<\C{CHARNAME}}> string C<CHARNAME> is first looked in the list of
+standard Unicode names of chars. If C<:short> is present, and
+C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
+as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
+with script name arguments, then for C<\C{CHARNAME}}> the name
+C<CHARNAME> is looked up as a letter in the given scripts (in the
+specified order).
+
+For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
+F<charcodes.pm> looks for the names
+
+ SCRIPTNAME CAPITAL LETTER CHARNAME
+ SCRIPTNAME SMALL LETTER CHARNAME
+ SCRIPTNAME LETTER CHARNAME
+
+in the table of standard Unicode names. If C<CHARNAME> is lowercase,
+then the C<CAPITAL> variant is ignored, otherwise C<SMALL> variant is
+ignored.
+
+=head1 CUSTOM TRANSLATORS
+
+The mechanism of translation is C<\C{...}> escapes is general and not
+hardwired into F<charnames.pm>. A module can install custom
+translations (inside the scope which C<use>s the module) by the
+following magic incantation:
+
+ sub import {
+ shift;
+ $^H |= 0x20000;
+ $^H{charnames} = \&translator;
+ }
+
+Here translator() is a subroutine which takes C<CHARNAME> as an
+argument, and returns text to insert into the string instead of the
+C<\C{CHARNAME}> escape. Since the text to insert should be different
+in C<utf8> mode and out of it, the function should check the current
+state of C<utf8>-flag as in
+
+ sub translator {
+ if ($^H & 0x8) {
+ return utf_translator(@_);
+ } else {
+ return no_utf_translator(@_);
+ }
+ }
+
+=head1 BUGS
+
+Since evaluation of the translation function happens in a middle of
+compilation (of a string literal), the translation function should not
+do any C<eval>s or C<require>s. This restriction should be lifted in
+a future version of Perl.
+
+=cut
+
diff --git a/lib/utf8.pm b/lib/utf8.pm
index beb4568e2a..269a1c2177 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -71,9 +71,8 @@ attempt to canonicalize variable names for you.)
=item *
Regular expressions match characters instead of bytes. For instance,
-"." matches a character instead of a byte. (However, the C<\C> pattern
-is provided to force a match a single byte ("C<char>" in C, hence
-C<\C>).)
+"." matches a character instead of a byte. (However, the C<\O> pattern
+is provided to force a match a single byte ("octet", hence C<\O>).)
=item *
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index bffd191174..2542838d83 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1130,6 +1130,16 @@ workarounds.
inlining. See L<perlsub/"Constant Functions"> for commentary and
workarounds.
+=item constant(%s): %%^H is not localized
+
+(F) When setting compile-time-lexicalized hash %^H one should set the
+corresponding bit of $^H as well.
+
+=item constant(%s): %s
+
+(F) Compile-time-substitutions (such as overloaded constants and
+character names) were not correctly set up.
+
=item Copy method did not return a reference
(F) The method which overloads "=" is buggy. See L<overload/Copy Constructor>.
@@ -1662,6 +1672,11 @@ ended earlier on the current line.
mentioned with the $ in Perl, unlike in the shells, where it can vary from
one line to the next.
+=item Missing %sbrace%s on \C{}
+
+(F) Wrong syntax of character name literal C<\C{charname}> within
+double-quotish context.
+
=item Missing comma after first argument to %s function
(F) While certain functions allow you to specify a filehandle or an
diff --git a/pod/perlop.pod b/pod/perlop.pod
index 3234131f90..bd4ca1df7d 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -673,6 +673,7 @@ a transliteration, the first eleven of these sequences may be used.
\x1b hex char (ESC)
\x{263a} wide hex char (SMILEY)
\c[ control char (ESC)
+ \C{name} named char
\l lowercase next char
\u uppercase next char
@@ -682,7 +683,8 @@ a transliteration, the first eleven of these sequences may be used.
\Q quote non-word characters till \E
If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u>
-and C<\U> is taken from the current locale. See L<perllocale>.
+and C<\U> is taken from the current locale. See L<perllocale>. For
+documentation of C<\C{name}>, see L<charnames>.
All systems use the virtual C<"\n"> to represent a line terminator,
called a "newline". There is no such thing as an unvarying, physical
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 6c05efc66f..85b2a949c2 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -149,6 +149,7 @@ also work:
\x1B hex char
\x{263a} wide hex char (Unicode SMILEY)
\c[ control char
+ \C{name} named char
\l lowercase next char (think vi)
\u uppercase next char (think vi)
\L lowercase till \E (think vi)
@@ -157,7 +158,8 @@ also work:
\Q quote (disable) pattern metacharacters till \E
If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u>
-and C<\U> is taken from the current locale. See L<perllocale>.
+and C<\U> is taken from the current locale. See L<perllocale>. For
+documentation of C<\C{name}>, see L<charnames>.
You cannot include a literal C<$> or C<@> within a C<\Q> sequence.
An unescaped C<$> or C<@> interpolates the corresponding variable,
@@ -176,7 +178,7 @@ In addition, Perl defines the following:
\PP Match non-P
\X Match eXtended Unicode "combining character sequence",
equivalent to C<(?:\PM\pM*)>
- \C Match a single C char (octet) even under utf8.
+ \O Match a single C char (octet) even under utf8.
A C<\w> matches a single alphanumeric character, not a whole word.
Use C<\w+> to match a string of Perl-identifier characters (which isn't
diff --git a/regcomp.c b/regcomp.c
index b06077b419..df2fc0c796 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1779,7 +1779,7 @@ tryagain:
PL_seen_zerolen++; /* Do not optimize RE away */
nextchar();
break;
- case 'C':
+ case 'O':
ret = reg_node(SANY);
*flagp |= HASWIDTH|SIMPLE;
nextchar();
diff --git a/t/lib/charnames.t b/t/lib/charnames.t
new file mode 100644
index 0000000000..860cc03c75
--- /dev/null
+++ b/t/lib/charnames.t
@@ -0,0 +1,53 @@
+#!./perl
+
+BEGIN {
+ unless(grep /blib/, @INC) {
+ chdir 't' if -d 't';
+ unshift @INC, '../lib' if -d '../lib';
+ }
+}
+
+$| = 1;
+print "1..5\n";
+
+use charnames ':full';
+
+print "not " unless "Here\C{EXCLAMATION MARK}?" eq 'Here!?';
+print "ok 1\n";
+
+print "# \$res=$res \$\@='$@'\nnot "
+ if $res = eval <<'EOE'
+use charnames ":full";
+"Here: \C{CYRILLIC SMALL LETTER BE}!";
+1
+EOE
+ or $@ !~ /above 0xFF/;
+print "ok 2\n";
+# print "# \$res=$res \$\@='$@'\n";
+
+print "# \$res=$res \$\@='$@'\nnot "
+ if $res = eval <<'EOE'
+use charnames 'cyrillic';
+"Here: \C{Be}!";
+1
+EOE
+ or $@ !~ /CYRILLIC CAPITAL LETTER BE.*above 0xFF/;
+print "ok 3\n";
+
+# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt
+$encoded_be = "\320\261";
+$encoded_alpha = "\316\261";
+$encoded_bet = "\327\221";
+{
+ use charnames ':full';
+ use utf8;
+
+ print "not " unless "\C{CYRILLIC SMALL LETTER BE}" eq $encoded_be;
+ print "ok 4\n";
+
+ use charnames qw(cyrillic greek :short);
+
+ print "not " unless "\C{be},\C{alpha},\C{hebrew:bet}"
+ eq "$encoded_be,$encoded_alpha,$encoded_bet";
+ print "ok 5\n";
+}
diff --git a/toke.c b/toke.c
index 64485ac75c..f351c96591 100644
--- a/toke.c
+++ b/toke.c
@@ -1132,7 +1132,7 @@ S_scan_const(pTHX_ char *start)
: UTF;
char *leaveit = /* set of acceptably-backslashed characters */
PL_lex_inpat
- ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
+ ? "\\.^$@AGZdDwWsSbBpPXO+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
: "";
while (s < send || dorange) {
@@ -1353,6 +1353,43 @@ S_scan_const(pTHX_ char *start)
}
continue;
+ /* \C{latin small letter a} is a named character */
+ case 'C':
+ ++s;
+ if (*s == '{') {
+ char* e = strchr(s, '}');
+ HV *hv;
+ SV **svp;
+ SV *res, *cv;
+ STRLEN len;
+ char *str;
+ char *why = Nullch;
+
+ if (!e) {
+ yyerror("Missing right brace on \\C{}");
+ e = s - 1;
+ goto cont_scan;
+ }
+ res = newSVpvn(s + 1, e - s - 1);
+ res = new_constant( Nullch, 0, "charnames",
+ res, Nullsv, "\\C{...}" );
+ str = SvPV(res,len);
+ if (len > e - s + 4) {
+ char *odest = SvPVX(sv);
+
+ SvGROW(sv, (SvCUR(sv) + len - (e - s + 4)));
+ d = SvPVX(sv) + (d - odest);
+ }
+ Copy(str, d, len, char);
+ d += len;
+ SvREFCNT_dec(res);
+ cont_scan:
+ s = e + 1;
+ }
+ else
+ yyerror("Missing braces on \\C{}");
+ continue;
+
/* \c is a control character */
case 'c':
s++;
@@ -5251,76 +5288,101 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what)
}
}
+/* Either returns sv, or mortalizes sv and returns a new SV*.
+ Best used as sv=new_constant(..., sv, ...).
+ If s, pv are NULL, calls subroutine with one argument,
+ and type is used with error messages only. */
+
STATIC SV *
S_new_constant(pTHX_ char *s, STRLEN len, char *key, SV *sv, SV *pv, char *type)
{
dSP;
HV *table = GvHV(PL_hintgv); /* ^H */
- BINOP myop;
SV *res;
- bool oldcatch = CATCH_GET;
SV **cvp;
SV *cv, *typesv;
-
+ char *why, *why1, *why2;
+
+ if (!(PL_hints & HINT_LOCALIZE_HH)) {
+ SV *msg;
+
+ why = "%^H is not localized";
+ report_short:
+ why1 = why2 = "";
+ report:
+ msg = Perl_newSVpvf(aTHX_ "constant(%s): %s%s%s",
+ (type ? type: "undef"), why1, why2, why);
+ yyerror(SvPVX(msg));
+ SvREFCNT_dec(msg);
+ return sv;
+ }
if (!table) {
- yyerror("%^H is not defined");
- return sv;
+ why = "%^H is not defined";
+ goto report_short;
}
cvp = hv_fetch(table, key, strlen(key), FALSE);
if (!cvp || !SvOK(*cvp)) {
- char buf[128];
- sprintf(buf,"$^H{%s} is not defined", key);
- yyerror(buf);
- return sv;
+ why = "} is not defined";
+ why1 = "$^H{";
+ why2 = key;
+ goto report;
}
sv_2mortal(sv); /* Parent created it permanently */
cv = *cvp;
- if (!pv)
- pv = sv_2mortal(newSVpvn(s, len));
- if (type)
- typesv = sv_2mortal(newSVpv(type, 0));
+ if (!pv && s)
+ pv = sv_2mortal(newSVpvn(s, len));
+ if (type && pv)
+ typesv = sv_2mortal(newSVpv(type, 0));
else
- typesv = &PL_sv_undef;
- CATCH_SET(TRUE);
- Zero(&myop, 1, BINOP);
- myop.op_last = (OP *) &myop;
- myop.op_next = Nullop;
- myop.op_flags = OPf_WANT_SCALAR | OPf_STACKED;
-
+ typesv = &PL_sv_undef;
+
PUSHSTACKi(PERLSI_OVERLOAD);
- ENTER;
- SAVEOP();
- PL_op = (OP *) &myop;
- if (PERLDB_SUB && PL_curstash != PL_debstash)
- PL_op->op_private |= OPpENTERSUB_DB;
- PUTBACK;
- Perl_pp_pushmark(aTHX);
-
+ ENTER ;
+ SAVETMPS;
+
+ PUSHMARK(SP) ;
EXTEND(sp, 4);
- PUSHs(pv);
+ if (pv)
+ PUSHs(pv);
PUSHs(sv);
- PUSHs(typesv);
+ if (pv)
+ PUSHs(typesv);
PUSHs(cv);
PUTBACK;
-
- if (PL_op = Perl_pp_entersub(aTHX))
- CALLRUNOPS(aTHX);
- LEAVE;
- SPAGAIN;
-
- res = POPs;
- PUTBACK;
- CATCH_SET(oldcatch);
+ call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL));
+
+ SPAGAIN ;
+
+ /* Check the eval first */
+ if (!PL_in_eval && SvTRUE(ERRSV))
+ {
+ STRLEN n_a;
+ sv_catpv(ERRSV, "Propagated");
+ yyerror(SvPV(ERRSV, n_a)); /* Duplicates the message inside eval */
+ POPs ;
+ res = SvREFCNT_inc(sv);
+ }
+ else {
+ res = POPs;
+ SvREFCNT_inc(res);
+ }
+
+ PUTBACK ;
+ FREETMPS ;
+ LEAVE ;
POPSTACK;
-
+
if (!SvOK(res)) {
- char buf[128];
- sprintf(buf,"Call to &{$^H{%s}} did not return a defined value", key);
- yyerror(buf);
- }
- return SvREFCNT_inc(res);
+ why = "}} did not return a defined value";
+ why1 = "Call to &{$^H{";
+ why2 = key;
+ sv = res;
+ goto report;
+ }
+
+ return res;
}
-
+
STATIC char *
S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
{