Introduce the charnames pragma.

Subject: [PATCH 5.005_58] Free \C (for named chars), move to \O From: Ilya Zakharevich <[9]ilya@math.ohio-state.edu> To: Chip Salzenberg <[11]chip@perlsupport.com> Cc: Mailing list Perl5 <[12]perl5-porters@perl.org> Date: Sat, 31 Jul 1999 05:44:05 -0400 Message-Id: <[13]199907311407.IAA25042@localhost.frii.com> From: Ilya Zakharevich <ilya@math.ohio-state.edu> To: Mailing list Perl5 <perl5-porters@perl.org> Subject: [PATCH 5.005_58] Named characters in Perl Date: Mon, 2 Aug 1999 19:25:40 -0400 Message-ID: <19990802192540.B24407@monk.mps.ohio-state.edu> p4raw-id: //depot/cfgperl@3916
author: Jarkko Hietaniemi <jhi@iki.fi> 1999-08-04 07:59:05 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 1999-08-04 07:59:05 +0000
commit: 423cee853811c26846bd1948939b85f9866dfb4a (patch)
tree: a1adb3cff6fd86474503248d2eba6747f0adeb02
parent: fd1e013efb606b51dc27fba846b1bedb38910a76 (diff)
download: perl-423cee853811c26846bd1948939b85f9866dfb4a.tar.gz
10 files changed, 325 insertions, 54 deletions
diff --git a/MAINTAIN b/MAINTAIN
index 12f987d5bc..4507ca9e6e 100644
--- a/MAINTAIN
+++ b/MAINTAIN
@@ -477,6 +477,7 @@ lib/bigint.pl
 lib/bigrat.pl	
 lib/blib.pm	
 lib/cacheout.pl	
+lib/charnames.pm		ilya
 lib/chat2.pl	
 lib/complete.pl	
 lib/constant.pm	
@@ -683,6 +684,7 @@ t/lib/cgi-form.t
 t/lib/cgi-function.t	
 t/lib/cgi-html.t	
 t/lib/cgi-request.t	
+t/lib/charnames.t		ilya
 t/lib/checktree.t	
 t/lib/complex.t			complex
 t/lib/db-btree.t		pmarquess
diff --git a/MANIFEST b/MANIFEST
index 0db9a3f0e9..0c0c0771f8 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -648,6 +648,7 @@ lib/bigrat.pl		An arbitrary precision rational arithmetic package
 lib/blib.pm		For "use blib"
 lib/cacheout.pl		Manages output filehandles when you need too many
 lib/caller.pm		Inherit pragmatic attributes from caller's context
+lib/charnames.pm	Character names
 lib/chat2.pl            Obsolete ipc library (use Comm.pm etc instead)
 lib/complete.pl		A command completion subroutine
 lib/constant.pm		For "use constant"
@@ -1112,6 +1113,7 @@ t/lib/cgi-form.t	See if CGI.pm works
 t/lib/cgi-function.t	See if CGI.pm works
 t/lib/cgi-html.t	See if CGI.pm works
 t/lib/cgi-request.t	See if CGI.pm works
+t/lib/charnames.t	See if character names work
 t/lib/checktree.t	See if File::CheckTree works
 t/lib/complex.t		See if Math::Complex works
 t/lib/db-btree.t	See if DB_File works
diff --git a/lib/charnames.pm b/lib/charnames.pm
new file mode 100644
index 0000000000..e407ff7c8a
--- /dev/null
+++ b/lib/charnames.pm
@@ -0,0 +1,134 @@
+package charnames;
+
+my $fname = 'unicode/UnicodeData-Latest.txt';
+my $txt;
+
+# This is not optimized in any way yet
+sub charnames {
+  $name = shift;
+  $txt = do "unicode/Name.pl" unless $txt;
+  my @off;
+  if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
+    @off = ($-[0], $+[0]);
+  }
+  unless (@off) {
+    if ($^H{charnames_short} and $name =~ /^(.*?):(.*)/s) {
+      my ($script, $cname) = ($1,$2);
+      my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+      if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
+	@off = ($-[0], $+[0]);
+      }
+    }
+  }
+  unless (@off) {
+    my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+    for ( @{$^H{charnames_scripts}} ) {
+      (@off = ($-[0], $+[0])), last 
+	if $txt =~ m/\t\t$_ (?:$case )?LETTER \U$name$/m;
+    }
+  }
+  die "Unknown charname '$name'" unless @off;
+  
+  # use caller 'encoding';	# Does not work at compile time?
+
+  my $ord = hex substr $txt, $off[0] - 4, 4;
+  if ($^H & 0x8) {
+    use utf8;
+    return chr $ord;
+  }
+  return chr $ord if $ord <= 255;
+  my $hex = sprintf '%X=0%o', $ord, $ord;
+  my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
+  die "Character 0x$hex with name '$fname' is above 0xFF";
+}
+
+sub import {
+  shift;
+  die "No scripts for `use charnames'" unless @_;
+  $^H |= 0x20000;
+  $^H{charnames} = \&charnames ;
+  my %h;
+  @h{@_} = (1) x @_;
+  $^H{charnames_full} = delete $h{':full'};
+  $^H{charnames_short} = delete $h{':short'};
+  $^H{charnames_scripts} = [map uc, keys %h];
+}
+
+
+1;
+__END__
+
+=head1 NAME
+
+charnames - define character names for C<\C{named}> string literal escape.
+
+=head1 SYNOPSIS
+
+  use charnames ':full';
+  print "\C{GREEK SMALL LETTER SIGMA} is called sigma.\n";
+
+  use charnames ':short';
+  print "\C{greek:Sigma} is an upper-case sigma.\n";
+
+  use charnames qw(cyrillic greek);
+  print "\C{sigma} is Greek sigma, and \C{be} is Cyrillic b.\n";
+
+=head1 DESCRIPTION
+
+Pragma C<use charnames> supports arguments C<:full>, C<:short> and
+script names.  If C<:full> is present, for expansion of
+C<\C{CHARNAME}}> string C<CHARNAME> is first looked in the list of
+standard Unicode names of chars.  If C<:short> is present, and
+C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
+as a letter in script C<SCRIPT>.  If pragma C<use charnames> is used
+with script name arguments, then for C<\C{CHARNAME}}> the name
+C<CHARNAME> is looked up as a letter in the given scripts (in the
+specified order).
+
+For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
+F<charcodes.pm> looks for the names
+
+  SCRIPTNAME CAPITAL LETTER CHARNAME
+  SCRIPTNAME SMALL LETTER CHARNAME
+  SCRIPTNAME LETTER CHARNAME
+
+in the table of standard Unicode names.  If C<CHARNAME> is lowercase,
+then the C<CAPITAL> variant is ignored, otherwise C<SMALL> variant is
+ignored.
+
+=head1 CUSTOM TRANSLATORS
+
+The mechanism of translation is C<\C{...}> escapes is general and not
+hardwired into F<charnames.pm>.  A module can install custom
+translations (inside the scope which C<use>s the module) by the
+following magic incantation:
+
+  sub import {
+    shift;
+    $^H |= 0x20000;
+    $^H{charnames} = \&translator;
+  }
+
+Here translator() is a subroutine which takes C<CHARNAME> as an
+argument, and returns text to insert into the string instead of the
+C<\C{CHARNAME}> escape.  Since the text to insert should be different
+in C<utf8> mode and out of it, the function should check the current
+state of C<utf8>-flag as in
+
+  sub translator {
+    if ($^H & 0x8) {
+      return utf_translator(@_);
+    } else {
+      return no_utf_translator(@_);
+    }
+  }
+
+=head1 BUGS
+
+Since evaluation of the translation function happens in a middle of
+compilation (of a string literal), the translation function should not
+do any C<eval>s or C<require>s.  This restriction should be lifted in
+a future version of Perl.
+
+=cut
+
diff --git a/lib/utf8.pm b/lib/utf8.pm
index beb4568e2a..269a1c2177 100644
--- a/lib/utf8.pm
+++ b/lib/utf8.pm
@@ -71,9 +71,8 @@ attempt to canonicalize variable names for you.)
 =item *
 
 Regular expressions match characters instead of bytes.  For instance,
-"." matches a character instead of a byte.  (However, the C<\C> pattern
-is provided to force a match a single byte ("C<char>" in C, hence
-C<\C>).)
+"." matches a character instead of a byte.  (However, the C<\O> pattern
+is provided to force a match a single byte ("octet", hence C<\O>).)
 
 =item *
 
diff --git a/pod/perldiag.pod b/pod/perldiag.pod
index bffd191174..2542838d83 100644
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -1130,6 +1130,16 @@ workarounds.
 inlining.  See L<perlsub/"Constant Functions"> for commentary and
 workarounds.
 
+=item constant(%s): %%^H is not localized
+
+(F) When setting compile-time-lexicalized hash %^H one should set the 
+corresponding bit of $^H as well.
+
+=item constant(%s): %s
+
+(F) Compile-time-substitutions (such as overloaded constants and
+character names) were not correctly set up.
+
 =item Copy method did not return a reference
 
 (F) The method which overloads "=" is buggy. See L<overload/Copy Constructor>.
@@ -1662,6 +1672,11 @@ ended earlier on the current line.
 mentioned with the $ in Perl, unlike in the shells, where it can vary from
 one line to the next.
 
+=item Missing %sbrace%s on \C{}
+
+(F) Wrong syntax of character name literal C<\C{charname}> within
+double-quotish context.
+
 =item Missing comma after first argument to %s function
 
 (F) While certain functions allow you to specify a filehandle or an
diff --git a/pod/perlop.pod b/pod/perlop.pod
index 3234131f90..bd4ca1df7d 100644
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -673,6 +673,7 @@ a transliteration, the first eleven of these sequences may be used.
     \x1b	hex char	(ESC)
     \x{263a}	wide hex char	(SMILEY)
     \c[		control char    (ESC)
+    \C{name}	named char
 
     \l		lowercase next char
     \u		uppercase next char
@@ -682,7 +683,8 @@ a transliteration, the first eleven of these sequences may be used.
     \Q		quote non-word characters till \E
 
 If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u>
-and C<\U> is taken from the current locale.  See L<perllocale>.
+and C<\U> is taken from the current locale.  See L<perllocale>.  For
+documentation of C<\C{name}>, see L<charnames>.
 
 All systems use the virtual C<"\n"> to represent a line terminator,
 called a "newline".  There is no such thing as an unvarying, physical
diff --git a/pod/perlre.pod b/pod/perlre.pod
index 6c05efc66f..85b2a949c2 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -149,6 +149,7 @@ also work:
     \x1B	hex char
     \x{263a}	wide hex char         (Unicode SMILEY)
     \c[		control char
+    \C{name}	named char
     \l		lowercase next char (think vi)
     \u		uppercase next char (think vi)
     \L		lowercase till \E (think vi)
@@ -157,7 +158,8 @@ also work:
     \Q		quote (disable) pattern metacharacters till \E
 
 If C<use locale> is in effect, the case map used by C<\l>, C<\L>, C<\u>
-and C<\U> is taken from the current locale.  See L<perllocale>.
+and C<\U> is taken from the current locale.  See L<perllocale>.  For
+documentation of C<\C{name}>, see L<charnames>.
 
 You cannot include a literal C<$> or C<@> within a C<\Q> sequence.
 An unescaped C<$> or C<@> interpolates the corresponding variable,
@@ -176,7 +178,7 @@ In addition, Perl defines the following:
     \PP	Match non-P
     \X	Match eXtended Unicode "combining character sequence",
         equivalent to C<(?:\PM\pM*)>
-    \C	Match a single C char (octet) even under utf8.
+    \O	Match a single C char (octet) even under utf8.
 
 A C<\w> matches a single alphanumeric character, not a whole word.
 Use C<\w+> to match a string of Perl-identifier characters (which isn't 
diff --git a/regcomp.c b/regcomp.c
index b06077b419..df2fc0c796 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1779,7 +1779,7 @@ tryagain:
 	    PL_seen_zerolen++;		/* Do not optimize RE away */
 	    nextchar();
 	    break;
-	case 'C':
+	case 'O':
 	    ret = reg_node(SANY);
 	    *flagp |= HASWIDTH|SIMPLE;
 	    nextchar();
diff --git a/t/lib/charnames.t b/t/lib/charnames.t
new file mode 100644
index 0000000000..860cc03c75
--- /dev/null
+++ b/t/lib/charnames.t
@@ -0,0 +1,53 @@
+#!./perl
+
+BEGIN {
+    unless(grep /blib/, @INC) {
+	chdir 't' if -d 't';
+	unshift @INC, '../lib' if -d '../lib';
+    }
+}
+
+$| = 1;
+print "1..5\n";
+
+use charnames ':full';
+
+print "not " unless "Here\C{EXCLAMATION MARK}?" eq 'Here!?';
+print "ok 1\n";
+
+print "# \$res=$res \$\@='$@'\nnot "
+  if $res = eval <<'EOE'
+use charnames ":full";
+"Here: \C{CYRILLIC SMALL LETTER BE}!";
+1
+EOE
+  or $@ !~ /above 0xFF/;
+print "ok 2\n";
+# print "# \$res=$res \$\@='$@'\n";
+
+print "# \$res=$res \$\@='$@'\nnot "
+  if $res = eval <<'EOE'
+use charnames 'cyrillic';
+"Here: \C{Be}!";
+1
+EOE
+  or $@ !~ /CYRILLIC CAPITAL LETTER BE.*above 0xFF/;
+print "ok 3\n";
+
+# If octal representation of unicode char is \0xyzt, then the utf8 is \3xy\2zt
+$encoded_be = "\320\261";
+$encoded_alpha = "\316\261";
+$encoded_bet = "\327\221";
+{
+  use charnames ':full';
+  use utf8;
+
+  print "not " unless "\C{CYRILLIC SMALL LETTER BE}" eq $encoded_be;
+  print "ok 4\n";
+
+  use charnames qw(cyrillic greek :short);
+
+  print "not " unless "\C{be},\C{alpha},\C{hebrew:bet}" 
+    eq "$encoded_be,$encoded_alpha,$encoded_bet";
+  print "ok 5\n";
+}
diff --git a/toke.c b/toke.c
index 64485ac75c..f351c96591 100644
--- a/toke.c
+++ b/toke.c
@@ -1132,7 +1132,7 @@ S_scan_const(pTHX_ char *start)
 	: UTF;
     char *leaveit =			/* set of acceptably-backslashed characters */
 	PL_lex_inpat
-	    ? "\\.^$@AGZdDwWsSbBpPXC+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
+	    ? "\\.^$@AGZdDwWsSbBpPXO+*?|()-nrtfeaxcz0123456789[{]} \t\n\r\f\v#"
 	    : "";
 
     while (s < send || dorange) {
@@ -1353,6 +1353,43 @@ S_scan_const(pTHX_ char *start)
 		}
 		continue;
 
+ 	    /* \C{latin small letter a} is a named character */
+ 	    case 'C':
+ 		++s;
+ 		if (*s == '{') {
+ 		    char* e = strchr(s, '}');
+ 		    HV *hv;
+ 		    SV **svp;
+ 		    SV *res, *cv;
+ 		    STRLEN len;
+ 		    char *str;
+ 		    char *why = Nullch;
+ 
+ 		    if (!e) {
+			yyerror("Missing right brace on \\C{}");
+			e = s - 1;
+			goto cont_scan;
+		    }
+		    res = newSVpvn(s + 1, e - s - 1);
+		    res = new_constant( Nullch, 0, "charnames", 
+					res, Nullsv, "\\C{...}" );
+		    str = SvPV(res,len);
+		    if (len > e - s + 4) {
+			char *odest = SvPVX(sv);
+
+			SvGROW(sv, (SvCUR(sv) + len - (e - s + 4)));
+			d = SvPVX(sv) + (d - odest);
+		    }
+		    Copy(str, d, len, char);
+		    d += len;
+		    SvREFCNT_dec(res);
+		  cont_scan:
+		    s = e + 1;
+		}
+		else
+		    yyerror("Missing braces on \\C{}");
+		continue;
+
 	    /* \c is a control character */
 	    case 'c':
 		s++;
@@ -5251,76 +5288,101 @@ S_checkcomma(pTHX_ register char *s, char *name, char *what)
     }
 }
 
+/* Either returns sv, or mortalizes sv and returns a new SV*.
+   Best used as sv=new_constant(..., sv, ...).
+   If s, pv are NULL, calls subroutine with one argument,
+   and type is used with error messages only. */
+
 STATIC SV *
 S_new_constant(pTHX_ char *s, STRLEN len, char *key, SV *sv, SV *pv, char *type) 
 {
     dSP;
     HV *table = GvHV(PL_hintgv);		 /* ^H */
-    BINOP myop;
     SV *res;
-    bool oldcatch = CATCH_GET;
     SV **cvp;
     SV *cv, *typesv;
-	    
+    char *why, *why1, *why2;
+    
+    if (!(PL_hints & HINT_LOCALIZE_HH)) {
+	SV *msg;
+	
+	why = "%^H is not localized";
+    report_short:
+	why1 = why2 = "";
+    report:
+	msg = Perl_newSVpvf(aTHX_ "constant(%s): %s%s%s", 
+			    (type ? type: "undef"), why1, why2, why);
+	yyerror(SvPVX(msg));
+ 	SvREFCNT_dec(msg);
+  	return sv;
+    }
     if (!table) {
-	yyerror("%^H is not defined");
-	return sv;
+	why = "%^H is not defined";
+	goto report_short;
     }
     cvp = hv_fetch(table, key, strlen(key), FALSE);
     if (!cvp || !SvOK(*cvp)) {
-	char buf[128];
-	sprintf(buf,"$^H{%s} is not defined", key);
-	yyerror(buf);
-	return sv;
+	why = "} is not defined";
+	why1 = "$^H{";
+	why2 = key;
+	goto report;
     }
     sv_2mortal(sv);			/* Parent created it permanently */
     cv = *cvp;
-    if (!pv)
-	pv = sv_2mortal(newSVpvn(s, len));
-    if (type)
-	typesv = sv_2mortal(newSVpv(type, 0));
+    if (!pv && s)
+  	pv = sv_2mortal(newSVpvn(s, len));
+    if (type && pv)
+  	typesv = sv_2mortal(newSVpv(type, 0));
     else
-	typesv = &PL_sv_undef;
-    CATCH_SET(TRUE);
-    Zero(&myop, 1, BINOP);
-    myop.op_last = (OP *) &myop;
-    myop.op_next = Nullop;
-    myop.op_flags = OPf_WANT_SCALAR | OPf_STACKED;
-
+  	typesv = &PL_sv_undef;
+    
     PUSHSTACKi(PERLSI_OVERLOAD);
-    ENTER;
-    SAVEOP();
-    PL_op = (OP *) &myop;
-    if (PERLDB_SUB && PL_curstash != PL_debstash)
-	PL_op->op_private |= OPpENTERSUB_DB;
-    PUTBACK;
-    Perl_pp_pushmark(aTHX);
-
+    ENTER ;
+    SAVETMPS;
+    
+    PUSHMARK(SP) ;
     EXTEND(sp, 4);
-    PUSHs(pv);
+    if (pv)
+ 	PUSHs(pv);
     PUSHs(sv);
-    PUSHs(typesv);
+    if (pv)
+ 	PUSHs(typesv);
     PUSHs(cv);
     PUTBACK;
-
-    if (PL_op = Perl_pp_entersub(aTHX))
-      CALLRUNOPS(aTHX);
-    LEAVE;
-    SPAGAIN;
-
-    res = POPs;
-    PUTBACK;
-    CATCH_SET(oldcatch);
+    call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL));
+    
+    SPAGAIN ;
+    
+    /* Check the eval first */
+    if (!PL_in_eval && SvTRUE(ERRSV))
+    {
+	STRLEN n_a;
+ 	sv_catpv(ERRSV, "Propagated");
+	yyerror(SvPV(ERRSV, n_a)); /* Duplicates the message inside eval */
+	POPs ;
+ 	res = SvREFCNT_inc(sv);
+    }
+    else {
+ 	res = POPs;
+ 	SvREFCNT_inc(res);
+    }
+    
+    PUTBACK ;
+    FREETMPS ;
+    LEAVE ;
     POPSTACK;
-
+    
     if (!SvOK(res)) {
-	char buf[128];
-	sprintf(buf,"Call to &{$^H{%s}} did not return a defined value", key);
-	yyerror(buf);
-    }
-    return SvREFCNT_inc(res);
+ 	why = "}} did not return a defined value";
+ 	why1 = "Call to &{$^H{";
+ 	why2 = key;
+ 	sv = res;
+ 	goto report;
+     }
+
+     return res;
 }
-
+  
 STATIC char *
 S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
 {
author	Jarkko Hietaniemi <jhi@iki.fi>	1999-08-04 07:59:05 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	1999-08-04 07:59:05 +0000
commit	423cee853811c26846bd1948939b85f9866dfb4a (patch)
tree	a1adb3cff6fd86474503248d2eba6747f0adeb02
parent	fd1e013efb606b51dc27fba846b1bedb38910a76 (diff)
download	perl-423cee853811c26846bd1948939b85f9866dfb4a.tar.gz