summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJarkko Hietaniemi <jhi@iki.fi>2002-03-28 16:20:03 +0000
committerJarkko Hietaniemi <jhi@iki.fi>2002-03-28 16:20:03 +0000
commit52ea3e69a0eb35af2d24bda5dabccf9b9600bfe4 (patch)
treeb5ce0317ddfc1eb5c4e4dc05e20c513d761bad04
parent8a22007576b03a2f42861e49c20ebb363ff4ba58 (diff)
downloadperl-52ea3e69a0eb35af2d24bda5dabccf9b9600bfe4.tar.gz
Support Unicode 3.1 names, names without the (XX), and BOM.
p4raw-id: //depot/perl@15585
-rw-r--r--lib/charnames.pm209
-rw-r--r--lib/charnames.t37
-rw-r--r--pp_pack.c8
3 files changed, 192 insertions, 62 deletions
diff --git a/lib/charnames.pm b/lib/charnames.pm
index 0241534250..6471d18058 100644
--- a/lib/charnames.pm
+++ b/lib/charnames.pm
@@ -7,6 +7,32 @@ our $VERSION = '1.01';
use bytes (); # for $bytes::hint_bits
$charnames::hint_bits = 0x20000;
+my %alias1 = (
+ # Icky 3.2 names with parentheses.
+ 'LINE FEED' => 'LINE FEED (LF)',
+ 'FORM FEED' => 'FORM FEED (FF)',
+ 'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
+ 'NEXT LINE' => 'NEXT LINE (NEL)',
+ # Convenience.
+ 'LF' => 'LINE FEED (LF)',
+ 'FF' => 'FORM FEED (FF)',
+ 'CR' => 'CARRIAGE RETURN (LF)',
+ 'NEL' => 'NEXT LINE (NEXT LINE)',
+ 'BOM' => 'BYTE ORDER MARK',
+ );
+
+my %alias2 = (
+ # Pre-3.2 compatibility (only for the first 256 characters).
+ 'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
+ 'VERTICAL TABULATION' => 'LINE TABULATION',
+ 'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
+ 'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
+ 'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
+ 'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
+ 'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
+ 'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
+ );
+
my $txt;
# This is not optimized in any way yet
@@ -14,78 +40,99 @@ sub charnames
{
my $name = shift;
- ## Suck in the code/name list as a big string.
- ## Lines look like:
- ## "0052\t\tLATIN CAPITAL LETTER R\n"
- $txt = do "unicore/Name.pl" unless $txt;
+ if (exists $alias1{$name}) {
+ $name = $alias1{$name};
+ }
+ if (exists $alias2{$name}) {
+ require warnings;
+ warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
+ $name = $alias2{$name};
+ }
- ## @off will hold the index into the code/name string of the start and
- ## end of the name as we find it.
+ my $ord;
my @off;
+ my $fname;
+
+ if ($name eq "BYTE ORDER MARK") {
+ $fname = $name;
+ $ord = 0xFFFE;
+ } else {
+ ## Suck in the code/name list as a big string.
+ ## Lines look like:
+ ## "0052\t\tLATIN CAPITAL LETTER R\n"
+ $txt = do "unicore/Name.pl" unless $txt;
- ## If :full, look for the the name exactly
- if ($^H{charnames_full} and $txt =~ /\t\t$name$/m) {
- @off = ($-[0], $+[0]);
- }
-
- ## If we didn't get above, and :short allowed, look for the short name.
- ## The short name is like "greek:Sigma"
- unless (@off) {
- if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
- my ($script, $cname) = ($1,$2);
- my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
- if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U$cname$/m) {
- @off = ($-[0], $+[0]);
+ ## @off will hold the index into the code/name string of the start and
+ ## end of the name as we find it.
+
+ ## If :full, look for the the name exactly
+ if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
+ @off = ($-[0], $+[0]);
}
- }
- }
- ## If we still don't have it, check for the name among the loaded
- ## scripts.
- if (not @off)
- {
- my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
- for my $script ( @{$^H{charnames_scripts}} )
+ ## If we didn't get above, and :short allowed, look for the short name.
+ ## The short name is like "greek:Sigma"
+ unless (@off) {
+ if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
+ my ($script, $cname) = ($1,$2);
+ my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+ if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
+ @off = ($-[0], $+[0]);
+ }
+ }
+ }
+
+ ## If we still don't have it, check for the name among the loaded
+ ## scripts.
+ if (not @off)
{
- if ($txt =~ m/\t\t$script (?:$case )?LETTER \U$name$/m) {
- @off = ($-[0], $+[0]);
- last;
- }
+ my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
+ for my $script ( @{$^H{charnames_scripts}} )
+ {
+ if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
+ @off = ($-[0], $+[0]);
+ last;
+ }
+ }
}
+
+ ## If we don't have it by now, give up.
+ unless (@off) {
+ carp "Unknown charname '$name'";
+ return "\x{FFFD}";
+ }
+
+ ##
+ ## Now know where in the string the name starts.
+ ## The code, in hex, is befor that.
+ ##
+ ## The code can be 4-6 characters long, so we've got to sort of
+ ## go look for it, just after the newline that comes before $off[0].
+ ##
+ ## This would be much easier if unicore/Name.pl had info in
+ ## a name/code order, instead of code/name order.
+ ##
+ ## The +1 after the rindex() is to skip past the newline we're finding,
+ ## or, if the rindex() fails, to put us to an offset of zero.
+ ##
+ my $hexstart = rindex($txt, "\n", $off[0]) + 1;
+
+ ## we know where it starts, so turn into number -
+ ## the ordinal for the char.
+ $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
}
- ## If we don't have it by now, give up.
- unless (@off) {
- carp "Unknown charname '$name'";
- return "\x{FFFD}";
- }
-
- ##
- ## Now know where in the string the name starts.
- ## The code, in hex, is befor that.
- ##
- ## The code can be 4-6 characters long, so we've got to sort of
- ## go look for it, just after the newline that comes before $off[0].
- ##
- ## This would be much easier if unicore/Name.pl had info in
- ## a name/code order, instead of code/name order.
- ##
- ## The +1 after the rindex() is to skip past the newline we're finding,
- ## or, if the rindex() fails, to put us to an offset of zero.
- ##
- my $hexstart = rindex($txt, "\n", $off[0]) + 1;
-
- ## we know where it starts, so turn into number - the ordinal for the char.
- my $ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
-
if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
use bytes;
return chr $ord if $ord <= 255;
my $hex = sprintf "%04x", $ord;
- my $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
+ if (not defined $fname) {
+ $fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
+ }
croak "Character 0x$hex with name '$fname' is above 0xFF";
}
+ no warnings 'utf8'; # allow even illegal characters
return pack "U", $ord;
}
@@ -303,10 +350,56 @@ Returns undef if no name is known for the name.
This works only for the standard names, and does not yet aply
to custom translators.
+=head1 ALIASES
+
+A few aliases have been defined for convenience: instead of having
+to use the official names
+
+ LINE FEED (LF)
+ FORM FEED (FF)
+ CARRIAGE RETURN (CR)
+ NEXT LINE (NEL)
+
+(yes, with parentheses) one can use
+
+ LINE FEED
+ FORM FEED
+ CARRIAGE RETURN
+ NEXT LINE
+ LF
+ FF
+ CR
+ NEL
+
+One can also use
+
+ BYTE ORDER MARK
+ BOM
+
+though that is of course not a legal character as such.
+
+For backward compatibility one can use the old names for
+certain C0 and C1 controls
+
+ old new
+
+ HORIZONTAL TABULATION CHARACTER TABULATION
+ VERTICAL TABULATION LINE TABULATION
+ FILE SEPARATOR INFORMATION SEPARATOR FOUR
+ GROUP SEPARATOR INFORMATION SEPARATOR THREE
+ RECORD SEPARATOR INFORMATION SEPARATOR TWO
+ UNIT SEPARATOR INFORMATION SEPARATOR ONE
+ PARTIAL LINE DOWN PARTIAL LINE FORWARD
+ PARTIAL LINE UP PARTIAL LINE BACKWARD
+
+but the old names in addition to giving the character
+will also give a warning about being deprecated.
+
=head1 ILLEGAL CHARACTERS
-If you ask for a character that does not exist, a warning is given
-and the special Unicode I<replacement character> "\x{FFFD}" is returned.
+If you ask for a character that is illegal (like the byte order mark
+U+FFFE, or the U+FFFF) does not exist, a warning is given and the
+special Unicode I<replacement character> "\x{FFFD}" is returned.
=head1 BUGS
diff --git a/lib/charnames.t b/lib/charnames.t
index c800128efc..a8a063f096 100644
--- a/lib/charnames.t
+++ b/lib/charnames.t
@@ -1,15 +1,18 @@
#!./perl
+my @WARN;
+
BEGIN {
unless(grep /blib/, @INC) {
chdir 't' if -d 't';
@INC = '../lib';
}
+ $SIG{__WARN__} = sub { push @WARN, @_ };
}
$| = 1;
-print "1..25\n";
+print "1..34\n";
use charnames ':full';
@@ -169,6 +172,36 @@ print "ok 24\n";
print "not " unless "\N{NULL}" eq "\c@";
print "ok 25\n";
-# TODO: support 3.1 names, BOM. Generic aliasing?
+print "not " unless "\N{LINE FEED (LF)}" eq "\n";
+print "ok 26\n";
+
+print "not " unless "\N{LINE FEED}" eq "\n";
+print "ok 27\n";
+
+print "not " unless "\N{LF}" eq "\n";
+print "ok 28\n";
+
+print "not " unless "\N{BYTE ORDER MARK}" eq chr(0xFFFE);
+print "ok 29\n";
+
+print "not " unless "\N{BOM}" eq chr(0xFFFE);
+print "ok 30\n";
+
+{
+ use warnings 'deprecated';
+
+ print "not " unless "\N{HORIZONTAL TABULATION}" eq "\t";
+ print "ok 31\n";
+
+ print "not " unless grep { /"HORIZONTAL TABULATION" is deprecated/ } @WARN;
+ print "ok 32\n";
+ no warnings 'deprecated';
+
+ print "not " unless "\N{VERTICAL TABULATION}" eq "\013";
+ print "ok 33\n";
+
+ print "not " if grep { /"VERTICAL TABULATION" is deprecated/ } @WARN;
+ print "ok 34\n";
+}
diff --git a/pp_pack.c b/pp_pack.c
index 67f53e7718..1c5ee315ea 100644
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -2039,8 +2039,12 @@ Perl_pack_cat(pTHX_ SV *cat, char *pat, register char *patend, register SV **beg
fromstr = NEXTFROM;
auint = UNI_TO_NATIVE(SvUV(fromstr));
SvGROW(cat, SvCUR(cat) + UTF8_MAXLEN + 1);
- SvCUR_set(cat, (char*)uvchr_to_utf8((U8*)SvEND(cat),auint)
- - SvPVX(cat));
+ SvCUR_set(cat,
+ (char*)uvchr_to_utf8_flags((U8*)SvEND(cat),
+ auint,
+ ckWARN(WARN_UTF8) ?
+ 0 : UNICODE_ALLOW_ANY)
+ - SvPVX(cat));
}
*SvEND(cat) = '\0';
break;