diff options
author | Nick Ing-Simmons <nik@tiuk.ti.com> | 2002-02-05 16:01:15 +0000 |
---|---|---|
committer | Nick Ing-Simmons <nik@tiuk.ti.com> | 2002-02-05 16:01:15 +0000 |
commit | 4cfc977cb33a032e78e373bce7db50a1970926f3 (patch) | |
tree | 31a9f2e1abe7b30d2d9c4fe70c184c0cf532f0cd /ext | |
parent | 63ea68f84f0804c9f2dc0a3dbf173267d55e8144 (diff) | |
download | perl-4cfc977cb33a032e78e373bce7db50a1970926f3.tar.gz |
More Encode tweaks:
- make expensive and marginal substring search optional (-O)
- enable -O for ASCII-oid encodings (search space is small)
- add ASCII-oid jis0201 to basic Encode.so
- add some other Japanese encodings to EUC_JP bundle (without -O)
p4raw-id: //depot/perlio@14563
Diffstat (limited to 'ext')
-rw-r--r-- | ext/Encode/EUC_JP/Makefile.PL | 9 | ||||
-rw-r--r-- | ext/Encode/Encode/jis0201.ucm | 231 | ||||
-rw-r--r-- | ext/Encode/Makefile.PL | 4 | ||||
-rwxr-xr-x | ext/Encode/compile | 27 |
4 files changed, 252 insertions, 19 deletions
diff --git a/ext/Encode/EUC_JP/Makefile.PL b/ext/Encode/EUC_JP/Makefile.PL index 03277418d5..ffa6902f5c 100644 --- a/ext/Encode/EUC_JP/Makefile.PL +++ b/ext/Encode/EUC_JP/Makefile.PL @@ -3,11 +3,10 @@ use strict; use ExtUtils::MakeMaker; my %tables = (EUC_JP => [ - 'euc-jp.ucm', -# 'jis0201.enc', -# 'jis0212.enc', -# 'jis0208.enc', -# 'shiftjis.enc', + 'euc-jp.ucm', + 'jis0208.enc', + 'jis0212.enc', + 'shiftjis.enc', ]); diff --git a/ext/Encode/Encode/jis0201.ucm b/ext/Encode/Encode/jis0201.ucm new file mode 100644 index 0000000000..a14f8ce703 --- /dev/null +++ b/ext/Encode/Encode/jis0201.ucm @@ -0,0 +1,231 @@ +# compile -o Encode/jis0201.ucm Encode/jis0201.enc +<code_set_name> "jis0201" +<mb_cur_min> 1 +<mb_cur_max> 1 +<subchar> \x3F +# +CHARMAP +<U0000> \x00 |0 # <control> +<U0001> \x01 |0 # <control> +<U0002> \x02 |0 # <control> +<U0003> \x03 |0 # <control> +<U0004> \x04 |0 # <control> +<U0005> \x05 |0 # <control> +<U0006> \x06 |0 # <control> +<U0007> \x07 |0 # <control> +<U0008> \x08 |0 # <control> +<U0009> \x09 |0 # <control> +<U000A> \x0A |0 # <control> +<U000B> \x0B |0 # <control> +<U000C> \x0C |0 # <control> +<U000D> \x0D |0 # <control> +<U000E> \x0E |0 # <control> +<U000F> \x0F |0 # <control> +<U0010> \x10 |0 # <control> +<U0011> \x11 |0 # <control> +<U0012> \x12 |0 # <control> +<U0013> \x13 |0 # <control> +<U0014> \x14 |0 # <control> +<U0015> \x15 |0 # <control> +<U0016> \x16 |0 # <control> +<U0017> \x17 |0 # <control> +<U0018> \x18 |0 # <control> +<U0019> \x19 |0 # <control> +<U001A> \x1A |0 # <control> +<U001B> \x1B |0 # <control> +<U001C> \x1C |0 # <control> +<U001D> \x1D |0 # <control> +<U001E> \x1E |0 # <control> +<U001F> \x1F |0 # <control> +<U0020> \x20 |0 # SPACE +<U0021> \x21 |0 # EXCLAMATION MARK +<U0022> \x22 |0 # QUOTATION MARK +<U0023> \x23 |0 # NUMBER SIGN +<U0024> \x24 |0 # DOLLAR SIGN +<U0025> \x25 |0 # PERCENT SIGN +<U0026> \x26 |0 # AMPERSAND +<U0027> \x27 |0 # APOSTROPHE +<U0028> \x28 |0 # LEFT PARENTHESIS +<U0029> \x29 |0 # RIGHT PARENTHESIS +<U002A> \x2A |0 # ASTERISK +<U002B> \x2B |0 # PLUS SIGN +<U002C> \x2C |0 # COMMA +<U002D> \x2D |0 # HYPHEN-MINUS +<U002E> \x2E |0 # FULL STOP +<U002F> \x2F |0 # SOLIDUS +<U0030> \x30 |0 # DIGIT ZERO +<U0031> \x31 |0 # DIGIT ONE +<U0032> \x32 |0 # DIGIT TWO +<U0033> \x33 |0 # DIGIT THREE +<U0034> \x34 |0 # DIGIT FOUR +<U0035> \x35 |0 # DIGIT FIVE +<U0036> \x36 |0 # DIGIT SIX +<U0037> \x37 |0 # DIGIT SEVEN +<U0038> \x38 |0 # DIGIT EIGHT +<U0039> \x39 |0 # DIGIT NINE +<U003A> \x3A |0 # COLON +<U003B> \x3B |0 # SEMICOLON +<U003C> \x3C |0 # LESS-THAN SIGN +<U003D> \x3D |0 # EQUALS SIGN +<U003E> \x3E |0 # GREATER-THAN SIGN +<U003F> \x3F |0 # QUESTION MARK +<U0040> \x40 |0 # COMMERCIAL AT +<U0041> \x41 |0 # LATIN CAPITAL LETTER A +<U0042> \x42 |0 # LATIN CAPITAL LETTER B +<U0043> \x43 |0 # LATIN CAPITAL LETTER C +<U0044> \x44 |0 # LATIN CAPITAL LETTER D +<U0045> \x45 |0 # LATIN CAPITAL LETTER E +<U0046> \x46 |0 # LATIN CAPITAL LETTER F +<U0047> \x47 |0 # LATIN CAPITAL LETTER G +<U0048> \x48 |0 # LATIN CAPITAL LETTER H +<U0049> \x49 |0 # LATIN CAPITAL LETTER I +<U004A> \x4A |0 # LATIN CAPITAL LETTER J +<U004B> \x4B |0 # LATIN CAPITAL LETTER K +<U004C> \x4C |0 # LATIN CAPITAL LETTER L +<U004D> \x4D |0 # LATIN CAPITAL LETTER M +<U004E> \x4E |0 # LATIN CAPITAL LETTER N +<U004F> \x4F |0 # LATIN CAPITAL LETTER O +<U0050> \x50 |0 # LATIN CAPITAL LETTER P +<U0051> \x51 |0 # LATIN CAPITAL LETTER Q +<U0052> \x52 |0 # LATIN CAPITAL LETTER R +<U0053> \x53 |0 # LATIN CAPITAL LETTER S +<U0054> \x54 |0 # LATIN CAPITAL LETTER T +<U0055> \x55 |0 # LATIN CAPITAL LETTER U +<U0056> \x56 |0 # LATIN CAPITAL LETTER V +<U0057> \x57 |0 # LATIN CAPITAL LETTER W +<U0058> \x58 |0 # LATIN CAPITAL LETTER X +<U0059> \x59 |0 # LATIN CAPITAL LETTER Y +<U005A> \x5A |0 # LATIN CAPITAL LETTER Z +<U005B> \x5B |0 # LEFT SQUARE BRACKET +<U005C> \x5C |0 # REVERSE SOLIDUS +<U005D> \x5D |0 # RIGHT SQUARE BRACKET +<U005E> \x5E |0 # CIRCUMFLEX ACCENT +<U005F> \x5F |0 # LOW LINE +<U0060> \x60 |0 # GRAVE ACCENT +<U0061> \x61 |0 # LATIN SMALL LETTER A +<U0062> \x62 |0 # LATIN SMALL LETTER B +<U0063> \x63 |0 # LATIN SMALL LETTER C +<U0064> \x64 |0 # LATIN SMALL LETTER D +<U0065> \x65 |0 # LATIN SMALL LETTER E +<U0066> \x66 |0 # LATIN SMALL LETTER F +<U0067> \x67 |0 # LATIN SMALL LETTER G +<U0068> \x68 |0 # LATIN SMALL LETTER H +<U0069> \x69 |0 # LATIN SMALL LETTER I +<U006A> \x6A |0 # LATIN SMALL LETTER J +<U006B> \x6B |0 # LATIN SMALL LETTER K +<U006C> \x6C |0 # LATIN SMALL LETTER L +<U006D> \x6D |0 # LATIN SMALL LETTER M +<U006E> \x6E |0 # LATIN SMALL LETTER N +<U006F> \x6F |0 # LATIN SMALL LETTER O +<U0070> \x70 |0 # LATIN SMALL LETTER P +<U0071> \x71 |0 # LATIN SMALL LETTER Q +<U0072> \x72 |0 # LATIN SMALL LETTER R +<U0073> \x73 |0 # LATIN SMALL LETTER S +<U0074> \x74 |0 # LATIN SMALL LETTER T +<U0075> \x75 |0 # LATIN SMALL LETTER U +<U0076> \x76 |0 # LATIN SMALL LETTER V +<U0077> \x77 |0 # LATIN SMALL LETTER W +<U0078> \x78 |0 # LATIN SMALL LETTER X +<U0079> \x79 |0 # LATIN SMALL LETTER Y +<U007A> \x7A |0 # LATIN SMALL LETTER Z +<U007B> \x7B |0 # LEFT CURLY BRACKET +<U007C> \x7C |0 # VERTICAL LINE +<U007D> \x7D |0 # RIGHT CURLY BRACKET +<U203E> \x7E |0 # OVERLINE +<U007F> \x7F |0 # <control> +<U0080> \x80 |0 # <control> +<U0081> \x81 |0 # <control> +<U0082> \x82 |0 # <control> +<U0083> \x83 |0 # <control> +<U0084> \x84 |0 # <control> +<U0085> \x85 |0 # <control> +<U0086> \x86 |0 # <control> +<U0087> \x87 |0 # <control> +<U0088> \x88 |0 # <control> +<U0089> \x89 |0 # <control> +<U008A> \x8A |0 # <control> +<U008B> \x8B |0 # <control> +<U008C> \x8C |0 # <control> +<U008D> \x8D |0 # <control> +<U008E> \x8E |0 # <control> +<U008F> \x8F |0 # <control> +<U0090> \x90 |0 # <control> +<U0091> \x91 |0 # <control> +<U0092> \x92 |0 # <control> +<U0093> \x93 |0 # <control> +<U0094> \x94 |0 # <control> +<U0095> \x95 |0 # <control> +<U0096> \x96 |0 # <control> +<U0097> \x97 |0 # <control> +<U0098> \x98 |0 # <control> +<U0099> \x99 |0 # <control> +<U009A> \x9A |0 # <control> +<U009B> \x9B |0 # <control> +<U009C> \x9C |0 # <control> +<U009D> \x9D |0 # <control> +<U009E> \x9E |0 # <control> +<U009F> \x9F |0 # <control> +<UFF61> \xA1 |0 # HALFWIDTH IDEOGRAPHIC FULL STOP +<UFF62> \xA2 |0 # HALFWIDTH LEFT CORNER BRACKET +<UFF63> \xA3 |0 # HALFWIDTH RIGHT CORNER BRACKET +<UFF64> \xA4 |0 # HALFWIDTH IDEOGRAPHIC COMMA +<UFF65> \xA5 |0 # HALFWIDTH KATAKANA MIDDLE DOT +<UFF66> \xA6 |0 # HALFWIDTH KATAKANA LETTER WO +<UFF67> \xA7 |0 # HALFWIDTH KATAKANA LETTER SMALL A +<UFF68> \xA8 |0 # HALFWIDTH KATAKANA LETTER SMALL I +<UFF69> \xA9 |0 # HALFWIDTH KATAKANA LETTER SMALL U +<UFF6A> \xAA |0 # HALFWIDTH KATAKANA LETTER SMALL E +<UFF6B> \xAB |0 # HALFWIDTH KATAKANA LETTER SMALL O +<UFF6C> \xAC |0 # HALFWIDTH KATAKANA LETTER SMALL YA +<UFF6D> \xAD |0 # HALFWIDTH KATAKANA LETTER SMALL YU +<UFF6E> \xAE |0 # HALFWIDTH KATAKANA LETTER SMALL YO +<UFF6F> \xAF |0 # HALFWIDTH KATAKANA LETTER SMALL TU +<UFF70> \xB0 |0 # HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK +<UFF71> \xB1 |0 # HALFWIDTH KATAKANA LETTER A +<UFF72> \xB2 |0 # HALFWIDTH KATAKANA LETTER I +<UFF73> \xB3 |0 # HALFWIDTH KATAKANA LETTER U +<UFF74> \xB4 |0 # HALFWIDTH KATAKANA LETTER E +<UFF75> \xB5 |0 # HALFWIDTH KATAKANA LETTER O +<UFF76> \xB6 |0 # HALFWIDTH KATAKANA LETTER KA +<UFF77> \xB7 |0 # HALFWIDTH KATAKANA LETTER KI +<UFF78> \xB8 |0 # HALFWIDTH KATAKANA LETTER KU +<UFF79> \xB9 |0 # HALFWIDTH KATAKANA LETTER KE +<UFF7A> \xBA |0 # HALFWIDTH KATAKANA LETTER KO +<UFF7B> \xBB |0 # HALFWIDTH KATAKANA LETTER SA +<UFF7C> \xBC |0 # HALFWIDTH KATAKANA LETTER SI +<UFF7D> \xBD |0 # HALFWIDTH KATAKANA LETTER SU +<UFF7E> \xBE |0 # HALFWIDTH KATAKANA LETTER SE +<UFF7F> \xBF |0 # HALFWIDTH KATAKANA LETTER SO +<UFF80> \xC0 |0 # HALFWIDTH KATAKANA LETTER TA +<UFF81> \xC1 |0 # HALFWIDTH KATAKANA LETTER TI +<UFF82> \xC2 |0 # HALFWIDTH KATAKANA LETTER TU +<UFF83> \xC3 |0 # HALFWIDTH KATAKANA LETTER TE +<UFF84> \xC4 |0 # HALFWIDTH KATAKANA LETTER TO +<UFF85> \xC5 |0 # HALFWIDTH KATAKANA LETTER NA +<UFF86> \xC6 |0 # HALFWIDTH KATAKANA LETTER NI +<UFF87> \xC7 |0 # HALFWIDTH KATAKANA LETTER NU +<UFF88> \xC8 |0 # HALFWIDTH KATAKANA LETTER NE +<UFF89> \xC9 |0 # HALFWIDTH KATAKANA LETTER NO +<UFF8A> \xCA |0 # HALFWIDTH KATAKANA LETTER HA +<UFF8B> \xCB |0 # HALFWIDTH KATAKANA LETTER HI +<UFF8C> \xCC |0 # HALFWIDTH KATAKANA LETTER HU +<UFF8D> \xCD |0 # HALFWIDTH KATAKANA LETTER HE +<UFF8E> \xCE |0 # HALFWIDTH KATAKANA LETTER HO +<UFF8F> \xCF |0 # HALFWIDTH KATAKANA LETTER MA +<UFF90> \xD0 |0 # HALFWIDTH KATAKANA LETTER MI +<UFF91> \xD1 |0 # HALFWIDTH KATAKANA LETTER MU +<UFF92> \xD2 |0 # HALFWIDTH KATAKANA LETTER ME +<UFF93> \xD3 |0 # HALFWIDTH KATAKANA LETTER MO +<UFF94> \xD4 |0 # HALFWIDTH KATAKANA LETTER YA +<UFF95> \xD5 |0 # HALFWIDTH KATAKANA LETTER YU +<UFF96> \xD6 |0 # HALFWIDTH KATAKANA LETTER YO +<UFF97> \xD7 |0 # HALFWIDTH KATAKANA LETTER RA +<UFF98> \xD8 |0 # HALFWIDTH KATAKANA LETTER RI +<UFF99> \xD9 |0 # HALFWIDTH KATAKANA LETTER RU +<UFF9A> \xDA |0 # HALFWIDTH KATAKANA LETTER RE +<UFF9B> \xDB |0 # HALFWIDTH KATAKANA LETTER RO +<UFF9C> \xDC |0 # HALFWIDTH KATAKANA LETTER WA +<UFF9D> \xDD |0 # HALFWIDTH KATAKANA LETTER N +<UFF9E> \xDE |0 # HALFWIDTH KATAKANA VOICED SOUND MARK +<UFF9F> \xDF |0 # HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK +END CHARMAP diff --git a/ext/Encode/Makefile.PL b/ext/Encode/Makefile.PL index 5193d0ec2a..d6db7795e9 100644 --- a/ext/Encode/Makefile.PL +++ b/ext/Encode/Makefile.PL @@ -2,7 +2,7 @@ use 5.7.2; use strict; use ExtUtils::MakeMaker; -my %tables = (8859 => ['ascii.ucm', 'cp1250.ucm', 'koi8-r.ucm' ], +my %tables = (8859 => ['ascii.ucm', 'cp1250.ucm', 'koi8-r.ucm', 'jis0201.ucm' ], EBCDIC => ['cp1047.ucm','cp37.ucm','posix-bc.ucm'], Symbols => ['symbol.ucm','dingbats.ucm'], ); @@ -93,7 +93,7 @@ sub postamble $continuator = ''; } } - $str .= "\n\t\$(PERL) compile -o \$\@ -f $table.fnm\n\n"; + $str .= "\n\t\$(PERL) compile -O -o \$\@ -f $table.fnm\n\n"; open (FILELIST, ">$table.fnm") || die "Could not open $table.fnm: $!"; foreach my $file (@{$tables{$table}}) diff --git a/ext/Encode/compile b/ext/Encode/compile index f52b4edb71..ee6d778d94 100755 --- a/ext/Encode/compile +++ b/ext/Encode/compile @@ -46,7 +46,7 @@ sub encode_M eval "\@ARGV = map(glob(\$_),\@ARGV)" if ($^O eq 'MSWin32'); my %opt; -getopts('qo:f:n:',\%opt); +getopts('qOo:f:n:',\%opt); my $cname = (exists $opt{'o'}) ? $opt{'o'} : shift(@ARGV); chmod(0666,$cname) if -f $cname && !-w $cname; open(C,">$cname") || die "Cannot open $cname:$!"; @@ -434,17 +434,20 @@ sub outstring } else { - foreach my $o (keys %strings) - { - my $i = index($o,$s); - if ($i >= 0) - { - $sym = $strings{$o}; - $sym .= sprintf("+0x%02x",$i) if ($i); - $subsave += length($s); - return $sym; - } - } + if ($opt{'O'}) { + foreach my $o (keys %strings) + { + my $i = index($o,$s); + if ($i >= 0) + { + $sym = $strings{$o}; + $sym .= sprintf("+0x%02x",$i) if ($i); + $subsave += length($s); + $strings{$s} = $sym; + return $sym; + } + } + } $strings{$s} = $sym = $name; $strings += length($s); printf $fh "\nstatic const U8 %s[%d] =\n",$name,length($s); |