1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
|
package Encode::Alias;
use strict;
use warnings;
no warnings 'redefine';
our $VERSION = do { my @r = ( q$Revision: 2.14 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
sub DEBUG () { 0 }
use base qw(Exporter);
# Public, encouraged API is exported by default
our @EXPORT =
qw (
define_alias
find_alias
);
our @Alias; # ordered matching list
our %Alias; # cached known aliases
sub find_alias {
require Encode;
my $class = shift;
my $find = shift;
unless ( exists $Alias{$find} ) {
$Alias{$find} = undef; # Recursion guard
for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
my $alias = $Alias[$i];
my $val = $Alias[ $i + 1 ];
my $new;
if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
DEBUG and warn "eval $val";
$new = eval $val;
DEBUG and $@ and warn "$val, $@";
}
elsif ( ref($alias) eq 'CODE' ) {
DEBUG and warn "$alias", "->", "($find)";
$new = $alias->($find);
}
elsif ( lc($find) eq lc($alias) ) {
$new = $val;
}
if ( defined($new) ) {
next if $new eq $find; # avoid (direct) recursion on bugs
DEBUG and warn "$alias, $new";
my $enc =
( ref($new) ) ? $new : Encode::find_encoding($new);
if ($enc) {
$Alias{$find} = $enc;
last;
}
}
}
# case insensitive search when canonical is not in all lowercase
# RT ticket #7835
unless ( $Alias{$find} ) {
my $lcfind = lc($find);
for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
{
$lcfind eq lc($name) or next;
$Alias{$find} = Encode::find_encoding($name);
DEBUG and warn "$find => $name";
}
}
}
if (DEBUG) {
my $name;
if ( my $e = $Alias{$find} ) {
$name = $e->name;
}
else {
$name = "";
}
warn "find_alias($class, $find)->name = $name";
}
return $Alias{$find};
}
sub define_alias {
while (@_) {
my ( $alias, $name ) = splice( @_, 0, 2 );
unshift( @Alias, $alias => $name ); # newer one has precedence
if ( ref($alias) ) {
# clear %Alias cache to allow overrides
my @a = keys %Alias;
for my $k (@a) {
if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$k};
}
elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
DEBUG and warn "delete \$Alias\{$k\}";
delete $Alias{$k};
}
}
}
else {
DEBUG and warn "delete \$Alias\{$alias\}";
delete $Alias{$alias};
}
}
}
# Allow latin-1 style names as well
# 0 1 2 3 4 5 6 7 8 9 10
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
# Allow winlatin1 style names as well
our %Winlatin2cp = (
'latin1' => 1252,
'latin2' => 1250,
'cyrillic' => 1251,
'greek' => 1253,
'turkish' => 1254,
'hebrew' => 1255,
'arabic' => 1256,
'baltic' => 1257,
'vietnamese' => 1258,
);
init_aliases();
sub undef_aliases {
@Alias = ();
%Alias = ();
}
sub init_aliases {
require Encode;
undef_aliases();
# Try all-lower-case version should all else fails
define_alias( qr/^(.*)$/ => '"\L$1"' );
# UTF/UCS stuff
define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
define_alias(
qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
qr/^UCS-?4-?(BE|LE)?$/i => 'uc("UTF-32$1")',
qr/^iso-10646-1$/i => '"UCS-2BE"'
);
define_alias(
qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
qr/^UTF-?(16|32)$/i => '"UTF-$1"',
);
# ASCII
define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
define_alias( 'C' => 'ascii' );
define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
# Allow variants of iso-8859-1 etc.
define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
# At least HP-UX has these.
define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
# More HP stuff.
define_alias(
qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
'"${1}8"' );
# The Official name of ASCII.
define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
# This is a font issue, not an encoding issue.
# (The currency symbol of the Latin 1 upper half
# has been redefined as the euro symbol.)
define_alias( qr/^(.+)\@euro$/i => '"$1"' );
define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
);
define_alias(
qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
hebrew|arabic|baltic|vietnamese)$/ix =>
'"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
);
# Common names for non-latin preferred MIME names
define_alias(
'ascii' => 'US-ascii',
'cyrillic' => 'iso-8859-5',
'arabic' => 'iso-8859-6',
'greek' => 'iso-8859-7',
'hebrew' => 'iso-8859-8',
'thai' => 'iso-8859-11',
);
# RT #20781
define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
# And Microsoft has their own naming (again, surprisingly).
# And windows-* is registered in IANA!
define_alias(
qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
# Sometimes seen with a leading zero.
# define_alias( qr/\bcp037\b/i => '"cp37"');
# Mac Mappings
# predefined in *.ucm; unneeded
# define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
# http://rt.cpan.org/Ticket/Display.html?id=36326
define_alias( qr/^macintosh$/i => '"MacRoman"' );
# Ououououou. gone. They are differente!
# define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
# Standardize on the dashed versions.
define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
unless ($Encode::ON_EBCDIC) {
# for Encode::CN
define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
# define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
# CP936 doesn't have vendor-addon for GBK, so they're identical.
define_alias( qr/^gbk$/i => '"cp936"' );
# This fixes gb2312 vs. euc-cn confusion, practically
define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
# for Encode::JP
define_alias( qr/\bjis$/i => '"7bit-jis"' );
define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
define_alias( qr/\bujis$/i => '"euc-jp"' );
define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
define_alias( qr/\bsjis$/i => '"shiftjis"' );
define_alias( qr/\bwindows-31j$/i => '"cp932"' );
# for Encode::KR
define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
# This fixes ksc5601 vs. euc-kr confusion, practically
define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
# for Encode::TW
define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
}
# utf8 is blessed :)
define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
# At last, Map white space and _ to '-'
define_alias( qr/^(\S+)[\s_]+(.*)$/i => '"$1-$2"' );
}
1;
__END__
# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
# TODO: Armenian encoding ARMSCII-8
# TODO: Hebrew encoding ISO-8859-8-1
# TODO: Thai encoding TCVN
# TODO: Vietnamese encodings VPS
# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
# Kannada Khmer Korean Laotian Malayalam Mongolian
# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
=head1 NAME
Encode::Alias - alias definitions to encodings
=head1 SYNOPSIS
use Encode;
use Encode::Alias;
define_alias( "newName" => ENCODING);
define_alias( qr/.../ => ENCODING);
define_alias( sub { return ENCODING if ...; } );
=head1 DESCRIPTION
Allows newName to be used as an alias for ENCODING. ENCODING may be
either the name of an encoding or an encoding object (as described
in L<Encode>).
Currently the first argument to define_alias() can be specified in the
following ways:
=over 4
=item As a simple string.
=item As a qr// compiled regular expression, e.g.:
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
in order to allow C<$1> etc. to be substituted. The example is one
way to alias names as used in X11 fonts to the MIME names for the
iso-8859-* family. Note the double quotes inside the single quotes.
(or, you don't have to do this yourself because this example is predefined)
If you are using a regex here, you have to use the quotes as shown or
it won't work. Also note that regex handling is tricky even for the
experienced. Use this feature with caution.
=item As a code reference, e.g.:
define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
The same effect as the example above in a different way. The coderef
takes the alias name as an argument and returns a canonical name on
success or undef if not. Note the second argument is ignored if provided.
Use this with even more caution than the regex version.
=back
=head3 Changes in code reference aliasing
As of Encode 1.87, the older form
define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
no longer works.
Encode up to 1.86 internally used "local $_" to implement ths older
form. But consider the code below;
use Encode;
$_ = "eeeee" ;
while (/(e)/g) {
my $utf = decode('aliased-encoding-name', $1);
print "position:",pos,"\n";
}
Prior to Encode 1.86 this fails because of "local $_".
=head2 Alias overloading
You can override predefined aliases by simply applying define_alias().
The new alias is always evaluated first, and when necessary,
define_alias() flushes the internal cache to make the new definition
available.
# redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
# superset of SHIFT_JIS
define_alias( qr/shift.*jis$/i => '"cp932"' );
define_alias( qr/sjis$/i => '"cp932"' );
If you want to zap all predefined aliases, you can use
Encode::Alias->undef_aliases;
to do so. And
Encode::Alias->init_aliases;
gets the factory settings back.
Note that define_alias() will not be able to override the canonical name
of encodings. Encodings are first looked up by canonical name before
potential aliases are tried.
=head1 SEE ALSO
L<Encode>, L<Encode::Supported>
=cut
|