1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
package Encode::CN::HZ;
use strict;
no warnings 'redefine'; # to quell the "use Encode" below
use vars qw($VERSION);
$VERSION = do { my @r = (q$Revision: 0.92 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };
use Encode::CN;
use Encode qw|encode decode|;
use base 'Encode::Encoding';
# HZ is but escaped GB, so we implement it with the
# GB2312(raw) encoding here. Cf. RFC 1842 & 1843.
my $canon = 'hz';
my $obj = bless {name => $canon}, __PACKAGE__;
$obj->Define($canon);
sub decode
{
my ($obj,$str,$chk) = @_;
my $gb = Encode::find_encoding('gb2312');
$str =~ s{~ # starting tilde
(?:
(~) # another tilde - escaped (set $1)
| # or
\n # \n - output nothing
| # or
\{ # opening brace of GB data
( # set $2 to any number of...
(?:
[^~] # non-tilde GB character
| # or
~(?!\}) # tilde not followed by a closing brace
)*
)
~\} # closing brace of GB data
| # XXX: invalid escape - maybe die on $chk?
)
}{
(defined $1) ? '~' # two tildes make one tilde
:
(defined $2) ? $gb->decode($2, $chk) # decode the characters
:
'' # '' on ~\n and invalid escape
}egx;
return $str;
}
sub encode
{
my ($obj,$str,$chk) = @_;
my ($out, $in_gb);
my $gb = Encode::find_encoding('gb2312');
$str =~ s/~/~~/g;
# XXX: Since CHECK and partial decoding has not been implemented yet,
# we'll use a very crude way to test for GB2312ness.
for my $index (0 .. length($str) - 1) {
no warnings 'utf8';
my $char = substr($str, $index, 1);
my $try = $gb->encode($char); # try encode this char
if (defined($try)) { # is a GB character
if ($in_gb) {
$out .= $try; # in GB mode - just append it
}
else {
$out .= "~{$try"; # enter GB mode, then append it
$in_gb = 1;
}
}
elsif ($in_gb) {
$out .= "~}$char"; # leave GB mode, then append it
$in_gb = 0;
}
else {
$out .= $char; # not in GB mode - just append it
}
}
$out .= '~}' if $in_gb; # add closing brace as needed
return $out;
}
1;
__END__
|