summaryrefslogtreecommitdiff
path: root/ext/Encode/lib/Encode/CN/HZ.pm
blob: 6e382ff09ad278150a7ffb40f449593cb57e3f97 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
package Encode::CN::HZ;

use strict;
no warnings 'redefine'; # to quell the "use Encode" below

use vars qw($VERSION);
$VERSION = do { my @r = (q$Revision: 0.92 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r };

use Encode::CN;
use Encode qw|encode decode|;
use base 'Encode::Encoding';

# HZ is but escaped GB, so we implement it with the
# GB2312(raw) encoding here. Cf. RFC 1842 & 1843.

my $canon = 'hz';
my $obj = bless {name => $canon}, __PACKAGE__;
$obj->Define($canon);

sub decode
{
    my ($obj,$str,$chk) = @_;
    my $gb = Encode::find_encoding('gb2312');

    $str =~ s{~			# starting tilde
	(?:
	    (~)			# another tilde - escaped (set $1)
		|		#     or
	    \n			# \n - output nothing
		|		#     or
	    \{			# opening brace of GB data
		(		#  set $2 to any number of...
		    (?:	
			[^~]	#  non-tilde GB character
			    |   #     or
			~(?!\}) #  tilde not followed by a closing brace
		    )*
		)
	    ~\}			# closing brace of GB data
		|		# XXX: invalid escape - maybe die on $chk?
	)
    }{
	(defined $1)	? '~'			# two tildes make one tilde
	    :
	(defined $2)	? $gb->decode($2, $chk)	# decode the characters
	    :
	''					# '' on ~\n and invalid escape
    }egx;

    return $str;
}

sub encode
{
    my ($obj,$str,$chk) = @_;
    my ($out, $in_gb);
    my $gb = Encode::find_encoding('gb2312');

    $str =~ s/~/~~/g;

    # XXX: Since CHECK and partial decoding  has not been implemented yet,
    #      we'll use a very crude way to test for GB2312ness.

    for my $index (0 .. length($str) - 1) {
	no warnings 'utf8';

	my $char = substr($str, $index, 1);
	my $try  = $gb->encode($char);	# try encode this char

	if (defined($try)) {		# is a GB character
	    if ($in_gb) {
		$out .= $try;		# in GB mode - just append it
	    }
	    else {
		$out .= "~{$try";	# enter GB mode, then append it
		$in_gb = 1;
	    }
	}
	elsif ($in_gb) {
	    $out .= "~}$char";		# leave GB mode, then append it
	    $in_gb = 0;
	}
	else {
	    $out .= $char;		# not in GB mode - just append it
	}
    }

    $out .= '~}' if $in_gb;		# add closing brace as needed

    return $out;
}

1;
__END__