summaryrefslogtreecommitdiff
path: root/lib/I18N/Collate.pm
blob: 35c802536760fd16d298c79b7c123b2d3c33b280 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package I18N::Collate;

=head1 NAME

Collate - compare 8-bit scalar data according to the current locale

=head1 SYNOPSIS

    use Collate;
    setlocale(LC_COLLATE, 'locale-of-your-choice'); 
    $s1 = new Collate "scalar_data_1";
    $s2 = new Collate "scalar_data_2";

=head1 DESCRIPTION

This module provides you with objects that will collate 
according to your national character set, providing the 
POSIX setlocale() function should be supported on your system.

You can compare $s1 and $s2 above with

    $s1 le $s2

to extract the data itself, you'll need a dereference: $$s1

This uses POSIX::setlocale The basic collation conversion is done by
strxfrm() which terminates at NUL characters being a decent C routine.
collate_xfrm() handles embedded NUL characters gracefully.  Due to C<cmp>
and overload magic, C<lt>, C<le>, C<eq>, C<ge>, and C<gt> work also.  The
available locales depend on your operating system; try whether C<locale
-a> shows them or the more direct approach C<ls /usr/lib/nls/loc> or C<ls
/usr/lib/nls>.  The locale names are probably something like
"xx_XX.(ISO)?8859-N".

=cut

# Collate.pm
#
# Author:	Jarkko Hietaniemi <Jarkko.Hietaniemi@hut.fi>
#		Helsinki University of Technology, Finland
#
# Acks:		Guy Decoux <decoux@moulon.inra.fr> understood
#		overloading magic much deeper than I and told
#		how to cut the size of this code by more than half.
#		(my first version did overload all of lt gt eq le ge cmp)
#
# Purpose:      compare 8-bit scalar data according to the current locale
#
# Requirements:	Perl5 POSIX::setlocale() and POSIX::strxfrm()
#
# Exports:	setlocale 1)
#		collate_xfrm 2)
#
# Overloads:	cmp # 3)
#
# Usage:	use Collate;
#	        setlocale(&LC_COLLATE, 'locale-of-your-choice'); # 4)
#		$s1 = new Collate "scalar_data_1";
#		$s2 = new Collate "scalar_data_2";
#		
#		now you can compare $s1 and $s2: $s1 le $s2
#		to extract the data itself, you need to deref: $$s1
#		
# Notes:	
#		1) this uses POSIX::setlocale
#		2) the basic collation conversion is done by strxfrm() which
#		   terminates at NUL characters being a decent C routine.
#		   collate_xfrm handles embedded NUL characters gracefully.
#		3) due to cmp and overload magic, lt le eq ge gt work also
#		4) the available locales depend on your operating system;
#		   try whether "locale -a" shows them or the more direct
#		   approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls".
#		   The locale names are probably something like
#		   'xx_XX.(ISO)?8859-N'.
#
# Updated:	19940913 1341 GMT
#
# ---

use POSIX qw(strxfrm LC_COLLATE);

require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(collate_xfrm setlocale LC_COLLATE);
@EXPORT_OK = qw();

%OVERLOAD = qw(
fallback	1
cmp		collate_cmp
);

sub new { my $new = $_[1]; bless \$new }

sub setlocale {
 my ($category, $locale) = @_[0,1];

 POSIX::setlocale($category, $locale) if (defined $category);
 # the current $LOCALE 
 $LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || '';
}

sub C {
  my $s = ${$_[0]};

  $C->{$LOCALE}->{$s} = collate_xfrm($s)
    unless (defined $C->{$LOCALE}->{$s}); # cache when met

  $C->{$LOCALE}->{$s};
}

sub collate_xfrm {
  my $s = $_[0];
  my $x = '';
  
  for (split(/(\000+)/, $s)) {
    $x .= (/^\000/) ? $_ : strxfrm("$_\000");
  }

  $x;
}

sub collate_cmp {
  &C($_[0]) cmp &C($_[1]);
}

# init $LOCALE

&I18N::Collate::setlocale();

1; # keep require happy