summaryrefslogtreecommitdiff
path: root/lib/I18N/Collate.pm
blob: 18c46da83544b9fcfad8f6f6963fbfda90abdd9c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#-----------------------------------------------------------------------#
# NOTE! This module is deprecated (obsolete) after the Perl release     #
# 5.003_06 as the functionality has been integrated into the Perl core. #
#-----------------------------------------------------------------------#

package I18N::Collate;

=head1 NAME

I18N::Collate - compare 8-bit scalar data according to the current locale

=head1 SYNOPSIS

    use I18N::Collate;
    setlocale(LC_COLLATE, 'locale-of-your-choice'); 
    $s1 = new I18N::Collate "scalar_data_1";
    $s2 = new I18N::Collate "scalar_data_2";

=head1 DESCRIPTION

This module provides you with objects that will collate 
according to your national character set, provided that the 
POSIX setlocale() function is supported on your system.

You can compare $s1 and $s2 above with

    $s1 le $s2

to extract the data itself, you'll need a dereference: $$s1

This module uses POSIX::setlocale(). The basic collation conversion is
done by strxfrm() which terminates at NUL characters being a decent C
routine.  collate_xfrm() handles embedded NUL characters gracefully.

The available locales depend on your operating system; try whether
C<locale -a> shows them or man pages for "locale" or "nlsinfo" or the
direct approach C<ls /usr/lib/nls/loc> or C<ls /usr/lib/nls> or
C<ls /usr/lib/locale>.  Not all the locales that your vendor supports
are necessarily installed: please consult your operating system's
documentation and possibly your local system administration.  The
locale names are probably something like C<xx_XX.(ISO)?8859-N> or
C<xx_XX.(ISO)?8859N>, for example C<fr_CH.ISO8859-1> is the Swiss (CH)
variant of French (fr), ISO Latin (8859) 1 (-1) which is the Western
European character set.

=cut

# I18N::Collate.pm
#
# Author:	Jarkko Hietaniemi <Jarkko.Hietaniemi@hut.fi>
#		Helsinki University of Technology, Finland
#
# Acks:		Guy Decoux <decoux@moulon.inra.fr> understood
#		overloading magic much deeper than I and told
#		how to cut the size of this code by more than half.
#		(my first version did overload all of lt gt eq le ge cmp)
#
# Purpose:      compare 8-bit scalar data according to the current locale
#
# Requirements:	Perl5 POSIX::setlocale() and POSIX::strxfrm()
#
# Exports:	setlocale 1)
#		collate_xfrm 2)
#
# Overloads:	cmp # 3)
#
# Usage:	use I18N::Collate;
#	        setlocale(LC_COLLATE, 'locale-of-your-choice'); # 4)
#		$s1 = new I18N::Collate "scalar_data_1";
#		$s2 = new I18N::Collate "scalar_data_2";
#		
#		now you can compare $s1 and $s2: $s1 le $s2
#		to extract the data itself, you need to deref: $$s1
#		
# Notes:	
#		1) this uses POSIX::setlocale
#		2) the basic collation conversion is done by strxfrm() which
#		   terminates at NUL characters being a decent C routine.
#		   collate_xfrm handles embedded NUL characters gracefully.
#		3) due to cmp and overload magic, lt le eq ge gt work also
#		4) the available locales depend on your operating system;
#		   try whether "locale -a" shows them or man pages for
#		   "locale" or "nlsinfo" work or the more direct
#		   approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls".
#		   Not all the locales that your vendor supports
#		   are necessarily installed: please consult your
#		   operating system's documentation.
#		   The locale names are probably something like
#		   'xx_XX.(ISO)?8859-N' or 'xx_XX.(ISO)?8859N',
#		   for example 'fr_CH.ISO8859-1' is the Swiss (CH)
#		   variant of French (fr), ISO Latin (8859) 1 (-1)
#		   which is the Western European character set.
#
# Updated:	19961005
#
# ---

use POSIX qw(strxfrm LC_COLLATE);

require Exporter;

@ISA = qw(Exporter);
@EXPORT = qw(collate_xfrm setlocale LC_COLLATE);
@EXPORT_OK = qw();

use overload qw(
fallback	1
cmp		collate_cmp
);

sub new {
  my $new = $_[1];

  if ($^W && $] >= 5.003_06) {
    unless ($please_use_I18N_Collate_even_if_deprecated) {
      warn <<___EOD___;
***

  WARNING: starting from the Perl version 5.003_06 the I18N::Collate
  interface for comparing 8-bit scalar data according to the current locale

	HAS BEEN DEPRECATED

  (that is, please do not use it anymore for any new applications and please
  migrate the old applications away from it) because its functionality was
  integrated into the Perl core language in the release 5.003_06.

  See pod/perli18n.pod for further information.

***
___EOD___
      $please_use_I18N_Collate_even_if_deprecated++;
    }
  }

  bless \$new;
}

sub setlocale {
 my ($category, $locale) = @_[0,1];

 POSIX::setlocale($category, $locale) if (defined $category);
 # the current $LOCALE 
 $LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || '';
}

sub C {
  my $s = ${$_[0]};

  $C->{$LOCALE}->{$s} = collate_xfrm($s)
    unless (defined $C->{$LOCALE}->{$s}); # cache when met

  $C->{$LOCALE}->{$s};
}

sub collate_xfrm {
  my $s = $_[0];
  my $x = '';
  
  for (split(/(\000+)/, $s)) {
    $x .= (/^\000/) ? $_ : strxfrm("$_\000");
  }

  $x;
}

sub collate_cmp {
  &C($_[0]) cmp &C($_[1]);
}

# init $LOCALE

&I18N::Collate::setlocale();

1; # keep require happy