t/lib/locale.t


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440

#!./perl -wT

print "1..104\n";

BEGIN {
    chdir 't' if -d 't';
    @INC = '../lib';
}

use strict;
use POSIX qw(locale_h);

use vars qw($a
	    $English $German $French $Spanish
	    @C @English @German @French @Spanish
	    $Locale @Locale %iLocale %UPPER %lower @Neoalpha);

$a = 'abc %';

sub ok {
    my ($n, $result) = @_;

    print 'not ' unless ($result);
    print "ok $n\n";
}

# First we'll do a lot of taint checking for locales.
# This is the easiest to test, actually, as any locale,
# even the default locale will taint under 'use locale'.

sub is_tainted { # hello, camel two.
    my $dummy;
    not eval { $dummy = join("", @_), kill 0; 1 }
}

sub check_taint ($$) {
    ok $_[0], is_tainted($_[1]);
}

sub check_taint_not ($$) {
    ok $_[0], not is_tainted($_[1]);
}

use locale;	# engage locale and therefore locale taint.

check_taint_not   1, $a;

check_taint       2, uc($a);
check_taint       3, "\U$a";
check_taint       4, ucfirst($a);
check_taint       5, "\u$a";
check_taint       6, lc($a);
check_taint       7, "\L$a";
check_taint       8, lcfirst($a);
check_taint       9, "\l$a";

check_taint      10, sprintf('%e', 123.456);
check_taint      11, sprintf('%f', 123.456);
check_taint      12, sprintf('%g', 123.456);
check_taint_not  13, sprintf('%d', 123.456);
check_taint_not  14, sprintf('%x', 123.456);

$_ = $a;	# untaint $_

$_ = uc($a);	# taint $_

check_taint      15, $_;

/(\w)/;	# taint $&, $`, $', $+, $1.
check_taint      16, $&;
check_taint      17, $`;
check_taint      18, $';
check_taint      19, $+;
check_taint      20, $1;
check_taint_not  21, $2;

/(.)/;	# untaint $&, $`, $', $+, $1.
check_taint_not  22, $&;
check_taint_not  23, $`;
check_taint_not  24, $';
check_taint_not  25, $+;
check_taint_not  26, $1;
check_taint_not  27, $2;

/(\W)/;	# taint $&, $`, $', $+, $1.
check_taint      28, $&;
check_taint      29, $`;
check_taint      30, $';
check_taint      31, $+;
check_taint      32, $1;
check_taint_not  33, $2;

/(\s)/;	# taint $&, $`, $', $+, $1.
check_taint      34, $&;
check_taint      35, $`;
check_taint      36, $';
check_taint      37, $+;
check_taint      38, $1;
check_taint_not  39, $2;

/(\S)/;	# taint $&, $`, $', $+, $1.
check_taint      40, $&;
check_taint      41, $`;
check_taint      42, $';
check_taint      43, $+;
check_taint      44, $1;
check_taint_not  45, $2;

$_ = $a;	# untaint $_

check_taint_not  46, $_;

/(b)/;		# this must not taint
check_taint_not  47, $&;
check_taint_not  48, $`;
check_taint_not  49, $';
check_taint_not  50, $+;
check_taint_not  51, $1;
check_taint_not  52, $2;

$_ = $a;	# untaint $_

check_taint_not  53, $_;

$b = uc($a);	# taint $b
s/(.+)/$b/;	# this must taint only the $_

check_taint      54, $_;
check_taint_not  55, $&;
check_taint_not  56, $`;
check_taint_not  57, $';
check_taint_not  58, $+;
check_taint_not  59, $1;
check_taint_not  60, $2;

$_ = $a;	# untaint $_

s/(.+)/b/;	# this must not taint
check_taint_not  61, $_;
check_taint_not  62, $&;
check_taint_not  63, $`;
check_taint_not  64, $';
check_taint_not  65, $+;
check_taint_not  66, $1;
check_taint_not  67, $2;

$b = $a;	# untaint $b

($b = $a) =~ s/\w/$&/;
check_taint      68, $b;	# $b should be tainted.
check_taint_not  69, $a;	# $a should be not.

$_ = $a;	# untaint $_

s/(\w)/\l$1/;	# this must taint
check_taint      70, $_;
check_taint      71, $&;
check_taint      72, $`;
check_taint      73, $';
check_taint      74, $+;
check_taint      75, $1;
check_taint_not  76, $2;

$_ = $a;	# untaint $_

s/(\w)/\L$1/;	# this must taint
check_taint      77, $_;
check_taint      78, $&;
check_taint      79, $`;
check_taint      80, $';
check_taint      81, $+;
check_taint      82, $1;
check_taint_not  83, $2;

$_ = $a;	# untaint $_

s/(\w)/\u$1/;	# this must taint
check_taint      84, $_;
check_taint      85, $&;
check_taint      86, $`;
check_taint      87, $';
check_taint      88, $+;
check_taint      89, $1;
check_taint_not  90, $2;

$_ = $a;	# untaint $_

s/(\w)/\U$1/;	# this must taint
check_taint      91, $_;
check_taint      92, $&;
check_taint      93, $`;
check_taint      94, $';
check_taint      95, $+;
check_taint      96, $1;
check_taint_not  97, $2;

# After all this tainting $a should be cool.

check_taint_not  98, $a;

# I think we've seen quite enough of taint.
# Let us do some *real* locale work now.

sub getalnum {
    sort grep /\w/, map { chr } 0..255
}

sub locatelocale ($$@) {
    my ($lcall, $alnum, @try) = @_;

    undef $$lcall;

    for (@try) {
	local $^W = 0; # suppress "Subroutine LC_ALL redefined"
	if (setlocale(LC_ALL, $_)) {
	    $$lcall = $_;
	    @$alnum = &getalnum;
	    last;
	}
    }

    @$alnum = () unless (defined $$lcall);
}

# Find some default locale

locatelocale(\$Locale, \@Locale, qw(C POSIX));

# Find some English locale

locatelocale(\$English, \@English,
	     qw(en_US.ISO8859-1 en_GB.ISO8859-1
		en en_US en_UK en_IE en_CA en_AU en_NZ
		english english.iso88591
		american american.iso88591
		british british.iso88591
		));

# Find some German locale

locatelocale(\$German, \@German,
	     qw(de_DE.ISO8859-1 de_AT.ISO8859-1 de_CH.ISO8859-1
		de de_DE de_AT de_CH
		german german.iso88591));

# Find some French locale

locatelocale(\$French, \@French,
	     qw(fr_FR.ISO8859-1 fr_BE.ISO8859-1 fr_CA.ISO8859-1 fr_CH.ISO8859-1
		fr fr_FR fr_BE fr_CA fr_CH
		french french.iso88591));

# Find some Spanish locale

locatelocale(\$Spanish, \@Spanish,
	     qw(es_AR.ISO8859-1 es_BO.ISO8859-1 es_CL.ISO8859-1
		es_CO.ISO8859-1 es_CR.ISO8859-1 es_EC.ISO8859-1
		es_ES.ISO8859-1 es_GT.ISO8859-1 es_MX.ISO8859-1
		es_NI.ISO8859-1 es_PA.ISO8859-1 es_PE.ISO8859-1
		es_PY.ISO8859-1 es_SV.ISO8859-1 es_UY.ISO8859-1 es_VE.ISO8859-1
		es es_AR es_BO es_CL
		es_CO es_CR es_EC
		es_ES es_GT es_MX
		es_NI es_PA es_PE
		es_PY es_SV es_UY es_VE
		spanish spanish.iso88591));

# Select the largest of the alpha(num)bets.

($Locale, @Locale) = ($English, @English)
    if (length(@English) > length(@Locale));
($Locale, @Locale) = ($German, @German)
    if (length(@German)  > length(@Locale));
($Locale, @Locale) = ($French, @French)
    if (length(@French)  > length(@Locale));
($Locale, @Locale) = ($Spanish, @Spanish)
    if (length(@Spanish) > length(@Locale));

print "# Locale = $Locale\n";
print "# Alnum_ = @Locale\n";

{
    local $^W = 0;
    setlocale(LC_ALL, $Locale);
}

{
    my $i = 0;

    for (@Locale) {
	$iLocale{$_} = $i++;
    }
}

# Sieve the uppercase and the lowercase.

for (@Locale) {
    if (/[^\d_]/) { # skip digits and the _
	if (lc eq $_) {
	    $UPPER{$_} = uc;
	} else {
	    $lower{$_} = lc;
	}
    }
}

# Cross-check the upper and the lower.
# Yes, this is broken when the upper<->lower changes the number of
# the glyphs (e.g. the German sharp-s aka double-s aka sz-ligature,
# or the Dutch IJ or the Spanish LL or ...)
# But so far all the implementations do this wrong so we can do it wrong too.

for (keys %UPPER) {
    if (defined $lower{$UPPER{$_}}) {
	if ($_ ne $lower{$UPPER{$_}}) {
	    print 'not ';
	    last;
	}
    }
}
print "ok 99\n";

for (keys %lower) {
    if (defined $UPPER{$lower{$_}}) {
	if ($_ ne $UPPER{$lower{$_}}) {
	    print 'not ';
	    last;
	}
    }
}
print "ok 100\n";

# Find the alphabets that are not alphabets in the default locale.

{
    no locale;
    
    for (keys %UPPER, keys %lower) {
	push(@Neoalpha, $_) if (/\W/);
    }
}

@Neoalpha = sort @Neoalpha;

# Test \w.

{
    my $word = join('', @Neoalpha);

    $word =~ /^(\w*)$/;

    print 'not ' if ($1 ne $word);
}
print "ok 101\n";

# Find places where the collation order differs from the default locale.

{
    my (@k, $i, $j, @d);

    {
	no locale;

	@k = sort (keys %UPPER, keys %lower); 
    }

    for ($i = 0; $i < @k; $i++) {
	for ($j = $i + 1; $j < @k; $j++) {
	    if ($iLocale{$k[$j]} < $iLocale{$k[$i]}) {
		push(@d, [$k[$j], $k[$i]]);
	    }
	}
    }

    # Cross-check those places.

    for (@d) {
	($i, $j) = @$_;
	if ($i gt $j) {
	    print "# i = $i, j = $j, i ",
	          $i le $j ? 'le' : 'gt', " j\n";
	    print 'not ';
	    last;
	}
    }
}
print "ok 102\n";

# Cross-check whole character set.

for (map { chr } 0..255) {
    if (/\w/ and /\W/) { print 'not '; last }
    if (/\d/ and /\D/) { print 'not '; last }
    if (/\s/ and /\S/) { print 'not '; last }
    if (/\w/ and /\D/ and not /_/ and
	not (exists $UPPER{$_} or exists $lower{$_})) {
	print 'not ';
	last;
    }
}
print "ok 103\n";

# The @Locale should be internally consistent.

{
    my ($from, $to, , $lesser, $greater);

    for (0..9) {
	# Select a slice.
	$from = int(($_*@Locale)/10);
	$to = $from + int(@Locale/10);
        $to = $#Locale if ($to > $#Locale);
	$lesser  = join('', @Locale[$from..$to]);
	# Select a slice one character on.
	$from++; $to++;
        $to = $#Locale if ($to > $#Locale);
	$greater = join('', @Locale[$from..$to]);
	if (not ($lesser  lt $greater) or
	    not ($lesser  le $greater) or
	    not ($lesser  ne $greater) or
	        ($lesser  eq $greater) or
	        ($lesser  ge $greater) or
	        ($lesser  gt $greater) or
	        ($greater lt $lesser ) or
	        ($greater le $lesser ) or
	    not ($greater ne $lesser ) or
	        ($greater eq $lesser ) or
	    not ($greater ge $lesser ) or
	    not ($greater gt $lesser ) or
	    # Well, these two are sort of redundant because @Locale
	    # was derived using cmp.
	    not (($lesser  cmp $greater) == -1) or
	    not (($greater cmp $lesser ) ==  1)
	   ) {
	    print 'not ';
	    last;
	}
    }
}
print "ok 104\n";