Try to handle UTF-8 locales.

p4raw-id: //depot/perl@15561
author: Jarkko Hietaniemi <jhi@iki.fi> 2002-03-27 23:48:15 +0000
committer: Jarkko Hietaniemi <jhi@iki.fi> 2002-03-27 23:48:15 +0000
commit: ef4a39e599a07c97b0213b41bfa69f4a6f4a17ed (patch)
tree: e41527ecd5558d8b2c680ca4126516bafad5685e /lib/locale.t
parent: 3eb9224ade6d1fb49a2cd92811272334fefa9fea (diff)
download: perl-ef4a39e599a07c97b0213b41bfa69f4a6f4a17ed.tar.gz
1 files changed, 45 insertions, 52 deletions
diff --git a/lib/locale.t b/lib/locale.t
index e9ba3fad21..ee75c650e9 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -308,7 +308,7 @@ if ($^O eq 'os390') {
     $locales =~ s/Thai:th:th:11 tis620\n//;
 }
 
-sub in_utf8 () { $^H & 0x08 }
+sub in_utf8 () { $^H & 0x08 || (${^OPEN} || "") =~ /:utf8/ }
 
 if (in_utf8) {
     require "lib/locale/utf8";
@@ -430,8 +430,6 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) {
 
 setlocale(LC_ALL, "C");
 
-sub utf8locale { $_[0] =~ /utf-?8/i }
-
 @Locale = sort @Locale;
 
 debug "# Locales = @Locale\n";
@@ -520,18 +518,19 @@ foreach $Locale (@Locale) {
 
 	# Test \w.
     
-	if (utf8locale($Locale)) {
-	    # utf8 and locales do not mix.
-	    debug "# skipping UTF-8 locale '$Locale'\n";
-	    push @utf8locale, $Locale;
-            @utf8skip{99..102} = ();
-	} else {
-	    my $word = join('', @Neoalpha);
+	my $word = join('', @Neoalpha);
 
-	    $word =~ /^(\w+)$/;
- 
-	    tryneoalpha($Locale, 99, $1 eq $word);
+	if ($Locale =~ /utf-?8/i) {
+	    debug "# unknown whether locale and Unicode have the same \\w, skipping test 99 for locale '$Locale'\n";
+	    push @{$Okay{99}}, $Locale;
+	} else {
+	    if ($word =~ /^(\w+)$/) {
+		tryneoalpha($Locale, 99, 1);
+	    } else {
+		tryneoalpha($Locale, 99, 0);
+	    }
 	}
+
 	# Cross-check the whole 8-bit character set.
 
 	for (map { chr } 0..255) {
@@ -712,50 +711,44 @@ foreach $Locale (@Locale) {
     # case-insensitively the UPPER, and does the UPPER match
     # case-insensitively the lc of the UPPER.  And vice versa.
     {
-        if (utf8locale($Locale)) {
-	    # utf8 and locales do not mix.
-	    debug "# skipping UTF-8 locale '$Locale'\n";
-	    push @utf8locale, $Locale;
-            $utf8skip{117}++;
-	} else {
-	    use locale;
-	    no utf8;
-            my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/;
-
-	    my @f = ();
-	    foreach my $x (keys %UPPER) {
-		my $y = lc $x;
-		next unless uc $y eq $x;
-		print "# UPPER $x lc $y ",
-			$x =~ /$y/i ? 1 : 0, " ",
-			$y =~ /$x/i ? 1 : 0, "\n" if 0;
-		# If $x and $y contain regular expression characters
-		# AND THEY lowercase (/i) to regular expression characters,
-		# regcomp() will be mightily confused.  No, the \Q doesn't
-		# help here (maybe regex engine internal lowercasing
-		# is done after the \Q?)  An example of this happening is
-		# the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS):
-		# the chr(173) (the "[") is the lowercase of the chr(235).
-		# Similarly losing EBCDIC locales include cs_cz, cs_CZ,
-		# el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!),
-		# et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT,
-		# mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037,
-		# pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU,
-		# sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR.
-		if ($x =~ $re || $y =~ $re) {
-		    print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
-		    next;
-		}
-		# With utf8 both will fail since the locale concept
-		# of upper/lower does not work well in Unicode.
-		push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
+        use locale;
+        no utf8;
+        my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/;
+
+        my @f = ();
+        foreach my $x (keys %UPPER) {
+	    my $y = lc $x;
+	    next unless uc $y eq $x;
+	    print "# UPPER $x lc $y ",
+	    $x =~ /$y/i ? 1 : 0, " ",
+	    $y =~ /$x/i ? 1 : 0, "\n" if 0;
+	    # If $x and $y contain regular expression characters
+	    # AND THEY lowercase (/i) to regular expression characters,
+	    # regcomp() will be mightily confused.  No, the \Q doesn't
+	    # help here (maybe regex engine internal lowercasing
+	    # is done after the \Q?)  An example of this happening is
+	    # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS):
+	    # the chr(173) (the "[") is the lowercase of the chr(235).
+	    # Similarly losing EBCDIC locales include cs_cz, cs_CZ,
+	    # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!),
+	    # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT,
+	    # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037,
+	    # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU,
+	    # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR.
+	    if ($x =~ $re || $y =~ $re) {
+		print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
+		next;
 	    }
+	    # With utf8 both will fail since the locale concept
+	    # of upper/lower does not work well in Unicode.
+	    push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
+
 	    foreach my $x (keys %lower) {
 		my $y = uc $x;
 		next unless lc $y eq $x;
 		print "# lower $x uc $y ",
-			$x =~ /$y/i ? 1 : 0, " ",
-			$y =~ /$x/i ? 1 : 0, "\n" if 0;
+		$x =~ /$y/i ? 1 : 0, " ",
+		$y =~ /$x/i ? 1 : 0, "\n" if 0;
 		if ($x =~ $re || $y =~ $re) { # See above.
 		    print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
 		    next;
author	Jarkko Hietaniemi <jhi@iki.fi>	2002-03-27 23:48:15 +0000
committer	Jarkko Hietaniemi <jhi@iki.fi>	2002-03-27 23:48:15 +0000
commit	ef4a39e599a07c97b0213b41bfa69f4a6f4a17ed (patch)
tree	e41527ecd5558d8b2c680ca4126516bafad5685e /lib/locale.t
parent	3eb9224ade6d1fb49a2cd92811272334fefa9fea (diff)
download	perl-ef4a39e599a07c97b0213b41bfa69f4a6f4a17ed.tar.gz