diff options
author | Nicholas Clark <nick@ccl4.org> | 2009-09-28 14:44:14 +0100 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2009-09-29 11:12:37 +0100 |
commit | 8b2306352e674fdd7eb8b61ff2ce78864a87ed9c (patch) | |
tree | 0fe3e18be16f3a42c67c96ea9cf175bd2b63184c /dist | |
parent | ba41a17c3d9714aa11c7464401bafde353d96d54 (diff) | |
download | perl-8b2306352e674fdd7eb8b61ff2ce78864a87ed9c.tar.gz |
Move I18N::LangTags from ext/ to dist/
Diffstat (limited to 'dist')
-rw-r--r-- | dist/I18N-LangTags/ChangeLog | 195 | ||||
-rw-r--r-- | dist/I18N-LangTags/README | 78 | ||||
-rw-r--r-- | dist/I18N-LangTags/lib/I18N/LangTags.pm | 887 | ||||
-rw-r--r-- | dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm | 237 | ||||
-rw-r--r-- | dist/I18N-LangTags/lib/I18N/LangTags/List.pm | 1779 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/01_about_verbose.t | 89 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/05_main.t | 98 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/07_listy.t | 30 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/10_http.t | 104 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/20_locales.t | 38 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/50_super.t | 88 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/55_supers_strict.t | 78 | ||||
-rw-r--r-- | dist/I18N-LangTags/t/80_all_env.t | 115 |
13 files changed, 3816 insertions, 0 deletions
diff --git a/dist/I18N-LangTags/ChangeLog b/dist/I18N-LangTags/ChangeLog new file mode 100644 index 0000000000..6cd744ef46 --- /dev/null +++ b/dist/I18N-LangTags/ChangeLog @@ -0,0 +1,195 @@ +Revision history for Perl module I18N::LangTags. + Time-stamp: "2004-10-06 23:26:53 ADT" + +2004-10-06 Sean M. Burke sburke@cpan.org + + * Release 0.35 + + Bugfix version: locale2language_tag now correctly understands + locale-IDs with at-signs in them, like 'it_IT.utf8@euro' or + 'it_IT@euro'. This is now enforced by the new t/20_locales.t + + Thanks to Luca 'loopback' Cavalli for letting me know about these + new locale-ID name-styles. + + + * Release 0.34 -- never happened, because of an upload error + + +2004-07-01 Sean M. Burke sburke@cpan.org + + * Release 0.33 + + Minor bugfix version: + The test 80_all_env.t was erroneously failing for people with + LC_ALL or LC_MESSAGES set. Fixed. Thanks to everyone, especially + Nicholas Clark, who patiently helped out with this. + + +2004-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.32 + + Minor bugfix version: + The test 80_all_env.t was erroneously failing under MSWins that + had Win32::Locale installed. A workaround added. + + +2004-06-17 Sean M. Burke sburke@cpan.org + + * Release 0.31 + + Corrected some unevennesses in when/whether the return values from + I18N::LangTags::Detect's various internal functions would be + downcased. Now they're /always/ downcased, and are /always/ fed + thru alternate_language_tags()! + + Also, spiffed up and generally improved the earlier test + 80_all_env.t, which not even I could make sense of, and I wrote + the damned thing. Now it's sane, and checks both scalar and + list return values. Thanks to Rafael Garcia-Suarez and the + various CPAN-Testers for prodding me to fix this. (Hopefully the + earlier problems /are/ now fixed! Otherwise there'll be another + version of this module out real soon!) + + +2004-03-30 Sean M. Burke sburke@cpan.org + + * Release 0.30 + + New in I18N::LangTags : implicate_supers and + implicate_supers_strictly. + + New module: I18N::LangTags::Detect. + + Some new tests. + + Thanks to Autrijus Tang for catching some errors in my makefile! + + + +2003-10-10 Sean M. Burke sburke@cpan.org + + * Release 0.29 + + Minor bugfix to I18N::LangTags::List code. Addition of the + is_decent function, and the 02decency.t test for it. + + Better Makefile. Thanks to everyone who told me about the + INSTALLDIRS trick. + + + +2003-07-20 Sean M. Burke sburke@cpan.org + + * Release 0.28 + Doc fixes in I18N::LangTags, plus a few added variances (jw/jv, + cre/cr, etc.) + Lots of updates to I18N::LangTags::List + Deleted rfc3066.txt from dist. + Moved test.pl to t/01test.t and added more tests. + +2002-02-02 Sean M. Burke sburke@cpan.org + + * Release 0.27 -- minor mods to ::List: + Fixing its entries for sv-se and sv-fi. + Typo-fixes and rewordings in the incidental Pod text elsewhere. + +2001-06-21 Sean M. Burke sburke@cpan.org + + * Release 0.26 -- just making cosmetic changes + to test.pl, at Jarkko's request. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.25 -- just tweaking panic_languages behavior + for Scandinavian languages. Much better now. + Slight tweak to ::List's entries for Greek. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.24 + + * I18N::LangTags -- some elaborate hacks to make us + recognize legacy aliases like no-nyn == nn. + Added panic_languages(). + Added :ALL export tag. + Minor docs fixes, and spiffing up test.pl. + + * I18N::LangTags::List -- minor corrections; added + a few aliases. + +2001-05-29 Sean M. Burke sburke@cpan.org + + * Release 0.23 + + * I18N::LangTags::List -- minor corrections. And is now + a module, not just documentation. + +2001-05-27 Sean M. Burke sburke@cpan.org + + * Release 0.22 + + * Now bundling I18N::LangTags::List, a reference for lang tags, + replacing generate_language_table.plx and language_codes.txt + +2001-05-25 Sean M. Burke sburke@cpan.org + + * Release 0.21 + + * extract_language_tags and locale2langauge_tag now + return untainted output. Useful if you feed tainted + things, like $ENV{'LANG'}. + +2001-03-13 Sean M. Burke sburke@cpan.org + + * Release 0.20 + + * Added support for RFC 3066 tags: allowing three-letter primary + tags ("nav"), and allowing digits in subtags ("x-borg-prot3252"). + + * Changed all references from RFC 1766 to RFC 3066. + + * Now bundling fulltext of RFC 3066 in the dist. + + * Now bundling generate_language_table.plx and language_codes.txt + + * Added some nice tests to test.pl + + * Inverting order of listings in this ChangeLog file. + +2000-05-13 Sean M. Burke sburke@cpan.org + + * Release 0.13 + + * Just noting my new email address. + +1999-03-06 Sean M. Burke sburke@netadventure.net + + * Release 0.11 + + * Added functions + similarity_language_tag, is_dialect_of, + locale2language_tag, alternate_language_tags, and + encode_language_tag + +1998-12-14 Sean M. Burke sburke@netadventure.net + + * Release 0.09 + + * Added function super_languages() + +1998-10-31 Sean M. Burke sburke@netadventure.net + + * Release 0.08 + + * Just changes in the docs and bundle -- no change + in functionality. + +1998-04-02 Sean M. Burke sburke@netadventure.net + + * Release 0.07 + + * First public release. + +[END OF CHANGELOG] diff --git a/dist/I18N-LangTags/README b/dist/I18N-LangTags/README new file mode 100644 index 0000000000..ef0eb7b2f9 --- /dev/null +++ b/dist/I18N-LangTags/README @@ -0,0 +1,78 @@ +README for I18N::LangTags + Time-stamp: "2004-10-06 23:19:39 ADT" + + I18N::LangTags + +I18N::LangTags - functions for dealing with RFC3066-style language +tags + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags (notably the extraction of them, comparing them, and testing the +formal validity of them) as is needed in a variety of protocols and +applications. + + +I18N::LangTags::List -- tags and names for human languages. This +module goes from known language tag names ("fr-CA") to their English +names ("Canadian French"). Its documentation also lists the several +hundred known tags and some common subforms. You may find this useful +as a reference. + + +See the POD for more information. + + +INSTALLATION + +You install I18N::LangTags and I18N::LangTags::List, as you would +install any perl module library, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of I18N::LangTags in your home +directory, then you should try to produce the initial Makefile with +something like this command: + + perl Makefile.PL LIB=~/perl + +See perldoc perlmodinstall for more information on installing modules. + + +DOCUMENTATION + +POD-format documentation is included in LangTags.pm. POD is readable +with the 'perldoc' utility. See ChangeLog for recent changes. + + +SUPPORT + +Questions, bug reports, useful code bits, and suggestions for +I18N::LangTags should just be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of I18N::LangTags is available from the +Comprehensive Perl Archive Network (CPAN). Visit +<http://www.perl.com/CPAN/> to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1998+, Sean M. Burke <sburke@cpan.org>, all rights +reserved. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. diff --git a/dist/I18N-LangTags/lib/I18N/LangTags.pm b/dist/I18N-LangTags/lib/I18N/LangTags.pm new file mode 100644 index 0000000000..0bdc65fed1 --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags.pm @@ -0,0 +1,887 @@ + +# Time-stamp: "2004-10-06 23:26:33 ADT" +# Sean M. Burke <sburke@cpan.org> + +require 5.000; +package I18N::LangTags; +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag panic_languages + implicate_supers + implicate_supers_strictly + ); +%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); + +$VERSION = "0.35"; + +sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function + + +=head1 NAME + +I18N::LangTags - functions for dealing with RFC3066-style language tags + +=head1 SYNOPSIS + + use I18N::LangTags(); + +...or specify whichever of those functions you want to import, like so: + + use I18N::LangTags qw(implicate_supers similarity_language_tag); + +All the exportable functions are listed below -- you're free to import +only some, or none at all. By default, none are imported. If you +say: + + use I18N::LangTags qw(:ALL) + +...then all are exported. (This saves you from having to use +something less obvious like C<use I18N::LangTags qw(/./)>.) + +If you don't import any of these functions, assume a C<&I18N::LangTags::> +in front of all the function names in the following examples. + +=head1 DESCRIPTION + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags as they are needed in a variety of protocols and applications. + +Please see the "See Also" references for a thorough explanation +of how to correctly use language tags. + +=over + +=cut + +########################################################################### + +=item * the function is_language_tag($lang1) + +Returns true iff $lang1 is a formally valid language tag. + + is_language_tag("fr") is TRUE + is_language_tag("x-jicarilla") is FALSE + (Subtags can be 8 chars long at most -- 'jicarilla' is 9) + + is_language_tag("sgn-US") is TRUE + (That's American Sign Language) + + is_language_tag("i-Klikitat") is TRUE + (True without regard to the fact noone has actually + registered Klikitat -- it's a formally valid tag) + + is_language_tag("fr-patois") is TRUE + (Formally valid -- altho descriptively weak!) + + is_language_tag("Spanish") is FALSE + is_language_tag("french-patois") is FALSE + (No good -- first subtag has to match + /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) + + is_language_tag("x-borg-prot2532") is TRUE + (Yes, subtags can contain digits, as of RFC3066) + +=cut + +sub is_language_tag { + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = lc($_[0]); + + return 0 if $tag eq "i" or $tag eq "x"; + # Bad degenerate cases that the following + # regexp would erroneously let pass + + return $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs ? 1 : 0; +} + +########################################################################### + +=item * the function extract_language_tags($whatever) + +Returns a list of whatever looks like formally valid language tags +in $whatever. Not very smart, so don't get too creative with +what you want to feed it. + + extract_language_tags("fr, fr-ca, i-mingo") + returns: ('fr', 'fr-ca', 'i-mingo') + + extract_language_tags("It's like this: I'm in fr -- French!") + returns: ('It', 'in', 'fr') + (So don't just feed it any old thing.) + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub extract_language_tags { + + ## Changes in the language tagging standards may have to be reflected here. + + my($text) = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags + $text =~ + m/ + \b + (?: # First subtag + [iIxX] | [a-zA-Z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-zA-Z0-9]{1,8} # subtag + )* + \b + /xsg + ); +} + +########################################################################### + +=item * the function same_language_tag($lang1, $lang2) + +Returns true iff $lang1 and $lang2 are acceptable variant tags +representing the same language-form. + + same_language_tag('x-kadara', 'i-kadara') is TRUE + (The x/i- alternation doesn't matter) + same_language_tag('X-KADARA', 'i-kadara') is TRUE + (...and neither does case) + same_language_tag('en', 'en-US') is FALSE + (all-English is not the SAME as US English) + same_language_tag('x-kadara', 'x-kadar') is FALSE + (these are totally unrelated tags) + same_language_tag('no-bok', 'nb') is TRUE + (no-bok is a legacy tag for nb (Norwegian Bokmal)) + +C<same_language_tag> works by just seeing whether +C<encode_language_tag($lang1)> is the same as +C<encode_language_tag($lang2)>. + +(Yes, I know this function is named a bit oddly. Call it historic +reasons.) + +=cut + +sub same_language_tag { + my $el1 = &encode_language_tag($_[0]); + return 0 unless defined $el1; + # this avoids the problem of + # encode_language_tag($lang1) eq and encode_language_tag($lang2) + # being true if $lang1 and $lang2 are both undef + + return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; +} + +########################################################################### + +=item * the function similarity_language_tag($lang1, $lang2) + +Returns an integer representing the degree of similarity between +tags $lang1 and $lang2 (the order of which does not matter), where +similarity is the number of common elements on the left, +without regard to case and to x/i- alternation. + + similarity_language_tag('fr', 'fr-ca') is 1 + (one element in common) + similarity_language_tag('fr-ca', 'fr-FR') is 1 + (one element in common) + + similarity_language_tag('fr-CA-joual', + 'fr-CA-PEI') is 2 + similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 + (two elements in common) + + similarity_language_tag('x-kadara', 'i-kadara') is 1 + (x/i- doesn't matter) + + similarity_language_tag('en', 'x-kadar') is 0 + similarity_language_tag('x-kadara', 'x-kadar') is 0 + (unrelated tags -- no similarity) + + similarity_language_tag('i-cree-syllabic', + 'i-cherokee-syllabic') is 0 + (no B<leftmost> elements in common!) + +=cut + +sub similarity_language_tag { + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + # And encode_language_tag takes care of the whole + # no-nyn==nn, i-hakka==zh-hakka, etc, things + + # NB: (i-sil-...)? (i-sgn-...)? + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + my @l1_subtags = split('-', $lang1); + my @l2_subtags = split('-', $lang2); + my $similarity = 0; + + while(@l1_subtags and @l2_subtags) { + if(shift(@l1_subtags) eq shift(@l2_subtags)) { + ++$similarity; + } else { + last; + } + } + return $similarity; +} + +########################################################################### + +=item * the function is_dialect_of($lang1, $lang2) + +Returns true iff language tag $lang1 represents a subform of +language tag $lang2. + +B<Get the order right! It doesn't work the other way around!> + + is_dialect_of('en-US', 'en') is TRUE + (American English IS a dialect of all-English) + + is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE + is_dialect_of('fr-CA-joual', 'fr') is TRUE + (Joual is a dialect of (a dialect of) French) + + is_dialect_of('en', 'en-US') is FALSE + (all-English is a NOT dialect of American English) + + is_dialect_of('fr', 'en-CA') is FALSE + + is_dialect_of('en', 'en' ) is TRUE + is_dialect_of('en-US', 'en-US') is TRUE + (B<Note:> these are degenerate cases) + + is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE + (the x/i thing doesn't matter, nor does case) + + is_dialect_of('nn', 'no') is TRUE + (because 'nn' (New Norse) is aliased to 'no-nyn', + as a special legacy case, and 'no-nyn' is a + subform of 'no' (Norwegian)) + +=cut + +sub is_dialect_of { + + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + return 1 if $lang1 eq $lang2; + return 0 if length($lang1) < length($lang2); + + $lang1 .= '-'; + $lang2 .= '-'; + return + (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; +} + +########################################################################### + +=item * the function super_languages($lang1) + +Returns a list of language tags that are superordinate tags to $lang1 +-- it gets this by removing subtags from the end of $lang1 until +nothing (or just "i" or "x") is left. + + super_languages("fr-CA-joual") is ("fr-CA", "fr") + + super_languages("en-AU") is ("en") + + super_languages("en") is empty-list, () + + super_languages("i-cherokee") is empty-list, () + ...not ("i"), which would be illegal as well as pointless. + +If $lang1 is not a valid language tag, returns empty-list in +a list context, undef in a scalar context. + +A notable and rather unavoidable problem with this method: +"x-mingo-tom" has an "x" because the whole tag isn't an +IANA-registered tag -- but super_languages('x-mingo-tom') is +('x-mingo') -- which isn't really right, since 'i-mingo' is +registered. But this module has no way of knowing that. (But note +that same_language_tag('x-mingo', 'i-mingo') is TRUE.) + +More importantly, you assume I<at your peril> that superordinates of +$lang1 are mutually intelligible with $lang1. Consider this +carefully. + +=cut + +sub super_languages { + my $lang1 = $_[0]; + return() unless defined($lang1) && &is_language_tag($lang1); + + # a hack for those annoying new (2001) tags: + $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards + $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards + $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way + # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark + + my @l1_subtags = split('-', $lang1); + + ## Changes in the language tagging standards may have to be reflected here. + + # NB: (i-sil-...)? + + my @supers = (); + foreach my $bit (@l1_subtags) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + pop @supers if @supers; + shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; + return reverse @supers; +} + +########################################################################### + +=item * the function locale2language_tag($locale_identifier) + +This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") +and maps it to a language tag. If it's not mappable (as with, +notably, "C" and "POSIX"), this returns empty-list in a list context, +or undef in a scalar context. + + locale2language_tag("en") is "en" + + locale2language_tag("en_US") is "en-US" + + locale2language_tag("en_US.ISO8859-1") is "en-US" + + locale2language_tag("C") is undef or () + + locale2language_tag("POSIX") is undef or () + + locale2language_tag("POSIX") is undef or () + +I'm not totally sure that locale names map satisfactorily to language +tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub locale2language_tag { + my $lang = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return $lang if &is_language_tag($lang); # like "en" + + $lang =~ tr<_><->; # "en_US" -> en-US + $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US + # it_IT.utf8@euro => it-IT + + return $lang if &is_language_tag($lang); + + return; +} + +########################################################################### + +=item * the function encode_language_tag($lang1) + +This function, if given a language tag, returns an encoding of it such +that: + +* tags representing different languages never get the same encoding. + +* tags representing the same language always get the same encoding. + +* an encoding of a formally valid language tag always is a string +value that is defined, has length, and is true if considered as a +boolean. + +Note that the encoding itself is B<not> a formally valid language tag. +Note also that you cannot, currently, go from an encoding back to a +language tag that it's an encoding of. + +Note also that you B<must> consider the encoded value as atomic; i.e., +you should not consider it as anything but an opaque, unanalysable +string value. (The internals of the encoding method may change in +future versions, as the language tagging standard changes over time.) + +C<encode_language_tag> returns undef if given anything other than a +formally valid language tag. + +The reason C<encode_language_tag> exists is because different language +tags may represent the same language; this is normally treatable with +C<same_language_tag>, but consider this situation: + +You have a data file that expresses greetings in different languages. +Its format is "[language tag]=[how to say 'Hello']", like: + + en-US=Hiho + fr=Bonjour + i-mingo=Hau' + +And suppose you write a program that reads that file and then runs as +a daemon, answering client requests that specify a language tag and +then expect the string that says how to greet in that language. So an +interaction looks like: + + greeting-client asks: fr + greeting-server answers: Bonjour + +So far so good. But suppose the way you're implementing this is: + + my %greetings; + die unless open(IN, "<in.dat"); + while(<IN>) { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{$lang} = $expr; + } + close(IN); + +at which point %greetings has the contents: + + "en-US" => "Hiho" + "fr" => "Bonjour" + "i-mingo" => "Hau'" + +And suppose then that you answer client requests for language $wanted +by just looking up $greetings{$wanted}. + +If the client asks for "fr", that will look up successfully in +%greetings, to the value "Bonjour". And if the client asks for +"i-mingo", that will look up successfully in %greetings, to the value +"Hau'". + +But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the +lookup in %greetings fails. That's the Wrong Thing. + +You could instead do lookups on $wanted with: + + use I18N::LangTags qw(same_language_tag); + my $response = ''; + foreach my $l2 (keys %greetings) { + if(same_language_tag($wanted, $l2)) { + $response = $greetings{$l2}; + last; + } + } + +But that's rather inefficient. A better way to do it is to start your +program with: + + use I18N::LangTags qw(encode_language_tag); + my %greetings; + die unless open(IN, "<in.dat"); + while(<IN>) { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{ + encode_language_tag($lang) + } = $expr; + } + close(IN); + +and then just answer client requests for language $wanted by just +looking up + + $greetings{encode_language_tag($wanted)} + +And that does the Right Thing. + +=cut + +sub encode_language_tag { + # Only similarity_language_tag() is allowed to analyse encodings! + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = $_[0] || return undef; + return undef unless &is_language_tag($tag); + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + $tag =~ s/^iw\b/he/i; # Hebrew + $tag =~ s/^in\b/id/i; # Indonesian + $tag =~ s/^cre\b/cr/i; # Cree + $tag =~ s/^jw\b/jv/i; # Javanese + $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger + $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo + $tag =~ s/^ji\b/yi/i; # Yiddish + # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, + # but maybe they're all so obscure I can ignore them. "Obscure" + # meaning either that the language is obscure, and/or that the + # XXX form was extant so briefly that it's unlikely it was ever + # used. I hope. + # + # These go FROM the simplex to complex form, to get + # similarity-comparison right. And that's okay, since + # similarity_language_tag is the only thing that + # analyzes our output. + $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka + $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal + $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk + + $tag =~ s/^[xiXI]-//s; + # Just lop off any leading "x/i-" + + return "~" . uc($tag); +} + +#-------------------------------------------------------------------------- + +=item * the function alternate_language_tags($lang1) + +This function, if given a language tag, returns all language tags that +are alternate forms of this language tag. (I.e., tags which refer to +the same language.) This is meant to handle legacy tags caused by +the minor changes in language tag standards over the years; and +the x-/i- alternation is also dealt with. + +Note that this function does I<not> try to equate new (and never-used, +and unusable) +ISO639-2 three-letter tags to old (and still in use) ISO639-1 +two-letter equivalents -- like "ara" -> "ar" -- because +"ara" has I<never> been in use as an Internet language tag, +and RFC 3066 stipulates that it never should be, since a shorter +tag ("ar") exists. + +Examples: + + alternate_language_tags('no-bok') is ('nb') + alternate_language_tags('nb') is ('no-bok') + alternate_language_tags('he') is ('iw') + alternate_language_tags('iw') is ('he') + alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') + alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') + alternate_language_tags('en') is () + alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') + alternate_language_tags('x-klikitat') is ('i-klikitat') + alternate_language_tags('i-klikitat') is ('x-klikitat') + +This function returns empty-list if given anything other than a formally +valid language tag. + +=cut + +my %alt = qw( i x x i I X X I ); +sub alternate_language_tags { + my $tag = $_[0]; + return() unless &is_language_tag($tag); + + my @em; # push 'em real goood! + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + + if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; + } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; + + } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; + } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; + + } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; + } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; + + } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; + } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; + + } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; + } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; + + } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; + } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; + + } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; + } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; + + } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; + } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; + } + + push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; + return @em; +} + +########################################################################### + +{ + # Init %Panic... + + my @panic = ( # MUST all be lowercase! + # Only large ("national") languages make it in this list. + # If you, as a user, are so bizarre that the /only/ language + # you claim to accept is Galician, then no, we won't do you + # the favor of providing Catalan as a panic-fallback for + # you. Because if I start trying to add "little languages" in + # here, I'll just go crazy. + + # Scandinavian lgs. All based on opinion and hearsay. + 'sv' => [qw(nb no da nn)], + 'da' => [qw(nb no sv nn)], # I guess + [qw(no nn nb)], [qw(no nn nb sv da)], + 'is' => [qw(da sv no nb nn)], + 'fo' => [qw(da is no nb nn sv)], # I guess + + # I think this is about the extent of tolerable intelligibility + # among large modern Romance languages. + 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French + 'ca' => [qw(es pt it fr)], + 'es' => [qw(ca it fr pt)], + 'it' => [qw(es fr ca pt)], + 'fr' => [qw(es it ca pt)], + + # Also assume that speakers of the main Indian languages prefer + # to read/hear Hindi over English + [qw( + as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur + )] => 'hi', + # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, + # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, + # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. + 'hi' => [qw(bn pa as or)], + # I welcome finer data for the other Indian languages. + # E.g., what should Oriya's list be, besides just Hindi? + + # And the panic languages for English is, of course, nil! + + # My guesses at Slavic intelligibility: + ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian + 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat + 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak + + 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian + + 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish + + #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai + + ); + my($k,$v); + while(@panic) { + ($k,$v) = splice(@panic,0,2); + foreach my $k (ref($k) ? @$k : $k) { + foreach my $v (ref($v) ? @$v : $v) { + push @{$Panic{$k} ||= []}, $v unless $k eq $v; + } + } + } +} + +=item * the function @langs = panic_languages(@accept_languages) + +This function takes a list of 0 or more language +tags that constitute a given user's Accept-Language list, and +returns a list of tags for I<other> (non-super) +languages that are probably acceptable to the user, to be +used I<if all else fails>. + +For example, if a user accepts only 'ca' (Catalan) and +'es' (Spanish), and the documents/interfaces you have +available are just in German, Italian, and Chinese, then +the user will most likely want the Italian one (and not +the Chinese or German one!), instead of getting +nothing. So C<panic_languages('ca', 'es')> returns +a list containing 'it' (Italian). + +English ('en') is I<always> in the return list, but +whether it's at the very end or not depends +on the input languages. This function works by consulting +an internal table that stipulates what common +languages are "close" to each other. + +A useful construct you might consider using is: + + @fallbacks = super_languages(@accept_languages); + push @fallbacks, panic_languages( + @accept_languages, @fallbacks, + ); + +=cut + +sub panic_languages { + # When in panic or in doubt, run in circles, scream, and shout! + my(@out, %seen); + foreach my $t (@_) { + next unless $t; + next if $seen{$t}++; # so we don't return it or hit it again + # push @out, super_languages($t); # nah, keep that separate + push @out, @{ $Panic{lc $t} || next }; + } + return grep !$seen{$_}++, @out, 'en'; +} + +#--------------------------------------------------------------------------- +#--------------------------------------------------------------------------- + +=item * the function implicate_supers( ...languages... ) + +This takes a list of strings (which are presumed to be language-tags; +strings that aren't, are ignored); and after each one, this function +inserts super-ordinate forms that don't already appear in the list. +The original list, plus these insertions, is returned. + +In other words, it takes this: + + pt-br de-DE en-US fr pt-br-janeiro + +and returns this: + + pt-br pt de-DE de en-US en fr pt-br-janeiro + +This function is most useful in the idiom + + implicate_supers( I18N::LangTags::Detect::detect() ); + +(See L<I18N::LangTags::Detect>.) + + +=item * the function implicate_supers_strictly( ...languages... ) + +This works like C<implicate_supers> except that the implicated +forms are added to the end of the return list. + +In other words, implicate_supers_strictly takes a list of strings +(which are presumed to be language-tags; strings that aren't, are +ignored) and after the whole given list, it inserts the super-ordinate forms +of all given tags, minus any tags that already appear in the input list. + +In other words, it takes this: + + pt-br de-DE en-US fr pt-br-janeiro + +and returns this: + + pt-br de-DE en-US fr pt-br-janeiro pt de en + +The reason this function has "_strictly" in its name is that when +you're processing an Accept-Language list according to the RFCs, if +you interpret the RFCs quite strictly, then you would use +implicate_supers_strictly, but for normal use (i.e., common-sense use, +as far as I'm concerned) you'd use implicate_supers. + +=cut + +sub implicate_supers { + my @languages = grep is_language_tag($_), @_; + my %seen_encoded; + foreach my $lang (@languages) { + $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 + } + + my(@output_languages); + foreach my $lang (@languages) { + push @output_languages, $lang; + foreach my $s ( I18N::LangTags::super_languages($lang) ) { + # Note that super_languages returns the longest first. + last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; + push @output_languages, $s; + } + } + return uniq( @output_languages ); + +} + +sub implicate_supers_strictly { + my @tags = grep is_language_tag($_), @_; + return uniq( @_, map super_languages($_), @_ ); +} + + + +########################################################################### +1; +__END__ + +=back + +=head1 ABOUT LOWERCASING + +I've considered making all the above functions that output language +tags return all those tags strictly in lowercase. Having all your +language tags in lowercase does make some things easier. But you +might as well just lowercase as you like, or call +C<encode_language_tag($lang1)> where appropriate. + +=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS + +In some future version of I18N::LangTags, I plan to include support +for RFC2482-style language tags -- which are basically just normal +language tags with their ASCII characters shifted into Plane 14. + +=head1 SEE ALSO + +* L<I18N::LangTags::List|I18N::LangTags::List> + +* RFC 3066, C<ftp://ftp.isi.edu/in-notes/rfc3066.txt>, "Tags for the +Identification of Languages". (Obsoletes RFC 1766) + +* RFC 2277, C<ftp://ftp.isi.edu/in-notes/rfc2277.txt>, "IETF Policy on +Character Sets and Languages". + +* RFC 2231, C<ftp://ftp.isi.edu/in-notes/rfc2231.txt>, "MIME Parameter +Value and Encoded Word Extensions: Character Sets, Languages, and +Continuations". + +* RFC 2482, C<ftp://ftp.isi.edu/in-notes/rfc2482.txt>, +"Language Tagging in Unicode Plain Text". + +* Locale::Codes, in +C<http://www.perl.com/CPAN/modules/by-module/Locale/> + +* ISO 639-2, "Codes for the representation of names of languages", +including two-letter and three-letter codes, +C<http://www.loc.gov/standards/iso639-2/langcodes.html> + +* The IANA list of registered languages (hopefully up-to-date), +C<http://www.iana.org/assignments/language-tags> + +=head1 COPYRIGHT + +Copyright (c) 1998+ Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +=head1 AUTHOR + +Sean M. Burke C<sburke@cpan.org> + +=cut + diff --git a/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm b/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm new file mode 100644 index 0000000000..3f1b7c006a --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm @@ -0,0 +1,237 @@ + +# Time-stamp: "2004-06-20 21:47:55 ADT" + +require 5; +package I18N::LangTags::Detect; +use strict; + +use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS + $USE_LITERALS $MATCH_SUPERS_TIGHTLY); + +BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } + # define the constant 'DEBUG' at compile-time + +$VERSION = "1.03"; +@ISA = (); +use I18N::LangTags qw(alternate_language_tags locale2language_tag); + +sub _uniq { my %seen; return grep(!($seen{$_}++), @_); } +sub _normalize { + my(@languages) = + map lc($_), + grep $_, + map {; $_, alternate_language_tags($_) } @_; + return _uniq(@languages) if wantarray; + return $languages[0]; +} + +#--------------------------------------------------------------------------- +# The extent of our functional interface: + +sub detect () { return __PACKAGE__->ambient_langprefs; } + +#=========================================================================== + +sub ambient_langprefs { # always returns things untainted + my $base_class = $_[0]; + + return $base_class->http_accept_langs + if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI + # it's off in its own routine because it's complicated + + # Not running as a CGI: try to puzzle out from the environment + my @languages; + + foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) { + next unless $ENV{$envname}; + DEBUG and print "Noting \$$envname: $ENV{$envname}\n"; + push @languages, + map locale2language_tag($_), + # if it's a lg tag, fine, pass thru (untainted) + # if it's a locale ID, try converting to a lg tag (untainted), + # otherwise nix it. + + split m/[,:]/, + $ENV{$envname} + ; + last; # first one wins + } + + if($ENV{'IGNORE_WIN32_LOCALE'}) { + # no-op + } elsif(&_try_use('Win32::Locale')) { + # If we have that module installed... + push @languages, Win32::Locale::get_language() || '' + if defined &Win32::Locale::get_language; + } + return _normalize @languages; +} + +#--------------------------------------------------------------------------- + +sub http_accept_langs { + # Deal with HTTP "Accept-Language:" stuff. Hassle. + # This code is more lenient than RFC 3282, which you must read. + # Hm. Should I just move this into I18N::LangTags at some point? + no integer; + + my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'}; + # (always ends up untainting) + + return() unless defined $in and length $in; + + $in =~ s/\([^\)]*\)//g; # nix just about any comment + + if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) { + # Very common case: just one language tag + return _normalize $1; + } elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) { + # Common case these days: just "foo, bar, baz" + return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g ); + } + + # Else it's complicated... + + $in =~ s/\s+//g; # Yes, we can just do without the WS! + my @in = $in =~ m/([^,]+)/g; + my %pref; + + my $q; + foreach my $tag (@in) { + next unless $tag =~ + m/^([a-zA-Z][-a-zA-Z]+) + (?: + ;q= + ( + \d* # a bit too broad of a RE, but so what. + (?: + \.\d+ + )? + ) + )? + $ + /sx + ; + $q = (defined $2 and length $2) ? $2 : 1; + #print "$1 with q=$q\n"; + push @{ $pref{$q} }, lc $1; + } + + return _normalize( + # Read off %pref, in descending key order... + map @{$pref{$_}}, + sort {$b <=> $a} + keys %pref + ); +} + +#=========================================================================== + +my %tried = (); + # memoization of whether we've used this module, or found it unusable. + +sub _try_use { # Basically a wrapper around "require Modulename" + # "Many men have tried..." "They tried and failed?" "They tried and died." + return $tried{$_[0]} if exists $tried{$_[0]}; # memoization + + my $module = $_[0]; # ASSUME sane module name! + { no strict 'refs'; + return($tried{$module} = 1) + if defined(%{$module . "::Lexicon"}) or defined(@{$module . "::ISA"}); + # weird case: we never use'd it, but there it is! + } + + print " About to use $module ...\n" if DEBUG; + { + local $SIG{'__DIE__'}; + eval "require $module"; # used to be "use $module", but no point in that. + } + if($@) { + print "Error using $module \: $@\n" if DEBUG > 1; + return $tried{$module} = 0; + } else { + print " OK, $module is used\n" if DEBUG; + return $tried{$module} = 1; + } +} + +#--------------------------------------------------------------------------- +1; +__END__ + + +=head1 NAME + +I18N::LangTags::Detect - detect the user's language preferences + +=head1 SYNOPSIS + + use I18N::LangTags::Detect; + my @user_wants = I18N::LangTags::Detect::detect(); + +=head1 DESCRIPTION + +It is a common problem to want to detect what language(s) the user would +prefer output in. + +=head1 FUNCTIONS + +This module defines one public function, +C<I18N::LangTags::Detect::detect()>. This function is not exported +(nor is even exportable), and it takes no parameters. + +In scalar context, the function returns the most preferred language +tag (or undef if no preference was seen). + +In list context (which is usually what you want), +the function returns a +(possibly empty) list of language tags representing (best first) what +languages the user apparently would accept output in. You will +probably want to pass the output of this through +C<I18N::LangTags::implicate_supers_tightly(...)> +or +C<I18N::LangTags::implicate_supers(...)>, like so: + + my @languages = + I18N::LangTags::implicate_supers_tightly( + I18N::LangTags::Detect::detect() + ); + + +=head1 ENVIRONMENT + +This module looks for several environment variables, including +REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE, +LANGUAGE, LC_ALL, LC_MESSAGES, and LANG. + +It will also use the L<Win32::Locale> module, if it's installed. + + +=head1 SEE ALSO + +L<I18N::LangTags>, L<Win32::Locale>, L<Locale::Maketext>. + +(This module's core code started out as a routine in Locale::Maketext; +but I moved it here once I realized it was more generally useful.) + + +=head1 COPYRIGHT + +Copyright (c) 1998-2004 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + + +=head1 AUTHOR + +Sean M. Burke C<sburke@cpan.org> + +=cut + +# a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty! diff --git a/dist/I18N-LangTags/lib/I18N/LangTags/List.pm b/dist/I18N-LangTags/lib/I18N/LangTags/List.pm new file mode 100644 index 0000000000..5494bea21e --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags/List.pm @@ -0,0 +1,1779 @@ + +require 5; +package I18N::LangTags::List; +# Time-stamp: "2004-10-06 23:26:21 ADT" +use strict; +use vars qw(%Name %Is_Disrec $Debug $VERSION); +$VERSION = '0.35'; +# POD at the end. + +#---------------------------------------------------------------------- +{ +# read the table out of our own POD! + my $seeking = 1; + my $count = 0; + my($disrec,$tag,$name); + my $last_name = ''; + while(<I18N::LangTags::List::DATA>) { + if($seeking) { + $seeking = 0 if m/=for woohah/; + } elsif( ($disrec, $tag, $name) = + m/(\[?)\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/ + ) { + $name =~ s/\s*[;\.]*\s*$//g; + next unless $name; + ++$count; + print "<$tag> <$name>\n" if $Debug; + $last_name = $Name{$tag} = $name; + $Is_Disrec{$tag} = 1 if $disrec; + } elsif (m/[Ff]ormerly \"([-a-z0-9]+)\"/) { + $Name{$1} = "$last_name (old tag)" if $last_name; + $Is_Disrec{$1} = 1; + } + } + die "No tags read??" unless $count; +} +#---------------------------------------------------------------------- + +sub name { + my $tag = lc($_[0] || return); + $tag =~ s/^\s+//s; + $tag =~ s/\s+$//s; + + my $alt; + if($tag =~ m/^x-(.+)/) { + $alt = "i-$1"; + } elsif($tag =~ m/^i-(.+)/) { + $alt = "x-$1"; + } else { + $alt = ''; + } + + my $subform = ''; + my $name = ''; + print "Input: {$tag}\n" if $Debug; + while(length $tag) { + last if $name = $Name{$tag}; + last if $name = $Name{$alt}; + if($tag =~ s/(-[a-z0-9]+)$//s) { + print "Shaving off: $1 leaving $tag\n" if $Debug; + $subform = "$1$subform"; + # and loop around again + + $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; + } else { + # we're trying to pull a subform off a primary tag. TILT! + print "Aborting on: {$name}{$subform}\n" if $Debug; + last; + } + } + print "Output: {$name}{$subform}\n" if $Debug; + + return unless $name; # Failure + return $name unless $subform; # Exact match + $subform =~ s/^-//s; + $subform =~ s/-$//s; + return "$name (Subform \"$subform\")"; +} + +#-------------------------------------------------------------------------- + +sub is_decent { + my $tag = lc($_[0] || return 0); + #require I18N::LangTags; + + return 0 unless + $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs; + + my @supers = (); + foreach my $bit (split('-', $tag)) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + return 0 unless @supers; + shift @supers if $supers[0] =~ m<^(i|x|sgn)$>s; + return 0 unless @supers; + + foreach my $f ($tag, @supers) { + return 0 if $Is_Disrec{$f}; + return 2 if $Name{$f}; + # so that decent subforms of indecent tags are decent + } + return 2 if $Name{$tag}; # not only is it decent, it's known! + return 1; +} + +#-------------------------------------------------------------------------- +1; + +__DATA__ + +=head1 NAME + +I18N::LangTags::List -- tags and names for human languages + +=head1 SYNOPSIS + + use I18N::LangTags::List; + print "Parlez-vous... ", join(', ', + I18N::LangTags::List::name('elx') || 'unknown_language', + I18N::LangTags::List::name('ar-Kw') || 'unknown_language', + I18N::LangTags::List::name('en') || 'unknown_language', + I18N::LangTags::List::name('en-CA') || 'unknown_language', + ), "?\n"; + +prints: + + Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? + +=head1 DESCRIPTION + +This module provides a function +C<I18N::LangTags::List::name( I<langtag> ) > that takes +a language tag (see L<I18N::LangTags|I18N::LangTags>) +and returns the best attempt at an English name for it, or +undef if it can't make sense of the tag. + +The function I18N::LangTags::List::name(...) is not exported. + +This module also provides a function +C<I18N::LangTags::List::is_decent( I<langtag> )> that returns true iff +the language tag is syntactically valid and is for general use (like +"fr" or "fr-ca", below). That is, it returns false for tags that are +syntactically invalid and for tags, like "aus", that are listed in +brackets below. This function is not exported. + +The map of tags-to-names that it uses is accessible as +%I18N::LangTags::List::Name, and it's the same as the list +that follows in this documentation, which should be useful +to you even if you don't use this module. + +=head1 ABOUT LANGUAGE TAGS + +Internet language tags, as defined in RFC 3066, are a formalism +for denoting human languages. The two-letter ISO 639-1 language +codes are well known (as "en" for English), as are their forms +when qualified by a country code ("en-US"). Less well-known are the +arbitrary-length non-ISO codes (like "i-mingo"), and the +recently (in 2001) introduced three-letter ISO-639-2 codes. + +Remember these important facts: + +=over + +=item * + +Language tags are not locale IDs. A locale ID is written with a "_" +instead of a "-", (almost?) always matches C<m/^\w\w_\w\w\b/>, and +I<means> something different than a language tag. A language tag +denotes a language. A locale ID denotes a language I<as used in> +a particular place, in combination with non-linguistic +location-specific information such as what currency is used +there. Locales I<also> often denote character set information, +as in "en_US.ISO8859-1". + +=item * + +Language tags are not for computer languages. + +=item * + +"Dialect" is not a useful term, since there is no objective +criterion for establishing when two language-forms are +dialects of eachother, or are separate languages. + +=item * + +Language tags are not case-sensitive. en-US, en-us, En-Us, etc., +are all the same tag, and denote the same language. + +=item * + +Not every language tag really refers to a single language. Some +language tags refer to conditions: i-default (system-message text +in English plus maybe other languages), und (undetermined +language). Others (notably lots of the three-letter codes) are +bibliographic tags that classify whole groups of languages, as +with cus "Cushitic (Other)" (i.e., a +language that has been classed as Cushtic, but which has no more +specific code) or the even less linguistically coherent +sai for "South American Indian (Other)". Though useful in +bibliography, B<SUCH TAGS ARE NOT +FOR GENERAL USE>. For further guidance, email me. + +=item * + +Language tags are not country codes. In fact, they are often +distinct codes, as with language tag ja for Japanese, and +ISO 3166 country code C<.jp> for Japan. + +=back + +=head1 LIST OF LANGUAGES + +The first part of each item is the language tag, between +{...}. It +is followed by an English name for the language or language-group. +Language tags that I judge to be not for general use, are bracketed. + +This list is in alphabetical order by English name of the language. + +=for reminder + The name in the =item line MUST NOT have E<...>'s in it!! + +=for woohah START + +=over + +=item {ab} : Abkhazian + +eq Abkhaz + +=item {ace} : Achinese + +=item {ach} : Acoli + +=item {ada} : Adangme + +=item {ady} : Adyghe + +eq Adygei + +=item {aa} : Afar + +=item {afh} : Afrihili + +(Artificial) + +=item {af} : Afrikaans + +=item [{afa} : Afro-Asiatic (Other)] + +=item {ak} : Akan + +(Formerly "aka".) + +=item {akk} : Akkadian + +(Historical) + +=item {sq} : Albanian + +=item {ale} : Aleut + +=item [{alg} : Algonquian languages] + +NOT Algonquin! + +=item [{tut} : Altaic (Other)] + +=item {am} : Amharic + +NOT Aramaic! + +=item {i-ami} : Ami + +eq Amis. eq 'Amis. eq Pangca. + +=item [{apa} : Apache languages] + +=item {ar} : Arabic + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{ar-ae} UAE Arabic; +{ar-bh} Bahrain Arabic; +{ar-dz} Algerian Arabic; +{ar-eg} Egyptian Arabic; +{ar-iq} Iraqi Arabic; +{ar-jo} Jordanian Arabic; +{ar-kw} Kuwait Arabic; +{ar-lb} Lebanese Arabic; +{ar-ly} Libyan Arabic; +{ar-ma} Moroccan Arabic; +{ar-om} Omani Arabic; +{ar-qa} Qatari Arabic; +{ar-sa} Sauda Arabic; +{ar-sy} Syrian Arabic; +{ar-tn} Tunisian Arabic; +{ar-ye} Yemen Arabic. + +=item {arc} : Aramaic + +NOT Amharic! NOT Samaritan Aramaic! + +=item {arp} : Arapaho + +=item {arn} : Araucanian + +=item {arw} : Arawak + +=item {hy} : Armenian + +=item {an} : Aragonese + +=item [{art} : Artificial (Other)] + +=item {ast} : Asturian + +eq Bable. + +=item {as} : Assamese + +=item [{ath} : Athapascan languages] + +eq Athabaskan. eq Athapaskan. eq Athabascan. + +=item [{aus} : Australian languages] + +=item [{map} : Austronesian (Other)] + +=item {av} : Avaric + +(Formerly "ava".) + +=item {ae} : Avestan + +eq Zend + +=item {awa} : Awadhi + +=item {ay} : Aymara + +=item {az} : Azerbaijani + +eq Azeri + +Notable forms: +{az-Arab} Azerbaijani in Arabic script; +{az-Cyrl} Azerbaijani in Cyrillic script; +{az-Latn} Azerbaijani in Latin script. + +=item {ban} : Balinese + +=item [{bat} : Baltic (Other)] + +=item {bal} : Baluchi + +=item {bm} : Bambara + +(Formerly "bam".) + +=item [{bai} : Bamileke languages] + +=item {bad} : Banda + +=item [{bnt} : Bantu (Other)] + +=item {bas} : Basa + +=item {ba} : Bashkir + +=item {eu} : Basque + +=item {btk} : Batak (Indonesia) + +=item {bej} : Beja + +=item {be} : Belarusian + +eq Belarussian. eq Byelarussian. +eq Belorussian. eq Byelorussian. +eq White Russian. eq White Ruthenian. +NOT Ruthenian! + +=item {bem} : Bemba + +=item {bn} : Bengali + +eq Bangla. + +=item [{ber} : Berber (Other)] + +=item {bho} : Bhojpuri + +=item {bh} : Bihari + +=item {bik} : Bikol + +=item {bin} : Bini + +=item {bi} : Bislama + +eq Bichelamar. + +=item {bs} : Bosnian + +=item {bra} : Braj + +=item {br} : Breton + +=item {bug} : Buginese + +=item {bg} : Bulgarian + +=item {i-bnn} : Bunun + +=item {bua} : Buriat + +=item {my} : Burmese + +=item {cad} : Caddo + +=item {car} : Carib + +=item {ca} : Catalan + +eq CatalE<aacute>n. eq Catalonian. + +=item [{cau} : Caucasian (Other)] + +=item {ceb} : Cebuano + +=item [{cel} : Celtic (Other)] + +Notable forms: +{cel-gaulish} Gaulish (Historical) + +=item [{cai} : Central American Indian (Other)] + +=item {chg} : Chagatai + +(Historical?) + +=item [{cmc} : Chamic languages] + +=item {ch} : Chamorro + +=item {ce} : Chechen + +=item {chr} : Cherokee + +eq Tsalagi + +=item {chy} : Cheyenne + +=item {chb} : Chibcha + +(Historical) NOT Chibchan (which is a language family). + +=item {ny} : Chichewa + +eq Nyanja. eq Chinyanja. + +=item {zh} : Chinese + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{zh-Hans} Chinese, in simplified script; +{zh-Hant} Chinese, in traditional script; +{zh-tw} Taiwan Chinese; +{zh-cn} PRC Chinese; +{zh-sg} Singapore Chinese; +{zh-mo} Macau Chinese; +{zh-hk} Hong Kong Chinese; +{zh-guoyu} Mandarin [Putonghua/Guoyu]; +{zh-hakka} Hakka [formerly "i-hakka"]; +{zh-min} Hokkien; +{zh-min-nan} Southern Hokkien; +{zh-wuu} Shanghaiese; +{zh-xiang} Hunanese; +{zh-gan} Gan; +{zh-yue} Cantonese. + +=for etc +{i-hakka} Hakka (old tag) + +=item {chn} : Chinook Jargon + +eq Chinook Wawa. + +=item {chp} : Chipewyan + +=item {cho} : Choctaw + +=item {cu} : Church Slavic + +eq Old Church Slavonic. + +=item {chk} : Chuukese + +eq Trukese. eq Chuuk. eq Truk. eq Ruk. + +=item {cv} : Chuvash + +=item {cop} : Coptic + +=item {kw} : Cornish + +=item {co} : Corsican + +eq Corse. + +=item {cr} : Cree + +NOT Creek! (Formerly "cre".) + +=item {mus} : Creek + +NOT Cree! + +=item [{cpe} : English-based Creoles and pidgins (Other)] + +=item [{cpf} : French-based Creoles and pidgins (Other)] + +=item [{cpp} : Portuguese-based Creoles and pidgins (Other)] + +=item [{crp} : Creoles and pidgins (Other)] + +=item {hr} : Croatian + +eq Croat. + +=item [{cus} : Cushitic (Other)] + +=item {cs} : Czech + +=item {dak} : Dakota + +eq Nakota. eq Latoka. + +=item {da} : Danish + +=item {dar} : Dargwa + +=item {day} : Dayak + +=item {i-default} : Default (Fallthru) Language + +Defined in RFC 2277, this is for tagging text +(which must include English text, and might/should include text +in other appropriate languages) that is emitted in a context +where language-negotiation wasn't possible -- in SMTP mail failure +messages, for example. + +=item {del} : Delaware + +=item {din} : Dinka + +=item {dv} : Divehi + +eq Maldivian. (Formerly "div".) + +=item {doi} : Dogri + +NOT Dogrib! + +=item {dgr} : Dogrib + +NOT Dogri! + +=item [{dra} : Dravidian (Other)] + +=item {dua} : Duala + +=item {nl} : Dutch + +eq Netherlander. Notable forms: +{nl-nl} Netherlands Dutch; +{nl-be} Belgian Dutch. + +=item {dum} : Middle Dutch (ca.1050-1350) + +(Historical) + +=item {dyu} : Dyula + +=item {dz} : Dzongkha + +=item {efi} : Efik + +=item {egy} : Ancient Egyptian + +(Historical) + +=item {eka} : Ekajuk + +=item {elx} : Elamite + +(Historical) + +=item {en} : English + +Notable forms: +{en-au} Australian English; +{en-bz} Belize English; +{en-ca} Canadian English; +{en-gb} UK English; +{en-ie} Irish English; +{en-jm} Jamaican English; +{en-nz} New Zealand English; +{en-ph} Philippine English; +{en-tt} Trinidad English; +{en-us} US English; +{en-za} South African English; +{en-zw} Zimbabwe English. + +=item {enm} : Old English (1100-1500) + +(Historical) + +=item {ang} : Old English (ca.450-1100) + +eq Anglo-Saxon. (Historical) + +=item {i-enochian} : Enochian (Artificial) + +=item {myv} : Erzya + +=item {eo} : Esperanto + +(Artificial) + +=item {et} : Estonian + +=item {ee} : Ewe + +(Formerly "ewe".) + +=item {ewo} : Ewondo + +=item {fan} : Fang + +=item {fat} : Fanti + +=item {fo} : Faroese + +=item {fj} : Fijian + +=item {fi} : Finnish + +=item [{fiu} : Finno-Ugrian (Other)] + +eq Finno-Ugric. NOT Ugaritic! + +=item {fon} : Fon + +=item {fr} : French + +Notable forms: +{fr-fr} France French; +{fr-be} Belgian French; +{fr-ca} Canadian French; +{fr-ch} Swiss French; +{fr-lu} Luxembourg French; +{fr-mc} Monaco French. + +=item {frm} : Middle French (ca.1400-1600) + +(Historical) + +=item {fro} : Old French (842-ca.1400) + +(Historical) + +=item {fy} : Frisian + +=item {fur} : Friulian + +=item {ff} : Fulah + +(Formerly "ful".) + +=item {gaa} : Ga + +=item {gd} : Scots Gaelic + +NOT Scots! + +=item {gl} : Gallegan + +eq Galician + +=item {lg} : Ganda + +(Formerly "lug".) + +=item {gay} : Gayo + +=item {gba} : Gbaya + +=item {gez} : Geez + +eq Ge'ez + +=item {ka} : Georgian + +=item {de} : German + +Notable forms: +{de-at} Austrian German; +{de-be} Belgian German; +{de-ch} Swiss German; +{de-de} Germany German; +{de-li} Liechtenstein German; +{de-lu} Luxembourg German. + +=item {gmh} : Middle High German (ca.1050-1500) + +(Historical) + +=item {goh} : Old High German (ca.750-1050) + +(Historical) + +=item [{gem} : Germanic (Other)] + +=item {gil} : Gilbertese + +=item {gon} : Gondi + +=item {gor} : Gorontalo + +=item {got} : Gothic + +(Historical) + +=item {grb} : Grebo + +=item {grc} : Ancient Greek + +(Historical) (Until 15th century or so.) + +=item {el} : Modern Greek + +(Since 15th century or so.) + +=item {gn} : Guarani + +GuaranE<iacute> + +=item {gu} : Gujarati + +=item {gwi} : Gwich'in + +eq Gwichin + +=item {hai} : Haida + +=item {ht} : Haitian + +eq Haitian Creole + +=item {ha} : Hausa + +=item {haw} : Hawaiian + +Hawai'ian + +=item {he} : Hebrew + +(Formerly "iw".) + +=for etc +{iw} Hebrew (old tag) + +=item {hz} : Herero + +=item {hil} : Hiligaynon + +=item {him} : Himachali + +=item {hi} : Hindi + +=item {ho} : Hiri Motu + +=item {hit} : Hittite + +(Historical) + +=item {hmn} : Hmong + +=item {hu} : Hungarian + +=item {hup} : Hupa + +=item {iba} : Iban + +=item {is} : Icelandic + +=item {io} : Ido + +(Artificial) + +=item {ig} : Igbo + +(Formerly "ibo".) + +=item {ijo} : Ijo + +=item {ilo} : Iloko + +=item [{inc} : Indic (Other)] + +=item [{ine} : Indo-European (Other)] + +=item {id} : Indonesian + +(Formerly "in".) + +=for etc +{in} Indonesian (old tag) + +=item {inh} : Ingush + +=item {ia} : Interlingua (International Auxiliary Language Association) + +(Artificial) NOT Interlingue! + +=item {ie} : Interlingue + +(Artificial) NOT Interlingua! + +=item {iu} : Inuktitut + +A subform of "Eskimo". + +=item {ik} : Inupiaq + +A subform of "Eskimo". + +=item [{ira} : Iranian (Other)] + +=item {ga} : Irish + +=item {mga} : Middle Irish (900-1200) + +(Historical) + +=item {sga} : Old Irish (to 900) + +(Historical) + +=item [{iro} : Iroquoian languages] + +=item {it} : Italian + +Notable forms: +{it-it} Italy Italian; +{it-ch} Swiss Italian. + +=item {ja} : Japanese + +(NOT "jp"!) + +=item {jv} : Javanese + +(Formerly "jw" because of a typo.) + +=item {jrb} : Judeo-Arabic + +=item {jpr} : Judeo-Persian + +=item {kbd} : Kabardian + +=item {kab} : Kabyle + +=item {kac} : Kachin + +=item {kl} : Kalaallisut + +eq Greenlandic "Eskimo" + +=item {xal} : Kalmyk + +=item {kam} : Kamba + +=item {kn} : Kannada + +eq Kanarese. NOT Canadian! + +=item {kr} : Kanuri + +(Formerly "kau".) + +=item {krc} : Karachay-Balkar + +=item {kaa} : Kara-Kalpak + +=item {kar} : Karen + +=item {ks} : Kashmiri + +=item {csb} : Kashubian + +eq Kashub + +=item {kaw} : Kawi + +=item {kk} : Kazakh + +=item {kha} : Khasi + +=item {km} : Khmer + +eq Cambodian. eq Kampuchean. + +=item [{khi} : Khoisan (Other)] + +=item {kho} : Khotanese + +=item {ki} : Kikuyu + +eq Gikuyu. + +=item {kmb} : Kimbundu + +=item {rw} : Kinyarwanda + +=item {ky} : Kirghiz + +=item {i-klingon} : Klingon + +=item {kv} : Komi + +=item {kg} : Kongo + +(Formerly "kon".) + +=item {kok} : Konkani + +=item {ko} : Korean + +=item {kos} : Kosraean + +=item {kpe} : Kpelle + +=item {kro} : Kru + +=item {kj} : Kuanyama + +=item {kum} : Kumyk + +=item {ku} : Kurdish + +=item {kru} : Kurukh + +=item {kut} : Kutenai + +=item {lad} : Ladino + +eq Judeo-Spanish. NOT Ladin (a minority language in Italy). + +=item {lah} : Lahnda + +NOT Lamba! + +=item {lam} : Lamba + +NOT Lahnda! + +=item {lo} : Lao + +eq Laotian. + +=item {la} : Latin + +(Historical) NOT Ladin! NOT Ladino! + +=item {lv} : Latvian + +eq Lettish. + +=item {lb} : Letzeburgesch + +eq Luxemburgian, eq Luxemburger. (Formerly "i-lux".) + +=for etc +{i-lux} Letzeburgesch (old tag) + +=item {lez} : Lezghian + +=item {li} : Limburgish + +eq Limburger, eq Limburgan. NOT Letzeburgesch! + +=item {ln} : Lingala + +=item {lt} : Lithuanian + +=item {nds} : Low German + +eq Low Saxon. eq Low German. eq Low Saxon. + +=item {art-lojban} : Lojban (Artificial) + +=item {loz} : Lozi + +=item {lu} : Luba-Katanga + +(Formerly "lub".) + +=item {lua} : Luba-Lulua + +=item {lui} : Luiseno + +eq LuiseE<ntilde>o. + +=item {lun} : Lunda + +=item {luo} : Luo (Kenya and Tanzania) + +=item {lus} : Lushai + +=item {mk} : Macedonian + +eq the modern Slavic language spoken in what was Yugoslavia. +NOT the form of Greek spoken in Greek Macedonia! + +=item {mad} : Madurese + +=item {mag} : Magahi + +=item {mai} : Maithili + +=item {mak} : Makasar + +=item {mg} : Malagasy + +=item {ms} : Malay + +NOT Malayalam! + +=item {ml} : Malayalam + +NOT Malay! + +=item {mt} : Maltese + +=item {mnc} : Manchu + +=item {mdr} : Mandar + +NOT Mandarin! + +=item {man} : Mandingo + +=item {mni} : Manipuri + +eq Meithei. + +=item [{mno} : Manobo languages] + +=item {gv} : Manx + +=item {mi} : Maori + +NOT Mari! + +=item {mr} : Marathi + +=item {chm} : Mari + +NOT Maori! + +=item {mh} : Marshall + +eq Marshallese. + +=item {mwr} : Marwari + +=item {mas} : Masai + +=item [{myn} : Mayan languages] + +=item {men} : Mende + +=item {mic} : Micmac + +=item {min} : Minangkabau + +=item {i-mingo} : Mingo + +eq the Irquoian language West Virginia Seneca. NOT New York Seneca! + +=item [{mis} : Miscellaneous languages] + +Don't use this. + +=item {moh} : Mohawk + +=item {mdf} : Moksha + +=item {mo} : Moldavian + +eq Moldovan. + +=item [{mkh} : Mon-Khmer (Other)] + +=item {lol} : Mongo + +=item {mn} : Mongolian + +eq Mongol. + +=item {mos} : Mossi + +=item [{mul} : Multiple languages] + +Not for normal use. + +=item [{mun} : Munda languages] + +=item {nah} : Nahuatl + +=item {nap} : Neapolitan + +=item {na} : Nauru + +=item {nv} : Navajo + +eq Navaho. (Formerly "i-navajo".) + +=for etc +{i-navajo} Navajo (old tag) + +=item {nd} : North Ndebele + +=item {nr} : South Ndebele + +=item {ng} : Ndonga + +=item {ne} : Nepali + +eq Nepalese. Notable forms: +{ne-np} Nepal Nepali; +{ne-in} India Nepali. + +=item {new} : Newari + +=item {nia} : Nias + +=item [{nic} : Niger-Kordofanian (Other)] + +=item [{ssa} : Nilo-Saharan (Other)] + +=item {niu} : Niuean + +=item {nog} : Nogai + +=item {non} : Old Norse + +(Historical) + +=item [{nai} : North American Indian] + +Do not use this. + +=item {no} : Norwegian + +Note the two following forms: + +=item {nb} : Norwegian Bokmal + +eq BokmE<aring>l, (A form of Norwegian.) (Formerly "no-bok".) + +=for etc +{no-bok} Norwegian Bokmal (old tag) + +=item {nn} : Norwegian Nynorsk + +(A form of Norwegian.) (Formerly "no-nyn".) + +=for etc +{no-nyn} Norwegian Nynorsk (old tag) + +=item [{nub} : Nubian languages] + +=item {nym} : Nyamwezi + +=item {nyn} : Nyankole + +=item {nyo} : Nyoro + +=item {nzi} : Nzima + +=item {oc} : Occitan (post 1500) + +eq ProvenE<ccedil>al, eq Provencal + +=item {oj} : Ojibwa + +eq Ojibwe. (Formerly "oji".) + +=item {or} : Oriya + +=item {om} : Oromo + +=item {osa} : Osage + +=item {os} : Ossetian; Ossetic + +=item [{oto} : Otomian languages] + +Group of languages collectively called "OtomE<iacute>". + +=item {pal} : Pahlavi + +eq Pahlevi + +=item {i-pwn} : Paiwan + +eq Pariwan + +=item {pau} : Palauan + +=item {pi} : Pali + +(Historical?) + +=item {pam} : Pampanga + +=item {pag} : Pangasinan + +=item {pa} : Panjabi + +eq Punjabi + +=item {pap} : Papiamento + +eq Papiamentu. + +=item [{paa} : Papuan (Other)] + +=item {fa} : Persian + +eq Farsi. eq Iranian. + +=item {peo} : Old Persian (ca.600-400 B.C.) + +=item [{phi} : Philippine (Other)] + +=item {phn} : Phoenician + +(Historical) + +=item {pon} : Pohnpeian + +NOT Pompeiian! + +=item {pl} : Polish + +=item {pt} : Portuguese + +eq Portugese. Notable forms: +{pt-pt} Portugal Portuguese; +{pt-br} Brazilian Portuguese. + +=item [{pra} : Prakrit languages] + +=item {pro} : Old Provencal (to 1500) + +eq Old ProvenE<ccedil>al. (Historical.) + +=item {ps} : Pushto + +eq Pashto. eq Pushtu. + +=item {qu} : Quechua + +eq Quecha. + +=item {rm} : Raeto-Romance + +eq Romansh. + +=item {raj} : Rajasthani + +=item {rap} : Rapanui + +=item {rar} : Rarotongan + +=item [{qaa - qtz} : Reserved for local use.] + +=item [{roa} : Romance (Other)] + +NOT Romanian! NOT Romany! NOT Romansh! + +=item {ro} : Romanian + +eq Rumanian. NOT Romany! + +=item {rom} : Romany + +eq Rom. NOT Romanian! + +=item {rn} : Rundi + +=item {ru} : Russian + +NOT White Russian! NOT Rusyn! + +=item [{sal} : Salishan languages] + +Large language group. + +=item {sam} : Samaritan Aramaic + +NOT Aramaic! + +=item {se} : Northern Sami + +eq Lappish. eq Lapp. eq (Northern) Saami. + +=item {sma} : Southern Sami + +=item {smn} : Inari Sami + +=item {smj} : Lule Sami + +=item {sms} : Skolt Sami + +=item [{smi} : Sami languages (Other)] + +=item {sm} : Samoan + +=item {sad} : Sandawe + +=item {sg} : Sango + +=item {sa} : Sanskrit + +(Historical) + +=item {sat} : Santali + +=item {sc} : Sardinian + +eq Sard. + +=item {sas} : Sasak + +=item {sco} : Scots + +NOT Scots Gaelic! + +=item {sel} : Selkup + +=item [{sem} : Semitic (Other)] + +=item {sr} : Serbian + +eq Serb. NOT Sorbian. + +Notable forms: +{sr-Cyrl} : Serbian in Cyrillic script; +{sr-Latn} : Serbian in Latin script. + +=item {srr} : Serer + +=item {shn} : Shan + +=item {sn} : Shona + +=item {sid} : Sidamo + +=item {sgn-...} : Sign Languages + +Always use with a subtag. Notable forms: +{sgn-gb} British Sign Language (BSL); +{sgn-ie} Irish Sign Language (ESL); +{sgn-ni} Nicaraguan Sign Language (ISN); +{sgn-us} American Sign Language (ASL). + +(And so on with other country codes as the subtag.) + +=item {bla} : Siksika + +eq Blackfoot. eq Pikanii. + +=item {sd} : Sindhi + +=item {si} : Sinhalese + +eq Sinhala. + +=item [{sit} : Sino-Tibetan (Other)] + +=item [{sio} : Siouan languages] + +=item {den} : Slave (Athapascan) + +("Slavey" is a subform.) + +=item [{sla} : Slavic (Other)] + +=item {sk} : Slovak + +eq Slovakian. + +=item {sl} : Slovenian + +eq Slovene. + +=item {sog} : Sogdian + +=item {so} : Somali + +=item {son} : Songhai + +=item {snk} : Soninke + +=item {wen} : Sorbian languages + +eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! + +=item {nso} : Northern Sotho + +=item {st} : Southern Sotho + +eq Sutu. eq Sesotho. + +=item [{sai} : South American Indian (Other)] + +=item {es} : Spanish + +Notable forms: +{es-ar} Argentine Spanish; +{es-bo} Bolivian Spanish; +{es-cl} Chilean Spanish; +{es-co} Colombian Spanish; +{es-do} Dominican Spanish; +{es-ec} Ecuadorian Spanish; +{es-es} Spain Spanish; +{es-gt} Guatemalan Spanish; +{es-hn} Honduran Spanish; +{es-mx} Mexican Spanish; +{es-pa} Panamanian Spanish; +{es-pe} Peruvian Spanish; +{es-pr} Puerto Rican Spanish; +{es-py} Paraguay Spanish; +{es-sv} Salvadoran Spanish; +{es-us} US Spanish; +{es-uy} Uruguayan Spanish; +{es-ve} Venezuelan Spanish. + +=item {suk} : Sukuma + +=item {sux} : Sumerian + +(Historical) + +=item {su} : Sundanese + +=item {sus} : Susu + +=item {sw} : Swahili + +eq Kiswahili + +=item {ss} : Swati + +=item {sv} : Swedish + +Notable forms: +{sv-se} Sweden Swedish; +{sv-fi} Finland Swedish. + +=item {syr} : Syriac + +=item {tl} : Tagalog + +=item {ty} : Tahitian + +=item [{tai} : Tai (Other)] + +NOT Thai! + +=item {tg} : Tajik + +=item {tmh} : Tamashek + +=item {ta} : Tamil + +=item {i-tao} : Tao + +eq Yami. + +=item {tt} : Tatar + +=item {i-tay} : Tayal + +eq Atayal. eq Atayan. + +=item {te} : Telugu + +=item {ter} : Tereno + +=item {tet} : Tetum + +=item {th} : Thai + +NOT Tai! + +=item {bo} : Tibetan + +=item {tig} : Tigre + +=item {ti} : Tigrinya + +=item {tem} : Timne + +eq Themne. eq Timene. + +=item {tiv} : Tiv + +=item {tli} : Tlingit + +=item {tpi} : Tok Pisin + +=item {tkl} : Tokelau + +=item {tog} : Tonga (Nyasa) + +NOT Tsonga! + +=item {to} : Tonga (Tonga Islands) + +(Pronounced "Tong-a", not "Tong-ga") + +NOT Tsonga! + +=item {tsi} : Tsimshian + +eq Sm'algyax + +=item {ts} : Tsonga + +NOT Tonga! + +=item {i-tsu} : Tsou + +=item {tn} : Tswana + +Same as Setswana. + +=item {tum} : Tumbuka + +=item [{tup} : Tupi languages] + +=item {tr} : Turkish + +(Typically in Roman script) + +=item {ota} : Ottoman Turkish (1500-1928) + +(Typically in Arabic script) (Historical) + +=item {crh} : Crimean Turkish + +eq Crimean Tatar + +=item {tk} : Turkmen + +eq Turkmeni. + +=item {tvl} : Tuvalu + +=item {tyv} : Tuvinian + +eq Tuvan. eq Tuvin. + +=item {tw} : Twi + +=item {udm} : Udmurt + +=item {uga} : Ugaritic + +NOT Ugric! + +=item {ug} : Uighur + +=item {uk} : Ukrainian + +=item {umb} : Umbundu + +=item {und} : Undetermined + +Not a tag for normal use. + +=item {ur} : Urdu + +=item {uz} : Uzbek + +eq E<Ouml>zbek + +Notable forms: +{uz-Cyrl} Uzbek in Cyrillic script; +{uz-Latn} Uzbek in Latin script. + +=item {vai} : Vai + +=item {ve} : Venda + +NOT Wendish! NOT Wend! NOT Avestan! (Formerly "ven".) + +=item {vi} : Vietnamese + +eq Viet. + +=item {vo} : Volapuk + +eq VolapE<uuml>k. (Artificial) + +=item {vot} : Votic + +eq Votian. eq Vod. + +=item [{wak} : Wakashan languages] + +=item {wa} : Walloon + +=item {wal} : Walamo + +eq Wolaytta. + +=item {war} : Waray + +Presumably the Philippine language Waray-Waray (SamareE<ntilde>o), +not the smaller Philippine language Waray Sorsogon, nor the extinct +Australian language Waray. + +=item {was} : Washo + +eq Washoe + +=item {cy} : Welsh + +=item {wo} : Wolof + +=item {x-...} : Unregistered (Semi-Private Use) + +"x-" is a prefix for language tags that are not registered with ISO +or IANA. Example, x-double-dutch + +=item {xh} : Xhosa + +=item {sah} : Yakut + +=item {yao} : Yao + +(The Yao in Malawi?) + +=item {yap} : Yapese + +eq Yap + +=item {ii} : Sichuan Yi + +=item {yi} : Yiddish + +Formerly "ji". Usually in Hebrew script. + +Notable forms: +{yi-latn} Yiddish in Latin script + +=item {yo} : Yoruba + +=item [{ypk} : Yupik languages] + +Several "Eskimo" languages. + +=item {znd} : Zande + +=item [{zap} : Zapotec] + +(A group of languages.) + +=item {zen} : Zenaga + +NOT Zend. + +=item {za} : Zhuang + +=item {zu} : Zulu + +=item {zun} : Zuni + +eq ZuE<ntilde>i + +=back + +=for woohah END + +=head1 SEE ALSO + +L<I18N::LangTags|I18N::LangTags> and its "See Also" section. + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 2001+ Sean M. Burke. All rights reserved. + +You can redistribute and/or +modify this document under the same terms as Perl itself. + +This document is provided in the hope that it will be +useful, but without any warranty; +without even the implied warranty of accuracy, authoritativeness, +completeness, merchantability, or fitness for a particular purpose. + +Email any corrections or questions to me. + +=head1 AUTHOR + +Sean M. Burke, sburkeE<64>cpan.org + +=cut + + +# To generate a list of just the two and three-letter codes: + +#!/usr/local/bin/perl -w + +require 5; # Time-stamp: "2001-03-13 21:53:39 MST" + # Sean M. Burke, sburke@cpan.org + # This program is for generating the language_codes.txt file +use strict; +use LWP::Simple; +use HTML::TreeBuilder 3.10; +my $root = HTML::TreeBuilder->new(); +my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; +$root->parse(get($url) || die "Can't get $url"); +$root->eof(); + +my @codes; + +foreach my $tr ($root->find_by_tag_name('tr')) { + my @f = map $_->as_text(), $tr->content_list(); + #print map("<$_> ", @f), "\n"; + next unless @f == 5; + pop @f; # nix the French name + next if $f[-1] eq 'Language Name (English)'; # it's a header line + my $xx = splice(@f, 2,1); # pull out the two-letter code + $f[-1] =~ s/^\s+//; + $f[-1] =~ s/\s+$//; + if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it + push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; + } else { # print the three-letter codes. + if($f[0] eq $f[1]) { + push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; + } else { # shouldn't happen + push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; + } + } +} + +print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; +print "[ based on $url\n at ", scalar(localtime), "]\n", + "[Note: doesn't include IANA-registered codes.]\n"; +exit; +__END__ + diff --git a/dist/I18N-LangTags/t/01_about_verbose.t b/dist/I18N-LangTags/t/01_about_verbose.t new file mode 100644 index 0000000000..3abc68d537 --- /dev/null +++ b/dist/I18N-LangTags/t/01_about_verbose.t @@ -0,0 +1,89 @@ + +require 5; +# Time-stamp: "2004-03-30 17:02:59 AST" + +# Summary of, well, things. + +use Test; +BEGIN {plan tests => 2}; + +ok 1; + +use I18N::LangTags; +use I18N::LangTags::List; +use I18N::LangTags::Detect; + +#chdir "t" if -e "t"; + +{ + my @out; + push @out, + "\n\nPerl v", + defined($^V) ? sprintf('%vd', $^V) : $], + " under $^O ", + (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber()) + ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (), + (defined $MacPerl::Version) + ? ("(MacPerl version $MacPerl::Version)") : (), + "\n" + ; + + # Ugly code to walk the symbol tables: + my %v; + my @stack = (''); # start out in %:: + my $this; + my $count = 0; + my $pref; + while(@stack) { + $this = shift @stack; + die "Too many packages?" if ++$count > 1000; + next if exists $v{$this}; + next if $this eq 'main'; # %main:: is %:: + + #print "Peeking at $this => ${$this . '::VERSION'}\n"; + + if(defined ${$this . '::VERSION'} ) { + $v{$this} = ${$this . '::VERSION'} + } elsif( + defined *{$this . '::ISA'} or defined &{$this . '::import'} + or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"}) + # If it has an ISA, an import, or any subs... + ) { + # It's a class/module with no version. + $v{$this} = undef; + } else { + # It's probably an unpopulated package. + ## $v{$this} = '...'; + } + + $pref = length($this) ? "$this\::" : ''; + push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'}; + #print "Stack: @stack\n"; + } + push @out, " Modules in memory:\n"; + delete @v{'', '[none]'}; + foreach my $p (sort {lc($a) cmp lc($b)} keys %v) { + $indent = ' ' x (2 + ($p =~ tr/:/:/)); + push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n"; + } + push @out, sprintf "[at %s (local) / %s (GMT)]\n", + scalar(gmtime), scalar(localtime); + my $x = join '', @out; + $x =~ s/^/#/mg; + print $x; +} + +print "# Running", + (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n", + "#\n", +; + +print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n"; + +print "# \%INC:\n"; +foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) { + print "# [$x] = [", $INC{$x} || '', "]\n"; +} + +ok 1; + diff --git a/dist/I18N-LangTags/t/05_main.t b/dist/I18N-LangTags/t/05_main.t new file mode 100644 index 0000000000..056baafc55 --- /dev/null +++ b/dist/I18N-LangTags/t/05_main.t @@ -0,0 +1,98 @@ + +require 5; + # Time-stamp: "2004-03-30 17:52:14 AST" +use strict; +use Test; +BEGIN { plan tests => 64 }; +BEGIN { ok 1 } +use I18N::LangTags (':ALL'); + +print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; + +ok !is_language_tag(''); +ok is_language_tag('fr'); +ok is_language_tag('fr-ca'); +ok is_language_tag('fr-CA'); +ok !is_language_tag('fr-CA-'); +ok !is_language_tag('fr_CA'); +ok is_language_tag('fr-ca-joual'); +ok !is_language_tag('frca'); +ok is_language_tag('nav'); # (not actual tag) +ok is_language_tag('nav-shiprock'); # (not actual tag) +ok !is_language_tag('nav-ceremonial'); # subtag too long +ok !is_language_tag('x'); +ok !is_language_tag('i'); +ok is_language_tag('i-borg'); # NB: fictitious tag +ok is_language_tag('x-borg'); +ok is_language_tag('x-borg-prot5123'); +ok same_language_tag('x-borg-prot5123', 'i-BORG-Prot5123' ); +ok !same_language_tag('en', 'en-us' ); + +ok 0 == similarity_language_tag('en-ca', 'fr-ca'); +ok 1 == similarity_language_tag('en-ca', 'en-us'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us-western'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us'); + +ok grep $_ eq 'hi', panic_languages('kok'); +ok grep $_ eq 'en', panic_languages('x-woozle-wuzzle'); +ok ! grep $_ eq 'mr', panic_languages('it'); +ok grep $_ eq 'es', panic_languages('it'); +ok grep $_ eq 'it', panic_languages('es'); + + +print "# Now the ::List tests...\n"; +print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; + +use I18N::LangTags::List; +foreach my $lt (qw( + en + en-us + en-kr + el + elx + i-mingo + i-mingo-tom + x-mingo-tom + it + it-it + it-IT + it-FR + ak + aka + jv + jw + no + no-nyn + nn + i-lux + lb + wa + yi + ji + den-syllabic + den-syllabic-western + den-western + den-latin + cre-syllabic + cre-syllabic-western + cre-western + cre-latin + cr-syllabic + cr-syllabic-western + cr-western + cr-latin +)) { + my $name = I18N::LangTags::List::name($lt); + if($name) { + ok(1); + print "# $lt -> $name\n"; + } else { + ok(0); + print "# Failed lookup on $lt\n"; + } +} + + + +print "# So there!\n"; + diff --git a/dist/I18N-LangTags/t/07_listy.t b/dist/I18N-LangTags/t/07_listy.t new file mode 100644 index 0000000000..a56a798a33 --- /dev/null +++ b/dist/I18N-LangTags/t/07_listy.t @@ -0,0 +1,30 @@ + +require 5; + # Time-stamp: "2003-10-10 17:37:34 ADT" +use strict; +use Test; +BEGIN { plan tests => 17 }; +BEGIN { ok 1 } +use I18N::LangTags::List; + +print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; + +ok I18N::LangTags::List::name('fr'), 'French'; +ok I18N::LangTags::List::name('fr-fr'); +ok !I18N::LangTags::List::name('El Zorcho'); +ok !I18N::LangTags::List::name(); + + +ok !I18N::LangTags::List::is_decent(); +ok I18N::LangTags::List::is_decent('fr'); +ok I18N::LangTags::List::is_decent('fr-blorch'); +ok !I18N::LangTags::List::is_decent('El Zorcho'); +ok !I18N::LangTags::List::is_decent('sgn'); +ok I18N::LangTags::List::is_decent('sgn-us'); +ok !I18N::LangTags::List::is_decent('i'); +ok I18N::LangTags::List::is_decent('i-mingo'); +ok I18N::LangTags::List::is_decent('i-mingo-tom'); +ok !I18N::LangTags::List::is_decent('cel'); +ok I18N::LangTags::List::is_decent('cel-gaulish'); + +ok 1; # one for the road diff --git a/dist/I18N-LangTags/t/10_http.t b/dist/I18N-LangTags/t/10_http.t new file mode 100644 index 0000000000..36341f77cf --- /dev/null +++ b/dist/I18N-LangTags/t/10_http.t @@ -0,0 +1,104 @@ + +# Time-stamp: "2004-06-17 23:06:22 PDT" + +use I18N::LangTags::Detect; + +use Test; +BEGIN { plan tests => 87 }; + +my @in = grep m/\S/, split /\n/, q{ + +[ sv ] sv +[ en ] en +[ en fi ] en, fi +[ en-us ] en-us +[ en-us ] en-US +[ en-us ] EN-US + +[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt tli ja ] EN-au, tli;q=0.201, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 + +[ en-au en en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 +[ en fr ] en;q=1,fr;q=.5 +[ en fr ] en;q=1,fr;q=.99 +[ en ru ko ] en, ru;q=0.7, ko;q=0.3 +[ en ru ko ] en, ru;q=0.7, KO;q=0.3 +[ en-us en ] en-us, en;q=0.50 +[ en fr ] fr ; q = 0.9, en +[ en fr ] en,fr;q=.90 +[ ru en-uk en fr ] ru, en-UK;q=0.5, en;q=0.3, fr;q=0.1 +[ en-us fr es-mx ] en-us,fr;q=0.7,es-mx;q=0.3 +[ en-us en ] en-us, en;q=0.50 + +[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 +[ da en-gb en ] da, en;q=0.7, en-gb;q=0.8 +[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 +[ da en-gb en ] da,en;q=0.7,en-gb;q=0.8 +[ da en-gb en ] da, en-gb ; q=0.8, en ; q=0.7 +[ da en-gb en ] da , en-gb ; q = 0.8 , en ; q =0.7 +[ da en-gb en ] da (yup, Danish) , en-gb ; q = 0.8 , en ; q =0.7 + +[ no dk en-uk en-us ] en-UK;q=0.7, en-US;q=0.6, no;q=1.0, dk;q=0.8 +[ no dk en-uk en-us ] en-US;q=0.6, en-UK;q=0.7, no;q=1.0, dk;q=0.8 +[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, en-US;q=0.6, dk;q=0.8 +[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, dk;q=0.8, en-US;q=0.6 + +[ fi en ] fi;q=1, en;q=0.2 +[ de-de de en en-us en-gb ] de-DE, de;q=0.80, en;q=0.60, en-US;q=0.40, en-GB;q=0.20 +[ ru ] ru; q=1, *; q=0.1 +[ ru en ] ru, en; q=0.1 +[ ja en ] ja,en;q=0.5 +[ en ] en; q=1.0 +[ ja ] ja; q=1.0 +[ ja ] ja; q=1.0 +[ en ja ] en; q=0.5, ja; q=0.5 +[ fr-ca fr en ] fr-ca, fr;q=0.8, en;q=0.7 +[ NIX ] NIX +}; + +foreach my $in (@in) { + $in =~ s/^\s*\[([^\]]+)\]\s*//s or die "Bad input: $in"; + my @should = do { my $x = $1; $x =~ m/(\S+)/g }; + + if($in eq 'NIX') { $in = ''; @should = (); } + + local $ENV{'HTTP_ACCEPT_LANGUAGE'}; + + foreach my $modus ( + sub { + print "# Testing with arg...\n"; + $ENV{'HTTP_ACCEPT_LANGUAGE'} = 'PLORK'; + return $_[0]; + }, + sub { + print "# Testing wath HTTP_ACCEPT_LANGUAGE...\n"; + $ENV{'HTTP_ACCEPT_LANGUAGE'} = $_[0]; + return(); + }, + ) { + my @args = &$modus($in); + + # //////////////////////////////////////////////////// + my @out = I18N::LangTags::Detect->http_accept_langs(@args); + # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ + + if( + @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#Got: [@out]\n", + "# but wanted: [@should]\n", + "# < \"$in\"\n#\n"; + } + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/20_locales.t b/dist/I18N-LangTags/t/20_locales.t new file mode 100644 index 0000000000..ae04812ff2 --- /dev/null +++ b/dist/I18N-LangTags/t/20_locales.t @@ -0,0 +1,38 @@ +require 5; + # Time-stamp: "2004-10-06 23:07:06 ADT" +use strict; +use Test; +BEGIN { plan tests => 22 }; +BEGIN { ok 1 } +use I18N::LangTags (':ALL'); + +print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; +print "# Loaded from ", $INC{'I18N/LangTags.pm'} || "??", "\n"; + +ok lc locale2language_tag('en'), 'en'; +ok lc locale2language_tag('en_US'), 'en-us'; +ok lc locale2language_tag('en_US.ISO8859-1'), 'en-us'; +ok lc(locale2language_tag('C')||''), ''; +ok lc(locale2language_tag('POSIX')||''), ''; + + +ok lc locale2language_tag('eu_mt'), 'eu-mt'; +ok lc locale2language_tag('eu'), 'eu'; +ok lc locale2language_tag('it'), 'it'; +ok lc locale2language_tag('it_IT'), 'it-it'; +ok lc locale2language_tag('it_IT.utf8'), 'it-it'; +ok lc locale2language_tag('it_IT.utf8@euro'), 'it-it'; +ok lc locale2language_tag('it_IT@euro'), 'it-it'; + + +ok lc locale2language_tag('zh_CN.gb18030'), 'zh-cn'; +ok lc locale2language_tag('zh_CN.gbk'), 'zh-cn'; +ok lc locale2language_tag('zh_CN.utf8'), 'zh-cn'; +ok lc locale2language_tag('zh_HK'), 'zh-hk'; +ok lc locale2language_tag('zh_HK.utf8'), 'zh-hk'; +ok lc locale2language_tag('zh_TW'), 'zh-tw'; +ok lc locale2language_tag('zh_TW.euctw'), 'zh-tw'; +ok lc locale2language_tag('zh_TW.utf8'), 'zh-tw'; + +print "# So there!\n"; +ok 1; diff --git a/dist/I18N-LangTags/t/50_super.t b/dist/I18N-LangTags/t/50_super.t new file mode 100644 index 0000000000..9923c84068 --- /dev/null +++ b/dist/I18N-LangTags/t/50_super.t @@ -0,0 +1,88 @@ + +# Time-stamp: "2004-03-30 17:46:17 AST" + +use Test; +BEGIN { plan tests => 26 }; +print "#\n# Testing normal (tight) insertion of super-ordinate language tags...\n#\n"; + +use I18N::LangTags qw(implicate_supers); + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br pt fr + pt-br fr pt => pt-br fr pt + + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br pt fr + hai pt-br fr => hai pt-br pt fr + + # Now test multi-part complicateds: + pt-br-janeiro => pt-br-janeiro pt-br pt + pt-br-janeiro fr => pt-br-janeiro pt-br pt fr + pt-br-janeiro de fr => pt-br-janeiro pt-br pt de fr + pt-br-janeiro de pt fr => pt-br-janeiro pt-br de pt fr + + pt-br-janeiro pt-br-saopaolo => pt-br-janeiro pt-br pt pt-br-saopaolo + pt-br-janeiro fr pt-br-saopaolo => pt-br-janeiro pt-br pt fr pt-br-saopaolo + pt-br-janeiro de pt-br-saopaolo fr => pt-br-janeiro pt-br pt de pt-br-saopaolo fr + pt-br-janeiro de pt-br fr pt-br-saopaolo => pt-br-janeiro de pt-br pt fr pt-br-saopaolo + + pt-br de en fr pt-br-janeiro => pt-br pt de en fr pt-br-janeiro + pt-br de en fr => pt-br pt de en fr + + ja pt-br-janeiro fr => ja pt-br-janeiro pt-br pt fr + ja pt-br-janeiro de fr => ja pt-br-janeiro pt-br pt de fr + ja pt-br-janeiro de pt fr => ja pt-br-janeiro pt-br de pt fr + + pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br pt fr +# an odd case, since we don't filter for uniqueness in this sub + +}; + +sub uniq { my %seen; return grep(!($seen{$_}++), @_); } + +foreach my $in (@in) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + my(@in, @should); + { + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my($i,$s) = ($1, $2); + @in = ($i =~ m/(\S+)/g); + @should = ($s =~ m/(\S+)/g); + #print "{@in}{@should}\n"; + } + my @out = implicate_supers( + ("@in" eq 'NIX') ? () : @in + ); + #print "O: ", join(' ', map "<$_>", @out), "\n"; + @out = 'NIX' unless @out; + + + if( @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#!!Got: [@out]\n", + "#!! but wanted: [@should]\n", + "#!! from \"$in\"\n#\n"; + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/55_supers_strict.t b/dist/I18N-LangTags/t/55_supers_strict.t new file mode 100644 index 0000000000..3b285157a1 --- /dev/null +++ b/dist/I18N-LangTags/t/55_supers_strict.t @@ -0,0 +1,78 @@ + +# Time-stamp: "2004-03-30 17:49:58 AST" +#sub I18N::LangTags::Detect::DEBUG () {10} +use I18N::LangTags qw(implicate_supers_strictly); + +use Test; +BEGIN { plan tests => 19 }; + +print "#\n# Testing strict (non-tight) insertion of super-ordinate language tags...\n#\n"; + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br fr pt + pt-br fr pt => pt-br fr pt + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br fr pt + hai pt-br fr => hai pt-br fr pt + +# Now test multi-part complicateds: + pt-br-janeiro fr => pt-br-janeiro fr pt-br pt +pt-br-janeiro de fr => pt-br-janeiro de fr pt-br pt +pt-br-janeiro de pt fr => pt-br-janeiro de pt fr pt-br + +ja pt-br-janeiro fr => ja pt-br-janeiro fr pt-br pt +ja pt-br-janeiro de fr => ja pt-br-janeiro de fr pt-br pt +ja pt-br-janeiro de pt fr => ja pt-br-janeiro de pt fr pt-br + +pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br fr pt + # an odd case, since we don't filter for uniqueness in this sub + +}; + + +foreach my $in (@in) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + my(@in, @should); + { + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my($i,$s) = ($1, $2); + @in = ($i =~ m/(\S+)/g); + @should = ($s =~ m/(\S+)/g); + #print "{@in}{@should}\n"; + } + my @out = I18N::LangTags::implicate_supers_strictly( + ("@in" eq 'NIX') ? () : @in + ); + #print "O: ", join(' ', map "<$_>", @out), "\n"; + @out = 'NIX' unless @out; + + + if( @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#!!Got: [@out]\n", + "#!! but wanted: [@should]\n", + "#!! from \"$in\"\n#\n"; + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/80_all_env.t b/dist/I18N-LangTags/t/80_all_env.t new file mode 100644 index 0000000000..1362d5ebf5 --- /dev/null +++ b/dist/I18N-LangTags/t/80_all_env.t @@ -0,0 +1,115 @@ + +require 5; +use Test; +# Time-stamp: "2004-07-01 14:33:50 ADT" +BEGIN { plan tests => 20; } +use I18N::LangTags::Detect 1.01; +print "# Hi there...\n"; +ok 1; + +print "# Using I18N::LangTags::Detect v$I18N::LangTags::Detect::VERSION\n"; + +print "# Make sure we can assign to ENV entries\n", + "# (Otherwise we can't run the subsequent tests)...\n"; +$ENV{'MYORP'} = 'Zing'; ok $ENV{'MYORP'}, 'Zing'; +$ENV{'SWUZ'} = 'KLORTHO HOOBOY'; ok $ENV{'SWUZ'}, 'KLORTHO HOOBOY'; + +delete $ENV{'MYORP'}; +delete $ENV{'SWUZ'}; + +sub j { "[" . join(' ', map "\"$_\"", @_) . "]" ;} + +sub show { + print "# (Seeing {", join(' ', + map(qq{<$_>}, @_)), "} at line ", (caller)[2], ")\n"; + printenv(); + return $_[0] || ''; +} +sub printenv { + print "# ENV:\n"; + foreach my $k (sort keys %ENV) { + my $p = $ENV{$k}; $p =~ s/\n/\n#/g; + print "# [$k] = [$p]\n"; } + print "# [end of ENV]\n#\n"; +} + +$ENV{'IGNORE_WIN32_LOCALE'} = 1; # a hack, just for testing's sake. + + +print "# Test LANGUAGE...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = 'Eu-MT'; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = ''; +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +print "# Test LC_ALL...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = 'Eu-MT'; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = ''; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + +print "# Test LC_MESSAGES...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = 'Eu-MT'; +$ENV{'LANG'} = ''; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +print "# Test LANG...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = 'Eu_MT'; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + + +print "# Test LANG...\n"; +$ENV{'LANGUAGE'} = ''; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = 'Eu_MT'; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + + + +print "# Test HTTP_ACCEPT_LANGUAGE...\n"; +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'eu-MT'; +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eu-MT, i-klung'; +ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; +ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eU-Mt, i-klung'; +ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; +ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; + + + + +print "# Byebye!\n"; +ok 1; + |