From 8b2306352e674fdd7eb8b61ff2ce78864a87ed9c Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Mon, 28 Sep 2009 14:44:14 +0100 Subject: Move I18N::LangTags from ext/ to dist/ --- MANIFEST | 26 +- Porting/Maintainers.pl | 2 +- dist/I18N-LangTags/ChangeLog | 195 +++ dist/I18N-LangTags/README | 78 ++ dist/I18N-LangTags/lib/I18N/LangTags.pm | 887 ++++++++++++ dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm | 237 ++++ dist/I18N-LangTags/lib/I18N/LangTags/List.pm | 1779 ++++++++++++++++++++++++ dist/I18N-LangTags/t/01_about_verbose.t | 89 ++ dist/I18N-LangTags/t/05_main.t | 98 ++ dist/I18N-LangTags/t/07_listy.t | 30 + dist/I18N-LangTags/t/10_http.t | 104 ++ dist/I18N-LangTags/t/20_locales.t | 38 + dist/I18N-LangTags/t/50_super.t | 88 ++ dist/I18N-LangTags/t/55_supers_strict.t | 78 ++ dist/I18N-LangTags/t/80_all_env.t | 115 ++ ext/I18N-LangTags/ChangeLog | 195 --- ext/I18N-LangTags/README | 78 -- ext/I18N-LangTags/lib/I18N/LangTags.pm | 887 ------------ ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm | 237 ---- ext/I18N-LangTags/lib/I18N/LangTags/List.pm | 1779 ------------------------ ext/I18N-LangTags/t/01_about_verbose.t | 89 -- ext/I18N-LangTags/t/05_main.t | 98 -- ext/I18N-LangTags/t/07_listy.t | 30 - ext/I18N-LangTags/t/10_http.t | 104 -- ext/I18N-LangTags/t/20_locales.t | 38 - ext/I18N-LangTags/t/50_super.t | 88 -- ext/I18N-LangTags/t/55_supers_strict.t | 78 -- ext/I18N-LangTags/t/80_all_env.t | 115 -- 28 files changed, 3830 insertions(+), 3830 deletions(-) create mode 100644 dist/I18N-LangTags/ChangeLog create mode 100644 dist/I18N-LangTags/README create mode 100644 dist/I18N-LangTags/lib/I18N/LangTags.pm create mode 100644 dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm create mode 100644 dist/I18N-LangTags/lib/I18N/LangTags/List.pm create mode 100644 dist/I18N-LangTags/t/01_about_verbose.t create mode 100644 dist/I18N-LangTags/t/05_main.t create mode 100644 dist/I18N-LangTags/t/07_listy.t create mode 100644 dist/I18N-LangTags/t/10_http.t create mode 100644 dist/I18N-LangTags/t/20_locales.t create mode 100644 dist/I18N-LangTags/t/50_super.t create mode 100644 dist/I18N-LangTags/t/55_supers_strict.t create mode 100644 dist/I18N-LangTags/t/80_all_env.t delete mode 100644 ext/I18N-LangTags/ChangeLog delete mode 100644 ext/I18N-LangTags/README delete mode 100644 ext/I18N-LangTags/lib/I18N/LangTags.pm delete mode 100644 ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm delete mode 100644 ext/I18N-LangTags/lib/I18N/LangTags/List.pm delete mode 100644 ext/I18N-LangTags/t/01_about_verbose.t delete mode 100644 ext/I18N-LangTags/t/05_main.t delete mode 100644 ext/I18N-LangTags/t/07_listy.t delete mode 100644 ext/I18N-LangTags/t/10_http.t delete mode 100644 ext/I18N-LangTags/t/20_locales.t delete mode 100644 ext/I18N-LangTags/t/50_super.t delete mode 100644 ext/I18N-LangTags/t/55_supers_strict.t delete mode 100644 ext/I18N-LangTags/t/80_all_env.t diff --git a/MANIFEST b/MANIFEST index f7d71c5fec..c2f3a30b88 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1067,6 +1067,19 @@ dist/Filter-Simple/t/lib/Filter/Simple/ExportTest.pm Helper file for Filter::Si dist/Filter-Simple/t/lib/Filter/Simple/FilterOnlyTest.pm Helper file for Filter::Simple tests dist/Filter-Simple/t/lib/Filter/Simple/FilterTest.pm Helper file for Filter::Simple tests dist/Filter-Simple/t/lib/Filter/Simple/ImportTest.pm Helper file for Filter::Simple tests +dist/I18N-LangTags/ChangeLog I18N::LangTags +dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm Detect language preferences +dist/I18N-LangTags/lib/I18N/LangTags/List.pm List of tags for human languages +dist/I18N-LangTags/lib/I18N/LangTags.pm I18N::LangTags +dist/I18N-LangTags/README I18N::LangTags +dist/I18N-LangTags/t/01_about_verbose.t See whether I18N::LangTags works +dist/I18N-LangTags/t/05_main.t See whether I18N::LangTags works +dist/I18N-LangTags/t/07_listy.t See whether I18N::LangTags works +dist/I18N-LangTags/t/10_http.t See whether I18N::LangTags works +dist/I18N-LangTags/t/20_locales.t See whether I18N::LangTags works +dist/I18N-LangTags/t/50_super.t See whether I18N::LangTags works +dist/I18N-LangTags/t/55_supers_strict.t See whether I18N::LangTags works +dist/I18N-LangTags/t/80_all_env.t See whether I18N::LangTags works djgpp/config.over DOS/DJGPP port djgpp/configure.bat DOS/DJGPP port djgpp/djgpp.c DOS/DJGPP port @@ -1598,19 +1611,6 @@ ext/I18N-Langinfo/Langinfo.pm I18N::Langinfo ext/I18N-Langinfo/Langinfo.xs I18N::Langinfo ext/I18N-Langinfo/Makefile.PL I18N::Langinfo ext/I18N-Langinfo/t/Langinfo.t See whether I18N::Langinfo works -ext/I18N-LangTags/ChangeLog I18N::LangTags -ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm Detect language preferences -ext/I18N-LangTags/lib/I18N/LangTags/List.pm List of tags for human languages -ext/I18N-LangTags/lib/I18N/LangTags.pm I18N::LangTags -ext/I18N-LangTags/README I18N::LangTags -ext/I18N-LangTags/t/01_about_verbose.t See whether I18N::LangTags works -ext/I18N-LangTags/t/05_main.t See whether I18N::LangTags works -ext/I18N-LangTags/t/07_listy.t See whether I18N::LangTags works -ext/I18N-LangTags/t/10_http.t See whether I18N::LangTags works -ext/I18N-LangTags/t/20_locales.t See whether I18N::LangTags works -ext/I18N-LangTags/t/50_super.t See whether I18N::LangTags works -ext/I18N-LangTags/t/55_supers_strict.t See whether I18N::LangTags works -ext/I18N-LangTags/t/80_all_env.t See whether I18N::LangTags works ext/if/if.pm For "use if" ext/if/t/if.t Tests for "use if" ext/IO/ChangeLog IO perl module change log diff --git a/Porting/Maintainers.pl b/Porting/Maintainers.pl index b7538c784f..8ce849f0c3 100755 --- a/Porting/Maintainers.pl +++ b/Porting/Maintainers.pl @@ -753,7 +753,7 @@ use File::Glob qw(:case); { 'MAINTAINER' => 'p5p', 'DISTRIBUTION' => 'SBURKE/I18N-LangTags-0.35.tar.gz', - 'FILES' => q[ext/I18N-LangTags], + 'FILES' => q[dist/I18N-LangTags], 'CPAN' => 0, 'UPSTREAM' => 'blead', }, diff --git a/dist/I18N-LangTags/ChangeLog b/dist/I18N-LangTags/ChangeLog new file mode 100644 index 0000000000..6cd744ef46 --- /dev/null +++ b/dist/I18N-LangTags/ChangeLog @@ -0,0 +1,195 @@ +Revision history for Perl module I18N::LangTags. + Time-stamp: "2004-10-06 23:26:53 ADT" + +2004-10-06 Sean M. Burke sburke@cpan.org + + * Release 0.35 + + Bugfix version: locale2language_tag now correctly understands + locale-IDs with at-signs in them, like 'it_IT.utf8@euro' or + 'it_IT@euro'. This is now enforced by the new t/20_locales.t + + Thanks to Luca 'loopback' Cavalli for letting me know about these + new locale-ID name-styles. + + + * Release 0.34 -- never happened, because of an upload error + + +2004-07-01 Sean M. Burke sburke@cpan.org + + * Release 0.33 + + Minor bugfix version: + The test 80_all_env.t was erroneously failing for people with + LC_ALL or LC_MESSAGES set. Fixed. Thanks to everyone, especially + Nicholas Clark, who patiently helped out with this. + + +2004-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.32 + + Minor bugfix version: + The test 80_all_env.t was erroneously failing under MSWins that + had Win32::Locale installed. A workaround added. + + +2004-06-17 Sean M. Burke sburke@cpan.org + + * Release 0.31 + + Corrected some unevennesses in when/whether the return values from + I18N::LangTags::Detect's various internal functions would be + downcased. Now they're /always/ downcased, and are /always/ fed + thru alternate_language_tags()! + + Also, spiffed up and generally improved the earlier test + 80_all_env.t, which not even I could make sense of, and I wrote + the damned thing. Now it's sane, and checks both scalar and + list return values. Thanks to Rafael Garcia-Suarez and the + various CPAN-Testers for prodding me to fix this. (Hopefully the + earlier problems /are/ now fixed! Otherwise there'll be another + version of this module out real soon!) + + +2004-03-30 Sean M. Burke sburke@cpan.org + + * Release 0.30 + + New in I18N::LangTags : implicate_supers and + implicate_supers_strictly. + + New module: I18N::LangTags::Detect. + + Some new tests. + + Thanks to Autrijus Tang for catching some errors in my makefile! + + + +2003-10-10 Sean M. Burke sburke@cpan.org + + * Release 0.29 + + Minor bugfix to I18N::LangTags::List code. Addition of the + is_decent function, and the 02decency.t test for it. + + Better Makefile. Thanks to everyone who told me about the + INSTALLDIRS trick. + + + +2003-07-20 Sean M. Burke sburke@cpan.org + + * Release 0.28 + Doc fixes in I18N::LangTags, plus a few added variances (jw/jv, + cre/cr, etc.) + Lots of updates to I18N::LangTags::List + Deleted rfc3066.txt from dist. + Moved test.pl to t/01test.t and added more tests. + +2002-02-02 Sean M. Burke sburke@cpan.org + + * Release 0.27 -- minor mods to ::List: + Fixing its entries for sv-se and sv-fi. + Typo-fixes and rewordings in the incidental Pod text elsewhere. + +2001-06-21 Sean M. Burke sburke@cpan.org + + * Release 0.26 -- just making cosmetic changes + to test.pl, at Jarkko's request. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.25 -- just tweaking panic_languages behavior + for Scandinavian languages. Much better now. + Slight tweak to ::List's entries for Greek. + +2001-06-20 Sean M. Burke sburke@cpan.org + + * Release 0.24 + + * I18N::LangTags -- some elaborate hacks to make us + recognize legacy aliases like no-nyn == nn. + Added panic_languages(). + Added :ALL export tag. + Minor docs fixes, and spiffing up test.pl. + + * I18N::LangTags::List -- minor corrections; added + a few aliases. + +2001-05-29 Sean M. Burke sburke@cpan.org + + * Release 0.23 + + * I18N::LangTags::List -- minor corrections. And is now + a module, not just documentation. + +2001-05-27 Sean M. Burke sburke@cpan.org + + * Release 0.22 + + * Now bundling I18N::LangTags::List, a reference for lang tags, + replacing generate_language_table.plx and language_codes.txt + +2001-05-25 Sean M. Burke sburke@cpan.org + + * Release 0.21 + + * extract_language_tags and locale2langauge_tag now + return untainted output. Useful if you feed tainted + things, like $ENV{'LANG'}. + +2001-03-13 Sean M. Burke sburke@cpan.org + + * Release 0.20 + + * Added support for RFC 3066 tags: allowing three-letter primary + tags ("nav"), and allowing digits in subtags ("x-borg-prot3252"). + + * Changed all references from RFC 1766 to RFC 3066. + + * Now bundling fulltext of RFC 3066 in the dist. + + * Now bundling generate_language_table.plx and language_codes.txt + + * Added some nice tests to test.pl + + * Inverting order of listings in this ChangeLog file. + +2000-05-13 Sean M. Burke sburke@cpan.org + + * Release 0.13 + + * Just noting my new email address. + +1999-03-06 Sean M. Burke sburke@netadventure.net + + * Release 0.11 + + * Added functions + similarity_language_tag, is_dialect_of, + locale2language_tag, alternate_language_tags, and + encode_language_tag + +1998-12-14 Sean M. Burke sburke@netadventure.net + + * Release 0.09 + + * Added function super_languages() + +1998-10-31 Sean M. Burke sburke@netadventure.net + + * Release 0.08 + + * Just changes in the docs and bundle -- no change + in functionality. + +1998-04-02 Sean M. Burke sburke@netadventure.net + + * Release 0.07 + + * First public release. + +[END OF CHANGELOG] diff --git a/dist/I18N-LangTags/README b/dist/I18N-LangTags/README new file mode 100644 index 0000000000..ef0eb7b2f9 --- /dev/null +++ b/dist/I18N-LangTags/README @@ -0,0 +1,78 @@ +README for I18N::LangTags + Time-stamp: "2004-10-06 23:19:39 ADT" + + I18N::LangTags + +I18N::LangTags - functions for dealing with RFC3066-style language +tags + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags (notably the extraction of them, comparing them, and testing the +formal validity of them) as is needed in a variety of protocols and +applications. + + +I18N::LangTags::List -- tags and names for human languages. This +module goes from known language tag names ("fr-CA") to their English +names ("Canadian French"). Its documentation also lists the several +hundred known tags and some common subforms. You may find this useful +as a reference. + + +See the POD for more information. + + +INSTALLATION + +You install I18N::LangTags and I18N::LangTags::List, as you would +install any perl module library, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of I18N::LangTags in your home +directory, then you should try to produce the initial Makefile with +something like this command: + + perl Makefile.PL LIB=~/perl + +See perldoc perlmodinstall for more information on installing modules. + + +DOCUMENTATION + +POD-format documentation is included in LangTags.pm. POD is readable +with the 'perldoc' utility. See ChangeLog for recent changes. + + +SUPPORT + +Questions, bug reports, useful code bits, and suggestions for +I18N::LangTags should just be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of I18N::LangTags is available from the +Comprehensive Perl Archive Network (CPAN). Visit + to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1998+, Sean M. Burke , all rights +reserved. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. diff --git a/dist/I18N-LangTags/lib/I18N/LangTags.pm b/dist/I18N-LangTags/lib/I18N/LangTags.pm new file mode 100644 index 0000000000..0bdc65fed1 --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags.pm @@ -0,0 +1,887 @@ + +# Time-stamp: "2004-10-06 23:26:33 ADT" +# Sean M. Burke + +require 5.000; +package I18N::LangTags; +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag panic_languages + implicate_supers + implicate_supers_strictly + ); +%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); + +$VERSION = "0.35"; + +sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function + + +=head1 NAME + +I18N::LangTags - functions for dealing with RFC3066-style language tags + +=head1 SYNOPSIS + + use I18N::LangTags(); + +...or specify whichever of those functions you want to import, like so: + + use I18N::LangTags qw(implicate_supers similarity_language_tag); + +All the exportable functions are listed below -- you're free to import +only some, or none at all. By default, none are imported. If you +say: + + use I18N::LangTags qw(:ALL) + +...then all are exported. (This saves you from having to use +something less obvious like C.) + +If you don't import any of these functions, assume a C<&I18N::LangTags::> +in front of all the function names in the following examples. + +=head1 DESCRIPTION + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags as they are needed in a variety of protocols and applications. + +Please see the "See Also" references for a thorough explanation +of how to correctly use language tags. + +=over + +=cut + +########################################################################### + +=item * the function is_language_tag($lang1) + +Returns true iff $lang1 is a formally valid language tag. + + is_language_tag("fr") is TRUE + is_language_tag("x-jicarilla") is FALSE + (Subtags can be 8 chars long at most -- 'jicarilla' is 9) + + is_language_tag("sgn-US") is TRUE + (That's American Sign Language) + + is_language_tag("i-Klikitat") is TRUE + (True without regard to the fact noone has actually + registered Klikitat -- it's a formally valid tag) + + is_language_tag("fr-patois") is TRUE + (Formally valid -- altho descriptively weak!) + + is_language_tag("Spanish") is FALSE + is_language_tag("french-patois") is FALSE + (No good -- first subtag has to match + /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) + + is_language_tag("x-borg-prot2532") is TRUE + (Yes, subtags can contain digits, as of RFC3066) + +=cut + +sub is_language_tag { + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = lc($_[0]); + + return 0 if $tag eq "i" or $tag eq "x"; + # Bad degenerate cases that the following + # regexp would erroneously let pass + + return $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs ? 1 : 0; +} + +########################################################################### + +=item * the function extract_language_tags($whatever) + +Returns a list of whatever looks like formally valid language tags +in $whatever. Not very smart, so don't get too creative with +what you want to feed it. + + extract_language_tags("fr, fr-ca, i-mingo") + returns: ('fr', 'fr-ca', 'i-mingo') + + extract_language_tags("It's like this: I'm in fr -- French!") + returns: ('It', 'in', 'fr') + (So don't just feed it any old thing.) + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub extract_language_tags { + + ## Changes in the language tagging standards may have to be reflected here. + + my($text) = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags + $text =~ + m/ + \b + (?: # First subtag + [iIxX] | [a-zA-Z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-zA-Z0-9]{1,8} # subtag + )* + \b + /xsg + ); +} + +########################################################################### + +=item * the function same_language_tag($lang1, $lang2) + +Returns true iff $lang1 and $lang2 are acceptable variant tags +representing the same language-form. + + same_language_tag('x-kadara', 'i-kadara') is TRUE + (The x/i- alternation doesn't matter) + same_language_tag('X-KADARA', 'i-kadara') is TRUE + (...and neither does case) + same_language_tag('en', 'en-US') is FALSE + (all-English is not the SAME as US English) + same_language_tag('x-kadara', 'x-kadar') is FALSE + (these are totally unrelated tags) + same_language_tag('no-bok', 'nb') is TRUE + (no-bok is a legacy tag for nb (Norwegian Bokmal)) + +C works by just seeing whether +C is the same as +C. + +(Yes, I know this function is named a bit oddly. Call it historic +reasons.) + +=cut + +sub same_language_tag { + my $el1 = &encode_language_tag($_[0]); + return 0 unless defined $el1; + # this avoids the problem of + # encode_language_tag($lang1) eq and encode_language_tag($lang2) + # being true if $lang1 and $lang2 are both undef + + return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; +} + +########################################################################### + +=item * the function similarity_language_tag($lang1, $lang2) + +Returns an integer representing the degree of similarity between +tags $lang1 and $lang2 (the order of which does not matter), where +similarity is the number of common elements on the left, +without regard to case and to x/i- alternation. + + similarity_language_tag('fr', 'fr-ca') is 1 + (one element in common) + similarity_language_tag('fr-ca', 'fr-FR') is 1 + (one element in common) + + similarity_language_tag('fr-CA-joual', + 'fr-CA-PEI') is 2 + similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 + (two elements in common) + + similarity_language_tag('x-kadara', 'i-kadara') is 1 + (x/i- doesn't matter) + + similarity_language_tag('en', 'x-kadar') is 0 + similarity_language_tag('x-kadara', 'x-kadar') is 0 + (unrelated tags -- no similarity) + + similarity_language_tag('i-cree-syllabic', + 'i-cherokee-syllabic') is 0 + (no B elements in common!) + +=cut + +sub similarity_language_tag { + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + # And encode_language_tag takes care of the whole + # no-nyn==nn, i-hakka==zh-hakka, etc, things + + # NB: (i-sil-...)? (i-sgn-...)? + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + my @l1_subtags = split('-', $lang1); + my @l2_subtags = split('-', $lang2); + my $similarity = 0; + + while(@l1_subtags and @l2_subtags) { + if(shift(@l1_subtags) eq shift(@l2_subtags)) { + ++$similarity; + } else { + last; + } + } + return $similarity; +} + +########################################################################### + +=item * the function is_dialect_of($lang1, $lang2) + +Returns true iff language tag $lang1 represents a subform of +language tag $lang2. + +B + + is_dialect_of('en-US', 'en') is TRUE + (American English IS a dialect of all-English) + + is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE + is_dialect_of('fr-CA-joual', 'fr') is TRUE + (Joual is a dialect of (a dialect of) French) + + is_dialect_of('en', 'en-US') is FALSE + (all-English is a NOT dialect of American English) + + is_dialect_of('fr', 'en-CA') is FALSE + + is_dialect_of('en', 'en' ) is TRUE + is_dialect_of('en-US', 'en-US') is TRUE + (B these are degenerate cases) + + is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE + (the x/i thing doesn't matter, nor does case) + + is_dialect_of('nn', 'no') is TRUE + (because 'nn' (New Norse) is aliased to 'no-nyn', + as a special legacy case, and 'no-nyn' is a + subform of 'no' (Norwegian)) + +=cut + +sub is_dialect_of { + + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + return 1 if $lang1 eq $lang2; + return 0 if length($lang1) < length($lang2); + + $lang1 .= '-'; + $lang2 .= '-'; + return + (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; +} + +########################################################################### + +=item * the function super_languages($lang1) + +Returns a list of language tags that are superordinate tags to $lang1 +-- it gets this by removing subtags from the end of $lang1 until +nothing (or just "i" or "x") is left. + + super_languages("fr-CA-joual") is ("fr-CA", "fr") + + super_languages("en-AU") is ("en") + + super_languages("en") is empty-list, () + + super_languages("i-cherokee") is empty-list, () + ...not ("i"), which would be illegal as well as pointless. + +If $lang1 is not a valid language tag, returns empty-list in +a list context, undef in a scalar context. + +A notable and rather unavoidable problem with this method: +"x-mingo-tom" has an "x" because the whole tag isn't an +IANA-registered tag -- but super_languages('x-mingo-tom') is +('x-mingo') -- which isn't really right, since 'i-mingo' is +registered. But this module has no way of knowing that. (But note +that same_language_tag('x-mingo', 'i-mingo') is TRUE.) + +More importantly, you assume I that superordinates of +$lang1 are mutually intelligible with $lang1. Consider this +carefully. + +=cut + +sub super_languages { + my $lang1 = $_[0]; + return() unless defined($lang1) && &is_language_tag($lang1); + + # a hack for those annoying new (2001) tags: + $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards + $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards + $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way + # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark + + my @l1_subtags = split('-', $lang1); + + ## Changes in the language tagging standards may have to be reflected here. + + # NB: (i-sil-...)? + + my @supers = (); + foreach my $bit (@l1_subtags) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + pop @supers if @supers; + shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; + return reverse @supers; +} + +########################################################################### + +=item * the function locale2language_tag($locale_identifier) + +This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") +and maps it to a language tag. If it's not mappable (as with, +notably, "C" and "POSIX"), this returns empty-list in a list context, +or undef in a scalar context. + + locale2language_tag("en") is "en" + + locale2language_tag("en_US") is "en-US" + + locale2language_tag("en_US.ISO8859-1") is "en-US" + + locale2language_tag("C") is undef or () + + locale2language_tag("POSIX") is undef or () + + locale2language_tag("POSIX") is undef or () + +I'm not totally sure that locale names map satisfactorily to language +tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub locale2language_tag { + my $lang = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return $lang if &is_language_tag($lang); # like "en" + + $lang =~ tr<_><->; # "en_US" -> en-US + $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US + # it_IT.utf8@euro => it-IT + + return $lang if &is_language_tag($lang); + + return; +} + +########################################################################### + +=item * the function encode_language_tag($lang1) + +This function, if given a language tag, returns an encoding of it such +that: + +* tags representing different languages never get the same encoding. + +* tags representing the same language always get the same encoding. + +* an encoding of a formally valid language tag always is a string +value that is defined, has length, and is true if considered as a +boolean. + +Note that the encoding itself is B a formally valid language tag. +Note also that you cannot, currently, go from an encoding back to a +language tag that it's an encoding of. + +Note also that you B consider the encoded value as atomic; i.e., +you should not consider it as anything but an opaque, unanalysable +string value. (The internals of the encoding method may change in +future versions, as the language tagging standard changes over time.) + +C returns undef if given anything other than a +formally valid language tag. + +The reason C exists is because different language +tags may represent the same language; this is normally treatable with +C, but consider this situation: + +You have a data file that expresses greetings in different languages. +Its format is "[language tag]=[how to say 'Hello']", like: + + en-US=Hiho + fr=Bonjour + i-mingo=Hau' + +And suppose you write a program that reads that file and then runs as +a daemon, answering client requests that specify a language tag and +then expect the string that says how to greet in that language. So an +interaction looks like: + + greeting-client asks: fr + greeting-server answers: Bonjour + +So far so good. But suppose the way you're implementing this is: + + my %greetings; + die unless open(IN, ") { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{$lang} = $expr; + } + close(IN); + +at which point %greetings has the contents: + + "en-US" => "Hiho" + "fr" => "Bonjour" + "i-mingo" => "Hau'" + +And suppose then that you answer client requests for language $wanted +by just looking up $greetings{$wanted}. + +If the client asks for "fr", that will look up successfully in +%greetings, to the value "Bonjour". And if the client asks for +"i-mingo", that will look up successfully in %greetings, to the value +"Hau'". + +But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the +lookup in %greetings fails. That's the Wrong Thing. + +You could instead do lookups on $wanted with: + + use I18N::LangTags qw(same_language_tag); + my $response = ''; + foreach my $l2 (keys %greetings) { + if(same_language_tag($wanted, $l2)) { + $response = $greetings{$l2}; + last; + } + } + +But that's rather inefficient. A better way to do it is to start your +program with: + + use I18N::LangTags qw(encode_language_tag); + my %greetings; + die unless open(IN, ") { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{ + encode_language_tag($lang) + } = $expr; + } + close(IN); + +and then just answer client requests for language $wanted by just +looking up + + $greetings{encode_language_tag($wanted)} + +And that does the Right Thing. + +=cut + +sub encode_language_tag { + # Only similarity_language_tag() is allowed to analyse encodings! + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = $_[0] || return undef; + return undef unless &is_language_tag($tag); + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + $tag =~ s/^iw\b/he/i; # Hebrew + $tag =~ s/^in\b/id/i; # Indonesian + $tag =~ s/^cre\b/cr/i; # Cree + $tag =~ s/^jw\b/jv/i; # Javanese + $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger + $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo + $tag =~ s/^ji\b/yi/i; # Yiddish + # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, + # but maybe they're all so obscure I can ignore them. "Obscure" + # meaning either that the language is obscure, and/or that the + # XXX form was extant so briefly that it's unlikely it was ever + # used. I hope. + # + # These go FROM the simplex to complex form, to get + # similarity-comparison right. And that's okay, since + # similarity_language_tag is the only thing that + # analyzes our output. + $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka + $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal + $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk + + $tag =~ s/^[xiXI]-//s; + # Just lop off any leading "x/i-" + + return "~" . uc($tag); +} + +#-------------------------------------------------------------------------- + +=item * the function alternate_language_tags($lang1) + +This function, if given a language tag, returns all language tags that +are alternate forms of this language tag. (I.e., tags which refer to +the same language.) This is meant to handle legacy tags caused by +the minor changes in language tag standards over the years; and +the x-/i- alternation is also dealt with. + +Note that this function does I try to equate new (and never-used, +and unusable) +ISO639-2 three-letter tags to old (and still in use) ISO639-1 +two-letter equivalents -- like "ara" -> "ar" -- because +"ara" has I been in use as an Internet language tag, +and RFC 3066 stipulates that it never should be, since a shorter +tag ("ar") exists. + +Examples: + + alternate_language_tags('no-bok') is ('nb') + alternate_language_tags('nb') is ('no-bok') + alternate_language_tags('he') is ('iw') + alternate_language_tags('iw') is ('he') + alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') + alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') + alternate_language_tags('en') is () + alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') + alternate_language_tags('x-klikitat') is ('i-klikitat') + alternate_language_tags('i-klikitat') is ('x-klikitat') + +This function returns empty-list if given anything other than a formally +valid language tag. + +=cut + +my %alt = qw( i x x i I X X I ); +sub alternate_language_tags { + my $tag = $_[0]; + return() unless &is_language_tag($tag); + + my @em; # push 'em real goood! + + # For the moment, these legacy variances are few enough that + # we can just handle them here with regexps. + + if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; + } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; + + } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; + } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; + + } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; + } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; + + } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; + } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; + + } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; + } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; + + } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; + } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; + + } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; + } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; + + } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; + } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; + } + + push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; + return @em; +} + +########################################################################### + +{ + # Init %Panic... + + my @panic = ( # MUST all be lowercase! + # Only large ("national") languages make it in this list. + # If you, as a user, are so bizarre that the /only/ language + # you claim to accept is Galician, then no, we won't do you + # the favor of providing Catalan as a panic-fallback for + # you. Because if I start trying to add "little languages" in + # here, I'll just go crazy. + + # Scandinavian lgs. All based on opinion and hearsay. + 'sv' => [qw(nb no da nn)], + 'da' => [qw(nb no sv nn)], # I guess + [qw(no nn nb)], [qw(no nn nb sv da)], + 'is' => [qw(da sv no nb nn)], + 'fo' => [qw(da is no nb nn sv)], # I guess + + # I think this is about the extent of tolerable intelligibility + # among large modern Romance languages. + 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French + 'ca' => [qw(es pt it fr)], + 'es' => [qw(ca it fr pt)], + 'it' => [qw(es fr ca pt)], + 'fr' => [qw(es it ca pt)], + + # Also assume that speakers of the main Indian languages prefer + # to read/hear Hindi over English + [qw( + as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur + )] => 'hi', + # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, + # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, + # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. + 'hi' => [qw(bn pa as or)], + # I welcome finer data for the other Indian languages. + # E.g., what should Oriya's list be, besides just Hindi? + + # And the panic languages for English is, of course, nil! + + # My guesses at Slavic intelligibility: + ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian + 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat + 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak + + 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian + + 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish + + #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai + + ); + my($k,$v); + while(@panic) { + ($k,$v) = splice(@panic,0,2); + foreach my $k (ref($k) ? @$k : $k) { + foreach my $v (ref($v) ? @$v : $v) { + push @{$Panic{$k} ||= []}, $v unless $k eq $v; + } + } + } +} + +=item * the function @langs = panic_languages(@accept_languages) + +This function takes a list of 0 or more language +tags that constitute a given user's Accept-Language list, and +returns a list of tags for I (non-super) +languages that are probably acceptable to the user, to be +used I. + +For example, if a user accepts only 'ca' (Catalan) and +'es' (Spanish), and the documents/interfaces you have +available are just in German, Italian, and Chinese, then +the user will most likely want the Italian one (and not +the Chinese or German one!), instead of getting +nothing. So C returns +a list containing 'it' (Italian). + +English ('en') is I in the return list, but +whether it's at the very end or not depends +on the input languages. This function works by consulting +an internal table that stipulates what common +languages are "close" to each other. + +A useful construct you might consider using is: + + @fallbacks = super_languages(@accept_languages); + push @fallbacks, panic_languages( + @accept_languages, @fallbacks, + ); + +=cut + +sub panic_languages { + # When in panic or in doubt, run in circles, scream, and shout! + my(@out, %seen); + foreach my $t (@_) { + next unless $t; + next if $seen{$t}++; # so we don't return it or hit it again + # push @out, super_languages($t); # nah, keep that separate + push @out, @{ $Panic{lc $t} || next }; + } + return grep !$seen{$_}++, @out, 'en'; +} + +#--------------------------------------------------------------------------- +#--------------------------------------------------------------------------- + +=item * the function implicate_supers( ...languages... ) + +This takes a list of strings (which are presumed to be language-tags; +strings that aren't, are ignored); and after each one, this function +inserts super-ordinate forms that don't already appear in the list. +The original list, plus these insertions, is returned. + +In other words, it takes this: + + pt-br de-DE en-US fr pt-br-janeiro + +and returns this: + + pt-br pt de-DE de en-US en fr pt-br-janeiro + +This function is most useful in the idiom + + implicate_supers( I18N::LangTags::Detect::detect() ); + +(See L.) + + +=item * the function implicate_supers_strictly( ...languages... ) + +This works like C except that the implicated +forms are added to the end of the return list. + +In other words, implicate_supers_strictly takes a list of strings +(which are presumed to be language-tags; strings that aren't, are +ignored) and after the whole given list, it inserts the super-ordinate forms +of all given tags, minus any tags that already appear in the input list. + +In other words, it takes this: + + pt-br de-DE en-US fr pt-br-janeiro + +and returns this: + + pt-br de-DE en-US fr pt-br-janeiro pt de en + +The reason this function has "_strictly" in its name is that when +you're processing an Accept-Language list according to the RFCs, if +you interpret the RFCs quite strictly, then you would use +implicate_supers_strictly, but for normal use (i.e., common-sense use, +as far as I'm concerned) you'd use implicate_supers. + +=cut + +sub implicate_supers { + my @languages = grep is_language_tag($_), @_; + my %seen_encoded; + foreach my $lang (@languages) { + $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 + } + + my(@output_languages); + foreach my $lang (@languages) { + push @output_languages, $lang; + foreach my $s ( I18N::LangTags::super_languages($lang) ) { + # Note that super_languages returns the longest first. + last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; + push @output_languages, $s; + } + } + return uniq( @output_languages ); + +} + +sub implicate_supers_strictly { + my @tags = grep is_language_tag($_), @_; + return uniq( @_, map super_languages($_), @_ ); +} + + + +########################################################################### +1; +__END__ + +=back + +=head1 ABOUT LOWERCASING + +I've considered making all the above functions that output language +tags return all those tags strictly in lowercase. Having all your +language tags in lowercase does make some things easier. But you +might as well just lowercase as you like, or call +C where appropriate. + +=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS + +In some future version of I18N::LangTags, I plan to include support +for RFC2482-style language tags -- which are basically just normal +language tags with their ASCII characters shifted into Plane 14. + +=head1 SEE ALSO + +* L + +* RFC 3066, C, "Tags for the +Identification of Languages". (Obsoletes RFC 1766) + +* RFC 2277, C, "IETF Policy on +Character Sets and Languages". + +* RFC 2231, C, "MIME Parameter +Value and Encoded Word Extensions: Character Sets, Languages, and +Continuations". + +* RFC 2482, C, +"Language Tagging in Unicode Plain Text". + +* Locale::Codes, in +C + +* ISO 639-2, "Codes for the representation of names of languages", +including two-letter and three-letter codes, +C + +* The IANA list of registered languages (hopefully up-to-date), +C + +=head1 COPYRIGHT + +Copyright (c) 1998+ Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +=head1 AUTHOR + +Sean M. Burke C + +=cut + diff --git a/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm b/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm new file mode 100644 index 0000000000..3f1b7c006a --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags/Detect.pm @@ -0,0 +1,237 @@ + +# Time-stamp: "2004-06-20 21:47:55 ADT" + +require 5; +package I18N::LangTags::Detect; +use strict; + +use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS + $USE_LITERALS $MATCH_SUPERS_TIGHTLY); + +BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } + # define the constant 'DEBUG' at compile-time + +$VERSION = "1.03"; +@ISA = (); +use I18N::LangTags qw(alternate_language_tags locale2language_tag); + +sub _uniq { my %seen; return grep(!($seen{$_}++), @_); } +sub _normalize { + my(@languages) = + map lc($_), + grep $_, + map {; $_, alternate_language_tags($_) } @_; + return _uniq(@languages) if wantarray; + return $languages[0]; +} + +#--------------------------------------------------------------------------- +# The extent of our functional interface: + +sub detect () { return __PACKAGE__->ambient_langprefs; } + +#=========================================================================== + +sub ambient_langprefs { # always returns things untainted + my $base_class = $_[0]; + + return $base_class->http_accept_langs + if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI + # it's off in its own routine because it's complicated + + # Not running as a CGI: try to puzzle out from the environment + my @languages; + + foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) { + next unless $ENV{$envname}; + DEBUG and print "Noting \$$envname: $ENV{$envname}\n"; + push @languages, + map locale2language_tag($_), + # if it's a lg tag, fine, pass thru (untainted) + # if it's a locale ID, try converting to a lg tag (untainted), + # otherwise nix it. + + split m/[,:]/, + $ENV{$envname} + ; + last; # first one wins + } + + if($ENV{'IGNORE_WIN32_LOCALE'}) { + # no-op + } elsif(&_try_use('Win32::Locale')) { + # If we have that module installed... + push @languages, Win32::Locale::get_language() || '' + if defined &Win32::Locale::get_language; + } + return _normalize @languages; +} + +#--------------------------------------------------------------------------- + +sub http_accept_langs { + # Deal with HTTP "Accept-Language:" stuff. Hassle. + # This code is more lenient than RFC 3282, which you must read. + # Hm. Should I just move this into I18N::LangTags at some point? + no integer; + + my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'}; + # (always ends up untainting) + + return() unless defined $in and length $in; + + $in =~ s/\([^\)]*\)//g; # nix just about any comment + + if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) { + # Very common case: just one language tag + return _normalize $1; + } elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) { + # Common case these days: just "foo, bar, baz" + return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g ); + } + + # Else it's complicated... + + $in =~ s/\s+//g; # Yes, we can just do without the WS! + my @in = $in =~ m/([^,]+)/g; + my %pref; + + my $q; + foreach my $tag (@in) { + next unless $tag =~ + m/^([a-zA-Z][-a-zA-Z]+) + (?: + ;q= + ( + \d* # a bit too broad of a RE, but so what. + (?: + \.\d+ + )? + ) + )? + $ + /sx + ; + $q = (defined $2 and length $2) ? $2 : 1; + #print "$1 with q=$q\n"; + push @{ $pref{$q} }, lc $1; + } + + return _normalize( + # Read off %pref, in descending key order... + map @{$pref{$_}}, + sort {$b <=> $a} + keys %pref + ); +} + +#=========================================================================== + +my %tried = (); + # memoization of whether we've used this module, or found it unusable. + +sub _try_use { # Basically a wrapper around "require Modulename" + # "Many men have tried..." "They tried and failed?" "They tried and died." + return $tried{$_[0]} if exists $tried{$_[0]}; # memoization + + my $module = $_[0]; # ASSUME sane module name! + { no strict 'refs'; + return($tried{$module} = 1) + if defined(%{$module . "::Lexicon"}) or defined(@{$module . "::ISA"}); + # weird case: we never use'd it, but there it is! + } + + print " About to use $module ...\n" if DEBUG; + { + local $SIG{'__DIE__'}; + eval "require $module"; # used to be "use $module", but no point in that. + } + if($@) { + print "Error using $module \: $@\n" if DEBUG > 1; + return $tried{$module} = 0; + } else { + print " OK, $module is used\n" if DEBUG; + return $tried{$module} = 1; + } +} + +#--------------------------------------------------------------------------- +1; +__END__ + + +=head1 NAME + +I18N::LangTags::Detect - detect the user's language preferences + +=head1 SYNOPSIS + + use I18N::LangTags::Detect; + my @user_wants = I18N::LangTags::Detect::detect(); + +=head1 DESCRIPTION + +It is a common problem to want to detect what language(s) the user would +prefer output in. + +=head1 FUNCTIONS + +This module defines one public function, +C. This function is not exported +(nor is even exportable), and it takes no parameters. + +In scalar context, the function returns the most preferred language +tag (or undef if no preference was seen). + +In list context (which is usually what you want), +the function returns a +(possibly empty) list of language tags representing (best first) what +languages the user apparently would accept output in. You will +probably want to pass the output of this through +C +or +C, like so: + + my @languages = + I18N::LangTags::implicate_supers_tightly( + I18N::LangTags::Detect::detect() + ); + + +=head1 ENVIRONMENT + +This module looks for several environment variables, including +REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE, +LANGUAGE, LC_ALL, LC_MESSAGES, and LANG. + +It will also use the L module, if it's installed. + + +=head1 SEE ALSO + +L, L, L. + +(This module's core code started out as a routine in Locale::Maketext; +but I moved it here once I realized it was more generally useful.) + + +=head1 COPYRIGHT + +Copyright (c) 1998-2004 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + + +=head1 AUTHOR + +Sean M. Burke C + +=cut + +# a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty! diff --git a/dist/I18N-LangTags/lib/I18N/LangTags/List.pm b/dist/I18N-LangTags/lib/I18N/LangTags/List.pm new file mode 100644 index 0000000000..5494bea21e --- /dev/null +++ b/dist/I18N-LangTags/lib/I18N/LangTags/List.pm @@ -0,0 +1,1779 @@ + +require 5; +package I18N::LangTags::List; +# Time-stamp: "2004-10-06 23:26:21 ADT" +use strict; +use vars qw(%Name %Is_Disrec $Debug $VERSION); +$VERSION = '0.35'; +# POD at the end. + +#---------------------------------------------------------------------- +{ +# read the table out of our own POD! + my $seeking = 1; + my $count = 0; + my($disrec,$tag,$name); + my $last_name = ''; + while() { + if($seeking) { + $seeking = 0 if m/=for woohah/; + } elsif( ($disrec, $tag, $name) = + m/(\[?)\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/ + ) { + $name =~ s/\s*[;\.]*\s*$//g; + next unless $name; + ++$count; + print "<$tag> <$name>\n" if $Debug; + $last_name = $Name{$tag} = $name; + $Is_Disrec{$tag} = 1 if $disrec; + } elsif (m/[Ff]ormerly \"([-a-z0-9]+)\"/) { + $Name{$1} = "$last_name (old tag)" if $last_name; + $Is_Disrec{$1} = 1; + } + } + die "No tags read??" unless $count; +} +#---------------------------------------------------------------------- + +sub name { + my $tag = lc($_[0] || return); + $tag =~ s/^\s+//s; + $tag =~ s/\s+$//s; + + my $alt; + if($tag =~ m/^x-(.+)/) { + $alt = "i-$1"; + } elsif($tag =~ m/^i-(.+)/) { + $alt = "x-$1"; + } else { + $alt = ''; + } + + my $subform = ''; + my $name = ''; + print "Input: {$tag}\n" if $Debug; + while(length $tag) { + last if $name = $Name{$tag}; + last if $name = $Name{$alt}; + if($tag =~ s/(-[a-z0-9]+)$//s) { + print "Shaving off: $1 leaving $tag\n" if $Debug; + $subform = "$1$subform"; + # and loop around again + + $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; + } else { + # we're trying to pull a subform off a primary tag. TILT! + print "Aborting on: {$name}{$subform}\n" if $Debug; + last; + } + } + print "Output: {$name}{$subform}\n" if $Debug; + + return unless $name; # Failure + return $name unless $subform; # Exact match + $subform =~ s/^-//s; + $subform =~ s/-$//s; + return "$name (Subform \"$subform\")"; +} + +#-------------------------------------------------------------------------- + +sub is_decent { + my $tag = lc($_[0] || return 0); + #require I18N::LangTags; + + return 0 unless + $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs; + + my @supers = (); + foreach my $bit (split('-', $tag)) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + return 0 unless @supers; + shift @supers if $supers[0] =~ m<^(i|x|sgn)$>s; + return 0 unless @supers; + + foreach my $f ($tag, @supers) { + return 0 if $Is_Disrec{$f}; + return 2 if $Name{$f}; + # so that decent subforms of indecent tags are decent + } + return 2 if $Name{$tag}; # not only is it decent, it's known! + return 1; +} + +#-------------------------------------------------------------------------- +1; + +__DATA__ + +=head1 NAME + +I18N::LangTags::List -- tags and names for human languages + +=head1 SYNOPSIS + + use I18N::LangTags::List; + print "Parlez-vous... ", join(', ', + I18N::LangTags::List::name('elx') || 'unknown_language', + I18N::LangTags::List::name('ar-Kw') || 'unknown_language', + I18N::LangTags::List::name('en') || 'unknown_language', + I18N::LangTags::List::name('en-CA') || 'unknown_language', + ), "?\n"; + +prints: + + Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? + +=head1 DESCRIPTION + +This module provides a function +C ) > that takes +a language tag (see L) +and returns the best attempt at an English name for it, or +undef if it can't make sense of the tag. + +The function I18N::LangTags::List::name(...) is not exported. + +This module also provides a function +C )> that returns true iff +the language tag is syntactically valid and is for general use (like +"fr" or "fr-ca", below). That is, it returns false for tags that are +syntactically invalid and for tags, like "aus", that are listed in +brackets below. This function is not exported. + +The map of tags-to-names that it uses is accessible as +%I18N::LangTags::List::Name, and it's the same as the list +that follows in this documentation, which should be useful +to you even if you don't use this module. + +=head1 ABOUT LANGUAGE TAGS + +Internet language tags, as defined in RFC 3066, are a formalism +for denoting human languages. The two-letter ISO 639-1 language +codes are well known (as "en" for English), as are their forms +when qualified by a country code ("en-US"). Less well-known are the +arbitrary-length non-ISO codes (like "i-mingo"), and the +recently (in 2001) introduced three-letter ISO-639-2 codes. + +Remember these important facts: + +=over + +=item * + +Language tags are not locale IDs. A locale ID is written with a "_" +instead of a "-", (almost?) always matches C, and +I something different than a language tag. A language tag +denotes a language. A locale ID denotes a language I +a particular place, in combination with non-linguistic +location-specific information such as what currency is used +there. Locales I often denote character set information, +as in "en_US.ISO8859-1". + +=item * + +Language tags are not for computer languages. + +=item * + +"Dialect" is not a useful term, since there is no objective +criterion for establishing when two language-forms are +dialects of eachother, or are separate languages. + +=item * + +Language tags are not case-sensitive. en-US, en-us, En-Us, etc., +are all the same tag, and denote the same language. + +=item * + +Not every language tag really refers to a single language. Some +language tags refer to conditions: i-default (system-message text +in English plus maybe other languages), und (undetermined +language). Others (notably lots of the three-letter codes) are +bibliographic tags that classify whole groups of languages, as +with cus "Cushitic (Other)" (i.e., a +language that has been classed as Cushtic, but which has no more +specific code) or the even less linguistically coherent +sai for "South American Indian (Other)". Though useful in +bibliography, B. For further guidance, email me. + +=item * + +Language tags are not country codes. In fact, they are often +distinct codes, as with language tag ja for Japanese, and +ISO 3166 country code C<.jp> for Japan. + +=back + +=head1 LIST OF LANGUAGES + +The first part of each item is the language tag, between +{...}. It +is followed by an English name for the language or language-group. +Language tags that I judge to be not for general use, are bracketed. + +This list is in alphabetical order by English name of the language. + +=for reminder + The name in the =item line MUST NOT have E<...>'s in it!! + +=for woohah START + +=over + +=item {ab} : Abkhazian + +eq Abkhaz + +=item {ace} : Achinese + +=item {ach} : Acoli + +=item {ada} : Adangme + +=item {ady} : Adyghe + +eq Adygei + +=item {aa} : Afar + +=item {afh} : Afrihili + +(Artificial) + +=item {af} : Afrikaans + +=item [{afa} : Afro-Asiatic (Other)] + +=item {ak} : Akan + +(Formerly "aka".) + +=item {akk} : Akkadian + +(Historical) + +=item {sq} : Albanian + +=item {ale} : Aleut + +=item [{alg} : Algonquian languages] + +NOT Algonquin! + +=item [{tut} : Altaic (Other)] + +=item {am} : Amharic + +NOT Aramaic! + +=item {i-ami} : Ami + +eq Amis. eq 'Amis. eq Pangca. + +=item [{apa} : Apache languages] + +=item {ar} : Arabic + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{ar-ae} UAE Arabic; +{ar-bh} Bahrain Arabic; +{ar-dz} Algerian Arabic; +{ar-eg} Egyptian Arabic; +{ar-iq} Iraqi Arabic; +{ar-jo} Jordanian Arabic; +{ar-kw} Kuwait Arabic; +{ar-lb} Lebanese Arabic; +{ar-ly} Libyan Arabic; +{ar-ma} Moroccan Arabic; +{ar-om} Omani Arabic; +{ar-qa} Qatari Arabic; +{ar-sa} Sauda Arabic; +{ar-sy} Syrian Arabic; +{ar-tn} Tunisian Arabic; +{ar-ye} Yemen Arabic. + +=item {arc} : Aramaic + +NOT Amharic! NOT Samaritan Aramaic! + +=item {arp} : Arapaho + +=item {arn} : Araucanian + +=item {arw} : Arawak + +=item {hy} : Armenian + +=item {an} : Aragonese + +=item [{art} : Artificial (Other)] + +=item {ast} : Asturian + +eq Bable. + +=item {as} : Assamese + +=item [{ath} : Athapascan languages] + +eq Athabaskan. eq Athapaskan. eq Athabascan. + +=item [{aus} : Australian languages] + +=item [{map} : Austronesian (Other)] + +=item {av} : Avaric + +(Formerly "ava".) + +=item {ae} : Avestan + +eq Zend + +=item {awa} : Awadhi + +=item {ay} : Aymara + +=item {az} : Azerbaijani + +eq Azeri + +Notable forms: +{az-Arab} Azerbaijani in Arabic script; +{az-Cyrl} Azerbaijani in Cyrillic script; +{az-Latn} Azerbaijani in Latin script. + +=item {ban} : Balinese + +=item [{bat} : Baltic (Other)] + +=item {bal} : Baluchi + +=item {bm} : Bambara + +(Formerly "bam".) + +=item [{bai} : Bamileke languages] + +=item {bad} : Banda + +=item [{bnt} : Bantu (Other)] + +=item {bas} : Basa + +=item {ba} : Bashkir + +=item {eu} : Basque + +=item {btk} : Batak (Indonesia) + +=item {bej} : Beja + +=item {be} : Belarusian + +eq Belarussian. eq Byelarussian. +eq Belorussian. eq Byelorussian. +eq White Russian. eq White Ruthenian. +NOT Ruthenian! + +=item {bem} : Bemba + +=item {bn} : Bengali + +eq Bangla. + +=item [{ber} : Berber (Other)] + +=item {bho} : Bhojpuri + +=item {bh} : Bihari + +=item {bik} : Bikol + +=item {bin} : Bini + +=item {bi} : Bislama + +eq Bichelamar. + +=item {bs} : Bosnian + +=item {bra} : Braj + +=item {br} : Breton + +=item {bug} : Buginese + +=item {bg} : Bulgarian + +=item {i-bnn} : Bunun + +=item {bua} : Buriat + +=item {my} : Burmese + +=item {cad} : Caddo + +=item {car} : Carib + +=item {ca} : Catalan + +eq CatalEn. eq Catalonian. + +=item [{cau} : Caucasian (Other)] + +=item {ceb} : Cebuano + +=item [{cel} : Celtic (Other)] + +Notable forms: +{cel-gaulish} Gaulish (Historical) + +=item [{cai} : Central American Indian (Other)] + +=item {chg} : Chagatai + +(Historical?) + +=item [{cmc} : Chamic languages] + +=item {ch} : Chamorro + +=item {ce} : Chechen + +=item {chr} : Cherokee + +eq Tsalagi + +=item {chy} : Cheyenne + +=item {chb} : Chibcha + +(Historical) NOT Chibchan (which is a language family). + +=item {ny} : Chichewa + +eq Nyanja. eq Chinyanja. + +=item {zh} : Chinese + +Many forms are mutually un-intelligible in spoken media. +Notable forms: +{zh-Hans} Chinese, in simplified script; +{zh-Hant} Chinese, in traditional script; +{zh-tw} Taiwan Chinese; +{zh-cn} PRC Chinese; +{zh-sg} Singapore Chinese; +{zh-mo} Macau Chinese; +{zh-hk} Hong Kong Chinese; +{zh-guoyu} Mandarin [Putonghua/Guoyu]; +{zh-hakka} Hakka [formerly "i-hakka"]; +{zh-min} Hokkien; +{zh-min-nan} Southern Hokkien; +{zh-wuu} Shanghaiese; +{zh-xiang} Hunanese; +{zh-gan} Gan; +{zh-yue} Cantonese. + +=for etc +{i-hakka} Hakka (old tag) + +=item {chn} : Chinook Jargon + +eq Chinook Wawa. + +=item {chp} : Chipewyan + +=item {cho} : Choctaw + +=item {cu} : Church Slavic + +eq Old Church Slavonic. + +=item {chk} : Chuukese + +eq Trukese. eq Chuuk. eq Truk. eq Ruk. + +=item {cv} : Chuvash + +=item {cop} : Coptic + +=item {kw} : Cornish + +=item {co} : Corsican + +eq Corse. + +=item {cr} : Cree + +NOT Creek! (Formerly "cre".) + +=item {mus} : Creek + +NOT Cree! + +=item [{cpe} : English-based Creoles and pidgins (Other)] + +=item [{cpf} : French-based Creoles and pidgins (Other)] + +=item [{cpp} : Portuguese-based Creoles and pidgins (Other)] + +=item [{crp} : Creoles and pidgins (Other)] + +=item {hr} : Croatian + +eq Croat. + +=item [{cus} : Cushitic (Other)] + +=item {cs} : Czech + +=item {dak} : Dakota + +eq Nakota. eq Latoka. + +=item {da} : Danish + +=item {dar} : Dargwa + +=item {day} : Dayak + +=item {i-default} : Default (Fallthru) Language + +Defined in RFC 2277, this is for tagging text +(which must include English text, and might/should include text +in other appropriate languages) that is emitted in a context +where language-negotiation wasn't possible -- in SMTP mail failure +messages, for example. + +=item {del} : Delaware + +=item {din} : Dinka + +=item {dv} : Divehi + +eq Maldivian. (Formerly "div".) + +=item {doi} : Dogri + +NOT Dogrib! + +=item {dgr} : Dogrib + +NOT Dogri! + +=item [{dra} : Dravidian (Other)] + +=item {dua} : Duala + +=item {nl} : Dutch + +eq Netherlander. Notable forms: +{nl-nl} Netherlands Dutch; +{nl-be} Belgian Dutch. + +=item {dum} : Middle Dutch (ca.1050-1350) + +(Historical) + +=item {dyu} : Dyula + +=item {dz} : Dzongkha + +=item {efi} : Efik + +=item {egy} : Ancient Egyptian + +(Historical) + +=item {eka} : Ekajuk + +=item {elx} : Elamite + +(Historical) + +=item {en} : English + +Notable forms: +{en-au} Australian English; +{en-bz} Belize English; +{en-ca} Canadian English; +{en-gb} UK English; +{en-ie} Irish English; +{en-jm} Jamaican English; +{en-nz} New Zealand English; +{en-ph} Philippine English; +{en-tt} Trinidad English; +{en-us} US English; +{en-za} South African English; +{en-zw} Zimbabwe English. + +=item {enm} : Old English (1100-1500) + +(Historical) + +=item {ang} : Old English (ca.450-1100) + +eq Anglo-Saxon. (Historical) + +=item {i-enochian} : Enochian (Artificial) + +=item {myv} : Erzya + +=item {eo} : Esperanto + +(Artificial) + +=item {et} : Estonian + +=item {ee} : Ewe + +(Formerly "ewe".) + +=item {ewo} : Ewondo + +=item {fan} : Fang + +=item {fat} : Fanti + +=item {fo} : Faroese + +=item {fj} : Fijian + +=item {fi} : Finnish + +=item [{fiu} : Finno-Ugrian (Other)] + +eq Finno-Ugric. NOT Ugaritic! + +=item {fon} : Fon + +=item {fr} : French + +Notable forms: +{fr-fr} France French; +{fr-be} Belgian French; +{fr-ca} Canadian French; +{fr-ch} Swiss French; +{fr-lu} Luxembourg French; +{fr-mc} Monaco French. + +=item {frm} : Middle French (ca.1400-1600) + +(Historical) + +=item {fro} : Old French (842-ca.1400) + +(Historical) + +=item {fy} : Frisian + +=item {fur} : Friulian + +=item {ff} : Fulah + +(Formerly "ful".) + +=item {gaa} : Ga + +=item {gd} : Scots Gaelic + +NOT Scots! + +=item {gl} : Gallegan + +eq Galician + +=item {lg} : Ganda + +(Formerly "lug".) + +=item {gay} : Gayo + +=item {gba} : Gbaya + +=item {gez} : Geez + +eq Ge'ez + +=item {ka} : Georgian + +=item {de} : German + +Notable forms: +{de-at} Austrian German; +{de-be} Belgian German; +{de-ch} Swiss German; +{de-de} Germany German; +{de-li} Liechtenstein German; +{de-lu} Luxembourg German. + +=item {gmh} : Middle High German (ca.1050-1500) + +(Historical) + +=item {goh} : Old High German (ca.750-1050) + +(Historical) + +=item [{gem} : Germanic (Other)] + +=item {gil} : Gilbertese + +=item {gon} : Gondi + +=item {gor} : Gorontalo + +=item {got} : Gothic + +(Historical) + +=item {grb} : Grebo + +=item {grc} : Ancient Greek + +(Historical) (Until 15th century or so.) + +=item {el} : Modern Greek + +(Since 15th century or so.) + +=item {gn} : Guarani + +GuaranE + +=item {gu} : Gujarati + +=item {gwi} : Gwich'in + +eq Gwichin + +=item {hai} : Haida + +=item {ht} : Haitian + +eq Haitian Creole + +=item {ha} : Hausa + +=item {haw} : Hawaiian + +Hawai'ian + +=item {he} : Hebrew + +(Formerly "iw".) + +=for etc +{iw} Hebrew (old tag) + +=item {hz} : Herero + +=item {hil} : Hiligaynon + +=item {him} : Himachali + +=item {hi} : Hindi + +=item {ho} : Hiri Motu + +=item {hit} : Hittite + +(Historical) + +=item {hmn} : Hmong + +=item {hu} : Hungarian + +=item {hup} : Hupa + +=item {iba} : Iban + +=item {is} : Icelandic + +=item {io} : Ido + +(Artificial) + +=item {ig} : Igbo + +(Formerly "ibo".) + +=item {ijo} : Ijo + +=item {ilo} : Iloko + +=item [{inc} : Indic (Other)] + +=item [{ine} : Indo-European (Other)] + +=item {id} : Indonesian + +(Formerly "in".) + +=for etc +{in} Indonesian (old tag) + +=item {inh} : Ingush + +=item {ia} : Interlingua (International Auxiliary Language Association) + +(Artificial) NOT Interlingue! + +=item {ie} : Interlingue + +(Artificial) NOT Interlingua! + +=item {iu} : Inuktitut + +A subform of "Eskimo". + +=item {ik} : Inupiaq + +A subform of "Eskimo". + +=item [{ira} : Iranian (Other)] + +=item {ga} : Irish + +=item {mga} : Middle Irish (900-1200) + +(Historical) + +=item {sga} : Old Irish (to 900) + +(Historical) + +=item [{iro} : Iroquoian languages] + +=item {it} : Italian + +Notable forms: +{it-it} Italy Italian; +{it-ch} Swiss Italian. + +=item {ja} : Japanese + +(NOT "jp"!) + +=item {jv} : Javanese + +(Formerly "jw" because of a typo.) + +=item {jrb} : Judeo-Arabic + +=item {jpr} : Judeo-Persian + +=item {kbd} : Kabardian + +=item {kab} : Kabyle + +=item {kac} : Kachin + +=item {kl} : Kalaallisut + +eq Greenlandic "Eskimo" + +=item {xal} : Kalmyk + +=item {kam} : Kamba + +=item {kn} : Kannada + +eq Kanarese. NOT Canadian! + +=item {kr} : Kanuri + +(Formerly "kau".) + +=item {krc} : Karachay-Balkar + +=item {kaa} : Kara-Kalpak + +=item {kar} : Karen + +=item {ks} : Kashmiri + +=item {csb} : Kashubian + +eq Kashub + +=item {kaw} : Kawi + +=item {kk} : Kazakh + +=item {kha} : Khasi + +=item {km} : Khmer + +eq Cambodian. eq Kampuchean. + +=item [{khi} : Khoisan (Other)] + +=item {kho} : Khotanese + +=item {ki} : Kikuyu + +eq Gikuyu. + +=item {kmb} : Kimbundu + +=item {rw} : Kinyarwanda + +=item {ky} : Kirghiz + +=item {i-klingon} : Klingon + +=item {kv} : Komi + +=item {kg} : Kongo + +(Formerly "kon".) + +=item {kok} : Konkani + +=item {ko} : Korean + +=item {kos} : Kosraean + +=item {kpe} : Kpelle + +=item {kro} : Kru + +=item {kj} : Kuanyama + +=item {kum} : Kumyk + +=item {ku} : Kurdish + +=item {kru} : Kurukh + +=item {kut} : Kutenai + +=item {lad} : Ladino + +eq Judeo-Spanish. NOT Ladin (a minority language in Italy). + +=item {lah} : Lahnda + +NOT Lamba! + +=item {lam} : Lamba + +NOT Lahnda! + +=item {lo} : Lao + +eq Laotian. + +=item {la} : Latin + +(Historical) NOT Ladin! NOT Ladino! + +=item {lv} : Latvian + +eq Lettish. + +=item {lb} : Letzeburgesch + +eq Luxemburgian, eq Luxemburger. (Formerly "i-lux".) + +=for etc +{i-lux} Letzeburgesch (old tag) + +=item {lez} : Lezghian + +=item {li} : Limburgish + +eq Limburger, eq Limburgan. NOT Letzeburgesch! + +=item {ln} : Lingala + +=item {lt} : Lithuanian + +=item {nds} : Low German + +eq Low Saxon. eq Low German. eq Low Saxon. + +=item {art-lojban} : Lojban (Artificial) + +=item {loz} : Lozi + +=item {lu} : Luba-Katanga + +(Formerly "lub".) + +=item {lua} : Luba-Lulua + +=item {lui} : Luiseno + +eq LuiseEo. + +=item {lun} : Lunda + +=item {luo} : Luo (Kenya and Tanzania) + +=item {lus} : Lushai + +=item {mk} : Macedonian + +eq the modern Slavic language spoken in what was Yugoslavia. +NOT the form of Greek spoken in Greek Macedonia! + +=item {mad} : Madurese + +=item {mag} : Magahi + +=item {mai} : Maithili + +=item {mak} : Makasar + +=item {mg} : Malagasy + +=item {ms} : Malay + +NOT Malayalam! + +=item {ml} : Malayalam + +NOT Malay! + +=item {mt} : Maltese + +=item {mnc} : Manchu + +=item {mdr} : Mandar + +NOT Mandarin! + +=item {man} : Mandingo + +=item {mni} : Manipuri + +eq Meithei. + +=item [{mno} : Manobo languages] + +=item {gv} : Manx + +=item {mi} : Maori + +NOT Mari! + +=item {mr} : Marathi + +=item {chm} : Mari + +NOT Maori! + +=item {mh} : Marshall + +eq Marshallese. + +=item {mwr} : Marwari + +=item {mas} : Masai + +=item [{myn} : Mayan languages] + +=item {men} : Mende + +=item {mic} : Micmac + +=item {min} : Minangkabau + +=item {i-mingo} : Mingo + +eq the Irquoian language West Virginia Seneca. NOT New York Seneca! + +=item [{mis} : Miscellaneous languages] + +Don't use this. + +=item {moh} : Mohawk + +=item {mdf} : Moksha + +=item {mo} : Moldavian + +eq Moldovan. + +=item [{mkh} : Mon-Khmer (Other)] + +=item {lol} : Mongo + +=item {mn} : Mongolian + +eq Mongol. + +=item {mos} : Mossi + +=item [{mul} : Multiple languages] + +Not for normal use. + +=item [{mun} : Munda languages] + +=item {nah} : Nahuatl + +=item {nap} : Neapolitan + +=item {na} : Nauru + +=item {nv} : Navajo + +eq Navaho. (Formerly "i-navajo".) + +=for etc +{i-navajo} Navajo (old tag) + +=item {nd} : North Ndebele + +=item {nr} : South Ndebele + +=item {ng} : Ndonga + +=item {ne} : Nepali + +eq Nepalese. Notable forms: +{ne-np} Nepal Nepali; +{ne-in} India Nepali. + +=item {new} : Newari + +=item {nia} : Nias + +=item [{nic} : Niger-Kordofanian (Other)] + +=item [{ssa} : Nilo-Saharan (Other)] + +=item {niu} : Niuean + +=item {nog} : Nogai + +=item {non} : Old Norse + +(Historical) + +=item [{nai} : North American Indian] + +Do not use this. + +=item {no} : Norwegian + +Note the two following forms: + +=item {nb} : Norwegian Bokmal + +eq BokmEl, (A form of Norwegian.) (Formerly "no-bok".) + +=for etc +{no-bok} Norwegian Bokmal (old tag) + +=item {nn} : Norwegian Nynorsk + +(A form of Norwegian.) (Formerly "no-nyn".) + +=for etc +{no-nyn} Norwegian Nynorsk (old tag) + +=item [{nub} : Nubian languages] + +=item {nym} : Nyamwezi + +=item {nyn} : Nyankole + +=item {nyo} : Nyoro + +=item {nzi} : Nzima + +=item {oc} : Occitan (post 1500) + +eq ProvenEal, eq Provencal + +=item {oj} : Ojibwa + +eq Ojibwe. (Formerly "oji".) + +=item {or} : Oriya + +=item {om} : Oromo + +=item {osa} : Osage + +=item {os} : Ossetian; Ossetic + +=item [{oto} : Otomian languages] + +Group of languages collectively called "OtomE". + +=item {pal} : Pahlavi + +eq Pahlevi + +=item {i-pwn} : Paiwan + +eq Pariwan + +=item {pau} : Palauan + +=item {pi} : Pali + +(Historical?) + +=item {pam} : Pampanga + +=item {pag} : Pangasinan + +=item {pa} : Panjabi + +eq Punjabi + +=item {pap} : Papiamento + +eq Papiamentu. + +=item [{paa} : Papuan (Other)] + +=item {fa} : Persian + +eq Farsi. eq Iranian. + +=item {peo} : Old Persian (ca.600-400 B.C.) + +=item [{phi} : Philippine (Other)] + +=item {phn} : Phoenician + +(Historical) + +=item {pon} : Pohnpeian + +NOT Pompeiian! + +=item {pl} : Polish + +=item {pt} : Portuguese + +eq Portugese. Notable forms: +{pt-pt} Portugal Portuguese; +{pt-br} Brazilian Portuguese. + +=item [{pra} : Prakrit languages] + +=item {pro} : Old Provencal (to 1500) + +eq Old ProvenEal. (Historical.) + +=item {ps} : Pushto + +eq Pashto. eq Pushtu. + +=item {qu} : Quechua + +eq Quecha. + +=item {rm} : Raeto-Romance + +eq Romansh. + +=item {raj} : Rajasthani + +=item {rap} : Rapanui + +=item {rar} : Rarotongan + +=item [{qaa - qtz} : Reserved for local use.] + +=item [{roa} : Romance (Other)] + +NOT Romanian! NOT Romany! NOT Romansh! + +=item {ro} : Romanian + +eq Rumanian. NOT Romany! + +=item {rom} : Romany + +eq Rom. NOT Romanian! + +=item {rn} : Rundi + +=item {ru} : Russian + +NOT White Russian! NOT Rusyn! + +=item [{sal} : Salishan languages] + +Large language group. + +=item {sam} : Samaritan Aramaic + +NOT Aramaic! + +=item {se} : Northern Sami + +eq Lappish. eq Lapp. eq (Northern) Saami. + +=item {sma} : Southern Sami + +=item {smn} : Inari Sami + +=item {smj} : Lule Sami + +=item {sms} : Skolt Sami + +=item [{smi} : Sami languages (Other)] + +=item {sm} : Samoan + +=item {sad} : Sandawe + +=item {sg} : Sango + +=item {sa} : Sanskrit + +(Historical) + +=item {sat} : Santali + +=item {sc} : Sardinian + +eq Sard. + +=item {sas} : Sasak + +=item {sco} : Scots + +NOT Scots Gaelic! + +=item {sel} : Selkup + +=item [{sem} : Semitic (Other)] + +=item {sr} : Serbian + +eq Serb. NOT Sorbian. + +Notable forms: +{sr-Cyrl} : Serbian in Cyrillic script; +{sr-Latn} : Serbian in Latin script. + +=item {srr} : Serer + +=item {shn} : Shan + +=item {sn} : Shona + +=item {sid} : Sidamo + +=item {sgn-...} : Sign Languages + +Always use with a subtag. Notable forms: +{sgn-gb} British Sign Language (BSL); +{sgn-ie} Irish Sign Language (ESL); +{sgn-ni} Nicaraguan Sign Language (ISN); +{sgn-us} American Sign Language (ASL). + +(And so on with other country codes as the subtag.) + +=item {bla} : Siksika + +eq Blackfoot. eq Pikanii. + +=item {sd} : Sindhi + +=item {si} : Sinhalese + +eq Sinhala. + +=item [{sit} : Sino-Tibetan (Other)] + +=item [{sio} : Siouan languages] + +=item {den} : Slave (Athapascan) + +("Slavey" is a subform.) + +=item [{sla} : Slavic (Other)] + +=item {sk} : Slovak + +eq Slovakian. + +=item {sl} : Slovenian + +eq Slovene. + +=item {sog} : Sogdian + +=item {so} : Somali + +=item {son} : Songhai + +=item {snk} : Soninke + +=item {wen} : Sorbian languages + +eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! + +=item {nso} : Northern Sotho + +=item {st} : Southern Sotho + +eq Sutu. eq Sesotho. + +=item [{sai} : South American Indian (Other)] + +=item {es} : Spanish + +Notable forms: +{es-ar} Argentine Spanish; +{es-bo} Bolivian Spanish; +{es-cl} Chilean Spanish; +{es-co} Colombian Spanish; +{es-do} Dominican Spanish; +{es-ec} Ecuadorian Spanish; +{es-es} Spain Spanish; +{es-gt} Guatemalan Spanish; +{es-hn} Honduran Spanish; +{es-mx} Mexican Spanish; +{es-pa} Panamanian Spanish; +{es-pe} Peruvian Spanish; +{es-pr} Puerto Rican Spanish; +{es-py} Paraguay Spanish; +{es-sv} Salvadoran Spanish; +{es-us} US Spanish; +{es-uy} Uruguayan Spanish; +{es-ve} Venezuelan Spanish. + +=item {suk} : Sukuma + +=item {sux} : Sumerian + +(Historical) + +=item {su} : Sundanese + +=item {sus} : Susu + +=item {sw} : Swahili + +eq Kiswahili + +=item {ss} : Swati + +=item {sv} : Swedish + +Notable forms: +{sv-se} Sweden Swedish; +{sv-fi} Finland Swedish. + +=item {syr} : Syriac + +=item {tl} : Tagalog + +=item {ty} : Tahitian + +=item [{tai} : Tai (Other)] + +NOT Thai! + +=item {tg} : Tajik + +=item {tmh} : Tamashek + +=item {ta} : Tamil + +=item {i-tao} : Tao + +eq Yami. + +=item {tt} : Tatar + +=item {i-tay} : Tayal + +eq Atayal. eq Atayan. + +=item {te} : Telugu + +=item {ter} : Tereno + +=item {tet} : Tetum + +=item {th} : Thai + +NOT Tai! + +=item {bo} : Tibetan + +=item {tig} : Tigre + +=item {ti} : Tigrinya + +=item {tem} : Timne + +eq Themne. eq Timene. + +=item {tiv} : Tiv + +=item {tli} : Tlingit + +=item {tpi} : Tok Pisin + +=item {tkl} : Tokelau + +=item {tog} : Tonga (Nyasa) + +NOT Tsonga! + +=item {to} : Tonga (Tonga Islands) + +(Pronounced "Tong-a", not "Tong-ga") + +NOT Tsonga! + +=item {tsi} : Tsimshian + +eq Sm'algyax + +=item {ts} : Tsonga + +NOT Tonga! + +=item {i-tsu} : Tsou + +=item {tn} : Tswana + +Same as Setswana. + +=item {tum} : Tumbuka + +=item [{tup} : Tupi languages] + +=item {tr} : Turkish + +(Typically in Roman script) + +=item {ota} : Ottoman Turkish (1500-1928) + +(Typically in Arabic script) (Historical) + +=item {crh} : Crimean Turkish + +eq Crimean Tatar + +=item {tk} : Turkmen + +eq Turkmeni. + +=item {tvl} : Tuvalu + +=item {tyv} : Tuvinian + +eq Tuvan. eq Tuvin. + +=item {tw} : Twi + +=item {udm} : Udmurt + +=item {uga} : Ugaritic + +NOT Ugric! + +=item {ug} : Uighur + +=item {uk} : Ukrainian + +=item {umb} : Umbundu + +=item {und} : Undetermined + +Not a tag for normal use. + +=item {ur} : Urdu + +=item {uz} : Uzbek + +eq Ezbek + +Notable forms: +{uz-Cyrl} Uzbek in Cyrillic script; +{uz-Latn} Uzbek in Latin script. + +=item {vai} : Vai + +=item {ve} : Venda + +NOT Wendish! NOT Wend! NOT Avestan! (Formerly "ven".) + +=item {vi} : Vietnamese + +eq Viet. + +=item {vo} : Volapuk + +eq VolapEk. (Artificial) + +=item {vot} : Votic + +eq Votian. eq Vod. + +=item [{wak} : Wakashan languages] + +=item {wa} : Walloon + +=item {wal} : Walamo + +eq Wolaytta. + +=item {war} : Waray + +Presumably the Philippine language Waray-Waray (SamareEo), +not the smaller Philippine language Waray Sorsogon, nor the extinct +Australian language Waray. + +=item {was} : Washo + +eq Washoe + +=item {cy} : Welsh + +=item {wo} : Wolof + +=item {x-...} : Unregistered (Semi-Private Use) + +"x-" is a prefix for language tags that are not registered with ISO +or IANA. Example, x-double-dutch + +=item {xh} : Xhosa + +=item {sah} : Yakut + +=item {yao} : Yao + +(The Yao in Malawi?) + +=item {yap} : Yapese + +eq Yap + +=item {ii} : Sichuan Yi + +=item {yi} : Yiddish + +Formerly "ji". Usually in Hebrew script. + +Notable forms: +{yi-latn} Yiddish in Latin script + +=item {yo} : Yoruba + +=item [{ypk} : Yupik languages] + +Several "Eskimo" languages. + +=item {znd} : Zande + +=item [{zap} : Zapotec] + +(A group of languages.) + +=item {zen} : Zenaga + +NOT Zend. + +=item {za} : Zhuang + +=item {zu} : Zulu + +=item {zun} : Zuni + +eq ZuEi + +=back + +=for woohah END + +=head1 SEE ALSO + +L and its "See Also" section. + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 2001+ Sean M. Burke. All rights reserved. + +You can redistribute and/or +modify this document under the same terms as Perl itself. + +This document is provided in the hope that it will be +useful, but without any warranty; +without even the implied warranty of accuracy, authoritativeness, +completeness, merchantability, or fitness for a particular purpose. + +Email any corrections or questions to me. + +=head1 AUTHOR + +Sean M. Burke, sburkeE<64>cpan.org + +=cut + + +# To generate a list of just the two and three-letter codes: + +#!/usr/local/bin/perl -w + +require 5; # Time-stamp: "2001-03-13 21:53:39 MST" + # Sean M. Burke, sburke@cpan.org + # This program is for generating the language_codes.txt file +use strict; +use LWP::Simple; +use HTML::TreeBuilder 3.10; +my $root = HTML::TreeBuilder->new(); +my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; +$root->parse(get($url) || die "Can't get $url"); +$root->eof(); + +my @codes; + +foreach my $tr ($root->find_by_tag_name('tr')) { + my @f = map $_->as_text(), $tr->content_list(); + #print map("<$_> ", @f), "\n"; + next unless @f == 5; + pop @f; # nix the French name + next if $f[-1] eq 'Language Name (English)'; # it's a header line + my $xx = splice(@f, 2,1); # pull out the two-letter code + $f[-1] =~ s/^\s+//; + $f[-1] =~ s/\s+$//; + if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it + push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; + } else { # print the three-letter codes. + if($f[0] eq $f[1]) { + push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; + } else { # shouldn't happen + push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; + } + } +} + +print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; +print "[ based on $url\n at ", scalar(localtime), "]\n", + "[Note: doesn't include IANA-registered codes.]\n"; +exit; +__END__ + diff --git a/dist/I18N-LangTags/t/01_about_verbose.t b/dist/I18N-LangTags/t/01_about_verbose.t new file mode 100644 index 0000000000..3abc68d537 --- /dev/null +++ b/dist/I18N-LangTags/t/01_about_verbose.t @@ -0,0 +1,89 @@ + +require 5; +# Time-stamp: "2004-03-30 17:02:59 AST" + +# Summary of, well, things. + +use Test; +BEGIN {plan tests => 2}; + +ok 1; + +use I18N::LangTags; +use I18N::LangTags::List; +use I18N::LangTags::Detect; + +#chdir "t" if -e "t"; + +{ + my @out; + push @out, + "\n\nPerl v", + defined($^V) ? sprintf('%vd', $^V) : $], + " under $^O ", + (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber()) + ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (), + (defined $MacPerl::Version) + ? ("(MacPerl version $MacPerl::Version)") : (), + "\n" + ; + + # Ugly code to walk the symbol tables: + my %v; + my @stack = (''); # start out in %:: + my $this; + my $count = 0; + my $pref; + while(@stack) { + $this = shift @stack; + die "Too many packages?" if ++$count > 1000; + next if exists $v{$this}; + next if $this eq 'main'; # %main:: is %:: + + #print "Peeking at $this => ${$this . '::VERSION'}\n"; + + if(defined ${$this . '::VERSION'} ) { + $v{$this} = ${$this . '::VERSION'} + } elsif( + defined *{$this . '::ISA'} or defined &{$this . '::import'} + or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"}) + # If it has an ISA, an import, or any subs... + ) { + # It's a class/module with no version. + $v{$this} = undef; + } else { + # It's probably an unpopulated package. + ## $v{$this} = '...'; + } + + $pref = length($this) ? "$this\::" : ''; + push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'}; + #print "Stack: @stack\n"; + } + push @out, " Modules in memory:\n"; + delete @v{'', '[none]'}; + foreach my $p (sort {lc($a) cmp lc($b)} keys %v) { + $indent = ' ' x (2 + ($p =~ tr/:/:/)); + push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n"; + } + push @out, sprintf "[at %s (local) / %s (GMT)]\n", + scalar(gmtime), scalar(localtime); + my $x = join '', @out; + $x =~ s/^/#/mg; + print $x; +} + +print "# Running", + (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n", + "#\n", +; + +print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n"; + +print "# \%INC:\n"; +foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) { + print "# [$x] = [", $INC{$x} || '', "]\n"; +} + +ok 1; + diff --git a/dist/I18N-LangTags/t/05_main.t b/dist/I18N-LangTags/t/05_main.t new file mode 100644 index 0000000000..056baafc55 --- /dev/null +++ b/dist/I18N-LangTags/t/05_main.t @@ -0,0 +1,98 @@ + +require 5; + # Time-stamp: "2004-03-30 17:52:14 AST" +use strict; +use Test; +BEGIN { plan tests => 64 }; +BEGIN { ok 1 } +use I18N::LangTags (':ALL'); + +print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; + +ok !is_language_tag(''); +ok is_language_tag('fr'); +ok is_language_tag('fr-ca'); +ok is_language_tag('fr-CA'); +ok !is_language_tag('fr-CA-'); +ok !is_language_tag('fr_CA'); +ok is_language_tag('fr-ca-joual'); +ok !is_language_tag('frca'); +ok is_language_tag('nav'); # (not actual tag) +ok is_language_tag('nav-shiprock'); # (not actual tag) +ok !is_language_tag('nav-ceremonial'); # subtag too long +ok !is_language_tag('x'); +ok !is_language_tag('i'); +ok is_language_tag('i-borg'); # NB: fictitious tag +ok is_language_tag('x-borg'); +ok is_language_tag('x-borg-prot5123'); +ok same_language_tag('x-borg-prot5123', 'i-BORG-Prot5123' ); +ok !same_language_tag('en', 'en-us' ); + +ok 0 == similarity_language_tag('en-ca', 'fr-ca'); +ok 1 == similarity_language_tag('en-ca', 'en-us'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us-western'); +ok 2 == similarity_language_tag('en-us-southern', 'en-us'); + +ok grep $_ eq 'hi', panic_languages('kok'); +ok grep $_ eq 'en', panic_languages('x-woozle-wuzzle'); +ok ! grep $_ eq 'mr', panic_languages('it'); +ok grep $_ eq 'es', panic_languages('it'); +ok grep $_ eq 'it', panic_languages('es'); + + +print "# Now the ::List tests...\n"; +print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; + +use I18N::LangTags::List; +foreach my $lt (qw( + en + en-us + en-kr + el + elx + i-mingo + i-mingo-tom + x-mingo-tom + it + it-it + it-IT + it-FR + ak + aka + jv + jw + no + no-nyn + nn + i-lux + lb + wa + yi + ji + den-syllabic + den-syllabic-western + den-western + den-latin + cre-syllabic + cre-syllabic-western + cre-western + cre-latin + cr-syllabic + cr-syllabic-western + cr-western + cr-latin +)) { + my $name = I18N::LangTags::List::name($lt); + if($name) { + ok(1); + print "# $lt -> $name\n"; + } else { + ok(0); + print "# Failed lookup on $lt\n"; + } +} + + + +print "# So there!\n"; + diff --git a/dist/I18N-LangTags/t/07_listy.t b/dist/I18N-LangTags/t/07_listy.t new file mode 100644 index 0000000000..a56a798a33 --- /dev/null +++ b/dist/I18N-LangTags/t/07_listy.t @@ -0,0 +1,30 @@ + +require 5; + # Time-stamp: "2003-10-10 17:37:34 ADT" +use strict; +use Test; +BEGIN { plan tests => 17 }; +BEGIN { ok 1 } +use I18N::LangTags::List; + +print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; + +ok I18N::LangTags::List::name('fr'), 'French'; +ok I18N::LangTags::List::name('fr-fr'); +ok !I18N::LangTags::List::name('El Zorcho'); +ok !I18N::LangTags::List::name(); + + +ok !I18N::LangTags::List::is_decent(); +ok I18N::LangTags::List::is_decent('fr'); +ok I18N::LangTags::List::is_decent('fr-blorch'); +ok !I18N::LangTags::List::is_decent('El Zorcho'); +ok !I18N::LangTags::List::is_decent('sgn'); +ok I18N::LangTags::List::is_decent('sgn-us'); +ok !I18N::LangTags::List::is_decent('i'); +ok I18N::LangTags::List::is_decent('i-mingo'); +ok I18N::LangTags::List::is_decent('i-mingo-tom'); +ok !I18N::LangTags::List::is_decent('cel'); +ok I18N::LangTags::List::is_decent('cel-gaulish'); + +ok 1; # one for the road diff --git a/dist/I18N-LangTags/t/10_http.t b/dist/I18N-LangTags/t/10_http.t new file mode 100644 index 0000000000..36341f77cf --- /dev/null +++ b/dist/I18N-LangTags/t/10_http.t @@ -0,0 +1,104 @@ + +# Time-stamp: "2004-06-17 23:06:22 PDT" + +use I18N::LangTags::Detect; + +use Test; +BEGIN { plan tests => 87 }; + +my @in = grep m/\S/, split /\n/, q{ + +[ sv ] sv +[ en ] en +[ en fi ] en, fi +[ en-us ] en-us +[ en-us ] en-US +[ en-us ] EN-US + +[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt tli ja ] EN-au, tli;q=0.201, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 + +[ en-au en en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 +[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 +[ en fr ] en;q=1,fr;q=.5 +[ en fr ] en;q=1,fr;q=.99 +[ en ru ko ] en, ru;q=0.7, ko;q=0.3 +[ en ru ko ] en, ru;q=0.7, KO;q=0.3 +[ en-us en ] en-us, en;q=0.50 +[ en fr ] fr ; q = 0.9, en +[ en fr ] en,fr;q=.90 +[ ru en-uk en fr ] ru, en-UK;q=0.5, en;q=0.3, fr;q=0.1 +[ en-us fr es-mx ] en-us,fr;q=0.7,es-mx;q=0.3 +[ en-us en ] en-us, en;q=0.50 + +[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 +[ da en-gb en ] da, en;q=0.7, en-gb;q=0.8 +[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 +[ da en-gb en ] da,en;q=0.7,en-gb;q=0.8 +[ da en-gb en ] da, en-gb ; q=0.8, en ; q=0.7 +[ da en-gb en ] da , en-gb ; q = 0.8 , en ; q =0.7 +[ da en-gb en ] da (yup, Danish) , en-gb ; q = 0.8 , en ; q =0.7 + +[ no dk en-uk en-us ] en-UK;q=0.7, en-US;q=0.6, no;q=1.0, dk;q=0.8 +[ no dk en-uk en-us ] en-US;q=0.6, en-UK;q=0.7, no;q=1.0, dk;q=0.8 +[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, en-US;q=0.6, dk;q=0.8 +[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, dk;q=0.8, en-US;q=0.6 + +[ fi en ] fi;q=1, en;q=0.2 +[ de-de de en en-us en-gb ] de-DE, de;q=0.80, en;q=0.60, en-US;q=0.40, en-GB;q=0.20 +[ ru ] ru; q=1, *; q=0.1 +[ ru en ] ru, en; q=0.1 +[ ja en ] ja,en;q=0.5 +[ en ] en; q=1.0 +[ ja ] ja; q=1.0 +[ ja ] ja; q=1.0 +[ en ja ] en; q=0.5, ja; q=0.5 +[ fr-ca fr en ] fr-ca, fr;q=0.8, en;q=0.7 +[ NIX ] NIX +}; + +foreach my $in (@in) { + $in =~ s/^\s*\[([^\]]+)\]\s*//s or die "Bad input: $in"; + my @should = do { my $x = $1; $x =~ m/(\S+)/g }; + + if($in eq 'NIX') { $in = ''; @should = (); } + + local $ENV{'HTTP_ACCEPT_LANGUAGE'}; + + foreach my $modus ( + sub { + print "# Testing with arg...\n"; + $ENV{'HTTP_ACCEPT_LANGUAGE'} = 'PLORK'; + return $_[0]; + }, + sub { + print "# Testing wath HTTP_ACCEPT_LANGUAGE...\n"; + $ENV{'HTTP_ACCEPT_LANGUAGE'} = $_[0]; + return(); + }, + ) { + my @args = &$modus($in); + + # //////////////////////////////////////////////////// + my @out = I18N::LangTags::Detect->http_accept_langs(@args); + # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ + + if( + @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#Got: [@out]\n", + "# but wanted: [@should]\n", + "# < \"$in\"\n#\n"; + } + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/20_locales.t b/dist/I18N-LangTags/t/20_locales.t new file mode 100644 index 0000000000..ae04812ff2 --- /dev/null +++ b/dist/I18N-LangTags/t/20_locales.t @@ -0,0 +1,38 @@ +require 5; + # Time-stamp: "2004-10-06 23:07:06 ADT" +use strict; +use Test; +BEGIN { plan tests => 22 }; +BEGIN { ok 1 } +use I18N::LangTags (':ALL'); + +print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; +print "# Loaded from ", $INC{'I18N/LangTags.pm'} || "??", "\n"; + +ok lc locale2language_tag('en'), 'en'; +ok lc locale2language_tag('en_US'), 'en-us'; +ok lc locale2language_tag('en_US.ISO8859-1'), 'en-us'; +ok lc(locale2language_tag('C')||''), ''; +ok lc(locale2language_tag('POSIX')||''), ''; + + +ok lc locale2language_tag('eu_mt'), 'eu-mt'; +ok lc locale2language_tag('eu'), 'eu'; +ok lc locale2language_tag('it'), 'it'; +ok lc locale2language_tag('it_IT'), 'it-it'; +ok lc locale2language_tag('it_IT.utf8'), 'it-it'; +ok lc locale2language_tag('it_IT.utf8@euro'), 'it-it'; +ok lc locale2language_tag('it_IT@euro'), 'it-it'; + + +ok lc locale2language_tag('zh_CN.gb18030'), 'zh-cn'; +ok lc locale2language_tag('zh_CN.gbk'), 'zh-cn'; +ok lc locale2language_tag('zh_CN.utf8'), 'zh-cn'; +ok lc locale2language_tag('zh_HK'), 'zh-hk'; +ok lc locale2language_tag('zh_HK.utf8'), 'zh-hk'; +ok lc locale2language_tag('zh_TW'), 'zh-tw'; +ok lc locale2language_tag('zh_TW.euctw'), 'zh-tw'; +ok lc locale2language_tag('zh_TW.utf8'), 'zh-tw'; + +print "# So there!\n"; +ok 1; diff --git a/dist/I18N-LangTags/t/50_super.t b/dist/I18N-LangTags/t/50_super.t new file mode 100644 index 0000000000..9923c84068 --- /dev/null +++ b/dist/I18N-LangTags/t/50_super.t @@ -0,0 +1,88 @@ + +# Time-stamp: "2004-03-30 17:46:17 AST" + +use Test; +BEGIN { plan tests => 26 }; +print "#\n# Testing normal (tight) insertion of super-ordinate language tags...\n#\n"; + +use I18N::LangTags qw(implicate_supers); + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br pt fr + pt-br fr pt => pt-br fr pt + + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br pt fr + hai pt-br fr => hai pt-br pt fr + + # Now test multi-part complicateds: + pt-br-janeiro => pt-br-janeiro pt-br pt + pt-br-janeiro fr => pt-br-janeiro pt-br pt fr + pt-br-janeiro de fr => pt-br-janeiro pt-br pt de fr + pt-br-janeiro de pt fr => pt-br-janeiro pt-br de pt fr + + pt-br-janeiro pt-br-saopaolo => pt-br-janeiro pt-br pt pt-br-saopaolo + pt-br-janeiro fr pt-br-saopaolo => pt-br-janeiro pt-br pt fr pt-br-saopaolo + pt-br-janeiro de pt-br-saopaolo fr => pt-br-janeiro pt-br pt de pt-br-saopaolo fr + pt-br-janeiro de pt-br fr pt-br-saopaolo => pt-br-janeiro de pt-br pt fr pt-br-saopaolo + + pt-br de en fr pt-br-janeiro => pt-br pt de en fr pt-br-janeiro + pt-br de en fr => pt-br pt de en fr + + ja pt-br-janeiro fr => ja pt-br-janeiro pt-br pt fr + ja pt-br-janeiro de fr => ja pt-br-janeiro pt-br pt de fr + ja pt-br-janeiro de pt fr => ja pt-br-janeiro pt-br de pt fr + + pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br pt fr +# an odd case, since we don't filter for uniqueness in this sub + +}; + +sub uniq { my %seen; return grep(!($seen{$_}++), @_); } + +foreach my $in (@in) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + my(@in, @should); + { + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my($i,$s) = ($1, $2); + @in = ($i =~ m/(\S+)/g); + @should = ($s =~ m/(\S+)/g); + #print "{@in}{@should}\n"; + } + my @out = implicate_supers( + ("@in" eq 'NIX') ? () : @in + ); + #print "O: ", join(' ', map "<$_>", @out), "\n"; + @out = 'NIX' unless @out; + + + if( @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#!!Got: [@out]\n", + "#!! but wanted: [@should]\n", + "#!! from \"$in\"\n#\n"; + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/55_supers_strict.t b/dist/I18N-LangTags/t/55_supers_strict.t new file mode 100644 index 0000000000..3b285157a1 --- /dev/null +++ b/dist/I18N-LangTags/t/55_supers_strict.t @@ -0,0 +1,78 @@ + +# Time-stamp: "2004-03-30 17:49:58 AST" +#sub I18N::LangTags::Detect::DEBUG () {10} +use I18N::LangTags qw(implicate_supers_strictly); + +use Test; +BEGIN { plan tests => 19 }; + +print "#\n# Testing strict (non-tight) insertion of super-ordinate language tags...\n#\n"; + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br fr pt + pt-br fr pt => pt-br fr pt + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br fr pt + hai pt-br fr => hai pt-br fr pt + +# Now test multi-part complicateds: + pt-br-janeiro fr => pt-br-janeiro fr pt-br pt +pt-br-janeiro de fr => pt-br-janeiro de fr pt-br pt +pt-br-janeiro de pt fr => pt-br-janeiro de pt fr pt-br + +ja pt-br-janeiro fr => ja pt-br-janeiro fr pt-br pt +ja pt-br-janeiro de fr => ja pt-br-janeiro de fr pt-br pt +ja pt-br-janeiro de pt fr => ja pt-br-janeiro de pt fr pt-br + +pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br fr pt + # an odd case, since we don't filter for uniqueness in this sub + +}; + + +foreach my $in (@in) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + my(@in, @should); + { + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my($i,$s) = ($1, $2); + @in = ($i =~ m/(\S+)/g); + @should = ($s =~ m/(\S+)/g); + #print "{@in}{@should}\n"; + } + my @out = I18N::LangTags::implicate_supers_strictly( + ("@in" eq 'NIX') ? () : @in + ); + #print "O: ", join(' ', map "<$_>", @out), "\n"; + @out = 'NIX' unless @out; + + + if( @out == @should + and lc( join "\e", @out ) eq lc( join "\e", @should ) + ) { + print "# Happily got [@out] from [$in]\n"; + ok 1; + } else { + ok 0; + print "#!!Got: [@out]\n", + "#!! but wanted: [@should]\n", + "#!! from \"$in\"\n#\n"; + } +} + +print "#\n#\n# Bye-bye!\n"; +ok 1; + diff --git a/dist/I18N-LangTags/t/80_all_env.t b/dist/I18N-LangTags/t/80_all_env.t new file mode 100644 index 0000000000..1362d5ebf5 --- /dev/null +++ b/dist/I18N-LangTags/t/80_all_env.t @@ -0,0 +1,115 @@ + +require 5; +use Test; +# Time-stamp: "2004-07-01 14:33:50 ADT" +BEGIN { plan tests => 20; } +use I18N::LangTags::Detect 1.01; +print "# Hi there...\n"; +ok 1; + +print "# Using I18N::LangTags::Detect v$I18N::LangTags::Detect::VERSION\n"; + +print "# Make sure we can assign to ENV entries\n", + "# (Otherwise we can't run the subsequent tests)...\n"; +$ENV{'MYORP'} = 'Zing'; ok $ENV{'MYORP'}, 'Zing'; +$ENV{'SWUZ'} = 'KLORTHO HOOBOY'; ok $ENV{'SWUZ'}, 'KLORTHO HOOBOY'; + +delete $ENV{'MYORP'}; +delete $ENV{'SWUZ'}; + +sub j { "[" . join(' ', map "\"$_\"", @_) . "]" ;} + +sub show { + print "# (Seeing {", join(' ', + map(qq{<$_>}, @_)), "} at line ", (caller)[2], ")\n"; + printenv(); + return $_[0] || ''; +} +sub printenv { + print "# ENV:\n"; + foreach my $k (sort keys %ENV) { + my $p = $ENV{$k}; $p =~ s/\n/\n#/g; + print "# [$k] = [$p]\n"; } + print "# [end of ENV]\n#\n"; +} + +$ENV{'IGNORE_WIN32_LOCALE'} = 1; # a hack, just for testing's sake. + + +print "# Test LANGUAGE...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = 'Eu-MT'; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = ''; +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +print "# Test LC_ALL...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = 'Eu-MT'; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = ''; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + +print "# Test LC_MESSAGES...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = 'Eu-MT'; +$ENV{'LANG'} = ''; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +print "# Test LANG...\n"; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANGUAGE'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = 'Eu_MT'; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + + +print "# Test LANG...\n"; +$ENV{'LANGUAGE'} = ''; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'LANG'} = 'Eu_MT'; + +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + + + +print "# Test HTTP_ACCEPT_LANGUAGE...\n"; +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'eu-MT'; +ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; +ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; + + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eu-MT, i-klung'; +ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; +ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eU-Mt, i-klung'; +ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; +ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; + + + + +print "# Byebye!\n"; +ok 1; + diff --git a/ext/I18N-LangTags/ChangeLog b/ext/I18N-LangTags/ChangeLog deleted file mode 100644 index 6cd744ef46..0000000000 --- a/ext/I18N-LangTags/ChangeLog +++ /dev/null @@ -1,195 +0,0 @@ -Revision history for Perl module I18N::LangTags. - Time-stamp: "2004-10-06 23:26:53 ADT" - -2004-10-06 Sean M. Burke sburke@cpan.org - - * Release 0.35 - - Bugfix version: locale2language_tag now correctly understands - locale-IDs with at-signs in them, like 'it_IT.utf8@euro' or - 'it_IT@euro'. This is now enforced by the new t/20_locales.t - - Thanks to Luca 'loopback' Cavalli for letting me know about these - new locale-ID name-styles. - - - * Release 0.34 -- never happened, because of an upload error - - -2004-07-01 Sean M. Burke sburke@cpan.org - - * Release 0.33 - - Minor bugfix version: - The test 80_all_env.t was erroneously failing for people with - LC_ALL or LC_MESSAGES set. Fixed. Thanks to everyone, especially - Nicholas Clark, who patiently helped out with this. - - -2004-06-20 Sean M. Burke sburke@cpan.org - - * Release 0.32 - - Minor bugfix version: - The test 80_all_env.t was erroneously failing under MSWins that - had Win32::Locale installed. A workaround added. - - -2004-06-17 Sean M. Burke sburke@cpan.org - - * Release 0.31 - - Corrected some unevennesses in when/whether the return values from - I18N::LangTags::Detect's various internal functions would be - downcased. Now they're /always/ downcased, and are /always/ fed - thru alternate_language_tags()! - - Also, spiffed up and generally improved the earlier test - 80_all_env.t, which not even I could make sense of, and I wrote - the damned thing. Now it's sane, and checks both scalar and - list return values. Thanks to Rafael Garcia-Suarez and the - various CPAN-Testers for prodding me to fix this. (Hopefully the - earlier problems /are/ now fixed! Otherwise there'll be another - version of this module out real soon!) - - -2004-03-30 Sean M. Burke sburke@cpan.org - - * Release 0.30 - - New in I18N::LangTags : implicate_supers and - implicate_supers_strictly. - - New module: I18N::LangTags::Detect. - - Some new tests. - - Thanks to Autrijus Tang for catching some errors in my makefile! - - - -2003-10-10 Sean M. Burke sburke@cpan.org - - * Release 0.29 - - Minor bugfix to I18N::LangTags::List code. Addition of the - is_decent function, and the 02decency.t test for it. - - Better Makefile. Thanks to everyone who told me about the - INSTALLDIRS trick. - - - -2003-07-20 Sean M. Burke sburke@cpan.org - - * Release 0.28 - Doc fixes in I18N::LangTags, plus a few added variances (jw/jv, - cre/cr, etc.) - Lots of updates to I18N::LangTags::List - Deleted rfc3066.txt from dist. - Moved test.pl to t/01test.t and added more tests. - -2002-02-02 Sean M. Burke sburke@cpan.org - - * Release 0.27 -- minor mods to ::List: - Fixing its entries for sv-se and sv-fi. - Typo-fixes and rewordings in the incidental Pod text elsewhere. - -2001-06-21 Sean M. Burke sburke@cpan.org - - * Release 0.26 -- just making cosmetic changes - to test.pl, at Jarkko's request. - -2001-06-20 Sean M. Burke sburke@cpan.org - - * Release 0.25 -- just tweaking panic_languages behavior - for Scandinavian languages. Much better now. - Slight tweak to ::List's entries for Greek. - -2001-06-20 Sean M. Burke sburke@cpan.org - - * Release 0.24 - - * I18N::LangTags -- some elaborate hacks to make us - recognize legacy aliases like no-nyn == nn. - Added panic_languages(). - Added :ALL export tag. - Minor docs fixes, and spiffing up test.pl. - - * I18N::LangTags::List -- minor corrections; added - a few aliases. - -2001-05-29 Sean M. Burke sburke@cpan.org - - * Release 0.23 - - * I18N::LangTags::List -- minor corrections. And is now - a module, not just documentation. - -2001-05-27 Sean M. Burke sburke@cpan.org - - * Release 0.22 - - * Now bundling I18N::LangTags::List, a reference for lang tags, - replacing generate_language_table.plx and language_codes.txt - -2001-05-25 Sean M. Burke sburke@cpan.org - - * Release 0.21 - - * extract_language_tags and locale2langauge_tag now - return untainted output. Useful if you feed tainted - things, like $ENV{'LANG'}. - -2001-03-13 Sean M. Burke sburke@cpan.org - - * Release 0.20 - - * Added support for RFC 3066 tags: allowing three-letter primary - tags ("nav"), and allowing digits in subtags ("x-borg-prot3252"). - - * Changed all references from RFC 1766 to RFC 3066. - - * Now bundling fulltext of RFC 3066 in the dist. - - * Now bundling generate_language_table.plx and language_codes.txt - - * Added some nice tests to test.pl - - * Inverting order of listings in this ChangeLog file. - -2000-05-13 Sean M. Burke sburke@cpan.org - - * Release 0.13 - - * Just noting my new email address. - -1999-03-06 Sean M. Burke sburke@netadventure.net - - * Release 0.11 - - * Added functions - similarity_language_tag, is_dialect_of, - locale2language_tag, alternate_language_tags, and - encode_language_tag - -1998-12-14 Sean M. Burke sburke@netadventure.net - - * Release 0.09 - - * Added function super_languages() - -1998-10-31 Sean M. Burke sburke@netadventure.net - - * Release 0.08 - - * Just changes in the docs and bundle -- no change - in functionality. - -1998-04-02 Sean M. Burke sburke@netadventure.net - - * Release 0.07 - - * First public release. - -[END OF CHANGELOG] diff --git a/ext/I18N-LangTags/README b/ext/I18N-LangTags/README deleted file mode 100644 index ef0eb7b2f9..0000000000 --- a/ext/I18N-LangTags/README +++ /dev/null @@ -1,78 +0,0 @@ -README for I18N::LangTags - Time-stamp: "2004-10-06 23:19:39 ADT" - - I18N::LangTags - -I18N::LangTags - functions for dealing with RFC3066-style language -tags - -Language tags are a formalism, described in RFC 3066 (obsoleting -1766), for declaring what language form (language and possibly -dialect) a given chunk of information is in. - -This library provides functions for common tasks involving language -tags (notably the extraction of them, comparing them, and testing the -formal validity of them) as is needed in a variety of protocols and -applications. - - -I18N::LangTags::List -- tags and names for human languages. This -module goes from known language tag names ("fr-CA") to their English -names ("Canadian French"). Its documentation also lists the several -hundred known tags and some common subforms. You may find this useful -as a reference. - - -See the POD for more information. - - -INSTALLATION - -You install I18N::LangTags and I18N::LangTags::List, as you would -install any perl module library, by running these commands: - - perl Makefile.PL - make - make test - make install - -If you want to install a private copy of I18N::LangTags in your home -directory, then you should try to produce the initial Makefile with -something like this command: - - perl Makefile.PL LIB=~/perl - -See perldoc perlmodinstall for more information on installing modules. - - -DOCUMENTATION - -POD-format documentation is included in LangTags.pm. POD is readable -with the 'perldoc' utility. See ChangeLog for recent changes. - - -SUPPORT - -Questions, bug reports, useful code bits, and suggestions for -I18N::LangTags should just be sent to me at sburke@cpan.org - - -AVAILABILITY - -The latest version of I18N::LangTags is available from the -Comprehensive Perl Archive Network (CPAN). Visit - to find a CPAN site near you. - - -COPYRIGHT - -Copyright 1998+, Sean M. Burke , all rights -reserved. - -The programs and documentation in this dist are distributed in -the hope that they will be useful, but without any warranty; without -even the implied warranty of merchantability or fitness for a -particular purpose. - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. diff --git a/ext/I18N-LangTags/lib/I18N/LangTags.pm b/ext/I18N-LangTags/lib/I18N/LangTags.pm deleted file mode 100644 index 0bdc65fed1..0000000000 --- a/ext/I18N-LangTags/lib/I18N/LangTags.pm +++ /dev/null @@ -1,887 +0,0 @@ - -# Time-stamp: "2004-10-06 23:26:33 ADT" -# Sean M. Burke - -require 5.000; -package I18N::LangTags; -use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); -require Exporter; -@ISA = qw(Exporter); -@EXPORT = qw(); -@EXPORT_OK = qw(is_language_tag same_language_tag - extract_language_tags super_languages - similarity_language_tag is_dialect_of - locale2language_tag alternate_language_tags - encode_language_tag panic_languages - implicate_supers - implicate_supers_strictly - ); -%EXPORT_TAGS = ('ALL' => \@EXPORT_OK); - -$VERSION = "0.35"; - -sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function - - -=head1 NAME - -I18N::LangTags - functions for dealing with RFC3066-style language tags - -=head1 SYNOPSIS - - use I18N::LangTags(); - -...or specify whichever of those functions you want to import, like so: - - use I18N::LangTags qw(implicate_supers similarity_language_tag); - -All the exportable functions are listed below -- you're free to import -only some, or none at all. By default, none are imported. If you -say: - - use I18N::LangTags qw(:ALL) - -...then all are exported. (This saves you from having to use -something less obvious like C.) - -If you don't import any of these functions, assume a C<&I18N::LangTags::> -in front of all the function names in the following examples. - -=head1 DESCRIPTION - -Language tags are a formalism, described in RFC 3066 (obsoleting -1766), for declaring what language form (language and possibly -dialect) a given chunk of information is in. - -This library provides functions for common tasks involving language -tags as they are needed in a variety of protocols and applications. - -Please see the "See Also" references for a thorough explanation -of how to correctly use language tags. - -=over - -=cut - -########################################################################### - -=item * the function is_language_tag($lang1) - -Returns true iff $lang1 is a formally valid language tag. - - is_language_tag("fr") is TRUE - is_language_tag("x-jicarilla") is FALSE - (Subtags can be 8 chars long at most -- 'jicarilla' is 9) - - is_language_tag("sgn-US") is TRUE - (That's American Sign Language) - - is_language_tag("i-Klikitat") is TRUE - (True without regard to the fact noone has actually - registered Klikitat -- it's a formally valid tag) - - is_language_tag("fr-patois") is TRUE - (Formally valid -- altho descriptively weak!) - - is_language_tag("Spanish") is FALSE - is_language_tag("french-patois") is FALSE - (No good -- first subtag has to match - /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) - - is_language_tag("x-borg-prot2532") is TRUE - (Yes, subtags can contain digits, as of RFC3066) - -=cut - -sub is_language_tag { - - ## Changes in the language tagging standards may have to be reflected here. - - my($tag) = lc($_[0]); - - return 0 if $tag eq "i" or $tag eq "x"; - # Bad degenerate cases that the following - # regexp would erroneously let pass - - return $tag =~ - /^(?: # First subtag - [xi] | [a-z]{2,3} - ) - (?: # Subtags thereafter - - # separator - [a-z0-9]{1,8} # subtag - )* - $/xs ? 1 : 0; -} - -########################################################################### - -=item * the function extract_language_tags($whatever) - -Returns a list of whatever looks like formally valid language tags -in $whatever. Not very smart, so don't get too creative with -what you want to feed it. - - extract_language_tags("fr, fr-ca, i-mingo") - returns: ('fr', 'fr-ca', 'i-mingo') - - extract_language_tags("It's like this: I'm in fr -- French!") - returns: ('It', 'in', 'fr') - (So don't just feed it any old thing.) - -The output is untainted. If you don't know what tainting is, -don't worry about it. - -=cut - -sub extract_language_tags { - - ## Changes in the language tagging standards may have to be reflected here. - - my($text) = - $_[0] =~ m/(.+)/ # to make for an untainted result - ? $1 : '' - ; - - return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags - $text =~ - m/ - \b - (?: # First subtag - [iIxX] | [a-zA-Z]{2,3} - ) - (?: # Subtags thereafter - - # separator - [a-zA-Z0-9]{1,8} # subtag - )* - \b - /xsg - ); -} - -########################################################################### - -=item * the function same_language_tag($lang1, $lang2) - -Returns true iff $lang1 and $lang2 are acceptable variant tags -representing the same language-form. - - same_language_tag('x-kadara', 'i-kadara') is TRUE - (The x/i- alternation doesn't matter) - same_language_tag('X-KADARA', 'i-kadara') is TRUE - (...and neither does case) - same_language_tag('en', 'en-US') is FALSE - (all-English is not the SAME as US English) - same_language_tag('x-kadara', 'x-kadar') is FALSE - (these are totally unrelated tags) - same_language_tag('no-bok', 'nb') is TRUE - (no-bok is a legacy tag for nb (Norwegian Bokmal)) - -C works by just seeing whether -C is the same as -C. - -(Yes, I know this function is named a bit oddly. Call it historic -reasons.) - -=cut - -sub same_language_tag { - my $el1 = &encode_language_tag($_[0]); - return 0 unless defined $el1; - # this avoids the problem of - # encode_language_tag($lang1) eq and encode_language_tag($lang2) - # being true if $lang1 and $lang2 are both undef - - return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; -} - -########################################################################### - -=item * the function similarity_language_tag($lang1, $lang2) - -Returns an integer representing the degree of similarity between -tags $lang1 and $lang2 (the order of which does not matter), where -similarity is the number of common elements on the left, -without regard to case and to x/i- alternation. - - similarity_language_tag('fr', 'fr-ca') is 1 - (one element in common) - similarity_language_tag('fr-ca', 'fr-FR') is 1 - (one element in common) - - similarity_language_tag('fr-CA-joual', - 'fr-CA-PEI') is 2 - similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 - (two elements in common) - - similarity_language_tag('x-kadara', 'i-kadara') is 1 - (x/i- doesn't matter) - - similarity_language_tag('en', 'x-kadar') is 0 - similarity_language_tag('x-kadara', 'x-kadar') is 0 - (unrelated tags -- no similarity) - - similarity_language_tag('i-cree-syllabic', - 'i-cherokee-syllabic') is 0 - (no B elements in common!) - -=cut - -sub similarity_language_tag { - my $lang1 = &encode_language_tag($_[0]); - my $lang2 = &encode_language_tag($_[1]); - # And encode_language_tag takes care of the whole - # no-nyn==nn, i-hakka==zh-hakka, etc, things - - # NB: (i-sil-...)? (i-sgn-...)? - - return undef if !defined($lang1) and !defined($lang2); - return 0 if !defined($lang1) or !defined($lang2); - - my @l1_subtags = split('-', $lang1); - my @l2_subtags = split('-', $lang2); - my $similarity = 0; - - while(@l1_subtags and @l2_subtags) { - if(shift(@l1_subtags) eq shift(@l2_subtags)) { - ++$similarity; - } else { - last; - } - } - return $similarity; -} - -########################################################################### - -=item * the function is_dialect_of($lang1, $lang2) - -Returns true iff language tag $lang1 represents a subform of -language tag $lang2. - -B - - is_dialect_of('en-US', 'en') is TRUE - (American English IS a dialect of all-English) - - is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE - is_dialect_of('fr-CA-joual', 'fr') is TRUE - (Joual is a dialect of (a dialect of) French) - - is_dialect_of('en', 'en-US') is FALSE - (all-English is a NOT dialect of American English) - - is_dialect_of('fr', 'en-CA') is FALSE - - is_dialect_of('en', 'en' ) is TRUE - is_dialect_of('en-US', 'en-US') is TRUE - (B these are degenerate cases) - - is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE - (the x/i thing doesn't matter, nor does case) - - is_dialect_of('nn', 'no') is TRUE - (because 'nn' (New Norse) is aliased to 'no-nyn', - as a special legacy case, and 'no-nyn' is a - subform of 'no' (Norwegian)) - -=cut - -sub is_dialect_of { - - my $lang1 = &encode_language_tag($_[0]); - my $lang2 = &encode_language_tag($_[1]); - - return undef if !defined($lang1) and !defined($lang2); - return 0 if !defined($lang1) or !defined($lang2); - - return 1 if $lang1 eq $lang2; - return 0 if length($lang1) < length($lang2); - - $lang1 .= '-'; - $lang2 .= '-'; - return - (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; -} - -########################################################################### - -=item * the function super_languages($lang1) - -Returns a list of language tags that are superordinate tags to $lang1 --- it gets this by removing subtags from the end of $lang1 until -nothing (or just "i" or "x") is left. - - super_languages("fr-CA-joual") is ("fr-CA", "fr") - - super_languages("en-AU") is ("en") - - super_languages("en") is empty-list, () - - super_languages("i-cherokee") is empty-list, () - ...not ("i"), which would be illegal as well as pointless. - -If $lang1 is not a valid language tag, returns empty-list in -a list context, undef in a scalar context. - -A notable and rather unavoidable problem with this method: -"x-mingo-tom" has an "x" because the whole tag isn't an -IANA-registered tag -- but super_languages('x-mingo-tom') is -('x-mingo') -- which isn't really right, since 'i-mingo' is -registered. But this module has no way of knowing that. (But note -that same_language_tag('x-mingo', 'i-mingo') is TRUE.) - -More importantly, you assume I that superordinates of -$lang1 are mutually intelligible with $lang1. Consider this -carefully. - -=cut - -sub super_languages { - my $lang1 = $_[0]; - return() unless defined($lang1) && &is_language_tag($lang1); - - # a hack for those annoying new (2001) tags: - $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards - $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards - $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way - # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark - - my @l1_subtags = split('-', $lang1); - - ## Changes in the language tagging standards may have to be reflected here. - - # NB: (i-sil-...)? - - my @supers = (); - foreach my $bit (@l1_subtags) { - push @supers, - scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; - } - pop @supers if @supers; - shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; - return reverse @supers; -} - -########################################################################### - -=item * the function locale2language_tag($locale_identifier) - -This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") -and maps it to a language tag. If it's not mappable (as with, -notably, "C" and "POSIX"), this returns empty-list in a list context, -or undef in a scalar context. - - locale2language_tag("en") is "en" - - locale2language_tag("en_US") is "en-US" - - locale2language_tag("en_US.ISO8859-1") is "en-US" - - locale2language_tag("C") is undef or () - - locale2language_tag("POSIX") is undef or () - - locale2language_tag("POSIX") is undef or () - -I'm not totally sure that locale names map satisfactorily to language -tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. - -The output is untainted. If you don't know what tainting is, -don't worry about it. - -=cut - -sub locale2language_tag { - my $lang = - $_[0] =~ m/(.+)/ # to make for an untainted result - ? $1 : '' - ; - - return $lang if &is_language_tag($lang); # like "en" - - $lang =~ tr<_><->; # "en_US" -> en-US - $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US - # it_IT.utf8@euro => it-IT - - return $lang if &is_language_tag($lang); - - return; -} - -########################################################################### - -=item * the function encode_language_tag($lang1) - -This function, if given a language tag, returns an encoding of it such -that: - -* tags representing different languages never get the same encoding. - -* tags representing the same language always get the same encoding. - -* an encoding of a formally valid language tag always is a string -value that is defined, has length, and is true if considered as a -boolean. - -Note that the encoding itself is B a formally valid language tag. -Note also that you cannot, currently, go from an encoding back to a -language tag that it's an encoding of. - -Note also that you B consider the encoded value as atomic; i.e., -you should not consider it as anything but an opaque, unanalysable -string value. (The internals of the encoding method may change in -future versions, as the language tagging standard changes over time.) - -C returns undef if given anything other than a -formally valid language tag. - -The reason C exists is because different language -tags may represent the same language; this is normally treatable with -C, but consider this situation: - -You have a data file that expresses greetings in different languages. -Its format is "[language tag]=[how to say 'Hello']", like: - - en-US=Hiho - fr=Bonjour - i-mingo=Hau' - -And suppose you write a program that reads that file and then runs as -a daemon, answering client requests that specify a language tag and -then expect the string that says how to greet in that language. So an -interaction looks like: - - greeting-client asks: fr - greeting-server answers: Bonjour - -So far so good. But suppose the way you're implementing this is: - - my %greetings; - die unless open(IN, ") { - chomp; - next unless /^([^=]+)=(.+)/s; - my($lang, $expr) = ($1, $2); - $greetings{$lang} = $expr; - } - close(IN); - -at which point %greetings has the contents: - - "en-US" => "Hiho" - "fr" => "Bonjour" - "i-mingo" => "Hau'" - -And suppose then that you answer client requests for language $wanted -by just looking up $greetings{$wanted}. - -If the client asks for "fr", that will look up successfully in -%greetings, to the value "Bonjour". And if the client asks for -"i-mingo", that will look up successfully in %greetings, to the value -"Hau'". - -But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the -lookup in %greetings fails. That's the Wrong Thing. - -You could instead do lookups on $wanted with: - - use I18N::LangTags qw(same_language_tag); - my $response = ''; - foreach my $l2 (keys %greetings) { - if(same_language_tag($wanted, $l2)) { - $response = $greetings{$l2}; - last; - } - } - -But that's rather inefficient. A better way to do it is to start your -program with: - - use I18N::LangTags qw(encode_language_tag); - my %greetings; - die unless open(IN, ") { - chomp; - next unless /^([^=]+)=(.+)/s; - my($lang, $expr) = ($1, $2); - $greetings{ - encode_language_tag($lang) - } = $expr; - } - close(IN); - -and then just answer client requests for language $wanted by just -looking up - - $greetings{encode_language_tag($wanted)} - -And that does the Right Thing. - -=cut - -sub encode_language_tag { - # Only similarity_language_tag() is allowed to analyse encodings! - - ## Changes in the language tagging standards may have to be reflected here. - - my($tag) = $_[0] || return undef; - return undef unless &is_language_tag($tag); - - # For the moment, these legacy variances are few enough that - # we can just handle them here with regexps. - $tag =~ s/^iw\b/he/i; # Hebrew - $tag =~ s/^in\b/id/i; # Indonesian - $tag =~ s/^cre\b/cr/i; # Cree - $tag =~ s/^jw\b/jv/i; # Javanese - $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger - $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo - $tag =~ s/^ji\b/yi/i; # Yiddish - # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, - # but maybe they're all so obscure I can ignore them. "Obscure" - # meaning either that the language is obscure, and/or that the - # XXX form was extant so briefly that it's unlikely it was ever - # used. I hope. - # - # These go FROM the simplex to complex form, to get - # similarity-comparison right. And that's okay, since - # similarity_language_tag is the only thing that - # analyzes our output. - $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka - $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal - $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk - - $tag =~ s/^[xiXI]-//s; - # Just lop off any leading "x/i-" - - return "~" . uc($tag); -} - -#-------------------------------------------------------------------------- - -=item * the function alternate_language_tags($lang1) - -This function, if given a language tag, returns all language tags that -are alternate forms of this language tag. (I.e., tags which refer to -the same language.) This is meant to handle legacy tags caused by -the minor changes in language tag standards over the years; and -the x-/i- alternation is also dealt with. - -Note that this function does I try to equate new (and never-used, -and unusable) -ISO639-2 three-letter tags to old (and still in use) ISO639-1 -two-letter equivalents -- like "ara" -> "ar" -- because -"ara" has I been in use as an Internet language tag, -and RFC 3066 stipulates that it never should be, since a shorter -tag ("ar") exists. - -Examples: - - alternate_language_tags('no-bok') is ('nb') - alternate_language_tags('nb') is ('no-bok') - alternate_language_tags('he') is ('iw') - alternate_language_tags('iw') is ('he') - alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') - alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') - alternate_language_tags('en') is () - alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') - alternate_language_tags('x-klikitat') is ('i-klikitat') - alternate_language_tags('i-klikitat') is ('x-klikitat') - -This function returns empty-list if given anything other than a formally -valid language tag. - -=cut - -my %alt = qw( i x x i I X X I ); -sub alternate_language_tags { - my $tag = $_[0]; - return() unless &is_language_tag($tag); - - my @em; # push 'em real goood! - - # For the moment, these legacy variances are few enough that - # we can just handle them here with regexps. - - if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; - } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; - - } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; - } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; - - } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; - } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; - - } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; - } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; - - } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; - } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; - - } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; - } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; - - } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; - } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; - - } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; - } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; - } - - push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; - return @em; -} - -########################################################################### - -{ - # Init %Panic... - - my @panic = ( # MUST all be lowercase! - # Only large ("national") languages make it in this list. - # If you, as a user, are so bizarre that the /only/ language - # you claim to accept is Galician, then no, we won't do you - # the favor of providing Catalan as a panic-fallback for - # you. Because if I start trying to add "little languages" in - # here, I'll just go crazy. - - # Scandinavian lgs. All based on opinion and hearsay. - 'sv' => [qw(nb no da nn)], - 'da' => [qw(nb no sv nn)], # I guess - [qw(no nn nb)], [qw(no nn nb sv da)], - 'is' => [qw(da sv no nb nn)], - 'fo' => [qw(da is no nb nn sv)], # I guess - - # I think this is about the extent of tolerable intelligibility - # among large modern Romance languages. - 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French - 'ca' => [qw(es pt it fr)], - 'es' => [qw(ca it fr pt)], - 'it' => [qw(es fr ca pt)], - 'fr' => [qw(es it ca pt)], - - # Also assume that speakers of the main Indian languages prefer - # to read/hear Hindi over English - [qw( - as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur - )] => 'hi', - # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, - # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, - # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. - 'hi' => [qw(bn pa as or)], - # I welcome finer data for the other Indian languages. - # E.g., what should Oriya's list be, besides just Hindi? - - # And the panic languages for English is, of course, nil! - - # My guesses at Slavic intelligibility: - ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian - 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat - 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak - - 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian - - 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish - - #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai - - ); - my($k,$v); - while(@panic) { - ($k,$v) = splice(@panic,0,2); - foreach my $k (ref($k) ? @$k : $k) { - foreach my $v (ref($v) ? @$v : $v) { - push @{$Panic{$k} ||= []}, $v unless $k eq $v; - } - } - } -} - -=item * the function @langs = panic_languages(@accept_languages) - -This function takes a list of 0 or more language -tags that constitute a given user's Accept-Language list, and -returns a list of tags for I (non-super) -languages that are probably acceptable to the user, to be -used I. - -For example, if a user accepts only 'ca' (Catalan) and -'es' (Spanish), and the documents/interfaces you have -available are just in German, Italian, and Chinese, then -the user will most likely want the Italian one (and not -the Chinese or German one!), instead of getting -nothing. So C returns -a list containing 'it' (Italian). - -English ('en') is I in the return list, but -whether it's at the very end or not depends -on the input languages. This function works by consulting -an internal table that stipulates what common -languages are "close" to each other. - -A useful construct you might consider using is: - - @fallbacks = super_languages(@accept_languages); - push @fallbacks, panic_languages( - @accept_languages, @fallbacks, - ); - -=cut - -sub panic_languages { - # When in panic or in doubt, run in circles, scream, and shout! - my(@out, %seen); - foreach my $t (@_) { - next unless $t; - next if $seen{$t}++; # so we don't return it or hit it again - # push @out, super_languages($t); # nah, keep that separate - push @out, @{ $Panic{lc $t} || next }; - } - return grep !$seen{$_}++, @out, 'en'; -} - -#--------------------------------------------------------------------------- -#--------------------------------------------------------------------------- - -=item * the function implicate_supers( ...languages... ) - -This takes a list of strings (which are presumed to be language-tags; -strings that aren't, are ignored); and after each one, this function -inserts super-ordinate forms that don't already appear in the list. -The original list, plus these insertions, is returned. - -In other words, it takes this: - - pt-br de-DE en-US fr pt-br-janeiro - -and returns this: - - pt-br pt de-DE de en-US en fr pt-br-janeiro - -This function is most useful in the idiom - - implicate_supers( I18N::LangTags::Detect::detect() ); - -(See L.) - - -=item * the function implicate_supers_strictly( ...languages... ) - -This works like C except that the implicated -forms are added to the end of the return list. - -In other words, implicate_supers_strictly takes a list of strings -(which are presumed to be language-tags; strings that aren't, are -ignored) and after the whole given list, it inserts the super-ordinate forms -of all given tags, minus any tags that already appear in the input list. - -In other words, it takes this: - - pt-br de-DE en-US fr pt-br-janeiro - -and returns this: - - pt-br de-DE en-US fr pt-br-janeiro pt de en - -The reason this function has "_strictly" in its name is that when -you're processing an Accept-Language list according to the RFCs, if -you interpret the RFCs quite strictly, then you would use -implicate_supers_strictly, but for normal use (i.e., common-sense use, -as far as I'm concerned) you'd use implicate_supers. - -=cut - -sub implicate_supers { - my @languages = grep is_language_tag($_), @_; - my %seen_encoded; - foreach my $lang (@languages) { - $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 - } - - my(@output_languages); - foreach my $lang (@languages) { - push @output_languages, $lang; - foreach my $s ( I18N::LangTags::super_languages($lang) ) { - # Note that super_languages returns the longest first. - last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; - push @output_languages, $s; - } - } - return uniq( @output_languages ); - -} - -sub implicate_supers_strictly { - my @tags = grep is_language_tag($_), @_; - return uniq( @_, map super_languages($_), @_ ); -} - - - -########################################################################### -1; -__END__ - -=back - -=head1 ABOUT LOWERCASING - -I've considered making all the above functions that output language -tags return all those tags strictly in lowercase. Having all your -language tags in lowercase does make some things easier. But you -might as well just lowercase as you like, or call -C where appropriate. - -=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS - -In some future version of I18N::LangTags, I plan to include support -for RFC2482-style language tags -- which are basically just normal -language tags with their ASCII characters shifted into Plane 14. - -=head1 SEE ALSO - -* L - -* RFC 3066, C, "Tags for the -Identification of Languages". (Obsoletes RFC 1766) - -* RFC 2277, C, "IETF Policy on -Character Sets and Languages". - -* RFC 2231, C, "MIME Parameter -Value and Encoded Word Extensions: Character Sets, Languages, and -Continuations". - -* RFC 2482, C, -"Language Tagging in Unicode Plain Text". - -* Locale::Codes, in -C - -* ISO 639-2, "Codes for the representation of names of languages", -including two-letter and three-letter codes, -C - -* The IANA list of registered languages (hopefully up-to-date), -C - -=head1 COPYRIGHT - -Copyright (c) 1998+ Sean M. Burke. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -The programs and documentation in this dist are distributed in -the hope that they will be useful, but without any warranty; without -even the implied warranty of merchantability or fitness for a -particular purpose. - -=head1 AUTHOR - -Sean M. Burke C - -=cut - diff --git a/ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm b/ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm deleted file mode 100644 index 3f1b7c006a..0000000000 --- a/ext/I18N-LangTags/lib/I18N/LangTags/Detect.pm +++ /dev/null @@ -1,237 +0,0 @@ - -# Time-stamp: "2004-06-20 21:47:55 ADT" - -require 5; -package I18N::LangTags::Detect; -use strict; - -use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS - $USE_LITERALS $MATCH_SUPERS_TIGHTLY); - -BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } - # define the constant 'DEBUG' at compile-time - -$VERSION = "1.03"; -@ISA = (); -use I18N::LangTags qw(alternate_language_tags locale2language_tag); - -sub _uniq { my %seen; return grep(!($seen{$_}++), @_); } -sub _normalize { - my(@languages) = - map lc($_), - grep $_, - map {; $_, alternate_language_tags($_) } @_; - return _uniq(@languages) if wantarray; - return $languages[0]; -} - -#--------------------------------------------------------------------------- -# The extent of our functional interface: - -sub detect () { return __PACKAGE__->ambient_langprefs; } - -#=========================================================================== - -sub ambient_langprefs { # always returns things untainted - my $base_class = $_[0]; - - return $base_class->http_accept_langs - if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI - # it's off in its own routine because it's complicated - - # Not running as a CGI: try to puzzle out from the environment - my @languages; - - foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) { - next unless $ENV{$envname}; - DEBUG and print "Noting \$$envname: $ENV{$envname}\n"; - push @languages, - map locale2language_tag($_), - # if it's a lg tag, fine, pass thru (untainted) - # if it's a locale ID, try converting to a lg tag (untainted), - # otherwise nix it. - - split m/[,:]/, - $ENV{$envname} - ; - last; # first one wins - } - - if($ENV{'IGNORE_WIN32_LOCALE'}) { - # no-op - } elsif(&_try_use('Win32::Locale')) { - # If we have that module installed... - push @languages, Win32::Locale::get_language() || '' - if defined &Win32::Locale::get_language; - } - return _normalize @languages; -} - -#--------------------------------------------------------------------------- - -sub http_accept_langs { - # Deal with HTTP "Accept-Language:" stuff. Hassle. - # This code is more lenient than RFC 3282, which you must read. - # Hm. Should I just move this into I18N::LangTags at some point? - no integer; - - my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'}; - # (always ends up untainting) - - return() unless defined $in and length $in; - - $in =~ s/\([^\)]*\)//g; # nix just about any comment - - if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) { - # Very common case: just one language tag - return _normalize $1; - } elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) { - # Common case these days: just "foo, bar, baz" - return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g ); - } - - # Else it's complicated... - - $in =~ s/\s+//g; # Yes, we can just do without the WS! - my @in = $in =~ m/([^,]+)/g; - my %pref; - - my $q; - foreach my $tag (@in) { - next unless $tag =~ - m/^([a-zA-Z][-a-zA-Z]+) - (?: - ;q= - ( - \d* # a bit too broad of a RE, but so what. - (?: - \.\d+ - )? - ) - )? - $ - /sx - ; - $q = (defined $2 and length $2) ? $2 : 1; - #print "$1 with q=$q\n"; - push @{ $pref{$q} }, lc $1; - } - - return _normalize( - # Read off %pref, in descending key order... - map @{$pref{$_}}, - sort {$b <=> $a} - keys %pref - ); -} - -#=========================================================================== - -my %tried = (); - # memoization of whether we've used this module, or found it unusable. - -sub _try_use { # Basically a wrapper around "require Modulename" - # "Many men have tried..." "They tried and failed?" "They tried and died." - return $tried{$_[0]} if exists $tried{$_[0]}; # memoization - - my $module = $_[0]; # ASSUME sane module name! - { no strict 'refs'; - return($tried{$module} = 1) - if defined(%{$module . "::Lexicon"}) or defined(@{$module . "::ISA"}); - # weird case: we never use'd it, but there it is! - } - - print " About to use $module ...\n" if DEBUG; - { - local $SIG{'__DIE__'}; - eval "require $module"; # used to be "use $module", but no point in that. - } - if($@) { - print "Error using $module \: $@\n" if DEBUG > 1; - return $tried{$module} = 0; - } else { - print " OK, $module is used\n" if DEBUG; - return $tried{$module} = 1; - } -} - -#--------------------------------------------------------------------------- -1; -__END__ - - -=head1 NAME - -I18N::LangTags::Detect - detect the user's language preferences - -=head1 SYNOPSIS - - use I18N::LangTags::Detect; - my @user_wants = I18N::LangTags::Detect::detect(); - -=head1 DESCRIPTION - -It is a common problem to want to detect what language(s) the user would -prefer output in. - -=head1 FUNCTIONS - -This module defines one public function, -C. This function is not exported -(nor is even exportable), and it takes no parameters. - -In scalar context, the function returns the most preferred language -tag (or undef if no preference was seen). - -In list context (which is usually what you want), -the function returns a -(possibly empty) list of language tags representing (best first) what -languages the user apparently would accept output in. You will -probably want to pass the output of this through -C -or -C, like so: - - my @languages = - I18N::LangTags::implicate_supers_tightly( - I18N::LangTags::Detect::detect() - ); - - -=head1 ENVIRONMENT - -This module looks for several environment variables, including -REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE, -LANGUAGE, LC_ALL, LC_MESSAGES, and LANG. - -It will also use the L module, if it's installed. - - -=head1 SEE ALSO - -L, L, L. - -(This module's core code started out as a routine in Locale::Maketext; -but I moved it here once I realized it was more generally useful.) - - -=head1 COPYRIGHT - -Copyright (c) 1998-2004 Sean M. Burke. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -The programs and documentation in this dist are distributed in -the hope that they will be useful, but without any warranty; without -even the implied warranty of merchantability or fitness for a -particular purpose. - - -=head1 AUTHOR - -Sean M. Burke C - -=cut - -# a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty! diff --git a/ext/I18N-LangTags/lib/I18N/LangTags/List.pm b/ext/I18N-LangTags/lib/I18N/LangTags/List.pm deleted file mode 100644 index 5494bea21e..0000000000 --- a/ext/I18N-LangTags/lib/I18N/LangTags/List.pm +++ /dev/null @@ -1,1779 +0,0 @@ - -require 5; -package I18N::LangTags::List; -# Time-stamp: "2004-10-06 23:26:21 ADT" -use strict; -use vars qw(%Name %Is_Disrec $Debug $VERSION); -$VERSION = '0.35'; -# POD at the end. - -#---------------------------------------------------------------------- -{ -# read the table out of our own POD! - my $seeking = 1; - my $count = 0; - my($disrec,$tag,$name); - my $last_name = ''; - while() { - if($seeking) { - $seeking = 0 if m/=for woohah/; - } elsif( ($disrec, $tag, $name) = - m/(\[?)\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/ - ) { - $name =~ s/\s*[;\.]*\s*$//g; - next unless $name; - ++$count; - print "<$tag> <$name>\n" if $Debug; - $last_name = $Name{$tag} = $name; - $Is_Disrec{$tag} = 1 if $disrec; - } elsif (m/[Ff]ormerly \"([-a-z0-9]+)\"/) { - $Name{$1} = "$last_name (old tag)" if $last_name; - $Is_Disrec{$1} = 1; - } - } - die "No tags read??" unless $count; -} -#---------------------------------------------------------------------- - -sub name { - my $tag = lc($_[0] || return); - $tag =~ s/^\s+//s; - $tag =~ s/\s+$//s; - - my $alt; - if($tag =~ m/^x-(.+)/) { - $alt = "i-$1"; - } elsif($tag =~ m/^i-(.+)/) { - $alt = "x-$1"; - } else { - $alt = ''; - } - - my $subform = ''; - my $name = ''; - print "Input: {$tag}\n" if $Debug; - while(length $tag) { - last if $name = $Name{$tag}; - last if $name = $Name{$alt}; - if($tag =~ s/(-[a-z0-9]+)$//s) { - print "Shaving off: $1 leaving $tag\n" if $Debug; - $subform = "$1$subform"; - # and loop around again - - $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; - } else { - # we're trying to pull a subform off a primary tag. TILT! - print "Aborting on: {$name}{$subform}\n" if $Debug; - last; - } - } - print "Output: {$name}{$subform}\n" if $Debug; - - return unless $name; # Failure - return $name unless $subform; # Exact match - $subform =~ s/^-//s; - $subform =~ s/-$//s; - return "$name (Subform \"$subform\")"; -} - -#-------------------------------------------------------------------------- - -sub is_decent { - my $tag = lc($_[0] || return 0); - #require I18N::LangTags; - - return 0 unless - $tag =~ - /^(?: # First subtag - [xi] | [a-z]{2,3} - ) - (?: # Subtags thereafter - - # separator - [a-z0-9]{1,8} # subtag - )* - $/xs; - - my @supers = (); - foreach my $bit (split('-', $tag)) { - push @supers, - scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; - } - return 0 unless @supers; - shift @supers if $supers[0] =~ m<^(i|x|sgn)$>s; - return 0 unless @supers; - - foreach my $f ($tag, @supers) { - return 0 if $Is_Disrec{$f}; - return 2 if $Name{$f}; - # so that decent subforms of indecent tags are decent - } - return 2 if $Name{$tag}; # not only is it decent, it's known! - return 1; -} - -#-------------------------------------------------------------------------- -1; - -__DATA__ - -=head1 NAME - -I18N::LangTags::List -- tags and names for human languages - -=head1 SYNOPSIS - - use I18N::LangTags::List; - print "Parlez-vous... ", join(', ', - I18N::LangTags::List::name('elx') || 'unknown_language', - I18N::LangTags::List::name('ar-Kw') || 'unknown_language', - I18N::LangTags::List::name('en') || 'unknown_language', - I18N::LangTags::List::name('en-CA') || 'unknown_language', - ), "?\n"; - -prints: - - Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? - -=head1 DESCRIPTION - -This module provides a function -C ) > that takes -a language tag (see L) -and returns the best attempt at an English name for it, or -undef if it can't make sense of the tag. - -The function I18N::LangTags::List::name(...) is not exported. - -This module also provides a function -C )> that returns true iff -the language tag is syntactically valid and is for general use (like -"fr" or "fr-ca", below). That is, it returns false for tags that are -syntactically invalid and for tags, like "aus", that are listed in -brackets below. This function is not exported. - -The map of tags-to-names that it uses is accessible as -%I18N::LangTags::List::Name, and it's the same as the list -that follows in this documentation, which should be useful -to you even if you don't use this module. - -=head1 ABOUT LANGUAGE TAGS - -Internet language tags, as defined in RFC 3066, are a formalism -for denoting human languages. The two-letter ISO 639-1 language -codes are well known (as "en" for English), as are their forms -when qualified by a country code ("en-US"). Less well-known are the -arbitrary-length non-ISO codes (like "i-mingo"), and the -recently (in 2001) introduced three-letter ISO-639-2 codes. - -Remember these important facts: - -=over - -=item * - -Language tags are not locale IDs. A locale ID is written with a "_" -instead of a "-", (almost?) always matches C, and -I something different than a language tag. A language tag -denotes a language. A locale ID denotes a language I -a particular place, in combination with non-linguistic -location-specific information such as what currency is used -there. Locales I often denote character set information, -as in "en_US.ISO8859-1". - -=item * - -Language tags are not for computer languages. - -=item * - -"Dialect" is not a useful term, since there is no objective -criterion for establishing when two language-forms are -dialects of eachother, or are separate languages. - -=item * - -Language tags are not case-sensitive. en-US, en-us, En-Us, etc., -are all the same tag, and denote the same language. - -=item * - -Not every language tag really refers to a single language. Some -language tags refer to conditions: i-default (system-message text -in English plus maybe other languages), und (undetermined -language). Others (notably lots of the three-letter codes) are -bibliographic tags that classify whole groups of languages, as -with cus "Cushitic (Other)" (i.e., a -language that has been classed as Cushtic, but which has no more -specific code) or the even less linguistically coherent -sai for "South American Indian (Other)". Though useful in -bibliography, B. For further guidance, email me. - -=item * - -Language tags are not country codes. In fact, they are often -distinct codes, as with language tag ja for Japanese, and -ISO 3166 country code C<.jp> for Japan. - -=back - -=head1 LIST OF LANGUAGES - -The first part of each item is the language tag, between -{...}. It -is followed by an English name for the language or language-group. -Language tags that I judge to be not for general use, are bracketed. - -This list is in alphabetical order by English name of the language. - -=for reminder - The name in the =item line MUST NOT have E<...>'s in it!! - -=for woohah START - -=over - -=item {ab} : Abkhazian - -eq Abkhaz - -=item {ace} : Achinese - -=item {ach} : Acoli - -=item {ada} : Adangme - -=item {ady} : Adyghe - -eq Adygei - -=item {aa} : Afar - -=item {afh} : Afrihili - -(Artificial) - -=item {af} : Afrikaans - -=item [{afa} : Afro-Asiatic (Other)] - -=item {ak} : Akan - -(Formerly "aka".) - -=item {akk} : Akkadian - -(Historical) - -=item {sq} : Albanian - -=item {ale} : Aleut - -=item [{alg} : Algonquian languages] - -NOT Algonquin! - -=item [{tut} : Altaic (Other)] - -=item {am} : Amharic - -NOT Aramaic! - -=item {i-ami} : Ami - -eq Amis. eq 'Amis. eq Pangca. - -=item [{apa} : Apache languages] - -=item {ar} : Arabic - -Many forms are mutually un-intelligible in spoken media. -Notable forms: -{ar-ae} UAE Arabic; -{ar-bh} Bahrain Arabic; -{ar-dz} Algerian Arabic; -{ar-eg} Egyptian Arabic; -{ar-iq} Iraqi Arabic; -{ar-jo} Jordanian Arabic; -{ar-kw} Kuwait Arabic; -{ar-lb} Lebanese Arabic; -{ar-ly} Libyan Arabic; -{ar-ma} Moroccan Arabic; -{ar-om} Omani Arabic; -{ar-qa} Qatari Arabic; -{ar-sa} Sauda Arabic; -{ar-sy} Syrian Arabic; -{ar-tn} Tunisian Arabic; -{ar-ye} Yemen Arabic. - -=item {arc} : Aramaic - -NOT Amharic! NOT Samaritan Aramaic! - -=item {arp} : Arapaho - -=item {arn} : Araucanian - -=item {arw} : Arawak - -=item {hy} : Armenian - -=item {an} : Aragonese - -=item [{art} : Artificial (Other)] - -=item {ast} : Asturian - -eq Bable. - -=item {as} : Assamese - -=item [{ath} : Athapascan languages] - -eq Athabaskan. eq Athapaskan. eq Athabascan. - -=item [{aus} : Australian languages] - -=item [{map} : Austronesian (Other)] - -=item {av} : Avaric - -(Formerly "ava".) - -=item {ae} : Avestan - -eq Zend - -=item {awa} : Awadhi - -=item {ay} : Aymara - -=item {az} : Azerbaijani - -eq Azeri - -Notable forms: -{az-Arab} Azerbaijani in Arabic script; -{az-Cyrl} Azerbaijani in Cyrillic script; -{az-Latn} Azerbaijani in Latin script. - -=item {ban} : Balinese - -=item [{bat} : Baltic (Other)] - -=item {bal} : Baluchi - -=item {bm} : Bambara - -(Formerly "bam".) - -=item [{bai} : Bamileke languages] - -=item {bad} : Banda - -=item [{bnt} : Bantu (Other)] - -=item {bas} : Basa - -=item {ba} : Bashkir - -=item {eu} : Basque - -=item {btk} : Batak (Indonesia) - -=item {bej} : Beja - -=item {be} : Belarusian - -eq Belarussian. eq Byelarussian. -eq Belorussian. eq Byelorussian. -eq White Russian. eq White Ruthenian. -NOT Ruthenian! - -=item {bem} : Bemba - -=item {bn} : Bengali - -eq Bangla. - -=item [{ber} : Berber (Other)] - -=item {bho} : Bhojpuri - -=item {bh} : Bihari - -=item {bik} : Bikol - -=item {bin} : Bini - -=item {bi} : Bislama - -eq Bichelamar. - -=item {bs} : Bosnian - -=item {bra} : Braj - -=item {br} : Breton - -=item {bug} : Buginese - -=item {bg} : Bulgarian - -=item {i-bnn} : Bunun - -=item {bua} : Buriat - -=item {my} : Burmese - -=item {cad} : Caddo - -=item {car} : Carib - -=item {ca} : Catalan - -eq CatalEn. eq Catalonian. - -=item [{cau} : Caucasian (Other)] - -=item {ceb} : Cebuano - -=item [{cel} : Celtic (Other)] - -Notable forms: -{cel-gaulish} Gaulish (Historical) - -=item [{cai} : Central American Indian (Other)] - -=item {chg} : Chagatai - -(Historical?) - -=item [{cmc} : Chamic languages] - -=item {ch} : Chamorro - -=item {ce} : Chechen - -=item {chr} : Cherokee - -eq Tsalagi - -=item {chy} : Cheyenne - -=item {chb} : Chibcha - -(Historical) NOT Chibchan (which is a language family). - -=item {ny} : Chichewa - -eq Nyanja. eq Chinyanja. - -=item {zh} : Chinese - -Many forms are mutually un-intelligible in spoken media. -Notable forms: -{zh-Hans} Chinese, in simplified script; -{zh-Hant} Chinese, in traditional script; -{zh-tw} Taiwan Chinese; -{zh-cn} PRC Chinese; -{zh-sg} Singapore Chinese; -{zh-mo} Macau Chinese; -{zh-hk} Hong Kong Chinese; -{zh-guoyu} Mandarin [Putonghua/Guoyu]; -{zh-hakka} Hakka [formerly "i-hakka"]; -{zh-min} Hokkien; -{zh-min-nan} Southern Hokkien; -{zh-wuu} Shanghaiese; -{zh-xiang} Hunanese; -{zh-gan} Gan; -{zh-yue} Cantonese. - -=for etc -{i-hakka} Hakka (old tag) - -=item {chn} : Chinook Jargon - -eq Chinook Wawa. - -=item {chp} : Chipewyan - -=item {cho} : Choctaw - -=item {cu} : Church Slavic - -eq Old Church Slavonic. - -=item {chk} : Chuukese - -eq Trukese. eq Chuuk. eq Truk. eq Ruk. - -=item {cv} : Chuvash - -=item {cop} : Coptic - -=item {kw} : Cornish - -=item {co} : Corsican - -eq Corse. - -=item {cr} : Cree - -NOT Creek! (Formerly "cre".) - -=item {mus} : Creek - -NOT Cree! - -=item [{cpe} : English-based Creoles and pidgins (Other)] - -=item [{cpf} : French-based Creoles and pidgins (Other)] - -=item [{cpp} : Portuguese-based Creoles and pidgins (Other)] - -=item [{crp} : Creoles and pidgins (Other)] - -=item {hr} : Croatian - -eq Croat. - -=item [{cus} : Cushitic (Other)] - -=item {cs} : Czech - -=item {dak} : Dakota - -eq Nakota. eq Latoka. - -=item {da} : Danish - -=item {dar} : Dargwa - -=item {day} : Dayak - -=item {i-default} : Default (Fallthru) Language - -Defined in RFC 2277, this is for tagging text -(which must include English text, and might/should include text -in other appropriate languages) that is emitted in a context -where language-negotiation wasn't possible -- in SMTP mail failure -messages, for example. - -=item {del} : Delaware - -=item {din} : Dinka - -=item {dv} : Divehi - -eq Maldivian. (Formerly "div".) - -=item {doi} : Dogri - -NOT Dogrib! - -=item {dgr} : Dogrib - -NOT Dogri! - -=item [{dra} : Dravidian (Other)] - -=item {dua} : Duala - -=item {nl} : Dutch - -eq Netherlander. Notable forms: -{nl-nl} Netherlands Dutch; -{nl-be} Belgian Dutch. - -=item {dum} : Middle Dutch (ca.1050-1350) - -(Historical) - -=item {dyu} : Dyula - -=item {dz} : Dzongkha - -=item {efi} : Efik - -=item {egy} : Ancient Egyptian - -(Historical) - -=item {eka} : Ekajuk - -=item {elx} : Elamite - -(Historical) - -=item {en} : English - -Notable forms: -{en-au} Australian English; -{en-bz} Belize English; -{en-ca} Canadian English; -{en-gb} UK English; -{en-ie} Irish English; -{en-jm} Jamaican English; -{en-nz} New Zealand English; -{en-ph} Philippine English; -{en-tt} Trinidad English; -{en-us} US English; -{en-za} South African English; -{en-zw} Zimbabwe English. - -=item {enm} : Old English (1100-1500) - -(Historical) - -=item {ang} : Old English (ca.450-1100) - -eq Anglo-Saxon. (Historical) - -=item {i-enochian} : Enochian (Artificial) - -=item {myv} : Erzya - -=item {eo} : Esperanto - -(Artificial) - -=item {et} : Estonian - -=item {ee} : Ewe - -(Formerly "ewe".) - -=item {ewo} : Ewondo - -=item {fan} : Fang - -=item {fat} : Fanti - -=item {fo} : Faroese - -=item {fj} : Fijian - -=item {fi} : Finnish - -=item [{fiu} : Finno-Ugrian (Other)] - -eq Finno-Ugric. NOT Ugaritic! - -=item {fon} : Fon - -=item {fr} : French - -Notable forms: -{fr-fr} France French; -{fr-be} Belgian French; -{fr-ca} Canadian French; -{fr-ch} Swiss French; -{fr-lu} Luxembourg French; -{fr-mc} Monaco French. - -=item {frm} : Middle French (ca.1400-1600) - -(Historical) - -=item {fro} : Old French (842-ca.1400) - -(Historical) - -=item {fy} : Frisian - -=item {fur} : Friulian - -=item {ff} : Fulah - -(Formerly "ful".) - -=item {gaa} : Ga - -=item {gd} : Scots Gaelic - -NOT Scots! - -=item {gl} : Gallegan - -eq Galician - -=item {lg} : Ganda - -(Formerly "lug".) - -=item {gay} : Gayo - -=item {gba} : Gbaya - -=item {gez} : Geez - -eq Ge'ez - -=item {ka} : Georgian - -=item {de} : German - -Notable forms: -{de-at} Austrian German; -{de-be} Belgian German; -{de-ch} Swiss German; -{de-de} Germany German; -{de-li} Liechtenstein German; -{de-lu} Luxembourg German. - -=item {gmh} : Middle High German (ca.1050-1500) - -(Historical) - -=item {goh} : Old High German (ca.750-1050) - -(Historical) - -=item [{gem} : Germanic (Other)] - -=item {gil} : Gilbertese - -=item {gon} : Gondi - -=item {gor} : Gorontalo - -=item {got} : Gothic - -(Historical) - -=item {grb} : Grebo - -=item {grc} : Ancient Greek - -(Historical) (Until 15th century or so.) - -=item {el} : Modern Greek - -(Since 15th century or so.) - -=item {gn} : Guarani - -GuaranE - -=item {gu} : Gujarati - -=item {gwi} : Gwich'in - -eq Gwichin - -=item {hai} : Haida - -=item {ht} : Haitian - -eq Haitian Creole - -=item {ha} : Hausa - -=item {haw} : Hawaiian - -Hawai'ian - -=item {he} : Hebrew - -(Formerly "iw".) - -=for etc -{iw} Hebrew (old tag) - -=item {hz} : Herero - -=item {hil} : Hiligaynon - -=item {him} : Himachali - -=item {hi} : Hindi - -=item {ho} : Hiri Motu - -=item {hit} : Hittite - -(Historical) - -=item {hmn} : Hmong - -=item {hu} : Hungarian - -=item {hup} : Hupa - -=item {iba} : Iban - -=item {is} : Icelandic - -=item {io} : Ido - -(Artificial) - -=item {ig} : Igbo - -(Formerly "ibo".) - -=item {ijo} : Ijo - -=item {ilo} : Iloko - -=item [{inc} : Indic (Other)] - -=item [{ine} : Indo-European (Other)] - -=item {id} : Indonesian - -(Formerly "in".) - -=for etc -{in} Indonesian (old tag) - -=item {inh} : Ingush - -=item {ia} : Interlingua (International Auxiliary Language Association) - -(Artificial) NOT Interlingue! - -=item {ie} : Interlingue - -(Artificial) NOT Interlingua! - -=item {iu} : Inuktitut - -A subform of "Eskimo". - -=item {ik} : Inupiaq - -A subform of "Eskimo". - -=item [{ira} : Iranian (Other)] - -=item {ga} : Irish - -=item {mga} : Middle Irish (900-1200) - -(Historical) - -=item {sga} : Old Irish (to 900) - -(Historical) - -=item [{iro} : Iroquoian languages] - -=item {it} : Italian - -Notable forms: -{it-it} Italy Italian; -{it-ch} Swiss Italian. - -=item {ja} : Japanese - -(NOT "jp"!) - -=item {jv} : Javanese - -(Formerly "jw" because of a typo.) - -=item {jrb} : Judeo-Arabic - -=item {jpr} : Judeo-Persian - -=item {kbd} : Kabardian - -=item {kab} : Kabyle - -=item {kac} : Kachin - -=item {kl} : Kalaallisut - -eq Greenlandic "Eskimo" - -=item {xal} : Kalmyk - -=item {kam} : Kamba - -=item {kn} : Kannada - -eq Kanarese. NOT Canadian! - -=item {kr} : Kanuri - -(Formerly "kau".) - -=item {krc} : Karachay-Balkar - -=item {kaa} : Kara-Kalpak - -=item {kar} : Karen - -=item {ks} : Kashmiri - -=item {csb} : Kashubian - -eq Kashub - -=item {kaw} : Kawi - -=item {kk} : Kazakh - -=item {kha} : Khasi - -=item {km} : Khmer - -eq Cambodian. eq Kampuchean. - -=item [{khi} : Khoisan (Other)] - -=item {kho} : Khotanese - -=item {ki} : Kikuyu - -eq Gikuyu. - -=item {kmb} : Kimbundu - -=item {rw} : Kinyarwanda - -=item {ky} : Kirghiz - -=item {i-klingon} : Klingon - -=item {kv} : Komi - -=item {kg} : Kongo - -(Formerly "kon".) - -=item {kok} : Konkani - -=item {ko} : Korean - -=item {kos} : Kosraean - -=item {kpe} : Kpelle - -=item {kro} : Kru - -=item {kj} : Kuanyama - -=item {kum} : Kumyk - -=item {ku} : Kurdish - -=item {kru} : Kurukh - -=item {kut} : Kutenai - -=item {lad} : Ladino - -eq Judeo-Spanish. NOT Ladin (a minority language in Italy). - -=item {lah} : Lahnda - -NOT Lamba! - -=item {lam} : Lamba - -NOT Lahnda! - -=item {lo} : Lao - -eq Laotian. - -=item {la} : Latin - -(Historical) NOT Ladin! NOT Ladino! - -=item {lv} : Latvian - -eq Lettish. - -=item {lb} : Letzeburgesch - -eq Luxemburgian, eq Luxemburger. (Formerly "i-lux".) - -=for etc -{i-lux} Letzeburgesch (old tag) - -=item {lez} : Lezghian - -=item {li} : Limburgish - -eq Limburger, eq Limburgan. NOT Letzeburgesch! - -=item {ln} : Lingala - -=item {lt} : Lithuanian - -=item {nds} : Low German - -eq Low Saxon. eq Low German. eq Low Saxon. - -=item {art-lojban} : Lojban (Artificial) - -=item {loz} : Lozi - -=item {lu} : Luba-Katanga - -(Formerly "lub".) - -=item {lua} : Luba-Lulua - -=item {lui} : Luiseno - -eq LuiseEo. - -=item {lun} : Lunda - -=item {luo} : Luo (Kenya and Tanzania) - -=item {lus} : Lushai - -=item {mk} : Macedonian - -eq the modern Slavic language spoken in what was Yugoslavia. -NOT the form of Greek spoken in Greek Macedonia! - -=item {mad} : Madurese - -=item {mag} : Magahi - -=item {mai} : Maithili - -=item {mak} : Makasar - -=item {mg} : Malagasy - -=item {ms} : Malay - -NOT Malayalam! - -=item {ml} : Malayalam - -NOT Malay! - -=item {mt} : Maltese - -=item {mnc} : Manchu - -=item {mdr} : Mandar - -NOT Mandarin! - -=item {man} : Mandingo - -=item {mni} : Manipuri - -eq Meithei. - -=item [{mno} : Manobo languages] - -=item {gv} : Manx - -=item {mi} : Maori - -NOT Mari! - -=item {mr} : Marathi - -=item {chm} : Mari - -NOT Maori! - -=item {mh} : Marshall - -eq Marshallese. - -=item {mwr} : Marwari - -=item {mas} : Masai - -=item [{myn} : Mayan languages] - -=item {men} : Mende - -=item {mic} : Micmac - -=item {min} : Minangkabau - -=item {i-mingo} : Mingo - -eq the Irquoian language West Virginia Seneca. NOT New York Seneca! - -=item [{mis} : Miscellaneous languages] - -Don't use this. - -=item {moh} : Mohawk - -=item {mdf} : Moksha - -=item {mo} : Moldavian - -eq Moldovan. - -=item [{mkh} : Mon-Khmer (Other)] - -=item {lol} : Mongo - -=item {mn} : Mongolian - -eq Mongol. - -=item {mos} : Mossi - -=item [{mul} : Multiple languages] - -Not for normal use. - -=item [{mun} : Munda languages] - -=item {nah} : Nahuatl - -=item {nap} : Neapolitan - -=item {na} : Nauru - -=item {nv} : Navajo - -eq Navaho. (Formerly "i-navajo".) - -=for etc -{i-navajo} Navajo (old tag) - -=item {nd} : North Ndebele - -=item {nr} : South Ndebele - -=item {ng} : Ndonga - -=item {ne} : Nepali - -eq Nepalese. Notable forms: -{ne-np} Nepal Nepali; -{ne-in} India Nepali. - -=item {new} : Newari - -=item {nia} : Nias - -=item [{nic} : Niger-Kordofanian (Other)] - -=item [{ssa} : Nilo-Saharan (Other)] - -=item {niu} : Niuean - -=item {nog} : Nogai - -=item {non} : Old Norse - -(Historical) - -=item [{nai} : North American Indian] - -Do not use this. - -=item {no} : Norwegian - -Note the two following forms: - -=item {nb} : Norwegian Bokmal - -eq BokmEl, (A form of Norwegian.) (Formerly "no-bok".) - -=for etc -{no-bok} Norwegian Bokmal (old tag) - -=item {nn} : Norwegian Nynorsk - -(A form of Norwegian.) (Formerly "no-nyn".) - -=for etc -{no-nyn} Norwegian Nynorsk (old tag) - -=item [{nub} : Nubian languages] - -=item {nym} : Nyamwezi - -=item {nyn} : Nyankole - -=item {nyo} : Nyoro - -=item {nzi} : Nzima - -=item {oc} : Occitan (post 1500) - -eq ProvenEal, eq Provencal - -=item {oj} : Ojibwa - -eq Ojibwe. (Formerly "oji".) - -=item {or} : Oriya - -=item {om} : Oromo - -=item {osa} : Osage - -=item {os} : Ossetian; Ossetic - -=item [{oto} : Otomian languages] - -Group of languages collectively called "OtomE". - -=item {pal} : Pahlavi - -eq Pahlevi - -=item {i-pwn} : Paiwan - -eq Pariwan - -=item {pau} : Palauan - -=item {pi} : Pali - -(Historical?) - -=item {pam} : Pampanga - -=item {pag} : Pangasinan - -=item {pa} : Panjabi - -eq Punjabi - -=item {pap} : Papiamento - -eq Papiamentu. - -=item [{paa} : Papuan (Other)] - -=item {fa} : Persian - -eq Farsi. eq Iranian. - -=item {peo} : Old Persian (ca.600-400 B.C.) - -=item [{phi} : Philippine (Other)] - -=item {phn} : Phoenician - -(Historical) - -=item {pon} : Pohnpeian - -NOT Pompeiian! - -=item {pl} : Polish - -=item {pt} : Portuguese - -eq Portugese. Notable forms: -{pt-pt} Portugal Portuguese; -{pt-br} Brazilian Portuguese. - -=item [{pra} : Prakrit languages] - -=item {pro} : Old Provencal (to 1500) - -eq Old ProvenEal. (Historical.) - -=item {ps} : Pushto - -eq Pashto. eq Pushtu. - -=item {qu} : Quechua - -eq Quecha. - -=item {rm} : Raeto-Romance - -eq Romansh. - -=item {raj} : Rajasthani - -=item {rap} : Rapanui - -=item {rar} : Rarotongan - -=item [{qaa - qtz} : Reserved for local use.] - -=item [{roa} : Romance (Other)] - -NOT Romanian! NOT Romany! NOT Romansh! - -=item {ro} : Romanian - -eq Rumanian. NOT Romany! - -=item {rom} : Romany - -eq Rom. NOT Romanian! - -=item {rn} : Rundi - -=item {ru} : Russian - -NOT White Russian! NOT Rusyn! - -=item [{sal} : Salishan languages] - -Large language group. - -=item {sam} : Samaritan Aramaic - -NOT Aramaic! - -=item {se} : Northern Sami - -eq Lappish. eq Lapp. eq (Northern) Saami. - -=item {sma} : Southern Sami - -=item {smn} : Inari Sami - -=item {smj} : Lule Sami - -=item {sms} : Skolt Sami - -=item [{smi} : Sami languages (Other)] - -=item {sm} : Samoan - -=item {sad} : Sandawe - -=item {sg} : Sango - -=item {sa} : Sanskrit - -(Historical) - -=item {sat} : Santali - -=item {sc} : Sardinian - -eq Sard. - -=item {sas} : Sasak - -=item {sco} : Scots - -NOT Scots Gaelic! - -=item {sel} : Selkup - -=item [{sem} : Semitic (Other)] - -=item {sr} : Serbian - -eq Serb. NOT Sorbian. - -Notable forms: -{sr-Cyrl} : Serbian in Cyrillic script; -{sr-Latn} : Serbian in Latin script. - -=item {srr} : Serer - -=item {shn} : Shan - -=item {sn} : Shona - -=item {sid} : Sidamo - -=item {sgn-...} : Sign Languages - -Always use with a subtag. Notable forms: -{sgn-gb} British Sign Language (BSL); -{sgn-ie} Irish Sign Language (ESL); -{sgn-ni} Nicaraguan Sign Language (ISN); -{sgn-us} American Sign Language (ASL). - -(And so on with other country codes as the subtag.) - -=item {bla} : Siksika - -eq Blackfoot. eq Pikanii. - -=item {sd} : Sindhi - -=item {si} : Sinhalese - -eq Sinhala. - -=item [{sit} : Sino-Tibetan (Other)] - -=item [{sio} : Siouan languages] - -=item {den} : Slave (Athapascan) - -("Slavey" is a subform.) - -=item [{sla} : Slavic (Other)] - -=item {sk} : Slovak - -eq Slovakian. - -=item {sl} : Slovenian - -eq Slovene. - -=item {sog} : Sogdian - -=item {so} : Somali - -=item {son} : Songhai - -=item {snk} : Soninke - -=item {wen} : Sorbian languages - -eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! - -=item {nso} : Northern Sotho - -=item {st} : Southern Sotho - -eq Sutu. eq Sesotho. - -=item [{sai} : South American Indian (Other)] - -=item {es} : Spanish - -Notable forms: -{es-ar} Argentine Spanish; -{es-bo} Bolivian Spanish; -{es-cl} Chilean Spanish; -{es-co} Colombian Spanish; -{es-do} Dominican Spanish; -{es-ec} Ecuadorian Spanish; -{es-es} Spain Spanish; -{es-gt} Guatemalan Spanish; -{es-hn} Honduran Spanish; -{es-mx} Mexican Spanish; -{es-pa} Panamanian Spanish; -{es-pe} Peruvian Spanish; -{es-pr} Puerto Rican Spanish; -{es-py} Paraguay Spanish; -{es-sv} Salvadoran Spanish; -{es-us} US Spanish; -{es-uy} Uruguayan Spanish; -{es-ve} Venezuelan Spanish. - -=item {suk} : Sukuma - -=item {sux} : Sumerian - -(Historical) - -=item {su} : Sundanese - -=item {sus} : Susu - -=item {sw} : Swahili - -eq Kiswahili - -=item {ss} : Swati - -=item {sv} : Swedish - -Notable forms: -{sv-se} Sweden Swedish; -{sv-fi} Finland Swedish. - -=item {syr} : Syriac - -=item {tl} : Tagalog - -=item {ty} : Tahitian - -=item [{tai} : Tai (Other)] - -NOT Thai! - -=item {tg} : Tajik - -=item {tmh} : Tamashek - -=item {ta} : Tamil - -=item {i-tao} : Tao - -eq Yami. - -=item {tt} : Tatar - -=item {i-tay} : Tayal - -eq Atayal. eq Atayan. - -=item {te} : Telugu - -=item {ter} : Tereno - -=item {tet} : Tetum - -=item {th} : Thai - -NOT Tai! - -=item {bo} : Tibetan - -=item {tig} : Tigre - -=item {ti} : Tigrinya - -=item {tem} : Timne - -eq Themne. eq Timene. - -=item {tiv} : Tiv - -=item {tli} : Tlingit - -=item {tpi} : Tok Pisin - -=item {tkl} : Tokelau - -=item {tog} : Tonga (Nyasa) - -NOT Tsonga! - -=item {to} : Tonga (Tonga Islands) - -(Pronounced "Tong-a", not "Tong-ga") - -NOT Tsonga! - -=item {tsi} : Tsimshian - -eq Sm'algyax - -=item {ts} : Tsonga - -NOT Tonga! - -=item {i-tsu} : Tsou - -=item {tn} : Tswana - -Same as Setswana. - -=item {tum} : Tumbuka - -=item [{tup} : Tupi languages] - -=item {tr} : Turkish - -(Typically in Roman script) - -=item {ota} : Ottoman Turkish (1500-1928) - -(Typically in Arabic script) (Historical) - -=item {crh} : Crimean Turkish - -eq Crimean Tatar - -=item {tk} : Turkmen - -eq Turkmeni. - -=item {tvl} : Tuvalu - -=item {tyv} : Tuvinian - -eq Tuvan. eq Tuvin. - -=item {tw} : Twi - -=item {udm} : Udmurt - -=item {uga} : Ugaritic - -NOT Ugric! - -=item {ug} : Uighur - -=item {uk} : Ukrainian - -=item {umb} : Umbundu - -=item {und} : Undetermined - -Not a tag for normal use. - -=item {ur} : Urdu - -=item {uz} : Uzbek - -eq Ezbek - -Notable forms: -{uz-Cyrl} Uzbek in Cyrillic script; -{uz-Latn} Uzbek in Latin script. - -=item {vai} : Vai - -=item {ve} : Venda - -NOT Wendish! NOT Wend! NOT Avestan! (Formerly "ven".) - -=item {vi} : Vietnamese - -eq Viet. - -=item {vo} : Volapuk - -eq VolapEk. (Artificial) - -=item {vot} : Votic - -eq Votian. eq Vod. - -=item [{wak} : Wakashan languages] - -=item {wa} : Walloon - -=item {wal} : Walamo - -eq Wolaytta. - -=item {war} : Waray - -Presumably the Philippine language Waray-Waray (SamareEo), -not the smaller Philippine language Waray Sorsogon, nor the extinct -Australian language Waray. - -=item {was} : Washo - -eq Washoe - -=item {cy} : Welsh - -=item {wo} : Wolof - -=item {x-...} : Unregistered (Semi-Private Use) - -"x-" is a prefix for language tags that are not registered with ISO -or IANA. Example, x-double-dutch - -=item {xh} : Xhosa - -=item {sah} : Yakut - -=item {yao} : Yao - -(The Yao in Malawi?) - -=item {yap} : Yapese - -eq Yap - -=item {ii} : Sichuan Yi - -=item {yi} : Yiddish - -Formerly "ji". Usually in Hebrew script. - -Notable forms: -{yi-latn} Yiddish in Latin script - -=item {yo} : Yoruba - -=item [{ypk} : Yupik languages] - -Several "Eskimo" languages. - -=item {znd} : Zande - -=item [{zap} : Zapotec] - -(A group of languages.) - -=item {zen} : Zenaga - -NOT Zend. - -=item {za} : Zhuang - -=item {zu} : Zulu - -=item {zun} : Zuni - -eq ZuEi - -=back - -=for woohah END - -=head1 SEE ALSO - -L and its "See Also" section. - -=head1 COPYRIGHT AND DISCLAIMER - -Copyright (c) 2001+ Sean M. Burke. All rights reserved. - -You can redistribute and/or -modify this document under the same terms as Perl itself. - -This document is provided in the hope that it will be -useful, but without any warranty; -without even the implied warranty of accuracy, authoritativeness, -completeness, merchantability, or fitness for a particular purpose. - -Email any corrections or questions to me. - -=head1 AUTHOR - -Sean M. Burke, sburkeE<64>cpan.org - -=cut - - -# To generate a list of just the two and three-letter codes: - -#!/usr/local/bin/perl -w - -require 5; # Time-stamp: "2001-03-13 21:53:39 MST" - # Sean M. Burke, sburke@cpan.org - # This program is for generating the language_codes.txt file -use strict; -use LWP::Simple; -use HTML::TreeBuilder 3.10; -my $root = HTML::TreeBuilder->new(); -my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; -$root->parse(get($url) || die "Can't get $url"); -$root->eof(); - -my @codes; - -foreach my $tr ($root->find_by_tag_name('tr')) { - my @f = map $_->as_text(), $tr->content_list(); - #print map("<$_> ", @f), "\n"; - next unless @f == 5; - pop @f; # nix the French name - next if $f[-1] eq 'Language Name (English)'; # it's a header line - my $xx = splice(@f, 2,1); # pull out the two-letter code - $f[-1] =~ s/^\s+//; - $f[-1] =~ s/\s+$//; - if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it - push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; - } else { # print the three-letter codes. - if($f[0] eq $f[1]) { - push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; - } else { # shouldn't happen - push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; - } - } -} - -print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; -print "[ based on $url\n at ", scalar(localtime), "]\n", - "[Note: doesn't include IANA-registered codes.]\n"; -exit; -__END__ - diff --git a/ext/I18N-LangTags/t/01_about_verbose.t b/ext/I18N-LangTags/t/01_about_verbose.t deleted file mode 100644 index 3abc68d537..0000000000 --- a/ext/I18N-LangTags/t/01_about_verbose.t +++ /dev/null @@ -1,89 +0,0 @@ - -require 5; -# Time-stamp: "2004-03-30 17:02:59 AST" - -# Summary of, well, things. - -use Test; -BEGIN {plan tests => 2}; - -ok 1; - -use I18N::LangTags; -use I18N::LangTags::List; -use I18N::LangTags::Detect; - -#chdir "t" if -e "t"; - -{ - my @out; - push @out, - "\n\nPerl v", - defined($^V) ? sprintf('%vd', $^V) : $], - " under $^O ", - (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber()) - ? ("(Win32::BuildNumber ", &Win32::BuildNumber(), ")") : (), - (defined $MacPerl::Version) - ? ("(MacPerl version $MacPerl::Version)") : (), - "\n" - ; - - # Ugly code to walk the symbol tables: - my %v; - my @stack = (''); # start out in %:: - my $this; - my $count = 0; - my $pref; - while(@stack) { - $this = shift @stack; - die "Too many packages?" if ++$count > 1000; - next if exists $v{$this}; - next if $this eq 'main'; # %main:: is %:: - - #print "Peeking at $this => ${$this . '::VERSION'}\n"; - - if(defined ${$this . '::VERSION'} ) { - $v{$this} = ${$this . '::VERSION'} - } elsif( - defined *{$this . '::ISA'} or defined &{$this . '::import'} - or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . "::"}) - # If it has an ISA, an import, or any subs... - ) { - # It's a class/module with no version. - $v{$this} = undef; - } else { - # It's probably an unpopulated package. - ## $v{$this} = '...'; - } - - $pref = length($this) ? "$this\::" : ''; - push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'}; - #print "Stack: @stack\n"; - } - push @out, " Modules in memory:\n"; - delete @v{'', '[none]'}; - foreach my $p (sort {lc($a) cmp lc($b)} keys %v) { - $indent = ' ' x (2 + ($p =~ tr/:/:/)); - push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n"; - } - push @out, sprintf "[at %s (local) / %s (GMT)]\n", - scalar(gmtime), scalar(localtime); - my $x = join '', @out; - $x =~ s/^/#/mg; - print $x; -} - -print "# Running", - (chr(65) eq 'A') ? " in an ASCII world.\n" : " in a non-ASCII world.\n", - "#\n", -; - -print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n"; - -print "# \%INC:\n"; -foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) { - print "# [$x] = [", $INC{$x} || '', "]\n"; -} - -ok 1; - diff --git a/ext/I18N-LangTags/t/05_main.t b/ext/I18N-LangTags/t/05_main.t deleted file mode 100644 index 056baafc55..0000000000 --- a/ext/I18N-LangTags/t/05_main.t +++ /dev/null @@ -1,98 +0,0 @@ - -require 5; - # Time-stamp: "2004-03-30 17:52:14 AST" -use strict; -use Test; -BEGIN { plan tests => 64 }; -BEGIN { ok 1 } -use I18N::LangTags (':ALL'); - -print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; - -ok !is_language_tag(''); -ok is_language_tag('fr'); -ok is_language_tag('fr-ca'); -ok is_language_tag('fr-CA'); -ok !is_language_tag('fr-CA-'); -ok !is_language_tag('fr_CA'); -ok is_language_tag('fr-ca-joual'); -ok !is_language_tag('frca'); -ok is_language_tag('nav'); # (not actual tag) -ok is_language_tag('nav-shiprock'); # (not actual tag) -ok !is_language_tag('nav-ceremonial'); # subtag too long -ok !is_language_tag('x'); -ok !is_language_tag('i'); -ok is_language_tag('i-borg'); # NB: fictitious tag -ok is_language_tag('x-borg'); -ok is_language_tag('x-borg-prot5123'); -ok same_language_tag('x-borg-prot5123', 'i-BORG-Prot5123' ); -ok !same_language_tag('en', 'en-us' ); - -ok 0 == similarity_language_tag('en-ca', 'fr-ca'); -ok 1 == similarity_language_tag('en-ca', 'en-us'); -ok 2 == similarity_language_tag('en-us-southern', 'en-us-western'); -ok 2 == similarity_language_tag('en-us-southern', 'en-us'); - -ok grep $_ eq 'hi', panic_languages('kok'); -ok grep $_ eq 'en', panic_languages('x-woozle-wuzzle'); -ok ! grep $_ eq 'mr', panic_languages('it'); -ok grep $_ eq 'es', panic_languages('it'); -ok grep $_ eq 'it', panic_languages('es'); - - -print "# Now the ::List tests...\n"; -print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; - -use I18N::LangTags::List; -foreach my $lt (qw( - en - en-us - en-kr - el - elx - i-mingo - i-mingo-tom - x-mingo-tom - it - it-it - it-IT - it-FR - ak - aka - jv - jw - no - no-nyn - nn - i-lux - lb - wa - yi - ji - den-syllabic - den-syllabic-western - den-western - den-latin - cre-syllabic - cre-syllabic-western - cre-western - cre-latin - cr-syllabic - cr-syllabic-western - cr-western - cr-latin -)) { - my $name = I18N::LangTags::List::name($lt); - if($name) { - ok(1); - print "# $lt -> $name\n"; - } else { - ok(0); - print "# Failed lookup on $lt\n"; - } -} - - - -print "# So there!\n"; - diff --git a/ext/I18N-LangTags/t/07_listy.t b/ext/I18N-LangTags/t/07_listy.t deleted file mode 100644 index a56a798a33..0000000000 --- a/ext/I18N-LangTags/t/07_listy.t +++ /dev/null @@ -1,30 +0,0 @@ - -require 5; - # Time-stamp: "2003-10-10 17:37:34 ADT" -use strict; -use Test; -BEGIN { plan tests => 17 }; -BEGIN { ok 1 } -use I18N::LangTags::List; - -print "# Perl v$], I18N::LangTags::List v$I18N::LangTags::List::VERSION\n"; - -ok I18N::LangTags::List::name('fr'), 'French'; -ok I18N::LangTags::List::name('fr-fr'); -ok !I18N::LangTags::List::name('El Zorcho'); -ok !I18N::LangTags::List::name(); - - -ok !I18N::LangTags::List::is_decent(); -ok I18N::LangTags::List::is_decent('fr'); -ok I18N::LangTags::List::is_decent('fr-blorch'); -ok !I18N::LangTags::List::is_decent('El Zorcho'); -ok !I18N::LangTags::List::is_decent('sgn'); -ok I18N::LangTags::List::is_decent('sgn-us'); -ok !I18N::LangTags::List::is_decent('i'); -ok I18N::LangTags::List::is_decent('i-mingo'); -ok I18N::LangTags::List::is_decent('i-mingo-tom'); -ok !I18N::LangTags::List::is_decent('cel'); -ok I18N::LangTags::List::is_decent('cel-gaulish'); - -ok 1; # one for the road diff --git a/ext/I18N-LangTags/t/10_http.t b/ext/I18N-LangTags/t/10_http.t deleted file mode 100644 index 36341f77cf..0000000000 --- a/ext/I18N-LangTags/t/10_http.t +++ /dev/null @@ -1,104 +0,0 @@ - -# Time-stamp: "2004-06-17 23:06:22 PDT" - -use I18N::LangTags::Detect; - -use Test; -BEGIN { plan tests => 87 }; - -my @in = grep m/\S/, split /\n/, q{ - -[ sv ] sv -[ en ] en -[ en fi ] en, fi -[ en-us ] en-us -[ en-us ] en-US -[ en-us ] EN-US - -[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 -[ en-au en i-klingon x-klingon en-gb en-us mt-mt mt tli ja ] EN-au, tli;q=0.201, JA;q=0.14, i-klingon;q=0.83, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 -[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 - -[ en-au en en-gb en-us mt-mt mt ja ] EN-au, JA;q=0.14, en-gb;q=0.71, en-us;q=0.57, mt-mt;q=0.43, mt;q=0.29, en;q=0.86 -[ en-au en en-gb en-us ja ] en-au, ja;q=0.20, en-gb;q=0.60, en-us;q=0.40, en;q=0.80 -[ en fr ] en;q=1,fr;q=.5 -[ en fr ] en;q=1,fr;q=.99 -[ en ru ko ] en, ru;q=0.7, ko;q=0.3 -[ en ru ko ] en, ru;q=0.7, KO;q=0.3 -[ en-us en ] en-us, en;q=0.50 -[ en fr ] fr ; q = 0.9, en -[ en fr ] en,fr;q=.90 -[ ru en-uk en fr ] ru, en-UK;q=0.5, en;q=0.3, fr;q=0.1 -[ en-us fr es-mx ] en-us,fr;q=0.7,es-mx;q=0.3 -[ en-us en ] en-us, en;q=0.50 - -[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 -[ da en-gb en ] da, en;q=0.7, en-gb;q=0.8 -[ da en-gb en ] da, en-gb;q=0.8, en;q=0.7 -[ da en-gb en ] da,en;q=0.7,en-gb;q=0.8 -[ da en-gb en ] da, en-gb ; q=0.8, en ; q=0.7 -[ da en-gb en ] da , en-gb ; q = 0.8 , en ; q =0.7 -[ da en-gb en ] da (yup, Danish) , en-gb ; q = 0.8 , en ; q =0.7 - -[ no dk en-uk en-us ] en-UK;q=0.7, en-US;q=0.6, no;q=1.0, dk;q=0.8 -[ no dk en-uk en-us ] en-US;q=0.6, en-UK;q=0.7, no;q=1.0, dk;q=0.8 -[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, en-US;q=0.6, dk;q=0.8 -[ no dk en-uk en-us ] en-UK;q=0.7, no;q=1.0, dk;q=0.8, en-US;q=0.6 - -[ fi en ] fi;q=1, en;q=0.2 -[ de-de de en en-us en-gb ] de-DE, de;q=0.80, en;q=0.60, en-US;q=0.40, en-GB;q=0.20 -[ ru ] ru; q=1, *; q=0.1 -[ ru en ] ru, en; q=0.1 -[ ja en ] ja,en;q=0.5 -[ en ] en; q=1.0 -[ ja ] ja; q=1.0 -[ ja ] ja; q=1.0 -[ en ja ] en; q=0.5, ja; q=0.5 -[ fr-ca fr en ] fr-ca, fr;q=0.8, en;q=0.7 -[ NIX ] NIX -}; - -foreach my $in (@in) { - $in =~ s/^\s*\[([^\]]+)\]\s*//s or die "Bad input: $in"; - my @should = do { my $x = $1; $x =~ m/(\S+)/g }; - - if($in eq 'NIX') { $in = ''; @should = (); } - - local $ENV{'HTTP_ACCEPT_LANGUAGE'}; - - foreach my $modus ( - sub { - print "# Testing with arg...\n"; - $ENV{'HTTP_ACCEPT_LANGUAGE'} = 'PLORK'; - return $_[0]; - }, - sub { - print "# Testing wath HTTP_ACCEPT_LANGUAGE...\n"; - $ENV{'HTTP_ACCEPT_LANGUAGE'} = $_[0]; - return(); - }, - ) { - my @args = &$modus($in); - - # //////////////////////////////////////////////////// - my @out = I18N::LangTags::Detect->http_accept_langs(@args); - # \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\ - - if( - @out == @should - and lc( join "\e", @out ) eq lc( join "\e", @should ) - ) { - print "# Happily got [@out] from [$in]\n"; - ok 1; - } else { - ok 0; - print "#Got: [@out]\n", - "# but wanted: [@should]\n", - "# < \"$in\"\n#\n"; - } - } -} - -print "#\n#\n# Bye-bye!\n"; -ok 1; - diff --git a/ext/I18N-LangTags/t/20_locales.t b/ext/I18N-LangTags/t/20_locales.t deleted file mode 100644 index ae04812ff2..0000000000 --- a/ext/I18N-LangTags/t/20_locales.t +++ /dev/null @@ -1,38 +0,0 @@ -require 5; - # Time-stamp: "2004-10-06 23:07:06 ADT" -use strict; -use Test; -BEGIN { plan tests => 22 }; -BEGIN { ok 1 } -use I18N::LangTags (':ALL'); - -print "# Perl v$], I18N::LangTags v$I18N::LangTags::VERSION\n"; -print "# Loaded from ", $INC{'I18N/LangTags.pm'} || "??", "\n"; - -ok lc locale2language_tag('en'), 'en'; -ok lc locale2language_tag('en_US'), 'en-us'; -ok lc locale2language_tag('en_US.ISO8859-1'), 'en-us'; -ok lc(locale2language_tag('C')||''), ''; -ok lc(locale2language_tag('POSIX')||''), ''; - - -ok lc locale2language_tag('eu_mt'), 'eu-mt'; -ok lc locale2language_tag('eu'), 'eu'; -ok lc locale2language_tag('it'), 'it'; -ok lc locale2language_tag('it_IT'), 'it-it'; -ok lc locale2language_tag('it_IT.utf8'), 'it-it'; -ok lc locale2language_tag('it_IT.utf8@euro'), 'it-it'; -ok lc locale2language_tag('it_IT@euro'), 'it-it'; - - -ok lc locale2language_tag('zh_CN.gb18030'), 'zh-cn'; -ok lc locale2language_tag('zh_CN.gbk'), 'zh-cn'; -ok lc locale2language_tag('zh_CN.utf8'), 'zh-cn'; -ok lc locale2language_tag('zh_HK'), 'zh-hk'; -ok lc locale2language_tag('zh_HK.utf8'), 'zh-hk'; -ok lc locale2language_tag('zh_TW'), 'zh-tw'; -ok lc locale2language_tag('zh_TW.euctw'), 'zh-tw'; -ok lc locale2language_tag('zh_TW.utf8'), 'zh-tw'; - -print "# So there!\n"; -ok 1; diff --git a/ext/I18N-LangTags/t/50_super.t b/ext/I18N-LangTags/t/50_super.t deleted file mode 100644 index 9923c84068..0000000000 --- a/ext/I18N-LangTags/t/50_super.t +++ /dev/null @@ -1,88 +0,0 @@ - -# Time-stamp: "2004-03-30 17:46:17 AST" - -use Test; -BEGIN { plan tests => 26 }; -print "#\n# Testing normal (tight) insertion of super-ordinate language tags...\n#\n"; - -use I18N::LangTags qw(implicate_supers); - -my @in = grep m/\S/, split /[\n\r]/, q{ - NIX => NIX - sv => sv - en => en - hai => hai - - pt-br => pt-br pt - pt-br fr => pt-br pt fr - pt-br fr pt => pt-br fr pt - - pt-br fr pt de => pt-br fr pt de - de pt-br fr pt => de pt-br fr pt - de pt-br fr => de pt-br pt fr - hai pt-br fr => hai pt-br pt fr - - # Now test multi-part complicateds: - pt-br-janeiro => pt-br-janeiro pt-br pt - pt-br-janeiro fr => pt-br-janeiro pt-br pt fr - pt-br-janeiro de fr => pt-br-janeiro pt-br pt de fr - pt-br-janeiro de pt fr => pt-br-janeiro pt-br de pt fr - - pt-br-janeiro pt-br-saopaolo => pt-br-janeiro pt-br pt pt-br-saopaolo - pt-br-janeiro fr pt-br-saopaolo => pt-br-janeiro pt-br pt fr pt-br-saopaolo - pt-br-janeiro de pt-br-saopaolo fr => pt-br-janeiro pt-br pt de pt-br-saopaolo fr - pt-br-janeiro de pt-br fr pt-br-saopaolo => pt-br-janeiro de pt-br pt fr pt-br-saopaolo - - pt-br de en fr pt-br-janeiro => pt-br pt de en fr pt-br-janeiro - pt-br de en fr => pt-br pt de en fr - - ja pt-br-janeiro fr => ja pt-br-janeiro pt-br pt fr - ja pt-br-janeiro de fr => ja pt-br-janeiro pt-br pt de fr - ja pt-br-janeiro de pt fr => ja pt-br-janeiro pt-br de pt fr - - pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br pt fr -# an odd case, since we don't filter for uniqueness in this sub - -}; - -sub uniq { my %seen; return grep(!($seen{$_}++), @_); } - -foreach my $in (@in) { - $in =~ s/^\s+//s; - $in =~ s/\s+$//s; - $in =~ s/#.+//s; - next unless $in =~ m/\S/; - - my(@in, @should); - { - die "What kind of line is <$in>?!" - unless $in =~ m/^(.+)=>(.+)$/s; - - my($i,$s) = ($1, $2); - @in = ($i =~ m/(\S+)/g); - @should = ($s =~ m/(\S+)/g); - #print "{@in}{@should}\n"; - } - my @out = implicate_supers( - ("@in" eq 'NIX') ? () : @in - ); - #print "O: ", join(' ', map "<$_>", @out), "\n"; - @out = 'NIX' unless @out; - - - if( @out == @should - and lc( join "\e", @out ) eq lc( join "\e", @should ) - ) { - print "# Happily got [@out] from [$in]\n"; - ok 1; - } else { - ok 0; - print "#!!Got: [@out]\n", - "#!! but wanted: [@should]\n", - "#!! from \"$in\"\n#\n"; - } -} - -print "#\n#\n# Bye-bye!\n"; -ok 1; - diff --git a/ext/I18N-LangTags/t/55_supers_strict.t b/ext/I18N-LangTags/t/55_supers_strict.t deleted file mode 100644 index 3b285157a1..0000000000 --- a/ext/I18N-LangTags/t/55_supers_strict.t +++ /dev/null @@ -1,78 +0,0 @@ - -# Time-stamp: "2004-03-30 17:49:58 AST" -#sub I18N::LangTags::Detect::DEBUG () {10} -use I18N::LangTags qw(implicate_supers_strictly); - -use Test; -BEGIN { plan tests => 19 }; - -print "#\n# Testing strict (non-tight) insertion of super-ordinate language tags...\n#\n"; - -my @in = grep m/\S/, split /[\n\r]/, q{ - NIX => NIX - sv => sv - en => en - hai => hai - - pt-br => pt-br pt - pt-br fr => pt-br fr pt - pt-br fr pt => pt-br fr pt - pt-br fr pt de => pt-br fr pt de - de pt-br fr pt => de pt-br fr pt - de pt-br fr => de pt-br fr pt - hai pt-br fr => hai pt-br fr pt - -# Now test multi-part complicateds: - pt-br-janeiro fr => pt-br-janeiro fr pt-br pt -pt-br-janeiro de fr => pt-br-janeiro de fr pt-br pt -pt-br-janeiro de pt fr => pt-br-janeiro de pt fr pt-br - -ja pt-br-janeiro fr => ja pt-br-janeiro fr pt-br pt -ja pt-br-janeiro de fr => ja pt-br-janeiro de fr pt-br pt -ja pt-br-janeiro de pt fr => ja pt-br-janeiro de pt fr pt-br - -pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br fr pt - # an odd case, since we don't filter for uniqueness in this sub - -}; - - -foreach my $in (@in) { - $in =~ s/^\s+//s; - $in =~ s/\s+$//s; - $in =~ s/#.+//s; - next unless $in =~ m/\S/; - - my(@in, @should); - { - die "What kind of line is <$in>?!" - unless $in =~ m/^(.+)=>(.+)$/s; - - my($i,$s) = ($1, $2); - @in = ($i =~ m/(\S+)/g); - @should = ($s =~ m/(\S+)/g); - #print "{@in}{@should}\n"; - } - my @out = I18N::LangTags::implicate_supers_strictly( - ("@in" eq 'NIX') ? () : @in - ); - #print "O: ", join(' ', map "<$_>", @out), "\n"; - @out = 'NIX' unless @out; - - - if( @out == @should - and lc( join "\e", @out ) eq lc( join "\e", @should ) - ) { - print "# Happily got [@out] from [$in]\n"; - ok 1; - } else { - ok 0; - print "#!!Got: [@out]\n", - "#!! but wanted: [@should]\n", - "#!! from \"$in\"\n#\n"; - } -} - -print "#\n#\n# Bye-bye!\n"; -ok 1; - diff --git a/ext/I18N-LangTags/t/80_all_env.t b/ext/I18N-LangTags/t/80_all_env.t deleted file mode 100644 index 1362d5ebf5..0000000000 --- a/ext/I18N-LangTags/t/80_all_env.t +++ /dev/null @@ -1,115 +0,0 @@ - -require 5; -use Test; -# Time-stamp: "2004-07-01 14:33:50 ADT" -BEGIN { plan tests => 20; } -use I18N::LangTags::Detect 1.01; -print "# Hi there...\n"; -ok 1; - -print "# Using I18N::LangTags::Detect v$I18N::LangTags::Detect::VERSION\n"; - -print "# Make sure we can assign to ENV entries\n", - "# (Otherwise we can't run the subsequent tests)...\n"; -$ENV{'MYORP'} = 'Zing'; ok $ENV{'MYORP'}, 'Zing'; -$ENV{'SWUZ'} = 'KLORTHO HOOBOY'; ok $ENV{'SWUZ'}, 'KLORTHO HOOBOY'; - -delete $ENV{'MYORP'}; -delete $ENV{'SWUZ'}; - -sub j { "[" . join(' ', map "\"$_\"", @_) . "]" ;} - -sub show { - print "# (Seeing {", join(' ', - map(qq{<$_>}, @_)), "} at line ", (caller)[2], ")\n"; - printenv(); - return $_[0] || ''; -} -sub printenv { - print "# ENV:\n"; - foreach my $k (sort keys %ENV) { - my $p = $ENV{$k}; $p =~ s/\n/\n#/g; - print "# [$k] = [$p]\n"; } - print "# [end of ENV]\n#\n"; -} - -$ENV{'IGNORE_WIN32_LOCALE'} = 1; # a hack, just for testing's sake. - - -print "# Test LANGUAGE...\n"; -$ENV{'REQUEST_METHOD'} = ''; -$ENV{'LANGUAGE'} = 'Eu-MT'; -$ENV{'LC_ALL'} = ''; -$ENV{'LC_MESSAGES'} = ''; -$ENV{'LANG'} = ''; -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - - -print "# Test LC_ALL...\n"; -$ENV{'REQUEST_METHOD'} = ''; -$ENV{'LANGUAGE'} = ''; -$ENV{'LC_ALL'} = 'Eu-MT'; -$ENV{'LC_MESSAGES'} = ''; -$ENV{'LANG'} = ''; - -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - -print "# Test LC_MESSAGES...\n"; -$ENV{'REQUEST_METHOD'} = ''; -$ENV{'LANGUAGE'} = ''; -$ENV{'LC_ALL'} = ''; -$ENV{'LC_MESSAGES'} = 'Eu-MT'; -$ENV{'LANG'} = ''; - -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - - -print "# Test LANG...\n"; -$ENV{'REQUEST_METHOD'} = ''; -$ENV{'LANGUAGE'} = ''; -$ENV{'LC_ALL'} = ''; -$ENV{'LC_MESSAGES'} = ''; -$ENV{'LANG'} = 'Eu_MT'; - -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - - - -print "# Test LANG...\n"; -$ENV{'LANGUAGE'} = ''; -$ENV{'REQUEST_METHOD'} = ''; -$ENV{'LC_ALL'} = ''; -$ENV{'LC_MESSAGES'} = ''; -$ENV{'LANG'} = 'Eu_MT'; - -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - - - - -print "# Test HTTP_ACCEPT_LANGUAGE...\n"; -$ENV{'REQUEST_METHOD'} = 'GET'; -$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'eu-MT'; -ok show( scalar I18N::LangTags::Detect::detect()), "eu-mt"; -ok show( j I18N::LangTags::Detect::detect()), q{["eu-mt"]}; - - -$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eu-MT, i-klung'; -ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; -ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; - -$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eU-Mt, i-klung'; -ok show( scalar I18N::LangTags::Detect::detect()), "x-plorp"; -ok show( j I18N::LangTags::Detect::detect()), qq{["x-plorp" "i-plorp" "zaz" "eu-mt" "i-klung" "x-klung"]}; - - - - -print "# Byebye!\n"; -ok 1; - -- cgit v1.2.1