From 4b053158ffba5bda82094dc0b0cd80c9d2867b97 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sun, 27 May 2001 13:43:38 +0000 Subject: Integrate I18N::LangTags from Sean Burke. TODO: the language list from RFC 3066 needs to be integrated and made available somehow. The list is included in the I18N-LangTags 0.21 distribution, but it is undocumented and unconnected to the module. p4raw-id: //depot/perl@10228 --- lib/I18N/LangTags.pm | 618 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 618 insertions(+) create mode 100644 lib/I18N/LangTags.pm (limited to 'lib') diff --git a/lib/I18N/LangTags.pm b/lib/I18N/LangTags.pm new file mode 100644 index 0000000000..c8a64d39f8 --- /dev/null +++ b/lib/I18N/LangTags.pm @@ -0,0 +1,618 @@ + +# Time-stamp: "2001-05-25 07:36:55 MDT" +# Sean M. Burke + +require 5.000; +package I18N::LangTags; +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); # $Debug +require Exporter; +# $Debug = 0; +@ISA = qw(Exporter); +@EXPORT = qw(); +@EXPORT_OK = qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag + ); + +$VERSION = "0.21"; + +=head1 NAME + +I18N::LangTags - functions for dealing with RFC3066-style language tags + +=head1 SYNOPSIS + + use I18N::LangTags qw(is_language_tag same_language_tag + extract_language_tags super_languages + similarity_language_tag is_dialect_of + locale2language_tag alternate_language_tags + encode_language_tag + ); + +...or whatever of those functions you want to import. Those are +all the exportable functions -- you're free to import only some, +or none at all. By default, none are imported. + +If you don't import any of these functions, assume a C<&I18N::LangTags::> +in front of all the function names in the following examples. + +=head1 DESCRIPTION + +Language tags are a formalism, described in RFC 3066 (obsoleting +1766), for declaring what language form (language and possibly +dialect) a given chunk of information is in. + +This library provides functions for common tasks involving language +tags as they are needed in a variety of protocols and applications. + +Please see the "See Also" references for a thorough explanation +of how to correctly use language tags. + +=over + +=cut + +########################################################################### + +=item * the function is_language_tag($lang1) + +Returns true iff $lang1 is a formally valid language tag. + + is_language_tag("fr") is TRUE + is_language_tag("x-jicarilla") is FALSE + (Subtags can be 8 chars long at most -- 'jicarilla' is 9) + + is_language_tag("sgn-US") is TRUE + (That's American Sign Language) + + is_language_tag("i-Klikitat") is TRUE + (True without regard to the fact noone has actually + registered Klikitat -- it's a formally valid tag) + + is_language_tag("fr-patois") is TRUE + (Formally valid -- altho descriptively weak!) + + is_language_tag("Spanish") is FALSE + is_language_tag("french-patois") is FALSE + (No good -- first subtag has to match + /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) + + is_language_tag("x-borg-prot2532") is TRUE + (Yes, subtags can contain digits, as of RFC3066) + +=cut + +sub is_language_tag { + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = lc($_[0]); + + return 0 if $tag eq "i" or $tag eq "x"; + # Bad degenerate cases the following + # regexp would erroneously let pass + + return $tag =~ + /^(?: # First subtag + [xi] | [a-z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-z0-9]{1,8} # subtag + )* + $/xs ? 1 : 0; +} + +########################################################################### + +=item * the function extract_language_tags($whatever) + +Returns a list of whatever looks like formally valid language tags +in $whatever. Not very smart, so don't get too creative with +what you want to feed it. + + extract_language_tags("fr, fr-ca, i-mingo") + returns: ('fr', 'fr-ca', 'i-mingo') + + extract_language_tags("It's like this: I'm in fr -- French!") + returns: ('It', 'in', 'fr') + (So don't just feed it any old thing.) + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub extract_language_tags { + + ## Changes in the language tagging standards may have to be reflected here. + + my($text) = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags + $text =~ + m/ + \b + (?: # First subtag + [iIxX] | [a-zA-Z]{2,3} + ) + (?: # Subtags thereafter + - # separator + [a-zA-Z0-9]{1,8} # subtag + )* + \b + /xsg + ); +} + +########################################################################### + +=item * the function same_language_tag($lang1, $lang2) + +Returns true iff $lang1 and $lang2 are acceptable variant tags +representing the same language-form. + + same_language_tag('x-kadara', 'i-kadara') is TRUE + (The x/i- alternation doesn't matter) + same_language_tag('X-KADARA', 'i-kadara') is TRUE + (...and neither does case) + same_language_tag('en', 'en-US') is FALSE + (all-English is not the SAME as US English) + same_language_tag('x-kadara', 'x-kadar') is FALSE + (these are totally unrelated tags) + +C works by just seeing whether +C is the same as +C. + +(Yes, I know this function is named a bit oddly. Call it historic +reasons.) + +=cut + +sub same_language_tag { + my $el1 = &encode_language_tag($_[0]); + return 0 unless defined $el1; + # this avoids the problem of + # encode_language_tag($lang1) eq and encode_language_tag($lang2) + # being true if $lang1 and $lang2 are both undef + + return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; +} + +########################################################################### + +=item * the function similarity_language_tag($lang1, $lang2) + +Returns an integer representing the degree of similarity between +tags $lang1 and $lang2 (the order of which does not matter), where +similarity is the number of common elements on the left, +without regard to case and to x/i- alternation. + + similarity_language_tag('fr', 'fr-ca') is 1 + (one element in common) + similarity_language_tag('fr-ca', 'fr-FR') is 1 + (one element in common) + + similarity_language_tag('fr-CA-joual', + 'fr-CA-PEI') is 2 + similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 + (two elements in common) + + similarity_language_tag('x-kadara', 'i-kadara') is 1 + (x/i- doesn't matter) + + similarity_language_tag('en', 'x-kadar') is 0 + similarity_language_tag('x-kadara', 'x-kadar') is 0 + (unrelated tags -- no similarity) + + similarity_language_tag('i-cree-syllabic', + 'i-cherokee-syllabic') is 0 + (no B elements in common!) + +=cut + +sub similarity_language_tag { + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + + # NB: (i-sil-...)? (i-sgn-...)? + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + my @l1_subtags = split('-', $lang1); + my @l2_subtags = split('-', $lang2); + my $similarity = 0; + + while(@l1_subtags and @l2_subtags) { + if(shift(@l1_subtags) eq shift(@l2_subtags)) { + ++$similarity; + } else { + last; + } + } + return $similarity; +} + +########################################################################### + +=item * the function is_dialect_of($lang1, $lang2) + +Returns true iff language tag $lang1 represents a subdialect of +language tag $lang2. + +B + + is_dialect_of('en-US', 'en') is TRUE + (American English IS a dialect of all-English) + + is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE + is_dialect_of('fr-CA-joual', 'fr') is TRUE + (Joual is a dialect of (a dialect of) French) + + is_dialect_of('en', 'en-US') is FALSE + (all-English is a NOT dialect of American English) + + is_dialect_of('fr', 'en-CA') is FALSE + + is_dialect_of('en', 'en' ) is TRUE + is_dialect_of('en-US', 'en-US') is TRUE + (B these are degenerate cases) + + is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE + (the x/i thing doesn't matter, nor does case) + +=cut + +sub is_dialect_of { + + my $lang1 = &encode_language_tag($_[0]); + my $lang2 = &encode_language_tag($_[1]); + + return undef if !defined($lang1) and !defined($lang2); + return 0 if !defined($lang1) or !defined($lang2); + + return 1 if $lang1 eq $lang2; + return 0 if length($lang1) < length($lang2); + + $lang1 .= '-'; + $lang2 .= '-'; + return + (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; +} + +########################################################################### + +=item * the function super_languages($lang1) + +Returns a list of language tags that are superordinate tags to $lang1 +-- it gets this by removing subtags from the end of $lang1 until +nothing (or just "i" or "x") is left. + + super_languages("fr-CA-joual") is ("fr-CA", "fr") + + super_languages("en-AU") is ("en") + + super_languages("en") is empty-list, () + + super_languages("i-cherokee") is empty-list, () + ...not ("i"), which would be illegal as well as pointless. + +If $lang1 is not a valid language tag, returns empty-list in +a list context, undef in a scalar context. + +A notable and rather unavoidable problem with this method: +"x-mingo-tom" has an "x" because the whole tag isn't an +IANA-registered tag -- but super_languages('x-mingo-tom') is +('x-mingo') -- which isn't really right, since 'i-mingo' is +registered. But this module has no way of knowing that. (But note +that same_language_tag('x-mingo', 'i-mingo') is TRUE.) + +More importantly, you assume I that superordinates of +$lang1 are mutually intelligible with $lang1. Consider this +carefully. + +=cut + +sub super_languages { + my $lang1 = $_[0]; + return() unless defined($lang1) && &is_language_tag($lang1); + my @l1_subtags = split('-', $lang1); + + ## Changes in the language tagging standards may have to be reflected here. + + # NB: (i-sil-...)? + + my @supers = (); + foreach my $bit (@l1_subtags) { + push @supers, + scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; + } + pop @supers if @supers; + shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; + return reverse @supers; +} + +########################################################################### + +=item * the function locale2language_tag($locale_identifier) + +This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") +and maps it to a language tag. If it's not mappable (as with, +notably, "C" and "POSIX"), this returns empty-list in a list context, +or undef in a scalar context. + + locale2language_tag("en") is "en" + + locale2language_tag("en_US") is "en-US" + + locale2language_tag("en_US.ISO8859-1") is "en-US" + + locale2language_tag("C") is undef or () + + locale2language_tag("POSIX") is undef or () + + locale2language_tag("POSIX") is undef or () + +I'm not totally sure that locale names map satisfactorily to language +tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. + +The output is untainted. If you don't know what tainting is, +don't worry about it. + +=cut + +sub locale2language_tag { + my $lang = + $_[0] =~ m/(.+)/ # to make for an untainted result + ? $1 : '' + ; + + return $lang if &is_language_tag($lang); # like "en" + + $lang =~ tr<_><->; # "en_US" -> en-US + $lang =~ s<\.[-_a-zA-Z0-9\.]*><>s; # "en_US.ISO8859-1" -> en-US + + return $lang if &is_language_tag($lang); + + return; +} + +########################################################################### + +=item * the function encode_language_tag($lang1) + +This function, if given a language tag, returns an encoding of it such +that: + +* tags representing different languages never get the same encoding. + +* tags representing the same language always get the same encoding. + +* an encoding of a formally valid language tag always is a string +value that is defined, has length, and is true if considered as a +boolean. + +Note that the encoding itself is B a formally valid language tag. +Note also that you cannot, currently, go from an encoding back to a +language tag that it's an encoding of. + +Note also that you B consider the encoded value as atomic; i.e., +you should not consider it as anything but an opaque, unanalysable +string value. (The internals of the encoding method may change in +future versions, as the language tagging standard changes over time.) + +C returns undef if given anything other than a +formally valid language tag. + +The reason C exists is because different language +tags may represent the same language; this is normally treatable with +C, but consider this situation: + +You have a data file that expresses greetings in different languages. +Its format is "[language tag]=[how to say 'Hello']", like: + + en-US=Hiho + fr=Bonjour + i-mingo=Hau' + +And suppose you write a program that reads that file and then runs as +a daemon, answering client requests that specify a language tag and +then expect the string that says how to greet in that language. So an +interaction looks like: + + greeting-client asks: fr + greeting-server answers: Bonjour + +So far so good. But suppose the way you're implementing this is: + + my %greetings; + die unless open(IN, ") { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{$lang} = $expr; + } + close(IN); + +at which point %greetings has the contents: + + "en-US" => "Hiho" + "fr" => "Bonjour" + "i-mingo" => "Hau'" + +And suppose then that you answer client requests for language $wanted +by just looking up $greetings{$wanted}. + +If the client asks for "fr", that will look up successfully in +%greetings, to the value "Bonjour". And if the client asks for +"i-mingo", that will look up successfully in %greetings, to the value +"Hau'". + +But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the +lookup in %greetings fails. That's the Wrong Thing. + +You could instead do lookups on $wanted with: + + use I18N::LangTags qw(same_language_tag); + my $repsonse = ''; + foreach my $l2 (keys %greetings) { + if(same_language_tag($wanted, $l2)) { + $response = $greetings{$l2}; + last; + } + } + +But that's rather inefficient. A better way to do it is to start your +program with: + + use I18N::LangTags qw(encode_language_tag); + my %greetings; + die unless open(IN, ") { + chomp; + next unless /^([^=]+)=(.+)/s; + my($lang, $expr) = ($1, $2); + $greetings{ + encode_language_tag($lang) + } = $expr; + } + close(IN); + +and then just answer client requests for language $wanted by just +looking up + + $greetings{encode_language_tag($wanted)} + +And that does the Right Thing. + +=cut + +sub encode_language_tag { + # Only similarity_language_tag() is allowed to analyse encodings! + + ## Changes in the language tagging standards may have to be reflected here. + + my($tag) = uc($_[0]); # smash case + return undef unless &is_language_tag($tag); + # If it's not a language tag, its encoding is undef + + $tag =~ s/^[xiXI]-//s; + # Just lop off any leading "x/i-" + # Or I suppose I could do s/^[xiXI]-/_/s or something. + + return "~$tag"; +} + +#-------------------------------------------------------------------------- + +=item * the function alternate_language_tags($lang1) + +This function, if given a language tag, returns all language tags that +are alternate forms of this language tag. (There is little +alternation in the C language tagging formalism, but +extensions to the formalism are under consideration which could add a +great deal of alternation.) + +Examples from the current formalism: + + alternate_language_tags('en') is () + alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') + alternate_language_tags('x-klikitat') is ('i-klikitat') + alternate_language_tags('i-klikitat') is ('x-klikitat') + +This function returns undef if given anything other than a formally +valid language tag. + +=cut + +my %alt = qw( i x x i I X X I ); +sub alternate_language_tags { + ## Changes in the language tagging standards may have to be reflected here. + my $tag = $_[0]; + return() unless &is_language_tag($tag); + + # might as well preserve case + + if($tag =~ /^([XIxi])(-.+)/) { + # This handles all the alternation that exists CURRENTLY + return($alt{$1} . $2); + } + return(); +} + +########################################################################### + +=back + +=head1 ABOUT LOWERCASING + +I've considered making all the above functions that output language +tags return all those tags strictly in lowercase. Having all your +language tags in lowercase does make some things easier. But you +might as well just lowercase as you like, or call +C where appropriate. + +=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS + +In some future version of I18N::LangTags, I plan to include support +for RFC2482-style language tags -- which are basically just normal +language tags with their ASCII characters shifted into Plane 14. + +=head1 SEE ALSO + +* RFC 3066, C, "Tags for the +Identification of Languages". (Obsoletes RFC 1766) + +* RFC 2277, C, "IETF Policy on +Character Sets and Languages". + +* RFC 2231, C, "MIME Parameter +Value and Encoded Word Extensions: Character Sets, Languages, and +Continuations". + +* RFC 2482, C, +"Language Tagging in Unicode Plain Text". + +* Locale::Codes, in +C + +* ISO 639, "Code for the representation of names of languages", +C + +* ISO 639-2, "Codes for the representation of names of languages", +including three-letter codes, +C + +* The IANA list of registered languages (hopefully up-to-date), +C + +=head1 COPYRIGHT + +Copyright (c) 1998-2001 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +The programs and documentation in this dist are distributed in +the hope that they will be useful, but without any warranty; without +even the implied warranty of merchantability or fitness for a +particular purpose. + +=head1 AUTHOR + +Sean M. Burke C + +=cut + +1; + +__END__ -- cgit v1.2.1