diff options
author | Jarkko Hietaniemi <jhi@iki.fi> | 2001-05-27 13:50:57 +0000 |
---|---|---|
committer | Jarkko Hietaniemi <jhi@iki.fi> | 2001-05-27 13:50:57 +0000 |
commit | 9378c5814a1c38be33358baa5cfd56712c3b71d4 (patch) | |
tree | 712d3a8e3142e76139998b3f83f430343ce1e173 /lib/Locale | |
parent | 4b053158ffba5bda82094dc0b0cd80c9d2867b97 (diff) | |
download | perl-9378c5814a1c38be33358baa5cfd56712c3b71d4.tar.gz |
Integrate Locale::Maketext 1.01 from Sean Burke.
p4raw-id: //depot/perl@10229
Diffstat (limited to 'lib/Locale')
-rw-r--r-- | lib/Locale/Maketext.pm | 646 | ||||
-rw-r--r-- | lib/Locale/Maketext.pod | 1302 | ||||
-rw-r--r-- | lib/Locale/Maketext/TPJ13.pod | 776 |
3 files changed, 2724 insertions, 0 deletions
diff --git a/lib/Locale/Maketext.pm b/lib/Locale/Maketext.pm new file mode 100644 index 0000000000..a39383fc30 --- /dev/null +++ b/lib/Locale/Maketext.pm @@ -0,0 +1,646 @@ + +# Time-stamp: "2001-05-25 07:49:06 MDT" + +require 5; +package Locale::Maketext; +use strict; +use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS + $USE_LITERALS); +use Carp (); +use I18N::LangTags 0.21 (); + +#-------------------------------------------------------------------------- + +BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } + # define the constant 'DEBUG' at compile-time + +$VERSION = "1.01"; +@ISA = (); + +$MATCH_SUPERS = 1; +$USING_LANGUAGE_TAGS = 1; + # Turning this off is somewhat of a security risk in that little or no + # checking will be done on the legality of tokens passed to the + # eval("use $module_name") in _try_use. If you turn this off, you have + # to do your own taint checking. + +$USE_LITERALS = 1 unless defined $USE_LITERALS; + # a hint for compiling bracket-notation things. + +my %isa_scan = (); + +########################################################################### + +sub quant { + my($handle, $num, @forms) = @_; + + return $num if @forms == 0; # what should this mean? + return $forms[2] if @forms > 2 and $num == 0; # special zeroth case + + # Normal case: + # Note that the formatting of $num is preserved. + return( $handle->numf($num) . ' ' . $handle->numerate($num, @forms) ); + # Most human languages put the number phrase before the qualified phrase. +} + + +sub numerate { + # return this lexical item in a form appropriate to this number + my($handle, $num, @forms) = @_; + my $s = ($num == 1); + + return '' unless @forms; + if(@forms == 1) { # only the headword form specified + return $s ? $forms[0] : ($forms[0] . 's'); # very cheap hack. + } else { # sing and plural were specified + return $s ? $forms[0] : $forms[1]; + } +} + +#-------------------------------------------------------------------------- + +sub numf { + my($handle, $num) = @_[0,1]; + if($num < 10_000_000_000 and $num > -10_000_000_000 and $num == int($num)) { + $num += 0; # Just use normal integer stringification. + # Specifically, don't let %G turn ten million into 1E+007 + } else { + $num = CORE::sprintf("%G", $num); + # "CORE::" is there to avoid confusion with the above sub sprintf. + } + while( $num =~ s/^([-+]?\d+)(\d{3})/$1,$2/s ) {1} # right from perlfaq5 + # The initial \d+ gobbles as many digits as it can, and then we + # backtrack so it un-eats the rightmost three, and then we + # insert the comma there. + + $num =~ tr<.,><,.> if ref($handle) and $handle->{'numf_comma'}; + # This is just a lame hack instead of using Number::Format + return $num; +} + +sub sprintf { + no integer; + my($handle, $format, @params) = @_; + return CORE::sprintf($format, @params); + # "CORE::" is there to avoid confusion with myself! +} + +#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# + +use integer; # vroom vroom... applies to the whole rest of the module + +sub language_tag { + my $it = ref($_[0]) || $_[0]; + return undef unless $it =~ m/([^':]+)(?:::)?$/s; + $it = lc($1); + $it =~ tr<_><->; + return $it; +} + +sub encoding { + my $it = $_[0]; + return( + (ref($it) && $it->{'encoding'}) + || "iso-8859-1" # Latin-1 + ); +} + +#-------------------------------------------------------------------------- + +sub fallback_languages { return('i-default', 'en', 'en-US') } + +sub fallback_language_classes { return () } + +#-------------------------------------------------------------------------- + +sub fail_with { # an actual attribute method! + my($handle, @params) = @_; + return unless ref($handle); + $handle->{'fail'} = $params[0] if @params; + return $handle->{'fail'}; +} + +#-------------------------------------------------------------------------- + +sub failure_handler_auto { + # Meant to be used like: + # $handle->fail_with('failure_handler_auto') + + my($handle, $phrase, @params) = @_; + $handle->{'failure_lex'} ||= {}; + my $lex = $handle->{'failure_lex'}; + + my $value; + $lex->{$phrase} ||= ($value = $handle->_compile($phrase)); + + # Dumbly copied from sub maketext: + { + local $SIG{'__DIE__'}; + eval { $value = &$value($handle, @_) }; + } + # If we make it here, there was an exception thrown in the + # call to $value, and so scream: + if($@) { + my $err = $@; + # pretty up the error message + $err =~ s<\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?> + <\n in bracket code [compiled line $1],>s; + #$err =~ s/\n?$/\n/s; + Carp::croak "Error in maketexting \"$phrase\":\n$err as used"; + # Rather unexpected, but suppose that the sub tried calling + # a method that didn't exist. + } else { + return $value; + } +} + +#========================================================================== + +sub new { + # Nothing fancy! + my $class = ref($_[0]) || $_[0]; + my $handle = bless {}, $class; + $handle->init; + return $handle; +} + +sub init { return } # no-op + +########################################################################### + +sub maketext { + # Remember, this can fail. Failure is controllable many ways. + Carp::croak "maketext requires at least one parameter" unless @_ > 1; + + my($handle, $phrase) = splice(@_,0,2); + + # Look up the value: + + my $value; + foreach my $h_r ( + @{ $isa_scan{ref($handle) || $handle} || $handle->_lex_refs } + ) { + print "* Looking up \"$phrase\" in $h_r\n" if DEBUG; + if(exists $h_r->{$phrase}) { + print " Found \"$phrase\" in $h_r\n" if DEBUG; + unless(ref($value = $h_r->{$phrase})) { + # Nonref means it's not yet compiled. Compile and replace. + $value = $h_r->{$phrase} = $handle->_compile($value); + } + last; + } elsif($phrase !~ m/^_/s and $h_r->{'_AUTO'}) { + # it's an auto lex, and this is an autoable key! + print " Automaking \"$phrase\" into $h_r\n" if DEBUG; + + $value = $h_r->{$phrase} = $handle->_compile($phrase); + last; + } + print " Not found in $h_r, nor automakable\n" if DEBUG > 1; + # else keep looking + } + + unless(defined($value)) { + print "! Lookup of \"$phrase\" in/under ", ref($handle) || $handle, + " fails.\n" if DEBUG; + if(ref($handle) and $handle->{'fail'}) { + print "WARNING0: maketext fails looking for <$phrase>\n" if DEBUG; + my $fail; + if(ref($fail = $handle->{'fail'}) eq 'CODE') { # it's a sub reference + return &{$fail}($handle, $phrase, @_); + # If it ever returns, it should return a good value. + } else { # It's a method name + return $handle->$fail($phrase, @_); + # If it ever returns, it should return a good value. + } + } else { + # All we know how to do is this; + Carp::croak("maketext doesn't know how to say:\n$phrase\nas needed"); + } + } + + return $$value if ref($value) eq 'SCALAR'; + return $value unless ref($value) eq 'CODE'; + + { + local $SIG{'__DIE__'}; + eval { $value = &$value($handle, @_) }; + } + # If we make it here, there was an exception thrown in the + # call to $value, and so scream: + if($@) { + my $err = $@; + # pretty up the error message + $err =~ s<\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?> + <\n in bracket code [compiled line $1],>s; + #$err =~ s/\n?$/\n/s; + Carp::croak "Error in maketexting \"$phrase\":\n$err as used"; + # Rather unexpected, but suppose that the sub tried calling + # a method that didn't exist. + } else { + return $value; + } +} + +########################################################################### + +sub get_handle { # This is a constructor and, yes, it CAN FAIL. + # Its class argument has to be the base class for the current + # application's l10n files. + my($base_class, @languages) = @_; + $base_class = ref($base_class) || $base_class; + # Complain if they use __PACKAGE__ as a project base class? + + unless(@languages) { # Calling with no args is magical! wooo, magic! + if(length( $ENV{'REQUEST_METHOD'} || '' )) { # I'm a CGI + my $in = $ENV{'HTTP_ACCEPT_LANGUAGE'} || ''; + # supposedly that works under mod_perl, too. + $in =~ s<\([\)]*\)><>g; # Kill parens'd things -- just a hack. + @languages = &I18N::LangTags::extract_language_tags($in) if length $in; + # ...which untaints, incidentally. + + } else { # Not running as a CGI: try to puzzle out from the environment + if(length( $ENV{'LANG'} || '' )) { + push @languages, split m/[,:]/, $ENV{'LANG'}; + # LANG can be only /one/ locale as far as I know, but what the hey. + } + if(length( $ENV{'LANGUAGE'} || '' )) { + push @languages, split m/[,:]/, $ENV{'LANGUAGE'}; + } + print "Noting ENV LANG ", join(',', @languages),"\n" if DEBUG; + # Those are really locale IDs, but they get xlated a few lines down. + + if(&_try_use('Win32::Locale')) { + # If we have that module installed... + push @languages, Win32::Locale::get_language() + if defined &Win32::Locale::get_language; + } + } + } + + #------------------------------------------------------------------------ + print "Lgs1: ", map("<$_>", @languages), "\n" if DEBUG; + + if($USING_LANGUAGE_TAGS) { + @languages = map &I18N::LangTags::locale2language_tag($_), @languages; + # if it's a lg tag, fine, pass thru (untainted) + # if it's a locale ID, try converting to a lg tag (untainted), + # otherwise nix it. + + push @languages, map &I18N::LangTags::super_languages($_), @languages + if $MATCH_SUPERS; + + @languages = map { $_, &I18N::LangTags::alternate_language_tags($_) } + @languages; # catch alternation + + push @languages, $base_class->fallback_languages; + # You are free to override fallback_languages to return empty-list! + + @languages = # final bit of processing: + map { + my $it = $_; # copy + $it =~ tr<-A-Z><_a-z>; # lc, and turn - to _ + $it =~ tr<_a-z0-9><>cd; # remove all but a-z0-9_ + $it; + } @languages + ; + } + print "Lgs2: ", map("<$_>", @languages), "\n" if DEBUG > 1; + + push @languages, $base_class->fallback_language_classes; + # You are free to override that to return whatever. + + + my %seen = (); + foreach my $module_name ( map { $base_class . "::" . $_ } @languages ) + { + next unless length $module_name; # sanity + next if $seen{$module_name}++ # Already been here, and it was no-go + || !&_try_use($module_name); # Try to use() it, but can't it. + return($module_name->new); # Make it! + } + + return undef; # Fail! +} + +########################################################################### +# +# This is where most people should stop reading. +# +########################################################################### + +sub _compile { + # This big scarp routine compiles an entry. + # It returns either a coderef if there's brackety bits in this, or + # otherwise a ref to a scalar. + + my $target = ref($_[0]) || $_[0]; + + my(@code); + my(@c) = (''); # "chunks" -- scratch. + my $call_count = 0; + my $big_pile = ''; + { + my $in_group = 0; # start out outside a group + my($m, @params); # scratch + + while($_[1] =~ # Iterate over chunks. + m<\G( + [^\~\[\]]+ # non-~[] stuff + | + ~. # ~[, ~], ~~, ~other + | + \x5B # [ + | + \x5D # ] + | + ~ # terminal ~? + | + $ + )>xgs + ) { + print " \"$1\"\n" if DEBUG > 2; + + if($1 eq '[' or $1 eq '') { # "[" or end + # Whether this is "[" or end, force processing of any + # preceding literal. + if($in_group) { + if($1 eq '') { + $target->_die_pointing($_[1], "Unterminated bracket group"); + } else { + $target->_die_pointing($_[1], "You can't nest bracket groups"); + } + } else { + if($1 eq '') { + print " [end-string]\n" if DEBUG > 2; + } else { + $in_group = 1; + } + die "How come \@c is empty?? in <$_[1]>" unless @c; # sanity + if(length $c[-1]) { + # Now actually processing the preceding literal + $big_pile .= $c[-1]; + if($USE_LITERALS and $c[-1] !~ m<[^\x20-\x7E]>s) { + # normal case -- all very safe chars + $c[-1] =~ s/'/\\'/g; + push @code, q{ '} . $c[-1] . "',\n"; + $c[-1] = ''; # reuse this slot + } else { + push @code, ' $c[' . $#c . "],\n"; + push @c, ''; # new chunk + } + } + # else just ignore the empty string. + } + + } elsif($1 eq ']') { # "]" + # close group -- go back in-band + if($in_group) { + $in_group = 0; + + print " --Closing group [$c[-1]]\n" if DEBUG > 2; + + # And now process the group... + + if(!length($c[-1]) or $c[-1] =~ m/^\s+$/s) { + DEBUG > 2 and print " -- (Ignoring)\n"; + $c[-1] = ''; # reset out chink + next; + } + + #$c[-1] =~ s/^\s+//s; + #$c[-1] =~ s/\s+$//s; + ($m,@params) = split(",", $c[-1], -1); # was /\s*,\s*/ + + foreach($m, @params) { tr/\x7F/,/ } + # A bit of a hack -- we've turned "~,"'s into \x7F's, so turn + # 'em into real commas here. + + if($m eq '_*' or $m =~ m<^_(-?\d+)$>s) { + # Treat [_1,...] as [,_1,...], etc. + unshift @params, $m; + $m = ''; + } + + # Most common case: a simple, legal-looking method name + if($m eq '') { + # 0-length method name means to just interpolate: + push @code, ' ('; + } elsif($m =~ m<^\w+(?:\:\:\w+)*$>s + and $m !~ m<(?:^|\:)\d>s + # exclude starting a (sub)package or symbol with a digit + ) { + # Yes, it even supports the demented (and undocumented?) + # $obj->Foo::bar(...) syntax. + $target->_die_pointing( + $_[1], "Can't (yet?) use \"SUPER::\" in a bracket-group method", + 2 + length($c[-1]) + ) + if $m =~ m/^SUPER::/s; + # Because for SUPER:: to work, we'd have to compile this into + # the right package, and that seems just not worth the bother, + # unless someone convinces me otherwise. + + push @code, ' $_[0]->' . $m . '('; + } else { + # TODO: implement something? or just too icky to consider? + $target->_die_pointing( + $_[1], + "Can't use \"$m\" as a method name in bracket group", + 2 + length($c[-1]) + ); + } + + pop @c; # we don't need that chunk anymore + ++$call_count; + + foreach my $p (@params) { + if($p eq '_*') { + # Meaning: all parameters except $_[0] + $code[-1] .= ' @_[1 .. $#_], '; + # and yes, that does the right thing for all @_ < 3 + } elsif($p =~ m<^_(-?\d+)$>s) { + # _3 meaning $_[3] + $code[-1] .= '$_[' . (0 + $1) . '], '; + } elsif($USE_LITERALS and $p !~ m<[^\x20-\x7E]>s) { + # Normal case: a literal containing only safe characters + $p =~ s/'/\\'/g; + $code[-1] .= q{'} . $p . q{', }; + } else { + # Stow it on the chunk-stack, and just refer to that. + push @c, $p; + push @code, ' $c[' . $#c . "], "; + } + } + $code[-1] .= "),\n"; + + push @c, ''; + } else { + $target->_die_pointing($_[1], "Unbalanced ']'"); + } + + } elsif(substr($1,0,1) ne '~') { + # it's stuff not containing "~" or "[" or "]" + # i.e., a literal blob + $c[-1] .= $1; + + } elsif($1 eq '~~') { # "~~" + $c[-1] .= '~'; + + } elsif($1 eq '~[') { # "~[" + $c[-1] .= '['; + + } elsif($1 eq '~]') { # "~]" + $c[-1] .= ']'; + + } elsif($1 eq '~,') { # "~," + if($in_group) { + $c[-1] .= "\x7F"; + # This is a hack, based on the assumption that no-one will actually + # want a \x7f inside a bracket group. Let's hope that's it's true. + } else { + $c[-1] .= '~,'; + } + + } elsif($1 eq '~') { # possible only at string-end, it seems. + $c[-1] .= '~'; + + } else { + # It's a "~X" where X is not a special character. + # Consider it a literal ~ and X. + $c[-1] .= $1; + } + } + } + + if($call_count) { + undef $big_pile; # Well, nevermind that. + } else { + # It's all literals! Ahwell, that can happen. + # So don't bother with the eval. Return a SCALAR reference. + return \$big_pile; + } + + die "Last chunk isn't null??" if @c and length $c[-1]; # sanity + print scalar(@c), " chunks under closure\n" if DEBUG; + if(@code == 0) { # not possible? + print "Empty code\n" if DEBUG; + return \''; + } elsif(@code > 1) { # most cases, presumably! + unshift @code, "join '',\n"; + } + unshift @code, "use strict; sub {\n"; + push @code, "}\n"; + + print @code if DEBUG; + my $sub = eval(join '', @code); + die "$@ while evalling" . join('', @code) if $@; # Should be impossible. + return $sub; +} + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +sub _die_pointing { + # This is used by _compile to throw a fatal error + my $target = shift; # class name + # ...leaving $_[0] the error-causing text, and $_[1] the error message + + my $i = index($_[0], "\n"); + + my $pointy; + my $pos = pos($_[0]) - (defined($_[2]) ? $_[2] : 0) - 1; + if($pos < 1) { + $pointy = "^=== near there\n"; + } else { # we need to space over + my $first_tab = index($_[0], "\t"); + if($pos > 2 and ( -1 == $first_tab or $first_tab > pos($_[0]))) { + # No tabs, or the first tab is harmlessly after where we will point to, + # AND we're far enough from the margin that we can draw a proper arrow. + $pointy = ('=' x $pos) . "^ near there\n"; + } else { + # tabs screw everything up! + $pointy = substr($_[0],0,$pos); + $pointy =~ tr/\t //cd; + # make everything into whitespace, but preseving tabs + $pointy .= "^=== near there\n"; + } + } + + my $errmsg = "$_[1], in\:\n$_[0]"; + + if($i == -1) { + # No newline. + $errmsg .= "\n" . $pointy; + } elsif($i == (length($_[0]) - 1) ) { + # Already has a newline at end. + $errmsg .= $pointy; + } else { + # don't bother with the pointy bit, I guess. + } + Carp::croak( "$errmsg via $target, as used" ); +} + +########################################################################### + +my %tried = (); + # memoization of whether we've used this module, or found it unusable. + +sub _try_use { # Basically a wrapper around "require Modulename" + # "Many men have tried..." "They tried and failed?" "They tried and died." + return $tried{$_[0]} if exists $tried{$_[0]}; # memoization + + my $module = $_[0]; # ASSUME sane module name! + { no strict 'refs'; + return($tried{$module} = 1) + if defined(%{$module . "::Lexicon"}) or defined(@{$module . "::ISA"}); + # weird case: we never use'd it, but there it is! + } + + print " About to use $module ...\n" if DEBUG; + { + local $SIG{'__DIE__'}; + eval "require $module"; # used to be "use $module", but no point in that. + } + if($@) { + print "Error using $module \: $@\n" if DEBUG > 1; + return $tried{$module} = 0; + } else { + print " OK, $module is used\n" if DEBUG; + return $tried{$module} = 1; + } +} + +#-------------------------------------------------------------------------- + +sub _lex_refs { # report the lexicon references for this handle's class + # returns an arrayREF! + no strict 'refs'; + my $class = ref($_[0]) || $_[0]; + print "Lex refs lookup on $class\n" if DEBUG > 1; + return $isa_scan{$class} if exists $isa_scan{$class}; # memoization! + + my @lex_refs; + my $seen_r = ref($_[1]) ? $_[1] : {}; + + if( defined( *{$class . '::Lexicon'}{'HASH'} )) { + push @lex_refs, *{$class . '::Lexicon'}{'HASH'}; + print "%" . $class . "::Lexicon contains ", + scalar(keys %{$class . '::Lexicon'}), " entries\n" if DEBUG; + } + + # Implements depth(height?)-first recursive searching of superclasses + foreach my $superclass (@{$class . "::ISA"}) { + print " Super-class search into $superclass\n" if DEBUG; + next if $seen_r->{$superclass}++; + push @lex_refs, @{&_lex_refs($superclass, $seen_r)}; # call myself + } + + $isa_scan{$class} = \@lex_refs; # save for next time + return \@lex_refs; +} + +sub clear_isa_scan { %isa_scan = (); return; } # end on a note of simplicity! + +########################################################################### +1; + + diff --git a/lib/Locale/Maketext.pod b/lib/Locale/Maketext.pod new file mode 100644 index 0000000000..b28a9d83c8 --- /dev/null +++ b/lib/Locale/Maketext.pod @@ -0,0 +1,1302 @@ + +# Time-stamp: "2001-05-25 07:50:08 MDT" + +=head1 NAME + +Locale::Maketext -- framework for localization + +=head1 SYNOPSIS + + package MyProgram; + use strict; + use MyProgram::L10N; + # ...which inherits from Locale::Maketext + my $lh = MyProgram::L10N->get_handle() || die "What language?"; + ... + # And then any messages your program emits, like: + warn $lh->maketext( "Can't open file [_1]: [_2]\n", $f, $! ); + ... + +=head1 DESCRIPTION + +It is a common feature of applications (whether run directly, +or via the Web) for them to be "localized" -- i.e., for them +to a present an English interface to an English-speaker, a German +interface to a German-speaker, and so on for all languages it's +programmed with. Locale::Maketext +is a framework for software localization; it provides you with the +tools for organizing and accessing the bits of text and text-processing +code that you need for producing localized applications. + +In order to make sense of Maketext and how all its +components fit together, you should probably +go read L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13>, and +I<then> read the following documentation. + +You may also want to read over the source for C<File::Findgrep> +and its constituent modules -- they are a complete (if small) +example application that uses Maketext. + +=head1 QUICK OVERVIEW + +The basic design of Locale::Maketext is object-oriented, and +Locale::Maketext is an abstract base class, from which you +derive a "project class". +The project class (with a name like "TkBocciBall::Localize", +which you then use in your module) is in turn the base class +for all the "language classes" for your project +(with names "TkBocciBall::Localize::it", +"TkBocciBall::Localize::en", +"TkBocciBall::Localize::fr", etc.). + +A language class is +a class containing a lexicon of phrases as class data, +and possibly also some methods that are of use in interpreting +phrases in the lexicon, or otherwise dealing with text in that +language. + +An object belonging to a language class is called a "language +handle"; it's typically a flyweight object. + +The normal course of action is to call: + + use TkBocciBall::Localize; # the localization project class + $lh = TkBocciBall::Localize->get_handle(); + # Depending on the user's locale, etc., this will + # make a language handle from among the classes available, + # and any defaults that you declare. + die "Couldn't make a language handle??" unless $lh; + +From then on, you use the C<maketext> function to access +entries in whatever lexicon(s) belong to the language handle +you got. So, this: + + print $lh->maketext("You won!"), "\n"; + +...emits the right text for this language. If the object +in C<$lh> belongs to class "TkBocciBall::Localize::fr" and +%TkBocciBall::Localize::fr::Lexicon contains C<("You won!" +=E<gt> "Tu as gagnE<eacute>!")>, then the above +code happily tells the user "Tu as gagnE<eacute>!". + +=head1 METHODS + +Locale::Maketext offers a variety of methods, which fall +into three categories: + +=over + +=item * + +Methods to do with constructing language handles. + +=item * + +C<maketext> and other methods to do with accessing %Lexicon data +for a given language handle. + +=item * + +Methods that you may find it handy to use, from routines of +yours that you put in %Lexicon entries. + +=back + +These are covered in the following section. + +=head2 Construction Methods + +These are to do with constructing a language handle: + +=over + +=item $lh = YourProjClass->get_handle( ...langtags... ) || die "lg-handle?"; + +This tries loading classes based on the language-tags you give (like +C<("en-US", "sk", "kon", "es-MX", "ja", "i-klingon")>, and for the first class +that succeeds, returns YourProjClass::I<language>->new(). + +It runs thru the entire given list of language-tags, and finds no classes +for those exact terms, it then tries "superordinate" language classes. +So if no "en-US" class (i.e., YourProjClass::en_us) +was found, nor classes for anything else in that list, we then try +its superordinate, "en" (i.e., YourProjClass::en), and so on thru +the other language-tags in the given list: "es". +(The other language-tags in our example list: +happen to have no superordinates.) + +If none of those language-tags leads to loadable classes, we then +try classes derived from YourProjClass->fallback_languages() and +then if nothing comes of that, we use classes named by +YourProjClass->fallback_language_classes(). Then in the (probably +quite unlikely) event that that fails, we just return undef. + +=item $lh = YourProjClass->get_handleB<()> || die "lg-handle?"; + +When C<get_handle> is called with an empty parameter list, magic happens: + +If C<get_handle> senses that it's running in program that was +invoked as a CGI, then it tries to get language-tags out of the +environment variable "HTTP_ACCEPT_LANGUAGE", and it pretends that +those were the languages passed as parameters to C<get_handle>. + +Otherwise (i.e., if not a CGI), this tries various OS-specific ways +to get the language-tags for the current locale/language, and then +pretends that those were the value(s) passed to C<cet_handle>. + +Currently this OS-specific stuff consists of looking in the environment +variables "LANG" and "LANGUAGE"; and on MSWin machines (where those +variables are typically unused), this also tries using +the module Win32::Locale to get a language-tag for whatever language/locale +is currently selected in the "Regional Settings" (or "International"?) +Control Panel. I welcome further +suggestions for making this do the Right Thing under other operating +systems that support localization. + +If you're using localization in an application that keeps a configuration +file, you might consider something like this in your project class: + + sub get_handle_via_config { + my $class = $_[0]; + my $preferred_language = $Config_settings{'language'}; + my $lh; + if($preferred_language) { + $lh = $class->get_handle($chosen_language) + || die "No language handle for \"$chosen_language\" or the like"; + } else { + # Config file missing, maybe? + $lh = $class->get_handle() + || die "Can't get a language handle"; + } + return $lh; + } + +=item $lh = YourProjClass::langname->new(); + +This constructs a language handle. You usually B<don't> call this +directly, but instead let C<get_handle> find a language class to C<use> +and to then call ->new on. + +=item $lh->init(); + +This is called by ->new to initialize newly-constructed language handles. +If you define an init method in your class, remember that it's usually +considered a good idea to call $lh->SUPER::init in it (presumably at the +beginning), so that all classes get a chance to initialize a new object +however they see fit. + +=item YourProjClass->fallback_languages() + +C<get_handle> appends the return value of this to the end of +whatever list of languages you pass C<get_handle>. Unless +you override this method, your project class +will inherit Locale::Maketext's C<fallback_languages>, which +currently returns C<('i-default', 'en', 'en-US')>. +("i-default" is defined in RFC 2277). + +This method (by having it return the name +of a language-tag that has an existing language class) +can be used for making sure that +C<get_handle> will always manage to construct a language +handle (assuming your language classes are in an appropriate +@INC directory). Or you can use the next method: + +=item YourProjClass->fallback_language_classes() + +C<get_handle> appends the return value of this to the end +of the list of classes it will try using. Unless +you override this method, your project class +will inherit Locale::Maketext's C<fallback_language_classes>, +which currently returns an empty list, C<()>. +By setting this to some value (namely, the name of a loadable +language class), you can be sure that +C<get_handle> will always manage to construct a language +handle. + +=back + +=head2 The "maketext" Method + +This is the most important method in Locale::Maketext: + +$text = $lh->maketext(I<key>, ...parameters for this phrase...); + +This looks in the %Lexicon of the language handle +$lh and all its superclasses, looking +for an entry whose key is the string I<key>. Assuming such +an entry is found, various things then happen, depending on the +value found: + +If the value is a scalarref, the scalar is dereferenced and returned +(and any parameters are ignored). +If the value is a coderef, we return &$value($lh, ...parameters...). +If the value is a string that I<doesn't> look like it's in Bracket Notation, +we return it (after replacing it with a scalarref, in its %Lexicon). +If the value I<does> look like it's in Bracket Notation, then we compile +it into a sub, replace the string in the %Lexicon with the new coderef, +and then we return &$new_sub($lh, ...parameters...). + +Bracket Notation is discussed in a later section. Note +that trying to compile a string into Bracket Notation can throw +an exception if the string is not syntactically valid (say, by not +balancing brackets right.) + +Also, calling &$coderef($lh, ...parameters...) can throw any sort of +exception (if, say, code in that sub tries to divide by zero). But +a very common exception occurs when you have Bracket +Notation text that says to call a method "foo", but there is no such +method. (E.g., "You have [quaB<tn>,_1,ball]." will throw an exception +on trying to call $lh->quaB<tn>($_[1],'ball') -- you presumably meant +"quant".) C<maketext> catches these exceptions, but only to make the +error message more readable, at which point it rethrows the exception. + +An exception I<may> be thrown if I<key> is not found in any +of $lh's %Lexicon hashes. What happens if a key is not found, +is discussed in a later section, "Controlling Lookup Failure". + +Note that you might find it useful in some cases to override +the C<maketext> method with an "after method", if you want to +translate encodings, or even scripts: + + package YrProj::zh_cn; # Chinese with PRC-style glyphs + use base ('YrProj::zh_tw'); # Taiwan-style + sub maketext { + my $self = shift(@_); + my $value = $self->maketext(@_); + return Chineeze::taiwan2mainland($value); + } + +Or you may want to override it with something that traps +any exceptions, if that's critical to your program: + + sub maketext { + my($lh, @stuff) = @_; + my $out; + eval { $out = $lh->SUPER::maketext(@stuff) }; + return $out unless $@; + ...otherwise deal with the exception... + } + +Other than those two situations, I don't imagine that +it's useful to override the C<maketext> method. (If +you run into a situation where it is useful, I'd be +interested in hearing about it.) + +=over + +=item $lh->fail_with I<or> $lh->fail_with(I<PARAM>) + +=item $lh->failure_handler_auto + +These two methods are discussed in the section "Controlling +Lookup Failure". + +=back + +=head2 Utility Methods + +These are methods that you may find it handy to use, generally +from %Lexicon routines of yours (whether expressed as +Bracket Notation or not). + +=over + +=item $language->quant($number, $singular) + +=item $language->quant($number, $singular, $plural) + +=item $language->quant($number, $singular, $plural, $negative) + +This is generally meant to be called from inside Bracket Notation +(which is discussed later), as in + + "Your search matched [quant,_1,document]!" + +It's for I<quantifying> a noun (i.e., saying how much of it there is, +while giving the currect form of it). The behavior of this method is +handy for English and a few other Western European languages, and you +should override it for languages where it's not suitable. You can feel +free to read the source, but the current implementation is basically +as this pseudocode describes: + + if $number is 0 and there's a $negative, + return $negative; + elsif $number is 1, + return "1 $singular"; + elsif there's a $plural, + return "$number $plural"; + else + return "$number " . $singular . "s"; + # + # ...except that we actually call numf to + # stringify $number before returning it. + +So for English (with Bracket Notation) +C<"...[quant,_1,file]..."> is fine (for 0 it returns "0 files", +for 1 it returns "1 file", and for more it returns "2 files", etc.) + +But for "directory", you'd want C<"[quant,_1,direcory,directories]"> +so that our elementary C<quant> method doesn't think that the +plural of "directory" is "directorys". And you might find that the +output may sound better if you specify a negative form, as in: + + "[quant,_1,file,files,No files] matched your query.\n" + +Remember to keep in mind verb agreement (or adjectives too, in +other languages), as in: + + "[quant,_1,document] were matched.\n" + +Because if _1 is one, you get "1 document B<were> matched". +An acceptable hack here is to do something like this: + + "[quant,_1,document was, documents were] matched.\n" + +=item $language->numf($number) + +This returns the given number formatted nicely according to +this language's conventions. Maketext's default method is +mostly to just take the normal string form of the number +(applying sprintf "%G" for only very large numbers), and then +to add commas as necessary. (Except that +we apply C<tr/,./.,/> if $language->{'numf_comma'} is true; +that's a bit of a hack that's useful for languages that express +two million as "2.000.000" and not as "2,000,000"). + +If you want anything fancier, consider overriding this with something +that uses L<Number::Format|Number::Format>, or does something else +entirely. + +Note that numf is called by quant for stringifying all quantifying +numbers. + +=item $language->sprintf($format, @items) + +This is just a wrapper around Perl's normal C<sprintf> function. +It's provided so that you can use "sprintf" in Bracket Notation: + + "Couldn't access datanode [sprintf,%10x=~[%s~],_1,_2]!\n" + +returning... + + Couldn't access datanode Stuff=[thangamabob]! + +=item $language->language_tag() + +Currently this just takes the last bit of C<ref($language)>, turns +underscores to dashes, and returns it. So if $language is +an object of class Hee::HOO::Haw::en_us, $language->language_tag() +returns "en-us". (Yes, the usual representation for that language +tag is "en-US", but case is I<never> considered meaningful in +language-tag comparison.) + +You may override this as you like; Maketext doesn't use it for +anything. + +=item $language->encoding() + +Currently this isn't used for anything, but it's provided +(with default value of +C<(ref($language) && $language-E<gt>{'encoding'})) or "iso-8859-1"> +) as a sort of suggestion that it may be useful/necessary to +associate encodings with your language handles (whether on a +per-class or even per-handle basis.) + +=back + +=head2 Language Handle Attributes and Internals + +A language handle is a flyweight object -- i.e., it doesn't (necessarily) +carry any data of interest, other than just being a member of +whatever class it belongs to. + +A language handle is implemented as a blessed hash. Subclasses of yours +can store whatever data you want in the hash. Currently the only hash +entry used by any crucial Maketext method is "fail", so feel free to +use anything else as you like. + +B<Remember: Don't be afraid to read the Maketext source if there's +any point on which this documentation is unclear.> This documentation +is vastly longer than the module source itself. + +=over + +=back + +=head1 LANGUAGE CLASS HIERARCHIES + +These are Locale::Maketext's assumptions about the class +hierarchy formed by all your language classes: + +=over + +=item * + +You must have a project base class, which you load, and +which you then use as the first argument in +the call to YourProjClass->get_handle(...). It should derive +(whether directly or indirectly) from Locale::Maketext. +It B<doesn't matter> how you name this class, altho assuming this +is the localization component of your Super Mega Program, +good names for your project class might be +SuperMegaProgram::Localization, SuperMegaProgram::L10N, +SuperMegaProgram::I18N, SuperMegaProgram::International, +or even SuperMegaProgram::Languages or SuperMegaProgram::Messages. + +=item * + +Language classes are what YourProjClass->get_handle will try to load. +It will look for them by taking each language-tag (B<skipping> it +if it doesn't look like a language-tag or locale-tag!), turning it to +all lowercase, turning and dashes to underscores, and appending it +to YourProjClass . "::". So this: + + $lh = YourProjClass->get_handle( + 'en-US', 'fr', 'kon', 'i-klingon', 'i-klingon-romanized' + ); + +will try loading the classes +YourProjClass::en_us (note lowercase!), YourProjClass::fr, +YourProjClass::kon, +YourProjClass::i_klingon +and YourProjClass::i_klingon_romanized. (And it'll stop at the +first one that actually loads.) + +=item * + +I assume that each language class derives (directly or indirectly) +from your project class, and also defines its @ISA, its %Lexicon, +or both. But I anticipate no dire consequences if these assumptions +do not hold. + +=item * + +Language classes may derive from other language classes (altho they +should have "use I<Thatclassname>" or "use base qw(I<...classes...>)"). +They may derive from the project +class. They may derive from some other class altogether. Or via +multiple inheritance, it may derive from any mixture of these. + +=item * + +I foresee no problems with having multiple inheritance in +your hierarchy of language classes. (As usual, however, Perl will +complain bitterly if you have a cycle in the hierarchy: i.e., if +any class is its own ancestor.) + +=back + +=head1 ENTRIES IN EACH LEXICON + +A typical %Lexicon entry is meant to signify a phrase, +taking some number (0 or more) of parameters. An entry +is meant to be accessed by via +a string I<key> in $lh->maketext(I<key>, ...parameters...), +which should return a string that is generally meant for +be used for "output" to the user -- regardless of whether +this actually means printing to STDOUT, writing to a file, +or putting into a GUI widget. + +While the key must be a string value (since that's a basic +restriction that Perl places on hash keys), the value in +the lexicon can currenly be of several types: +a defined scalar, scalarref, or coderef. The use of these is +explained above, in the section 'The "maketext" Method', and +Bracket Notation for strings is discussed in the next section. + +While you can use arbitrary unique IDs for lexicon keys +(like "_min_larger_max_error"), it is often +useful for if an entry's key is itself a valid value, like +this example error message: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + +Compare this code that uses an arbitrary ID... + + die $lh->maketext( "_min_larger_max_error", $min, $max ) + if $min > $max; + +...to this code that uses a key-as-value: + + die $lh->maketext( + "Minimum ([_1]) is larger than maximum ([_2])!\n", + $min, $max + ) if $min > $max; + +The second is, in short, more readable. In particular, it's obvious +that the number of parameters you're feeding to that phrase (two) is +the number of parameters that it I<wants> to be fed. (Since you see +_1 and a _2 being used in the key there.) + +Also, once a project is otherwise +complete and you start to localize it, you can scrape together +all the various keys you use, and pass it to a translator; and then +the translator's work will go faster if what he's presented is this: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + => "", # fill in something here, Jacques! + +rather than this more cryptic mess: + + "_min_larger_max_error" + => "", # fill in something here, Jacques + +I think that keys as lexicon values makes the completed lexicon +entries more readable: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + => "Le minimum ([_1]) est plus grand que le maximum ([_2])!\n", + +Also, having valid values as keys becomes very useful if you set +up an _AUTO lexicon. _AUTO lexicons are discussed in a later +section. + +I almost always use keys that are themselves +valid lexicon values. One notable exception is when the value is +quite long. For example, to get the screenful of data that +a command-line program might returns when given an unknown switch, +I often just use a key "_USAGE_MESSAGE". At that point I then go +and immediately to define that lexicon entry in the +ProjectClass::L10N::en lexicon (since English is always my "project +lanuage"): + + '_USAGE_MESSAGE' => <<'EOSTUFF', + ...long long message... + EOSTUFF + +and then I can use it as: + + getopt('oDI', \%opts) or die $lh->maketext('_USAGE_MESSAGE'); + +Incidentally, +note that each class's C<%Lexicon> inherits-and-extends +the lexicons in its superclasses. This is not because these are +special hashes I<per se>, but because you access them via the +C<maketext> method, which looks for entries across all the +C<%Lexicon>'s in a language class I<and> all its ancestor classes. +(This is because the idea of "class data" isn't directly implemented +in Perl, but is instead left to individual class-systems to implement +as they see fit..) + +Note that you may have things stored in a lexicon +besides just phrases for output: for example, if your program +takes input from the keyboard, asking a "(Y/N)" question, +you probably need to know what equivalent of "Y[es]/N[o]" is +in whatever language. You probably also need to know what +the equivalents of the answers "y" and "n" are. You can +store that information in the lexicon (say, under the keys +"~answer_y" and "~answer_n", and the long forms as +"~answer_yes" and "~answer_no", where "~" is just an ad-hoc +character meant to indicate to programmers/translators that +these are not phrases for output). + +Or instead of storing this in the language class's lexicon, +you can (and, in some cases, really should) represent the same bit +of knowledge as code is a method in the language class. (That +leaves a tidy distinction between the lexicon as the things we +know how to I<say>, and the rest of the things in the lexicon class +as things that we know how to I<do>.) Consider +this example of a processor for responses to French "oui/non" +questions: + + sub y_or_n { + return undef unless defined $_[1] and length $_[1]; + my $answer = lc $_[1]; # smash case + return 1 if $answer eq 'o' or $answer eq 'oui'; + return 0 if $answer eq 'n' or $answer eq 'non'; + return undef; + } + +...which you'd then call in a construct like this: + + my $response; + until(defined $response) { + print $lh->maketext("Open the pod bay door (y/n)? "); + $response = $lh->y_or_n( get_input_from_keyboard_somehow() ); + } + if($response) { $pod_bay_door->open() } + else { $pod_bay_door->leave_closed() } + +Other data worth storing in a lexicon might be things like +filenames for language-targetted resources: + + ... + "_main_splash_png" + => "/styles/en_us/main_splash.png", + "_main_splash_imagemap" + => "/styles/en_us/main_splash.incl", + "_general_graphics_path" + => "/styles/en_us/", + "_alert_sound" + => "/styles/en_us/hey_there.wav", + "_forward_icon" + => "left_arrow.png", + "_backward_icon" + => "right_arrow.png", + # In some other languages, left equals + # BACKwards, and right is FOREwards. + ... + +You might want to do the same thing for expressing key bindings +or the like (since hardwiring "q" as the binding for the function +that quits a screen/menu/program is useful only if your language +happens to associate "q" with "quit"!) + +=head1 BRACKET NOTATION + +Bracket Notation is a crucial feature of Locale::Maketext. I mean +Bracket Notation to provide a replacement for sprintf formatting. +Everything you do with Bracket Notation could be done with a sub block, +but bracket notation is meant to be much more concise. + +Bracket Notation is a like a miniature "template" system (in the sense +of L<Text::Template|Text::Template>, not in the sense of C++ templates), +where normal text is passed thru basically as is, but text is special +regions is specially interpreted. In Bracket Notation, you use brackets +("[...]" -- not "{...}"!) to note sections that are specially interpreted. + +For example, here all the areas that are taken literally are underlined with +a "^", and all the in-bracket special regions are underlined with an X: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + ^^^^^^^^^ XX ^^^^^^^^^^^^^^^^^^^^^^^^^^ XX ^^^^ + +When that string is compiled from bracket notation into a real Perl sub, +it's basically turned into: + + sub { + my $lh = $_[0]; + my @params = @_; + return join '', + "Minimum (", + ...some code here... + ") is larger than maximum (", + ...some code here... + ")!\n", + } + # to be called by $lh->maketext(KEY, params...) + +In other words, text outside bracket groups is turned into string +literals. Text in brackets is rather more complex, and currently follows +these rules: + +=over + +=item * + +Bracket groups that are empty, or which consist only of whitespace, +are ignored. (Examples: "[]", "[ ]", or a [ and a ] with returns +and/or tabs and/or spaces between them. + +Otherwise, each group is taken to be a comma-separated group of items, +and each item is interpreted as follows: + +=item * + +An item that is "_I<digits>" or "_-I<digits>" is interpreted as +$_[I<value>]. I.e., "_1" is becomes with $_[1], and "_-3" is interpreted +as $_[-3] (in which case @_ should have at least three elements in it). +Note that $_[0] is the language handle, and is typically not named +directly. + +=item * + +An item "_*" is interpreted to mean "all of @_ except $_[0]". +I.e., C<@_[1..$#_]>. Note that this is an empty list in the case +of calls like $lh->maketext(I<key>) where there are no +parameters (except $_[0], the language handle). + +=item * + +Otherwise, each item is interpreted as a string literal. + +=back + +The group as a whole is interpreted as follows: + +=over + +=item * + +If the first item in a bracket group looks like a method name, +then that group is interpreted like this: + + $lh->that_method_name( + ...rest of items in this group... + ), + +=item * + +If the first item in a bracket group is empty-string, or "_*" +or "_I<digits>" or "_-I<digits>", then that group is interpreted +as just the interpolation of all its items: + + join('', + ...rest of items in this group... + ), + +Examples: "[_1]" and "[,_1]", which are synonymous; and +"[,ID-(,_4,-,_2,)]", which compiles as +C<join "", "ID-(", $_[4], "-", $_[2], ")">. + +=item * + +Otherwise this bracket group is invalid. For example, in the group +"[!@#,whatever]", the first item C<"!@#"> is neither empty-string, +"_I<number>", "_-I<number>", "_*", nor a valid method name; and so +Locale::Maketext will throw an exception of you try compiling an +expression containing this bracket group. + +=back + +Note, incidentally, that items in each group are comma-separated, +not C</\s*,\s*/>-separated. That is, you might expect that this +bracket group: + + "Hoohah [foo, _1 , bar ,baz]!" + +would compile to this: + + sub { + my $lh = $_[0]; + return join '', + "Hoohah ", + $lh->foo( $_[1], "bar", "baz"), + "!", + } + +But it actually compiles as this: + + sub { + my $lh = $_[0]; + return join '', + "Hoohah ", + $lh->foo(" _1 ", " bar ", "baz"), #!!! + "!", + } + +In the notation discussed so far, the characters "[" and "]" are given +special meaning, for opening and closing bracket groups, and "," has +a special meaning inside bracket groups, where it separates items in the +group. This begs the question of how you'd express a literal "[" or +"]" in a Bracket Notation string, and how you'd express a literal +comma inside a bracket group. For this purpose I've adopted "~" (tilde) +as an escape character: "~[" means a literal '[' character anywhere +in Bracket Notation (i.e., regardless of whether you're in a bracket +group or not), and ditto for "~]" meaning a literal ']', and "~," meaning +a literal comma. (Altho "," means a literal comma outside of +bracket groups -- it's only inside bracket groups that commas are special.) + +And on the off chance you need a literal tilde in a bracket expression, +you get it with "~~". + +Currently, an unescaped "~" before a character +other than a bracket or a comma is taken to mean just a "~" and that +charecter. I.e., "~X" means the same as "~~X" -- i.e., one literal tilde, +and then one literal "X". However, by using "~X", you are assuming that +no future version of Maketext will use "~X" as a magic escape sequence. +In practice this is not a great problem, since first off you can just +write "~~X" and not worry about it; second off, I doubt I'll add lots +of new magic characters to bracket notation; and third off, you +aren't likely to want literal "~" characters in your messages anyway, +since it's not a character with wide use in natural language text. + +Brackets must be balanced -- every openbracket must have +one matching closebracket, and vice versa. So these are all B<invalid>: + + "I ate [quant,_1,rhubarb pie." + "I ate [quant,_1,rhubarb pie[." + "I ate quant,_1,rhubarb pie]." + "I ate quant,_1,rhubarb pie[." + +Currently, bracket groups do not nest. That is, you B<cannot> say: + + "Foo [bar,baz,[quux,quuux]]\n"; + +If you need a notation that's that powerful, use normal Perl: + + %Lexicon = ( + ... + "some_key" => sub { + my $lh = $_[0]; + join '', + "Foo ", + $lh->bar('baz', $lh->quux('quuux')), + "\n", + }, + ... + ); + +Or write the "bar" method so you don't need to pass it the +output from calling quux. + +I do not anticipate that you will need (or particularly want) +to nest bracket groups, but you are welcome to email me with +convincing (real-life) arguments to the contrary. + +=head1 AUTO LEXICONS + +If maketext goes to look in an individual %Lexicon for an entry +for I<key> (where I<key> does not start with an underscore), and +sees none, B<but does see> an entry of "_AUTO" => I<some_true_value>, +then we actually define $Lexicon{I<key>} = I<key> right then and there, +and then use that value as if it had been there all +along. This happens before we even look in any superclass %Lexicons! + +(This is meant to be somewhat like the AUTOLOAD mechanism in +Perl's function call system -- or, looked at another way, +like the L<AutoLoader|AutoLoader> module.) + +I can picture all sorts of circumstances where you just +do not want lookup to be able to fail (since failing +normally means that maketext throws a C<die>, altho +see the next section for greater control over that). But +here's one circumstance where _AUTO lexicons are meant to +be I<especially> useful: + +As you're writing an application, you decide as you go what messages +you need to emit. Normally you'd go to write this: + + if(-e $filename) { + go_process_file($filename) + } else { + print "Couldn't find file \"$filename\"!\n"; + } + +but since you anticipate localizing this, you write: + + use ThisProject::I18N; + my $lh = ThisProject::I18N->get_handle(); + # For the moment, assume that things are set up so + # that we load class ThisProject::I18N::en + # and that that's the class that $lh belongs to. + ... + if(-e $filename) { + go_process_file($filename) + } else { + print $lh->maketext( + "Couldn't find file \"[_1]\"!\n", $filename + ); + } + +Now, right after you've just written the above lines, you'd +normally have to go open the file +ThisProject/I18N/en.pm, and immediately add an entry: + + "Couldn't find file \"[_1]\"!\n" + => "Couldn't find file \"[_1]\"!\n", + +But I consider that somewhat of a distraction from the work +of getting the main code working -- to say nothing of the fact +that I often have to play with the program a few times before +I can decide exactly what wording I want in the messages (which +in this case would require me to go changing three lines of code: +the call to maketext with that key, and then the two lines in +ThisProject/I18N/en.pm). + +However, if you set "_AUTO => 1" in the %Lexicon in, +ThisProject/I18N/en.pm (assuming that English (en) is +the language that all your programmers will be using for this +project's internal message keys), then you don't ever have to +go adding lines like this + + "Couldn't find file \"[_1]\"!\n" + => "Couldn't find file \"[_1]\"!\n", + +to ThisProject/I18N/en.pm, because if _AUTO is true there, +then just looking for an entry with the key "Couldn't find +file \"[_1]\"!\n" in that lexicon will cause it to be added, +with that value! + +Note that the reason that keys that start with "_" +are immune to _AUTO isn't anything generally magical about +the underscore character -- I just wanted a way to have most +lexicon keys be autoable, except for possibly a few, and I +arbitrarily decided to use a leading underscore as a signal +to distinguish those few. + +=head1 CONTROLLING LOOKUP FAILURE + +If you call $lh->maketext(I<key>, ...parameters...), +and there's no entry I<key> in $lh's class's %Lexicon, nor +in the superclass %Lexicon hash, I<and> if we can't auto-make +I<key> (because either it starts with a "_", or because none +of its lexicons have C<_AUTO =E<gt> 1,>), then we have +failed to find a normal way to maketext I<key>. What then +happens in these failure conditions, depends on the $lh object +"fail" attribute. + +If the language handle has no "fail" attribute, maketext +will simply throw an exception (i.e., it calls C<die>, mentioning +the I<key> whose lookup failed, and naming the line number where +the calling $lh->maketext(I<key>,...) was. + +If the language handle has a "fail" attribute whose value is a +coderef, then $lh->maketext(I<key>,...params...) gives up and calls: + + return &{$that_subref}($lh, $key, @params); + +Otherwise, the "fail" attribute's value should be a string denoting +a method name, so that $lh->maketext(I<key>,...params...) can +give up with: + + return $lh->$that_method_name($phrase, @params); + +The "fail" attribute can be accessed with the C<fail_with> method: + + # Set to a coderef: + $lh->fail_with( \&failure_handler ); + + # Set to a method name: + $lh->fail_with( 'failure_method' ); + + # Set to nothing (i.e., so failure throws a plain exception) + $lh->fail_with( undef ); + + # Simply read: + $handler = $lh->fail_with(); + +Now, as to what you may want to do with these handlers: Maybe you'd +want to log what key failed for what class, and then die. Maybe +you don't like C<die> and instead you want to send the error message +to STDOUT (or wherever) and then merely C<exit()>. + +Or maybe you don't want to C<die> at all! Maybe you could use a +handler like this: + + # Make all lookups fall back onto an English value, + # but after we log it for later fingerpointing. + my $lh_backup = ThisProject->get_handle('en'); + open(LEX_FAIL_LOG, ">>wherever/lex.log") || die "GNAARGH $!"; + sub lex_fail { + my($failing_lh, $key, $params) = @_; + print LEX_FAIL_LOG scalar(localtime), "\t", + ref($failing_lh), "\t", $key, "\n"; + return $lh_backup->maketext($key,@params); + } + +Some users have expressed that they think this whole mechanism of +having a "fail" attribute at all, seems a rather pointless complication. +But I want Locale::Maketext to be usable for software projects of I<any> +scale and type; and different software projects have different ideas +of what the right thing is to do in failure conditions. I could simply +say that failure always throws an exception, and that if you want to be +careful, you'll just have to wrap every call to $lh->maketext in an +S<eval { }>. However, I want programmers to reserve the right (via +the "fail" attribute) to treat lookup failure as something other than +an exception of the same level of severity as a config file being +unreadable, or some essential resource being inaccessable. + +One possibly useful value for the "fail" attribute is the method name +"failure_handler_auto". This is a method defined in class +Locale::Maketext itself. You set it with: + + $lh->fail_with('failure_handler_auto'); + +Then when you call $lh->maketext(I<key>, ...parameters...) and +there's no I<key> in any of those lexicons, maketext gives up with + + return $lh->failure_handler_auto($key, @params); + +But failure_handler_auto, instead of dying or anything, compiles +$key, caching it in $lh->{'failure_lex'}{$key} = $complied, +and then calls the compiled value, and returns that. (I.e., if +$key looks like bracket notation, $compiled is a sub, and we return +&{$compiled}(@params); but if $key is just a plain string, we just +return that.) + +The effect of using "failure_auto_handler" +is like an AUTO lexicon, except that it 1) compiles $key even if +it starts with "_", and 2) you have a record in the new hashref +$lh->{'failure_lex'} of all the keys that have failed for +this object. This should avoid your program dying -- as long +as your keys aren't actually invalid as bracket code, and as +long as they don't try calling methods that don't exist. + +"failure_auto_handler" may not be exactly what you want, but I +hope it at least shows you that maketext failure can be mitigated +in any number of very flexible ways. If you can formalize exactly +what you want, you should be able to express that as a failure +handler. You can even make it default for every object of a given +class, by setting it in that class's init: + + sub init { + my $lh = $_[0]; # a newborn handle + $lh->SUPER::init(); + $lh->fail_with('my_clever_failure_handler'); + return; + } + sub my_clever_failure_handler { + ...you clever things here... + } + +=head1 HOW TO USE MAKETEXT + +Here is a brief checklist on how to use Maketext to localize +applications: + +=over + +=item * + +Decide what system you'll use for lexicon keys. If you insist, +you can use opaque IDs (if you're nostalgic for C<catgets>), +but I have better suggestions in the +section "Entries in Each Lexicon", above. Assuming you opt for +meaningful keys that double as values (like "Minimum ([_1]) is +larger than maximum ([_2])!\n"), you'll have to settle on what +language those should be in. For the sake of argument, I'll +call this English, specifically American English, "en-US". + +=item * + +Create a class for your localization project. This is +the name of the class that you'll use in the idiom: + + use Projname::L10N; + my $lh = Projname::L10N->get_handle(...) || die "Language?"; + +Assuming your call your class Projname::L10N, create a class +consisting minimally of: + + package Projname::L10N; + use base qw(Locale::Maketext); + ...any methods you might want all your languages to share... + + # And, assuming you want the base class to be an _AUTO lexicon, + # as is discussed a few sections up: + + 1; + +=item * + +Create a class for the language your internal keys are in. Name +the class after the language-tag for that language, in lowercase, +with dashes changed to underscores. Assuming your project's first +language is US English, you should call this Projname::L10N::en_us. +It should consist minimally of: + + package Projname::L10N::en_us; + use base qw(Projname::L10N); + %Lexicon = ( + '_AUTO' => 1, + ); + 1; + +(For the rest of this section, I'll assume that this "first +language class" of Projname::L10N::en_us has +_AUTO lexicon.) + +=item * + +Go and write your program. Everywhere in your program where +you would say: + + print "Foobar $thing stuff\n"; + +instead do it thru maketext, using no variable interpolation in +the key: + + print $lh->maketext("Foobar [_1] stuff\n", $thing); + +If you get tired of constantly saying C<print $lh-E<gt>maketext>, +consider making a functional wrapper for it, like so: + + use Projname::L10N; + use vars qw($lh); + $lh = Projname::L10N->get_handle(...) || die "Language?"; + sub pmt (@) { print( $lh->maketext(@_)) } + # "pmt" is short for "Print MakeText" + $Carp::Verbose = 1; + # so if maketext fails, we see made the call to pmt + +Besides whole phrases meant for output, anything language-dependent +should be put into the class Projname::L10N::en_us, +whether as methods, or as lexicon entries -- this is discussed +in the section "Entries in Each Lexicon", above. + +=item * + +Once the program is otherwise done, and once its localization for +the first language works right (via the data and methods in +Projname::L10N::en_us), you can get together the data for translation. +If your first language lexicon isn't an _AUTO lexicon, then you already +have all the messages explicitly in the lexicon (or else you'd be +getting exceptions thrown when you call $lh->maketext to get +messages that aren't in there). But if you were (advisedly) lazy and are +using an _AUTO lexicon, then you've got to make a list of all the phrases +that you've so far been letting _AUTO generate for you. There are very +many ways to assemble such a list. The most straightforward is to simply +grep the source for every occurrence of "maketext" (or calls +to wrappers around it, like the above C<pmt> function), and to log the +following phrase. + +=item * + +You may at this point want to consider whether the your base class +(Projname::L10N) that all lexicons inherit from (Projname::L10N::en, +Projname::L10N::es, etc.) should be an _AUTO lexicon. It may be true +that in theory, all needed messages will be in each language class; +but in the presumably unlikely or "impossible" case of lookup failure, +you should consider whether your program should throw an exception, +emit text in English (or whatever your project's first language is), +or some more complex solution as described in the section +"Controlling Lookup Failure", above. + +=item * + +Submit all messages/phrases/etc. to translators. + +(You may, in fact, want to start with localizing to I<one> other language +at first, if you're not sure that you've property abstracted the +language-dependent parts of your code.) + +Translators may request clarification of the situation in which a +particular phrase is found. For example, in English we are entirely happy +saying "I<n> files found", regardless of whether we mean "I looked for files, +and found I<n> of them" or the rather distinct situation of "I looked for +something else (like lines in files), and along the way I saw I<n> +files." This may involve rethinking things that you thought quite clear: +should "Edit" on a toolbar be a noun ("editing") or a verb ("to edit")? Is +there already a conventionalized way to express that menu option, separate +from the target language's normal word for "to edit"? + +In all cases where the very common phenomenon of quantification +(saying "I<N> files", for B<any> value of N) +is involved, each translator should make clear what dependencies the +number causes in the sentence. In many cases, dependency is +limited to words adjacent to the number, in places where you might +expect them ("I found the-?PLURAL I<N> +empty-?PLURAL directory-?PLURAL"), but in some cases there are +unexpected dependencies ("I found-?PLURAL ..."!) as well as long-distance +dependencies "The I<N> directory-?PLURAL could not be deleted-?PLURAL"!). + +Remind the translators to consider the case where N is 0: +"0 files found" isn't exactly natural-sounding in any language, but it +may be unacceptable in many -- or it may condition special +kinds of agreement (similar to English "I didN'T find ANY files"). + +Remember to ask your translators about numeral formatting in their +language, so that you can override the C<numf> method as +appropriate. Typical variables in number formatting are: what to +use as a decimal point (comma? period?); what to use as a thousands +separator (space? nonbreakinng space? comma? period? small +middot? prime? apostrophe?); and even whether the so-called "thousands +separator" is actually for every third digit -- I've heard reports of +two hundred thousand being expressable as "2,00,000" for some Indian +(Subcontinental) languages, besides the less surprising "S<200 000>", +"200.000", "200,000", and "200'000". Also, using a set of numeral +glyphs other than the usual ASCII "0"-"9" might be appreciated, as via +C<tr/0-9/\x{0966}-\x{096F}/> for getting digits in Devanagari script +(for Hindi, Konkani, others). + +The basic C<quant> method that Locale::Maketext provides should be +good for many languages. For some languages, it might be useful +to modify it (or its constituent C<numerate> method) +to take a plural form in the two-argument call to C<quant> +(as in "[quant,_1,files]") if +it's all-around easier to infer the singular form from the plural, than +to infer the plural form from the singular. + +But for other languages (as is discussed at length +in L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13>), simple +C<quant>/C<numerify> is not enough. For the particularly problematic +Slavic languages, what you may need is a method which you provide +with the number, the citation form of the noun to quantify, and +the case and gender that the sentence's syntax projects onto that +noun slot. The method would then be responsible for determining +what grammatical number that numeral projects onto its noun phrase, +and what case and gender it may override the normal case and gender +with; and then it would look up the noun in a lexicon providing +all needed inflected forms. + +=item * + +You may also wish to discuss with the translators the question of +how to relate different subforms of the same language tag, +considering how this reacts with C<get_handle>'s treatment of +these. For example, if a user accepts interfaces in "en, fr", and +you have interfaces available in "en-US" and "fr", what should +they get? You may wish to resolve this by establishing that "en" +and "en-US" are effectively synonymous, by having one class +zero-derive from the other. + +For some languages this issue may never come up (Danish is rarely +expressed as "da-DK", but instead is just "da"). And for other +languages, the whole concept of a "generic" form may verge on +being uselessly vague, particularly for interfaces involving voice +media in forms of Arabic or Chinese. + +=item * + +Once you've localized your program/site/etc. for all desired +languages, be sure to show the result (whether live, or via +screenshots) to the translators. Once they approve, make every +effort to have it then checked by at least one other speaker of +that language. This holds true even when (or especially when) the +translation is done by one of your own programmers. Some +kinds of systems may be harder to find testers for than others, +depending on the amount of domain-specific jargon and concepts +involved -- it's easier to find people who can tell you whether +they approve of your translation for "delete this message" in an +email-via-Web interface, than to find people who can give you +an informed opinion on your translation for "attribute value" +in an XML query tool's interface. + +=back + +=head1 SEE ALSO + +I recommend reading all of these: + +L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13> -- my I<The Perl +Journal> article about Maketext. It explains many important concepts +underlying Locale::Maketext's design, and some insight into why +Maketext is better than the plain old approach of just having +message catalogs that are just databases of sprintf formats. + +L<File::Findgrep|File::Findgrep> is a sample application/module +that uses Locale::Maketext to localize its messages. + +L<I18N::LangTags|I18N::LangTags>. + +L<Win32::Locale|Win32::Locale>. + +RFC 3066, I<Tags for the Identification of Languages>, +as at http://sunsite.dk/RFC/rfc/rfc3066.html + +RFC 2277, I<IETF Policy on Character Sets and Languages> +is at http://sunsite.dk/RFC/rfc/rfc2277.html -- much of it is +just things of interest to protocol designers, but it explains +some basic concepts, like the distinction between locales and +language-tags. + +The manual for GNU C<gettext>. The gettext dist is available in +C<ftp://prep.ai.mit.edu/pub/gnu/> -- get +a recent gettext tarball and look in its "doc/" directory, there's +an easily browsable HTML version in there. The +gettext documentation asks lots of questions worth thinking +about, even if some of their answers are sometimes wonky, +particularly where they start talking about pluralization. + +The Locale/Maketext.pm source. Obverse that the module is much +shorter than its documentation! + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 1999-2001 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +This program is distributed in the hope that it will be useful, but +without any warranty; without even the implied warranty of +merchantability or fitness for a particular purpose. + +=head1 AUTHOR + +Sean M. Burke C<sburke@cpan.org> + +=cut + +# Zing! diff --git a/lib/Locale/Maketext/TPJ13.pod b/lib/Locale/Maketext/TPJ13.pod new file mode 100644 index 0000000000..db22478215 --- /dev/null +++ b/lib/Locale/Maketext/TPJ13.pod @@ -0,0 +1,776 @@ + +# This document contains text in Perl "POD" format. +# Use a POD viewer like perldoc or perlman to render it. + +=head1 NAME + +Locale::Maketext::TPJ13 -- article about software localization + +=head1 SYNOPSIS + + # This an article, not a module. + +=head1 DESCRIPTION + +The following article by Sean M. Burke and Jordan Lachler +first appeared in I<The Perl +Journal> #13 and is copyright 1999 The Perl Journal. It appears +courtesy of Jon Orwant and The Perl Journal. This document may be +distributed under the same terms as Perl itself. + +=head1 Localization and Perl: gettext breaks, Maketext fixes + +by Sean M. Burke and Jordan Lachler + +This article points out cases where gettext (a common system for +localizing software interfaces -- i.e., making them work in the user's +language of choice) fails because of basic differences between human +languages. This article then describes Maketext, a new system capable +of correctly treating these differences. + +=head2 A Localization Horror Story: It Could Happen To You + +=over + +"There are a number of languages spoken by human beings in this +world." + +-- Harald Tveit Alvestrand, in RFC 1766, "Tags for the +Identification of Languages" + +=back + +Imagine that your task for the day is to localize a piece of software +-- and luckily for you, the only output the program emits is two +messages, like this: + + I scanned 12 directories. + + Your query matched 10 files in 4 directories. + +So how hard could that be? You look at the code that produces +produces the first item, and it reads: + + printf("I scanned %g directories.", + $directory_count); + +You think about that, and realize that it doesn't even work right for +English, as it can produce this output: + + I scanned 1 directories. + +So you rewrite it to read: + + printf("I scanned %g %s.", + $directory_count, + $directory_count == 1 ? + "directory" : "directories", + ); + +...which does the Right Thing. (In case you don't recall, "%g" is for +locale-specific number interpolation, and "%s" is for string +interpolation.) + +But you still have to localize it for all the languages you're +producing this software for, so you pull Locale::gettext off of CPAN +so you can access the C<gettext> C functions you've heard are standard +for localization tasks. + +And you write: + + printf(gettext("I scanned %g %s."), + $dir_scan_count, + $dir_scan_count == 1 ? + gettext("directory") : gettext("directory"), + ); + +But you then read in the gettext manual (Drepper, Miller, and Pinard 1995) +that this is not a good idea, since how a single word like "directory" +or "directories" is translated may depend on context -- and this is +true, since in a case language like German or Russian, you'd may need +these words with a different case ending in the first instance (where the +word is the object of a verb) than in the second instance, which you haven't even +gotten to yet (where the word is the object of a preposition, "in %g +directories") -- assuming these keep the same syntax when translated +into those languages. + +So, on the advice of the gettext manual, you rewrite: + + printf( $dir_scan_count == 1 ? + gettext("I scanned %g directory.") : + gettext("I scanned %g directories."), + $dir_scan_count ); + +So, you email your various translators (the boss decides that the +languages du jour are Chinese, Arabic, Russian, and Italian, so you +have one translator for each), asking for translations for "I scanned +%g directory." and "I scanned %g directories.". When they reply, +you'll put that in the lexicons for gettext to use when it localizes +your software, so that when the user is running under the "zh" +(Chinese) locale, gettext("I scanned %g directory.") will return the +appropriate Chinese text, with a "%g" in there where printf can then +interpolate $dir_scan. + +Your Chinese translator emails right back -- he says both of these +phrases translate to the same thing in Chinese, because, in linguistic +jargon, Chinese "doesn't have number as a grammatical category" -- +whereas English does. That is, English has grammatical rules that +refer to "number", i.e., whether something is grammatically singular +or plural; and one of these rules is the one that forces nouns to take +a plural suffix (generally "s") when in a plural context, as they are when +they follow a number other than "one" (including, oddly enough, "zero"). +Chinese has no such rules, and so has just the one phrase where English +has two. But, no problem, you can have this one Chinese phrase appear +as the translation for the two English phrases in the "zh" gettext +lexicon for your program. + +Emboldened by this, you dive into the second phrase that your software +needs to output: "Your query matched 10 files in 4 directories.". You notice +that if you want to treat phrases as indivisible, as the gettext +manual wisely advises, you need four cases now, instead of two, to +cover the permutations of singular and plural on the two items, +$dir_count and $file_count. So you try this: + + printf( $file_count == 1 ? + ( $directory_count == 1 ? + gettext("Your query matched %g file in %g directory.") : + gettext("Your query matched %g file in %g directories.") ) : + ( $directory_count == 1 ? + gettext("Your query matched %g files in %g directory.") : + gettext("Your query matched %g files in %g directories.") ), + $file_count, $directory_count, + ); + +(The case of "1 file in 2 [or more] directories" could, I suppose, +occur in the case of symlinking or something of the sort.) + +It occurs to you that this is not the prettiest code you've ever +written, but this seems the way to go. You mail off to the +translators asking for translations for these four cases. The +Chinese guy replies with the one phrase that these all translate to in +Chinese, and that phrase has two "%g"s in it, as it should -- but +there's a problem. He translates it word-for-word back: "To your +question, in %g directories you would find %g answers." The "%g" +slots are in an order reverse to what they are in English. You wonder +how you'll get gettext to handle that. + +But you put it aside for the moment, and optimistically hope that the +other translators won't have this problem, and that their languages +will be better behaved -- i.e., that they will be just like English. + +But the Arabic translator is the next to write back. First off, your +code for "I scanned %g directory." or "I scanned %g directories." +assumes there's only singular or plural. But, to use linguistic +jargon again, Arabic has grammatical number, like English (but unlike +Chinese), but it's a three-term category: singular, dual, and plural. +In other words, the way you say "directory" depends on whether there's +one directory, or I<two> of them, or I<more than two> of them. Your +test of C<($directory == 1)> no longer does the job. And it means +that where English's grammatical category of number necessitates +only the two permutations of the first sentence based on "directory +[singular]" and "directories [plural]", Arabic has three -- and, +worse, in the second sentence ("Your query matched %g file in %g +directory."), where English has four, Arabic has nine. You sense +an unwelcome, exponential trend taking shape. + +Your Italian translator emails you back and says that "I searched 0 +directories" (a possible English output of your program) is stilted, +and if you think that's fine English, that's your problem, but that +I<just will not do> in the language of Dante. He insists that where +$directory_count is 0, your program should produce the Italian text +for "I I<didn't> scan I<any> directories.". And ditto for "I didn't +match any files in any directories", although he says the last part +about "in any directories" should probably just be left off. + +You wonder how you'll get gettext to handle this; to accomodate the +ways Arabic, Chinese, and Italian deal with numbers in just these few +very simple phrases, you need to write code that will ask gettext for +different queries depending on whether the numerical values in +question are 1, 2, more than 2, or in some cases 0, and you still haven't +figured out the problem with the different word order in Chinese. + +Then your Russian translator calls on the phone, to I<personally> tell +you the bad news about how really unpleasant your life is about to +become: + +Russian, like German or Latin, is an inflectional language; that is, nouns +and adjectives have to take endings that depend on their case +(i.e., nominative, accusative, genitive, etc...) -- which is roughly a matter of +what role they have in syntax of the sentence -- +as well as on the grammatical gender (i.e., masculine, feminine, neuter) +and number (i.e., singular or plural) of the noun, as well as on the +declension class of the noun. But unlike with most other inflected languages, +putting a number-phrase (like "ten" or "forty-three", or their Arabic +numeral equivalents) in front of noun in Russian can change the case and +number that noun is, and therefore the endings you have to put on it. + +He elaborates: In "I scanned %g directories", you'd I<expect> +"directories" to be in the accusative case (since it is the direct +object in the sentnce) and the plural number, +except where $directory_count is 1, then you'd expect the singular, of +course. Just like Latin or German. I<But!> Where $directory_count % +10 is 1 ("%" for modulo, remember), assuming $directory count is an +integer, and except where $directory_count % 100 is 11, "directories" +is forced to become grammatically singular, which means it gets the +ending for the accusative singular... You begin to visualize the code +it'd take to test for the problem so far, I<and still work for Chinese +and Arabic and Italian>, and how many gettext items that'd take, but +he keeps going... But where $directory_count % 10 is 2, 3, or 4 +(except where $directory_count % 100 is 12, 13, or 14), the word for +"directories" is forced to be genitive singular -- which means another +ending... The room begins to spin around you, slowly at first... But +with I<all other> integer values, since "directory" is an inanimate +noun, when preceded by a number and in the nominative or accusative +cases (as it is here, just your luck!), it does stay plural, but it is +forced into the genitive case -- yet another another ending... And +you never hear him get to the part about how you're going to run into +similar (but maybe subtly different) problems with other Slavic +languages like Polish, because the floor comes up to meet you, and you +fade into unconsciousness. + + +The above cautionary tale relates how an attempt at localization can +lead from programmer consternation, to program obfuscation, to a need +for sedation. But careful evaluation shows that your choice of tools +merely needed further consideration. + +=head2 The Linguistic View + +=over + +"It is more complicated than you think." + +-- The Eighth Networking Truth, from RFC 1925 + +=back + +The field of Linguistics has expended a great deal of effort over the +past century trying to find grammatical patterns which hold across +languages; it's been a constant process +of people making generalizations that should apply to all languages, +only to find out that, all too often, these generalizations fail -- +sometimes failing for just a few languages, sometimes whole classes of +languages, and sometimes nearly every language in the world except +English. Broad statistical trends are evident in what the "average +language" is like as far as what its rules can look like, must look +like, and cannot look like. But the "average language" is just as +unreal a concept as the "average person" -- it runs up against the +fact no language (or person) is, in fact, average. The wisdom of past +experience leads us to believe that any given language can do whatever +it wants, in any order, with appeal to any kind of grammatical +categories wants -- case, number, tense, real or metaphoric +characteristics of the things that words refer to, arbitrary or +predictable classifications of words based on what endings or prefixes +they can take, degree or means of certainty about the truth of +statements expressed, and so on, ad infinitum. + +Mercifully, most localization tasks are a matter of finding ways to +translate whole phrases, generally sentences, where the context is +relatively set, and where the only variation in content is I<usually> +in a number being expressed -- as in the example sentences above. +Translating specific, fully-formed sentences is, in practice, fairly +foolproof -- which is good, because that's what's in the phrasebooks +that so many tourists rely on. Now, a given phrase (whether in a +phrasebook or in a gettext lexicon) in one language I<might> have a +greater or lesser applicability than that phrase's translation into +another language -- for example, strictly speaking, in Arabic, the +"your" in "Your query matched..." would take a different form +depending on whether the user is male or female; so the Arabic +translation "your[feminine] query" is applicable in fewer cases than +the corresponding English phrase, which doesn't distinguish the user's +gender. (In practice, it's not feasable to have a program know the +user's gender, so the masculine "you" in Arabic is usually used, by +default.) + +But in general, such surprises are rare when entire sentences are +being translated, especially when the functional context is restricted +to that of a computer interacting with a user either to convey a fact +or to prompt for a piece of information. So, for purposes of +localization, translation by phrase (generally by sentence) is both the +simplest and the least problematic. + +=head2 Breaking gettext + +=over + +"It Has To Work." + +-- First Networking Truth, RFC 1925 + +=back + +Consider that sentences in a tourist phrasebook are of two types: ones +like "How do I get to the marketplace?" that don't have any blanks to +fill in, and ones like "How much do these ___ cost?", where there's +one or more blanks to fill in (and these are usually linked to a +list of words that you can put in that blank: "fish", "potatoes", +"tomatoes", etc.) The ones with no blanks are no problem, but the +fill-in-the-blank ones may not be really straightforward. If it's a +Swahili phrasebook, for example, the authors probably didn't bother to +tell you the complicated ways that the verb "cost" changes its +inflectional prefix depending on the noun you're putting in the blank. +The trader in the marketplace will still understand what you're saying if +you say "how much do these potatoes cost?" with the wrong +inflectional prefix on "cost". After all, I<you> can't speak proper Swahili, +I<you're> just a tourist. But while tourists can be stupid, computers +are supposed to be smart; the computer should be able to fill in the +blank, and still have the results be grammatical. + +In other words, a phrasebook entry takes some values as parameters +(the things that you fill in the blank or blanks), and provides a value +based on these parameters, where the way you get that final value from +the given values can, properly speaking, involve an arbitrarily +complex series of operations. (In the case of Chinese, it'd be not at +all complex, at least in cases like the examples at the beginning of +this article; whereas in the case of Russian it'd be a rather complex +series of operations. And in some languages, the +complexity could be spread around differently: while the act of +putting a number-expression in front of a noun phrase might not be +complex by itself, it may change how you have to, for example, inflect +a verb elsewhere in the sentence. This is what in syntax is called +"long-distance dependencies".) + +This talk of parameters and arbitrary complexity is just another way +to say that an entry in a phrasebook is what in a programming language +would be called a "function". Just so you don't miss it, this is the +crux of this article: I<A phrase is a function; a phrasebook is a +bunch of functions.> + +The reason that using gettext runs into walls (as in the above +second-person horror story) is that you're trying to use a string (or +worse, a choice among a bunch of strings) to do what you really need a +function for -- which is futile. Preforming (s)printf interpolation +on the strings which you get back from gettext does allow you to do I<some> +common things passably well... sometimes... sort of; but, to paraphrase +what some people say about C<csh> script programming, "it fools you +into thinking you can use it for real things, but you can't, and you +don't discover this until you've already spent too much time trying, +and by then it's too late." + +=head2 Replacing gettext + +So, what needs to replace gettext is a system that supports lexicons +of functions instead of lexicons of strings. An entry in a lexicon +from such a system should I<not> look like this: + + "J'ai trouv\xE9 %g fichiers dans %g r\xE9pertoires" + +[\xE9 is e-acute in Latin-1. Some pod renderers would +scream if I used the actual character here. -- SB] + +but instead like this, bearing in mind that this is just a first stab: + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = sprintf("%g %s", $files, + $files == 1 ? 'fichier' : 'fichiers'); + $dirs = sprintf("%g %s", $dirs, + $dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +Now, there's no particularly obvious way to store anything but strings +in a gettext lexicon; so it looks like we just have to start over and +make something better, from scratch. I call my shot at a +gettext-replacement system "Maketext", or, in CPAN terms, +Locale::Maketext. + +When designing Maketext, I chose to plan its main features in terms of +"buzzword compliance". And here are the buzzwords: + +=head2 Buzzwords: Abstraction and Encapsulation + +The complexity of the language you're trying to output a phrase in is +entirely abstracted inside (and encapsulated within) the Maketext module +for that interface. When you call: + + print $lang->maketext("You have [quant,_1,piece] of new mail.", + scalar(@messages)); + +you don't know (and in fact can't easily find out) whether this will +involve lots of figuring, as in Russian (if $lang is a handle to the +Russian module), or relatively little, as in Chinese. That kind of +abstraction and encapsulation may encourage other pleasant buzzwords +like modularization and stratification, depending on what design +decisions you make. + +=head2 Buzzword: Isomorphism + +"Isomorphism" means "having the same structure or form"; in discussions +of program design, the word takes on the special, specific meaning that +your implementation of a solution to a problem I<has the same +structure> as, say, an informal verbal description of the solution, or +maybe of the problem itself. Isomorphism is, all things considered, +a good thing -- it's what problem-solving (and solution-implementing) +should look like. + +What's wrong the with gettext-using code like this... + + printf( $file_count == 1 ? + ( $directory_count == 1 ? + "Your query matched %g file in %g directory." : + "Your query matched %g file in %g directories." ) : + ( $directory_count == 1 ? + "Your query matched %g files in %g directory." : + "Your query matched %g files in %g directories." ), + $file_count, $directory_count, + ); + +is first off that it's not well abstracted -- these ways of testing +for grammatical number (as in the expressions like C<foo == 1 ? +singular_form : plural_form>) should be abstracted to each language +module, since how you get grammatical number is language-specific. + +But second off, it's not isomorphic -- the "solution" (i.e., the +phrasebook entries) for Chinese maps from these four English phrases to +the one Chinese phrase that fits for all of them. In other words, the +informal solution would be "The way to say what you want in Chinese is +with the one phrase 'For your question, in Y directories you would +find X files'" -- and so the implemented solution should be, +isomorphically, just a straightforward way to spit out that one +phrase, with numerals properly interpolated. It shouldn't have to map +from the complexity of other languages to the simplicity of this one. + +=head2 Buzzword: Inheritance + +There's a great deal of reuse possible for sharing of phrases between +modules for related dialects, or for sharing of auxiliary functions +between related languages. (By "auxiliary functions", I mean +functions that don't produce phrase-text, but which, say, return an +answer to "does this number require a plural noun after it?". Such +auxiliary functions would be used in the internal logic of functions +that actually do produce phrase-text.) + +In the case of sharing phrases, consider that you have an interface +already localized for American English (probably by having been +written with that as the native locale, but that's incidental). +Localizing it for UK English should, in practical terms, be just a +matter of running it past a British person with the instructions to +indicate what few phrases would benefit from a change in spelling or +possibly minor rewording. In that case, you should be able to put in +the UK English localization module I<only> those phrases that are +UK-specific, and for all the rest, I<inherit> from the American +English module. (And I expect this same situation would apply with +Brazilian and Continental Portugese, possbily with some I<very> +closely related languages like Czech and Slovak, and possibly with the +slightly different "versions" of written Mandarin Chinese, as I hear exist in +Taiwan and mainland China.) + +As to sharing of auxiliary functions, consider the problem of Russian +numbers from the beginning of this article; obviously, you'd want to +write only once the hairy code that, given a numeric value, would +return some specification of which case and number a given quanitified +noun should use. But suppose that you discover, while localizing an +interface for, say, Ukranian (a Slavic language related to Russian, +spoken by several million people, many of whom would be relieved to +find that your Web site's or software's interface is available in +their language), that the rules in Ukranian are the same as in Russian +for quantification, and probably for many other grammatical functions. +While there may well be no phrases in common between Russian and +Ukranian, you could still choose to have the Ukranian module inherit +from the Russian module, just for the sake of inheriting all the +various grammatical methods. Or, probably better organizationally, +you could move those functions to a module called C<_E_Slavic> or +something, which Russian and Ukranian could inherit useful functions +from, but which would (presumably) provide no lexicon. + +=head2 Buzzword: Concision + +Okay, concision isn't a buzzword. But it should be, so I decree that +as a new buzzword, "concision" means that simple common things should +be expressible in very few lines (or maybe even just a few characters) +of code -- call it a special case of "making simple things easy and +hard things possible", and see also the role it played in the +MIDI::Simple language, discussed elsewhere in this issue [TPJ#13]. + +Consider our first stab at an entry in our "phrasebook of functions": + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = sprintf("%g %s", $files, + $files == 1 ? 'fichier' : 'fichiers'); + $dirs = sprintf("%g %s", $dirs, + $dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +You may sense that a lexicon (to use a non-committal catch-all term for a +collection of things you know how to say, regardless of whether they're +phrases or words) consisting of functions I<expressed> as above would +make for rather long-winded and repetitive code -- even if you wisely +rewrote this to have quantification (as we call adding a number +expression to a noun phrase) be a function called like: + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = quant($files, "fichier"); + $dirs = quant($dirs, "r\xE9pertoire"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +And you may also sense that you do not want to bother your translators +with having to write Perl code -- you'd much rather that they spend +their I<very costly time> on just translation. And this is to say +nothing of the near impossibility of finding a commercial translator +who would know even simple Perl. + +In a first-hack implementation of Maketext, each language-module's +lexicon looked like this: + + %Lexicon = ( + "I found %g files in %g directories" + => sub { + my( $files, $dirs ) = @_[0,1]; + $files = quant($files, "fichier"); + $dirs = quant($dirs, "r\xE9pertoire"); + return "J'ai trouv\xE9 $files dans $dirs."; + }, + ... and so on with other phrase => sub mappings ... + ); + +but I immediately went looking for some more concise way to basically +denote the same phrase-function -- a way that would also serve to +concisely denote I<most> phrase-functions in the lexicon for I<most> +languages. After much time and even some actual thought, I decided on +this system: + +* Where a value in a %Lexicon hash is a contentful string instead of +an anonymous sub (or, conceivably, a coderef), it would be interpreted +as a sort of shorthand expression of what the sub does. When accessed +for the first time in a session, it is parsed, turned into Perl code, +and then eval'd into an anonymous sub; then that sub replaces the +original string in that lexicon. (That way, the work of parsing and +evaling the shorthand form for a given phrase is done no more than +once per session.) + +* Calls to C<maketext> (as Maketext's main function is called) happen +thru a "language session handle", notionally very much like an IO +handle, in that you open one at the start of the session, and use it +for "sending signals" to an object in order to have it return the text +you want. + +So, this: + + $lang->maketext("You have [quant,_1,piece] of new mail.", + scalar(@messages)); + +basically means this: look in the lexicon for $lang (which may inherit +from any number of other lexicons), and find the function that we +happen to associate with the string "You have [quant,_1,piece] of new +mail" (which is, and should be, a functioning "shorthand" for this +function in the native locale -- English in this case). If you find +such a function, call it with $lang as its first parameter (as if it +were a method), and then a copy of scalar(@messages) as its second, +and then return that value. If that function was found, but was in +string shorthand instead of being a fully specified function, parse it +and make it into a function before calling it the first time. + +* The shorthand uses code in brackets to indicate method calls that +should be performed. A full explanation is not in order here, but a +few examples will suffice: + + "You have [quant,_1,piece] of new mail." + +The above code is shorthand for, and will be interpreted as, +this: + + sub { + my $handle = $_[0]; + my(@params) = @_; + return join '', + "You have ", + $handle->quant($params[1], 'piece'), + "of new mail."; + } + +where "quant" is the name of a method you're using to quantify the +noun "piece" with the number $params[0]. + +A string with no brackety calls, like this: + + "Your search expression was malformed." + +is somewhat of a degerate case, and just gets turned into: + + sub { return "Your search expression was malformed." } + +However, not everything you can write in Perl code can be written in +the above shorthand system -- not by a long shot. For example, consider +the Italian translator from the beginning of this article, who wanted +the Italian for "I didn't find any files" as a special case, instead +of "I found 0 files". That couldn't be specified (at least not easily +or simply) in our shorthand system, and it would have to be written +out in full, like this: + + sub { # pretend the English strings are in Italian + my($handle, $files, $dirs) = @_[0,1,2]; + return "I didn't find any files" unless $files; + return join '', + "I found ", + $handle->quant($files, 'file'), + " in ", + $handle->quant($dirs, 'directory'), + "."; + } + +Next to a lexicon full of shorthand code, that sort of sticks out like a +sore thumb -- but this I<is> a special case, after all; and at least +it's possible, if not as concise as usual. + +As to how you'd implement the Russian example from the beginning of +the article, well, There's More Than One Way To Do It, but it could be +something like this (using English words for Russian, just so you know +what's going on): + + "I [quant,_1,directory,accusative] scanned." + +This shifts the burden of complexity off to the quant method. That +method's parameters are: the numeric value it's going to use to +quantify something; the Russian word it's going to quantify; and the +parameter "accusative", which you're using to mean that this +sentence's syntax wants a noun in the accusative case there, although +that quantification method may have to overrule, for grammatical +reasons you may recall from the beginning of this article. + +Now, the Russian quant method here is responsible not only for +implementing the strange logic necessary for figuring out how Russian +number-phrases impose case and number on their noun-phrases, but also +for inflecting the Russian word for "directory". How that inflection +is to be carried out is no small issue, and among the solutions I've +seen, some (like variations on a simple lookup in a hash where all +possible forms are provided for all necessary words) are +straightforward but I<can> become cumbersome when you need to inflect +more than a few dozen words; and other solutions (like using +algorithms to model the inflections, storing only root forms and +irregularities) I<can> involve more overhead than is justifiable for +all but the largest lexicons. + +Mercifully, this design decision becomes crucial only in the hairiest +of inflected languages, of which Russian is by no means the I<worst> case +scenario, but is worse than most. Most languages have simpler +inflection systems; for example, in English or Swahili, there are +generally no more than two possible inflected forms for a given noun +("error/errors"; "kosa/makosa"), and the +rules for producing these forms are fairly simple -- or at least, +simple rules can be formulated that work for most words, and you can +then treat the exceptions as just "irregular", at least relative to +your ad hoc rules. A simpler inflection system (simpler rules, fewer +forms) means that design decisions are less crucial to maintaining +sanity, whereas the same decisions could incur +overhead-versus-scalability problems in languages like Russian. It +may I<also> be likely that code (possibly in Perl, as with +Lingua::EN::Inflect, for English nouns) has already +been written for the language in question, whether simple or complex. + +Moreover, a third possibility may even be simpler than anything +discussed above: "Just require that all possible (or at least +applicable) forms be provided in the call to the given language's quant +method, as in:" + + "I found [quant,_1,file,files]." + +That way, quant just has to chose which form it needs, without having +to look up or generate anything. While possibly not optimal for +Russian, this should work well for most other languages, where +quantification is not as complicated an operation. + +=head2 The Devil in the Details + +There's plenty more to Maketext than described above -- for example, +there's the details of how language tags ("en-US", "x-cree", "fi", +etc.) or locale IDs ("en_US") interact with actual module naming +("BogoQuery/Locale/en_us.pm"), and what magic can ensue; there's the +details of how to record (and possibly negotiate) what character +encoding Maketext will return text in (UTF8? Latin-1? KOI8?). There's +the interesting fact that Maketext is for localization, but nowhere +actually has a "C<use locale;>" anywhere in it. For the curious, +there's the somewhat frightening details of how I actually +implement something like data inheritance so that searches across +modules' %Lexicon hashes can parallel how Perl implements method +inheritance. + +And, most importantly, there's all the practical details of how to +actually go about deriving from Maketext so you can use it for your +interfaces, and the various tools and conventions for starting out and +maintaining individual language modules. + +That is all covered in the documentation for Locale::Maketext and the +modules that come with it, available in CPAN. After having read this +article, which covers the why's of Maketext, the documentation, +which covers the how's of it, should be quite straightfoward. + +=head2 The Proof in the Pudding: Localizing Web Sites + +Maketext and gettext have a notable difference: gettext is in C, +accessible thru C library calls, whereas Maketext is in Perl, and +really can't work without a Perl interpreter (although I suppose +something like it could be written for C). Accidents of history (and +not necessarily lucky ones) have made C++ the most common language for +the implementation of applications like word processors, Web browsers, +and even many in-house applications like custom query systems. Current +conditions make it somewhat unlikely that the next one of any of these +kinds of applications will be written in Perl, albeit clearly more for +reasons of custom and inertia than out of consideration of what is the +right tool for the job. + +However, other accidents of history have made Perl a well-accepted +language for design of server-side programs (generally in CGI form) +for Web site interfaces. Localization of static pages in Web sites is +trivial, feasable either with simple language-negotiation features in +servers like Apache, or with some kind of server-side inclusions of +language-appropriate text into layout templates. However, I think +that the localization of Perl-based search systems (or other kinds of +dynamic content) in Web sites, be they public or access-restricted, +is where Maketext will see the greatest use. + +I presume that it would be only the exceptional Web site that gets +localized for English I<and> Chinese I<and> Italian I<and> Arabic +I<and> Russian, to recall the languages from the beginning of this +article -- to say nothing of German, Spanish, French, Japanese, +Finnish, and Hindi, to name a few languages that benefit from large +numbers of programmers or Web viewers or both. + +However, the ever-increasing internationalization of the Web (whether +measured in terms of amount of content, of numbers of content writers +or programmers, or of size of content audiences) makes it increasingly +likely that the interface to the average Web-based dynamic content +service will be localized for two or maybe three languages. It is my +hope that Maketext will make that task as simple as possible, and will +remove previous barriers to localization for languages dissimilar to +English. + + __END__ + +Sean M. Burke (sburkeE<64>cpan.org) has a Master's in linguistics +from Northwestern University; he specializes in language technology. +Jordan Lachler (lachlerE<64>unm.edu) is a PhD student in the Department of +Linguistics at the University of New Mexico; he specializes in +morphology and pedagogy of North American native languages. + +=head2 References + +Alvestrand, Harald Tveit. 1995. I<RFC 1766: Tags for the +Identification of Languages.> +C<ftp://ftp.isi.edu/in-notes/rfc1766.txt> +[Now see RFC 3066.] + +Callon, Ross, editor. 1996. I<RFC 1925: The Twelve +Networking Truths.> +C<ftp://ftp.isi.edu/in-notes/rfc1925.txt> + +Drepper, Ulrich, Peter Miller, +and FranE<ccedil>ois Pinard. 1995-2001. GNU +C<gettext>. Available in C<ftp://prep.ai.mit.edu/pub/gnu/>, with +extensive docs in the distribution tarball. [Since +I wrote this article in 1998, I now see that the +gettext docs are now trying more to come to terms with +plurality. Whether useful conclusions have come from it +is another question altogether. -- SMB, May 2001] + +Forbes, Nevill. 1964. I<Russian Grammar.> Third Edition, revised +by J. C. Dumbreck. Oxford University Press. + +=cut + +#End + |