diff options
author | Nicholas Clark <nick@ccl4.org> | 2009-09-30 18:54:05 +0100 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2009-09-30 18:54:05 +0100 |
commit | b667d99b3752a0d79b4c2306a3e6e925f4f26d9f (patch) | |
tree | f31bbe3f189d2479a7eab4722e9e08ff15a588fc /dist/Locale-Maketext | |
parent | c0fceebed3d9fb5d5f03486c745053b710f98fc7 (diff) | |
download | perl-b667d99b3752a0d79b4c2306a3e6e925f4f26d9f.tar.gz |
Move Locale::Maketext from ext/ to dist/
Diffstat (limited to 'dist/Locale-Maketext')
-rw-r--r-- | dist/Locale-Maketext/ChangeLog | 131 | ||||
-rw-r--r-- | dist/Locale-Maketext/README | 67 | ||||
-rw-r--r-- | dist/Locale-Maketext/lib/Locale/Maketext.pm | 456 | ||||
-rw-r--r-- | dist/Locale-Maketext/lib/Locale/Maketext.pod | 1326 | ||||
-rw-r--r-- | dist/Locale-Maketext/lib/Locale/Maketext/Guts.pm | 328 | ||||
-rw-r--r-- | dist/Locale-Maketext/lib/Locale/Maketext/GutsLoader.pm | 49 | ||||
-rw-r--r-- | dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod | 775 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/01_about_verbose.t | 76 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/10_make.t | 33 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/20_get.t | 72 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/30_local.t | 32 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/40_super.t | 61 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/50_super.t | 71 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/60_super.t | 37 | ||||
-rw-r--r-- | dist/Locale-Maketext/t/90_utf8.t | 33 |
15 files changed, 3547 insertions, 0 deletions
diff --git a/dist/Locale-Maketext/ChangeLog b/dist/Locale-Maketext/ChangeLog new file mode 100644 index 0000000000..284d9717f4 --- /dev/null +++ b/dist/Locale-Maketext/ChangeLog @@ -0,0 +1,131 @@ +Revision history for Perl suite Locale::Maketext + +2008-05-28 Adriano Ferreira + * Release 1.13 + + New maintainer. No noteworthy changes. + + When debugging is turned on, "DEBUG and warn" is now + used instead of "DEBUG and print". It makes web applications + happier. CPAN RT #36238 + +2007-11-17 + * Release 1.12 + + Many doc changes from RT. + + Silenced some "used only once" warnings under Perl 5.10. + + $@ is now localized in case it gets interpolated. This was added + a while ago, but now there's a test for it, too. + + Added warnings and strict to tests. + + Cleaning up some Perl::Critic gripes. + +2007-05-07 Andy Lester + * Release 1.11_01 + + Fixed perlbug #33938 + http://rt.perl.org/rt3//Public/Bug/Display.html?id=3393 + + Started cleaning up source per Perl::Critic. + + +2005-11-10 Andy Lester + * Release 1.10: + + New maintainer. No changes at all. Bumped up the version number + and released it so that I can get the RT queue and any future mail. + + +2004-03-30 Sean M. Burke sburke@cpan.org + * Release 1.09: + + * Moved the language-preference-detecting code into new module + I18N::LangTags::Detect. + + Thanks to Autrijus Tang for catching some errors in the dist! + + +2004-01-19 Sean M. Burke sburke@cpan.org + + * Release 1.08: + + * Corrected a one-line code bug in v1.07 that accidentally demoted + all en-* tags in cases of lexicon-groups that had an en.pm but no + en_*.pm. Thanks to Robert Spier for spotting this. Test added. + So don't use v1.07! + + * Autrijus found some typoes in the TPJ article. Fixed. + +2004-01-11 Sean M. Burke sburke@cpan.org + + * Release 1.07: Now uses a new and different rule for implicating + superordinate language tags in accept-language lists. Previously, + superordinates were just tacked onto the, so "en-US, ja", turned + into "en-US, ja, en". However, this turned out to be suboptimal + for many users of RT, a popular system using Maketext. The new + rule is that a tag implicates superordinate forms right after it, + unless those tags are explicitly stated elsewhere in the + accept-languages list. So "en-US ja" becomes "en-US en ja". If + you want "en" to be really lower, you have to actually state it + there: "en-US ja en" is left as-is. + + The 04super.t and 05super.t tests in t/ have many many examples of + this, including some strange corner cases. + + (In implementing this change, I also refactored some code in + Maketext.pm, for hopefully improved readability. However, + the above is the only actual change in behavior.) + +2003-06-21 Sean M. Burke sburke@cpan.org + * Release 1.06: Now has "use utf8" to make the things work + happily. Some fancy footwork is required to make this work under + pre-utf8 perl versions. + +2003-04-18 Sean M. Burke sburke@cpan.org + * Release 1.05: Different Makefile.PL, same .pm code. + + Jesse Vincent, Hugo van der Sanden, and Jarkko Hietaniemi + encourage me to add this to the makefile: + ($] < 5.008) ? () : ( INSTALLDIRS => 'perl'), + so that when you install this on a recent version of perl (5.8 or + later), the installation will overwrite the Maketext.pm in your + core library directory. Email me if this produces trouble for any + of you folks out there, okay? + +2003-04-02 Sean M. Burke sburke@cpan.org + * Release 1.04: Implementing proper HTTP "tag;q=rank" parsing for + get_handle. This should make all the difference for users/victims + of the current version of Safari, which uses that syntax as well + as inserts random languages with low q numbers. + Thanks to Jesse Vincent and the whole RT junta for finding this. + + * Added more tests, now in t/ + + * Lots of typo fixes to Maketext.pm. Thanks to Evan A. Zacks for + patient help in finding them all. + +2001-06-21 Sean M. Burke sburke@cpan.org + * Release 1.03: basically cosmetic tweaks to the docs and the + test.pl. + +2001-06-20 Sean M. Burke sburke@cpan.org + * Release 1.02: EBCDIC-compatability changes courtesy of Peter + Prymmer. Added [*,...] as alias for [quant,...] and [#,...] as an + alias for [numf,...]. Added some more things to test.pl + +2001-05-25 Sean M. Burke sburke@cpan.org + * Release 1.01: total rewrite. Docs are massive now. + Including TPJ13 article now. + +2000-05-14 Sean M. Burke sburke@cpan.org + + * Release 0.18: only change, regrettably, is a better makefile, + and it my email address has changed. + +1999-03-15 Sean M. Burke sburke@netadventure.net + + * Release 0.17: Public alpha release + Underdocumented. diff --git a/dist/Locale-Maketext/README b/dist/Locale-Maketext/README new file mode 100644 index 0000000000..55a2ff73e8 --- /dev/null +++ b/dist/Locale-Maketext/README @@ -0,0 +1,67 @@ +README for Locale::Maketext + Time-stamp: "2004-03-30 16:02:27 AST" + + Locale::Maketext + +Locale::Maketext is a base class providing a framework for +localization and inheritance-based lexicons, as described in my +article in The Perl Journal #13 (a corrected version of which appears +in this dist). + + +PREREQUISITES + +This suite requires Perl 5. It also requires a recent version +of I18N::LangTags. MSWin users should also get Win32::Locale. +File::Findgrep is also useful example code, as is the rather +larger Apache::MP3 source (even if you don't run Apache). + + +INSTALLATION + +You install Locale::Maketext, as you would install any Perl module +distribution, by running these commands: + + perl Makefile.PL + make + make test + make install + +If you want to install a private copy of Maketext in your home directory, +then you should try to produce the initial Makefile with something +like this command: + + perl Makefile.PL LIB=~/perl + +See perldoc perlmodinstall for more information. + + +DOCUMENTATION + +See the pod in Locale::Maketext and Locale::Maketext::TPJ13, +and see also File::Findgrep. + + +SUPPORT + +Questions, bug reports, useful code bits, and suggestions for +Worms should be sent to me at sburke@cpan.org + + +AVAILABILITY + +The latest version of Locale::Maketext is available from the +Comprehensive Perl Archive Network (CPAN). Visit +<http://www.perl.com/CPAN/> to find a CPAN site near you. + + +COPYRIGHT + +Copyright 1999-2004, Sean M. Burke <sburke@cpan.org>, all rights +reserved. This program is free software; you can redistribute it +and/or modify it under the same terms as Perl itself. + + +AUTHOR + +Sean M. Burke <sburke@cpan.org> diff --git a/dist/Locale-Maketext/lib/Locale/Maketext.pm b/dist/Locale-Maketext/lib/Locale/Maketext.pm new file mode 100644 index 0000000000..eda9e97b1b --- /dev/null +++ b/dist/Locale-Maketext/lib/Locale/Maketext.pm @@ -0,0 +1,456 @@ +package Locale::Maketext; +use strict; +use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS +$USE_LITERALS $MATCH_SUPERS_TIGHTLY); +use Carp (); +use I18N::LangTags 0.30 (); + +#-------------------------------------------------------------------------- + +BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } +# define the constant 'DEBUG' at compile-time + +$VERSION = '1.13'; +@ISA = (); + +$MATCH_SUPERS = 1; +$MATCH_SUPERS_TIGHTLY = 1; +$USING_LANGUAGE_TAGS = 1; +# Turning this off is somewhat of a security risk in that little or no +# checking will be done on the legality of tokens passed to the +# eval("use $module_name") in _try_use. If you turn this off, you have +# to do your own taint checking. + +$USE_LITERALS = 1 unless defined $USE_LITERALS; +# a hint for compiling bracket-notation things. + +my %isa_scan = (); + +########################################################################### + +sub quant { + my($handle, $num, @forms) = @_; + + return $num if @forms == 0; # what should this mean? + return $forms[2] if @forms > 2 and $num == 0; # special zeroth case + + # Normal case: + # Note that the formatting of $num is preserved. + return( $handle->numf($num) . ' ' . $handle->numerate($num, @forms) ); + # Most human languages put the number phrase before the qualified phrase. +} + + +sub numerate { + # return this lexical item in a form appropriate to this number + my($handle, $num, @forms) = @_; + my $s = ($num == 1); + + return '' unless @forms; + if(@forms == 1) { # only the headword form specified + return $s ? $forms[0] : ($forms[0] . 's'); # very cheap hack. + } + else { # sing and plural were specified + return $s ? $forms[0] : $forms[1]; + } +} + +#-------------------------------------------------------------------------- + +sub numf { + my($handle, $num) = @_[0,1]; + if($num < 10_000_000_000 and $num > -10_000_000_000 and $num == int($num)) { + $num += 0; # Just use normal integer stringification. + # Specifically, don't let %G turn ten million into 1E+007 + } + else { + $num = CORE::sprintf('%G', $num); + # "CORE::" is there to avoid confusion with the above sub sprintf. + } + while( $num =~ s/^([-+]?\d+)(\d{3})/$1,$2/s ) {1} # right from perlfaq5 + # The initial \d+ gobbles as many digits as it can, and then we + # backtrack so it un-eats the rightmost three, and then we + # insert the comma there. + + $num =~ tr<.,><,.> if ref($handle) and $handle->{'numf_comma'}; + # This is just a lame hack instead of using Number::Format + return $num; +} + +sub sprintf { + no integer; + my($handle, $format, @params) = @_; + return CORE::sprintf($format, @params); + # "CORE::" is there to avoid confusion with myself! +} + +#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=# + +use integer; # vroom vroom... applies to the whole rest of the module + +sub language_tag { + my $it = ref($_[0]) || $_[0]; + return undef unless $it =~ m/([^':]+)(?:::)?$/s; + $it = lc($1); + $it =~ tr<_><->; + return $it; +} + +sub encoding { + my $it = $_[0]; + return( + (ref($it) && $it->{'encoding'}) + || 'iso-8859-1' # Latin-1 + ); +} + +#-------------------------------------------------------------------------- + +sub fallback_languages { return('i-default', 'en', 'en-US') } + +sub fallback_language_classes { return () } + +#-------------------------------------------------------------------------- + +sub fail_with { # an actual attribute method! + my($handle, @params) = @_; + return unless ref($handle); + $handle->{'fail'} = $params[0] if @params; + return $handle->{'fail'}; +} + +#-------------------------------------------------------------------------- + +sub failure_handler_auto { + # Meant to be used like: + # $handle->fail_with('failure_handler_auto') + + my $handle = shift; + my $phrase = shift; + + $handle->{'failure_lex'} ||= {}; + my $lex = $handle->{'failure_lex'}; + + my $value; + $lex->{$phrase} ||= ($value = $handle->_compile($phrase)); + + # Dumbly copied from sub maketext: + return ${$value} if ref($value) eq 'SCALAR'; + return $value if ref($value) ne 'CODE'; + { + local $SIG{'__DIE__'}; + eval { $value = &$value($handle, @_) }; + } + # If we make it here, there was an exception thrown in the + # call to $value, and so scream: + if($@) { + my $err = $@; + # pretty up the error message + $err =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?} + {\n in bracket code [compiled line $1],}s; + #$err =~ s/\n?$/\n/s; + Carp::croak "Error in maketexting \"$phrase\":\n$err as used"; + # Rather unexpected, but suppose that the sub tried calling + # a method that didn't exist. + } + else { + return $value; + } +} + +#========================================================================== + +sub new { + # Nothing fancy! + my $class = ref($_[0]) || $_[0]; + my $handle = bless {}, $class; + $handle->init; + return $handle; +} + +sub init { return } # no-op + +########################################################################### + +sub maketext { + # Remember, this can fail. Failure is controllable many ways. + Carp::croak 'maketext requires at least one parameter' unless @_ > 1; + + my($handle, $phrase) = splice(@_,0,2); + Carp::confess('No handle/phrase') unless (defined($handle) && defined($phrase)); + + + # Don't interefere with $@ in case that's being interpolated into the msg. + local $@; + + # Look up the value: + + my $value; + foreach my $h_r ( + @{ $isa_scan{ref($handle) || $handle} || $handle->_lex_refs } + ) { + DEBUG and warn "* Looking up \"$phrase\" in $h_r\n"; + if(exists $h_r->{$phrase}) { + DEBUG and warn " Found \"$phrase\" in $h_r\n"; + unless(ref($value = $h_r->{$phrase})) { + # Nonref means it's not yet compiled. Compile and replace. + $value = $h_r->{$phrase} = $handle->_compile($value); + } + last; + } + elsif($phrase !~ m/^_/s and $h_r->{'_AUTO'}) { + # it's an auto lex, and this is an autoable key! + DEBUG and warn " Automaking \"$phrase\" into $h_r\n"; + + $value = $h_r->{$phrase} = $handle->_compile($phrase); + last; + } + DEBUG>1 and print " Not found in $h_r, nor automakable\n"; + # else keep looking + } + + unless(defined($value)) { + DEBUG and warn "! Lookup of \"$phrase\" in/under ", ref($handle) || $handle, " fails.\n"; + if(ref($handle) and $handle->{'fail'}) { + DEBUG and warn "WARNING0: maketext fails looking for <$phrase>\n"; + my $fail; + if(ref($fail = $handle->{'fail'}) eq 'CODE') { # it's a sub reference + return &{$fail}($handle, $phrase, @_); + # If it ever returns, it should return a good value. + } + else { # It's a method name + return $handle->$fail($phrase, @_); + # If it ever returns, it should return a good value. + } + } + else { + # All we know how to do is this; + Carp::croak("maketext doesn't know how to say:\n$phrase\nas needed"); + } + } + + return $$value if ref($value) eq 'SCALAR'; + return $value unless ref($value) eq 'CODE'; + + { + local $SIG{'__DIE__'}; + eval { $value = &$value($handle, @_) }; + } + # If we make it here, there was an exception thrown in the + # call to $value, and so scream: + if ($@) { + my $err = $@; + # pretty up the error message + $err =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?} + {\n in bracket code [compiled line $1],}s; + #$err =~ s/\n?$/\n/s; + Carp::croak "Error in maketexting \"$phrase\":\n$err as used"; + # Rather unexpected, but suppose that the sub tried calling + # a method that didn't exist. + } + else { + return $value; + } +} + +########################################################################### + +sub get_handle { # This is a constructor and, yes, it CAN FAIL. + # Its class argument has to be the base class for the current + # application's l10n files. + + my($base_class, @languages) = @_; + $base_class = ref($base_class) || $base_class; + # Complain if they use __PACKAGE__ as a project base class? + + if( @languages ) { + DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + if($USING_LANGUAGE_TAGS) { # An explicit language-list was given! + @languages = + map {; $_, I18N::LangTags::alternate_language_tags($_) } + # Catch alternation + map I18N::LangTags::locale2language_tag($_), + # If it's a lg tag, fine, pass thru (untainted) + # If it's a locale ID, try converting to a lg tag (untainted), + # otherwise nix it. + @languages; + DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + } + } + else { + @languages = $base_class->_ambient_langprefs; + } + + @languages = $base_class->_langtag_munging(@languages); + + my %seen; + foreach my $module_name ( map { $base_class . '::' . $_ } @languages ) { + next unless length $module_name; # sanity + next if $seen{$module_name}++ # Already been here, and it was no-go + || !&_try_use($module_name); # Try to use() it, but can't it. + return($module_name->new); # Make it! + } + + return undef; # Fail! +} + +########################################################################### + +sub _langtag_munging { + my($base_class, @languages) = @_; + + # We have all these DEBUG statements because otherwise it's hard as hell + # to diagnose ifwhen something goes wrong. + + DEBUG and warn 'Lgs1: ', map("<$_>", @languages), "\n"; + + if($USING_LANGUAGE_TAGS) { + DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + @languages = $base_class->_add_supers( @languages ); + + push @languages, I18N::LangTags::panic_languages(@languages); + DEBUG and warn "After adding panic languages:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + push @languages, $base_class->fallback_languages; + # You are free to override fallback_languages to return empty-list! + DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + @languages = # final bit of processing to turn them into classname things + map { + my $it = $_; # copy + $it =~ tr<-A-Z><_a-z>; # lc, and turn - to _ + $it =~ tr<_a-z0-9><>cd; # remove all but a-z0-9_ + $it; + } @languages + ; + DEBUG and warn "Nearing end of munging:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + } + else { + DEBUG and warn "Bypassing language-tags.\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + } + + DEBUG and warn "Before adding fallback classes:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + push @languages, $base_class->fallback_language_classes; + # You are free to override that to return whatever. + + DEBUG and warn "Finally:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + return @languages; +} + +########################################################################### + +sub _ambient_langprefs { + require I18N::LangTags::Detect; + return I18N::LangTags::Detect::detect(); +} + +########################################################################### + +sub _add_supers { + my($base_class, @languages) = @_; + + if (!$MATCH_SUPERS) { + # Nothing + DEBUG and warn "Bypassing any super-matching.\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + } + elsif( $MATCH_SUPERS_TIGHTLY ) { + DEBUG and warn "Before adding new supers tightly:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + @languages = I18N::LangTags::implicate_supers( @languages ); + DEBUG and warn "After adding new supers tightly:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + + } + else { + DEBUG and warn "Before adding supers to end:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + @languages = I18N::LangTags::implicate_supers_strictly( @languages ); + DEBUG and warn "After adding supers to end:\n", + ' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n"; + } + + return @languages; +} + +########################################################################### +# +# This is where most people should stop reading. +# +########################################################################### + +use Locale::Maketext::GutsLoader; + +########################################################################### + +my %tried = (); +# memoization of whether we've used this module, or found it unusable. + +sub _try_use { # Basically a wrapper around "require Modulename" + # "Many men have tried..." "They tried and failed?" "They tried and died." + return $tried{$_[0]} if exists $tried{$_[0]}; # memoization + + my $module = $_[0]; # ASSUME sane module name! + { no strict 'refs'; + return($tried{$module} = 1) + if defined(%{$module . '::Lexicon'}) or defined(@{$module . '::ISA'}); + # weird case: we never use'd it, but there it is! + } + + DEBUG and warn " About to use $module ...\n"; + { + local $SIG{'__DIE__'}; + eval "require $module"; # used to be "use $module", but no point in that. + } + if($@) { + DEBUG and warn "Error using $module \: $@\n"; + return $tried{$module} = 0; + } + else { + DEBUG and warn " OK, $module is used\n"; + return $tried{$module} = 1; + } +} + +#-------------------------------------------------------------------------- + +sub _lex_refs { # report the lexicon references for this handle's class + # returns an arrayREF! + no strict 'refs'; + no warnings 'once'; + my $class = ref($_[0]) || $_[0]; + DEBUG and warn "Lex refs lookup on $class\n"; + return $isa_scan{$class} if exists $isa_scan{$class}; # memoization! + + my @lex_refs; + my $seen_r = ref($_[1]) ? $_[1] : {}; + + if( defined( *{$class . '::Lexicon'}{'HASH'} )) { + push @lex_refs, *{$class . '::Lexicon'}{'HASH'}; + DEBUG and warn '%' . $class . '::Lexicon contains ', + scalar(keys %{$class . '::Lexicon'}), " entries\n"; + } + + # Implements depth(height?)-first recursive searching of superclasses. + # In hindsight, I suppose I could have just used Class::ISA! + foreach my $superclass (@{$class . '::ISA'}) { + DEBUG and warn " Super-class search into $superclass\n"; + next if $seen_r->{$superclass}++; + push @lex_refs, @{&_lex_refs($superclass, $seen_r)}; # call myself + } + + $isa_scan{$class} = \@lex_refs; # save for next time + return \@lex_refs; +} + +sub clear_isa_scan { %isa_scan = (); return; } # end on a note of simplicity! + +1; diff --git a/dist/Locale-Maketext/lib/Locale/Maketext.pod b/dist/Locale-Maketext/lib/Locale/Maketext.pod new file mode 100644 index 0000000000..15533e4e87 --- /dev/null +++ b/dist/Locale-Maketext/lib/Locale/Maketext.pod @@ -0,0 +1,1326 @@ + +# Time-stamp: "2004-01-11 18:35:34 AST" + +=head1 NAME + +Locale::Maketext - framework for localization + +=head1 SYNOPSIS + + package MyProgram; + use strict; + use MyProgram::L10N; + # ...which inherits from Locale::Maketext + my $lh = MyProgram::L10N->get_handle() || die "What language?"; + ... + # And then any messages your program emits, like: + warn $lh->maketext( "Can't open file [_1]: [_2]\n", $f, $! ); + ... + +=head1 DESCRIPTION + +It is a common feature of applications (whether run directly, +or via the Web) for them to be "localized" -- i.e., for them +to a present an English interface to an English-speaker, a German +interface to a German-speaker, and so on for all languages it's +programmed with. Locale::Maketext +is a framework for software localization; it provides you with the +tools for organizing and accessing the bits of text and text-processing +code that you need for producing localized applications. + +In order to make sense of Maketext and how all its +components fit together, you should probably +go read L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13>, and +I<then> read the following documentation. + +You may also want to read over the source for C<File::Findgrep> +and its constituent modules -- they are a complete (if small) +example application that uses Maketext. + +=head1 QUICK OVERVIEW + +The basic design of Locale::Maketext is object-oriented, and +Locale::Maketext is an abstract base class, from which you +derive a "project class". +The project class (with a name like "TkBocciBall::Localize", +which you then use in your module) is in turn the base class +for all the "language classes" for your project +(with names "TkBocciBall::Localize::it", +"TkBocciBall::Localize::en", +"TkBocciBall::Localize::fr", etc.). + +A language class is +a class containing a lexicon of phrases as class data, +and possibly also some methods that are of use in interpreting +phrases in the lexicon, or otherwise dealing with text in that +language. + +An object belonging to a language class is called a "language +handle"; it's typically a flyweight object. + +The normal course of action is to call: + + use TkBocciBall::Localize; # the localization project class + $lh = TkBocciBall::Localize->get_handle(); + # Depending on the user's locale, etc., this will + # make a language handle from among the classes available, + # and any defaults that you declare. + die "Couldn't make a language handle??" unless $lh; + +From then on, you use the C<maketext> function to access +entries in whatever lexicon(s) belong to the language handle +you got. So, this: + + print $lh->maketext("You won!"), "\n"; + +...emits the right text for this language. If the object +in C<$lh> belongs to class "TkBocciBall::Localize::fr" and +%TkBocciBall::Localize::fr::Lexicon contains C<("You won!" +=E<gt> "Tu as gagnE<eacute>!")>, then the above +code happily tells the user "Tu as gagnE<eacute>!". + +=head1 METHODS + +Locale::Maketext offers a variety of methods, which fall +into three categories: + +=over + +=item * + +Methods to do with constructing language handles. + +=item * + +C<maketext> and other methods to do with accessing %Lexicon data +for a given language handle. + +=item * + +Methods that you may find it handy to use, from routines of +yours that you put in %Lexicon entries. + +=back + +These are covered in the following section. + +=head2 Construction Methods + +These are to do with constructing a language handle: + +=over + +=item * + +$lh = YourProjClass->get_handle( ...langtags... ) || die "lg-handle?"; + +This tries loading classes based on the language-tags you give (like +C<("en-US", "sk", "kon", "es-MX", "ja", "i-klingon")>, and for the first class +that succeeds, returns YourProjClass::I<language>->new(). + +If it runs thru the entire given list of language-tags, and finds no classes +for those exact terms, it then tries "superordinate" language classes. +So if no "en-US" class (i.e., YourProjClass::en_us) +was found, nor classes for anything else in that list, we then try +its superordinate, "en" (i.e., YourProjClass::en), and so on thru +the other language-tags in the given list: "es". +(The other language-tags in our example list: +happen to have no superordinates.) + +If none of those language-tags leads to loadable classes, we then +try classes derived from YourProjClass->fallback_languages() and +then if nothing comes of that, we use classes named by +YourProjClass->fallback_language_classes(). Then in the (probably +quite unlikely) event that that fails, we just return undef. + +=item * + +$lh = YourProjClass->get_handleB<()> || die "lg-handle?"; + +When C<get_handle> is called with an empty parameter list, magic happens: + +If C<get_handle> senses that it's running in program that was +invoked as a CGI, then it tries to get language-tags out of the +environment variable "HTTP_ACCEPT_LANGUAGE", and it pretends that +those were the languages passed as parameters to C<get_handle>. + +Otherwise (i.e., if not a CGI), this tries various OS-specific ways +to get the language-tags for the current locale/language, and then +pretends that those were the value(s) passed to C<get_handle>. + +Currently this OS-specific stuff consists of looking in the environment +variables "LANG" and "LANGUAGE"; and on MSWin machines (where those +variables are typically unused), this also tries using +the module Win32::Locale to get a language-tag for whatever language/locale +is currently selected in the "Regional Settings" (or "International"?) +Control Panel. I welcome further +suggestions for making this do the Right Thing under other operating +systems that support localization. + +If you're using localization in an application that keeps a configuration +file, you might consider something like this in your project class: + + sub get_handle_via_config { + my $class = $_[0]; + my $chosen_language = $Config_settings{'language'}; + my $lh; + if($chosen_language) { + $lh = $class->get_handle($chosen_language) + || die "No language handle for \"$chosen_language\" or the like"; + } else { + # Config file missing, maybe? + $lh = $class->get_handle() + || die "Can't get a language handle"; + } + return $lh; + } + +=item * + +$lh = YourProjClass::langname->new(); + +This constructs a language handle. You usually B<don't> call this +directly, but instead let C<get_handle> find a language class to C<use> +and to then call ->new on. + +=item * + +$lh->init(); + +This is called by ->new to initialize newly-constructed language handles. +If you define an init method in your class, remember that it's usually +considered a good idea to call $lh->SUPER::init in it (presumably at the +beginning), so that all classes get a chance to initialize a new object +however they see fit. + +=item * + +YourProjClass->fallback_languages() + +C<get_handle> appends the return value of this to the end of +whatever list of languages you pass C<get_handle>. Unless +you override this method, your project class +will inherit Locale::Maketext's C<fallback_languages>, which +currently returns C<('i-default', 'en', 'en-US')>. +("i-default" is defined in RFC 2277). + +This method (by having it return the name +of a language-tag that has an existing language class) +can be used for making sure that +C<get_handle> will always manage to construct a language +handle (assuming your language classes are in an appropriate +@INC directory). Or you can use the next method: + +=item * + +YourProjClass->fallback_language_classes() + +C<get_handle> appends the return value of this to the end +of the list of classes it will try using. Unless +you override this method, your project class +will inherit Locale::Maketext's C<fallback_language_classes>, +which currently returns an empty list, C<()>. +By setting this to some value (namely, the name of a loadable +language class), you can be sure that +C<get_handle> will always manage to construct a language +handle. + +=back + +=head2 The "maketext" Method + +This is the most important method in Locale::Maketext: + + $text = $lh->maketext(I<key>, ...parameters for this phrase...); + +This looks in the %Lexicon of the language handle +$lh and all its superclasses, looking +for an entry whose key is the string I<key>. Assuming such +an entry is found, various things then happen, depending on the +value found: + +If the value is a scalarref, the scalar is dereferenced and returned +(and any parameters are ignored). + +If the value is a coderef, we return &$value($lh, ...parameters...). + +If the value is a string that I<doesn't> look like it's in Bracket Notation, +we return it (after replacing it with a scalarref, in its %Lexicon). + +If the value I<does> look like it's in Bracket Notation, then we compile +it into a sub, replace the string in the %Lexicon with the new coderef, +and then we return &$new_sub($lh, ...parameters...). + +Bracket Notation is discussed in a later section. Note +that trying to compile a string into Bracket Notation can throw +an exception if the string is not syntactically valid (say, by not +balancing brackets right.) + +Also, calling &$coderef($lh, ...parameters...) can throw any sort of +exception (if, say, code in that sub tries to divide by zero). But +a very common exception occurs when you have Bracket +Notation text that says to call a method "foo", but there is no such +method. (E.g., "You have [quaB<tn>,_1,ball]." will throw an exception +on trying to call $lh->quaB<tn>($_[1],'ball') -- you presumably meant +"quant".) C<maketext> catches these exceptions, but only to make the +error message more readable, at which point it rethrows the exception. + +An exception I<may> be thrown if I<key> is not found in any +of $lh's %Lexicon hashes. What happens if a key is not found, +is discussed in a later section, "Controlling Lookup Failure". + +Note that you might find it useful in some cases to override +the C<maketext> method with an "after method", if you want to +translate encodings, or even scripts: + + package YrProj::zh_cn; # Chinese with PRC-style glyphs + use base ('YrProj::zh_tw'); # Taiwan-style + sub maketext { + my $self = shift(@_); + my $value = $self->maketext(@_); + return Chineeze::taiwan2mainland($value); + } + +Or you may want to override it with something that traps +any exceptions, if that's critical to your program: + + sub maketext { + my($lh, @stuff) = @_; + my $out; + eval { $out = $lh->SUPER::maketext(@stuff) }; + return $out unless $@; + ...otherwise deal with the exception... + } + +Other than those two situations, I don't imagine that +it's useful to override the C<maketext> method. (If +you run into a situation where it is useful, I'd be +interested in hearing about it.) + +=over + +=item $lh->fail_with I<or> $lh->fail_with(I<PARAM>) + +=item $lh->failure_handler_auto + +These two methods are discussed in the section "Controlling +Lookup Failure". + +=back + +=head2 Utility Methods + +These are methods that you may find it handy to use, generally +from %Lexicon routines of yours (whether expressed as +Bracket Notation or not). + +=over + +=item $language->quant($number, $singular) + +=item $language->quant($number, $singular, $plural) + +=item $language->quant($number, $singular, $plural, $negative) + +This is generally meant to be called from inside Bracket Notation +(which is discussed later), as in + + "Your search matched [quant,_1,document]!" + +It's for I<quantifying> a noun (i.e., saying how much of it there is, +while giving the correct form of it). The behavior of this method is +handy for English and a few other Western European languages, and you +should override it for languages where it's not suitable. You can feel +free to read the source, but the current implementation is basically +as this pseudocode describes: + + if $number is 0 and there's a $negative, + return $negative; + elsif $number is 1, + return "1 $singular"; + elsif there's a $plural, + return "$number $plural"; + else + return "$number " . $singular . "s"; + # + # ...except that we actually call numf to + # stringify $number before returning it. + +So for English (with Bracket Notation) +C<"...[quant,_1,file]..."> is fine (for 0 it returns "0 files", +for 1 it returns "1 file", and for more it returns "2 files", etc.) + +But for "directory", you'd want C<"[quant,_1,directory,directories]"> +so that our elementary C<quant> method doesn't think that the +plural of "directory" is "directorys". And you might find that the +output may sound better if you specify a negative form, as in: + + "[quant,_1,file,files,No files] matched your query.\n" + +Remember to keep in mind verb agreement (or adjectives too, in +other languages), as in: + + "[quant,_1,document] were matched.\n" + +Because if _1 is one, you get "1 document B<were> matched". +An acceptable hack here is to do something like this: + + "[quant,_1,document was, documents were] matched.\n" + +=item $language->numf($number) + +This returns the given number formatted nicely according to +this language's conventions. Maketext's default method is +mostly to just take the normal string form of the number +(applying sprintf "%G" for only very large numbers), and then +to add commas as necessary. (Except that +we apply C<tr/,./.,/> if $language->{'numf_comma'} is true; +that's a bit of a hack that's useful for languages that express +two million as "2.000.000" and not as "2,000,000"). + +If you want anything fancier, consider overriding this with something +that uses L<Number::Format|Number::Format>, or does something else +entirely. + +Note that numf is called by quant for stringifying all quantifying +numbers. + +=item $language->sprintf($format, @items) + +This is just a wrapper around Perl's normal C<sprintf> function. +It's provided so that you can use "sprintf" in Bracket Notation: + + "Couldn't access datanode [sprintf,%10x=~[%s~],_1,_2]!\n" + +returning... + + Couldn't access datanode Stuff=[thangamabob]! + +=item $language->language_tag() + +Currently this just takes the last bit of C<ref($language)>, turns +underscores to dashes, and returns it. So if $language is +an object of class Hee::HOO::Haw::en_us, $language->language_tag() +returns "en-us". (Yes, the usual representation for that language +tag is "en-US", but case is I<never> considered meaningful in +language-tag comparison.) + +You may override this as you like; Maketext doesn't use it for +anything. + +=item $language->encoding() + +Currently this isn't used for anything, but it's provided +(with default value of +C<(ref($language) && $language-E<gt>{'encoding'})) or "iso-8859-1"> +) as a sort of suggestion that it may be useful/necessary to +associate encodings with your language handles (whether on a +per-class or even per-handle basis.) + +=back + +=head2 Language Handle Attributes and Internals + +A language handle is a flyweight object -- i.e., it doesn't (necessarily) +carry any data of interest, other than just being a member of +whatever class it belongs to. + +A language handle is implemented as a blessed hash. Subclasses of yours +can store whatever data you want in the hash. Currently the only hash +entry used by any crucial Maketext method is "fail", so feel free to +use anything else as you like. + +B<Remember: Don't be afraid to read the Maketext source if there's +any point on which this documentation is unclear.> This documentation +is vastly longer than the module source itself. + +=over + +=back + +=head1 LANGUAGE CLASS HIERARCHIES + +These are Locale::Maketext's assumptions about the class +hierarchy formed by all your language classes: + +=over + +=item * + +You must have a project base class, which you load, and +which you then use as the first argument in +the call to YourProjClass->get_handle(...). It should derive +(whether directly or indirectly) from Locale::Maketext. +It B<doesn't matter> how you name this class, although assuming this +is the localization component of your Super Mega Program, +good names for your project class might be +SuperMegaProgram::Localization, SuperMegaProgram::L10N, +SuperMegaProgram::I18N, SuperMegaProgram::International, +or even SuperMegaProgram::Languages or SuperMegaProgram::Messages. + +=item * + +Language classes are what YourProjClass->get_handle will try to load. +It will look for them by taking each language-tag (B<skipping> it +if it doesn't look like a language-tag or locale-tag!), turning it to +all lowercase, turning dashes to underscores, and appending it +to YourProjClass . "::". So this: + + $lh = YourProjClass->get_handle( + 'en-US', 'fr', 'kon', 'i-klingon', 'i-klingon-romanized' + ); + +will try loading the classes +YourProjClass::en_us (note lowercase!), YourProjClass::fr, +YourProjClass::kon, +YourProjClass::i_klingon +and YourProjClass::i_klingon_romanized. (And it'll stop at the +first one that actually loads.) + +=item * + +I assume that each language class derives (directly or indirectly) +from your project class, and also defines its @ISA, its %Lexicon, +or both. But I anticipate no dire consequences if these assumptions +do not hold. + +=item * + +Language classes may derive from other language classes (although they +should have "use I<Thatclassname>" or "use base qw(I<...classes...>)"). +They may derive from the project +class. They may derive from some other class altogether. Or via +multiple inheritance, it may derive from any mixture of these. + +=item * + +I foresee no problems with having multiple inheritance in +your hierarchy of language classes. (As usual, however, Perl will +complain bitterly if you have a cycle in the hierarchy: i.e., if +any class is its own ancestor.) + +=back + +=head1 ENTRIES IN EACH LEXICON + +A typical %Lexicon entry is meant to signify a phrase, +taking some number (0 or more) of parameters. An entry +is meant to be accessed by via +a string I<key> in $lh->maketext(I<key>, ...parameters...), +which should return a string that is generally meant for +be used for "output" to the user -- regardless of whether +this actually means printing to STDOUT, writing to a file, +or putting into a GUI widget. + +While the key must be a string value (since that's a basic +restriction that Perl places on hash keys), the value in +the lexicon can currently be of several types: +a defined scalar, scalarref, or coderef. The use of these is +explained above, in the section 'The "maketext" Method', and +Bracket Notation for strings is discussed in the next section. + +While you can use arbitrary unique IDs for lexicon keys +(like "_min_larger_max_error"), it is often +useful for if an entry's key is itself a valid value, like +this example error message: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + +Compare this code that uses an arbitrary ID... + + die $lh->maketext( "_min_larger_max_error", $min, $max ) + if $min > $max; + +...to this code that uses a key-as-value: + + die $lh->maketext( + "Minimum ([_1]) is larger than maximum ([_2])!\n", + $min, $max + ) if $min > $max; + +The second is, in short, more readable. In particular, it's obvious +that the number of parameters you're feeding to that phrase (two) is +the number of parameters that it I<wants> to be fed. (Since you see +_1 and a _2 being used in the key there.) + +Also, once a project is otherwise +complete and you start to localize it, you can scrape together +all the various keys you use, and pass it to a translator; and then +the translator's work will go faster if what he's presented is this: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + => "", # fill in something here, Jacques! + +rather than this more cryptic mess: + + "_min_larger_max_error" + => "", # fill in something here, Jacques + +I think that keys as lexicon values makes the completed lexicon +entries more readable: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + => "Le minimum ([_1]) est plus grand que le maximum ([_2])!\n", + +Also, having valid values as keys becomes very useful if you set +up an _AUTO lexicon. _AUTO lexicons are discussed in a later +section. + +I almost always use keys that are themselves +valid lexicon values. One notable exception is when the value is +quite long. For example, to get the screenful of data that +a command-line program might return when given an unknown switch, +I often just use a brief, self-explanatory key such as "_USAGE_MESSAGE". At that point I then go +and immediately to define that lexicon entry in the +ProjectClass::L10N::en lexicon (since English is always my "project +language"): + + '_USAGE_MESSAGE' => <<'EOSTUFF', + ...long long message... + EOSTUFF + +and then I can use it as: + + getopt('oDI', \%opts) or die $lh->maketext('_USAGE_MESSAGE'); + +Incidentally, +note that each class's C<%Lexicon> inherits-and-extends +the lexicons in its superclasses. This is not because these are +special hashes I<per se>, but because you access them via the +C<maketext> method, which looks for entries across all the +C<%Lexicon> hashes in a language class I<and> all its ancestor classes. +(This is because the idea of "class data" isn't directly implemented +in Perl, but is instead left to individual class-systems to implement +as they see fit..) + +Note that you may have things stored in a lexicon +besides just phrases for output: for example, if your program +takes input from the keyboard, asking a "(Y/N)" question, +you probably need to know what the equivalent of "Y[es]/N[o]" is +in whatever language. You probably also need to know what +the equivalents of the answers "y" and "n" are. You can +store that information in the lexicon (say, under the keys +"~answer_y" and "~answer_n", and the long forms as +"~answer_yes" and "~answer_no", where "~" is just an ad-hoc +character meant to indicate to programmers/translators that +these are not phrases for output). + +Or instead of storing this in the language class's lexicon, +you can (and, in some cases, really should) represent the same bit +of knowledge as code in a method in the language class. (That +leaves a tidy distinction between the lexicon as the things we +know how to I<say>, and the rest of the things in the lexicon class +as things that we know how to I<do>.) Consider +this example of a processor for responses to French "oui/non" +questions: + + sub y_or_n { + return undef unless defined $_[1] and length $_[1]; + my $answer = lc $_[1]; # smash case + return 1 if $answer eq 'o' or $answer eq 'oui'; + return 0 if $answer eq 'n' or $answer eq 'non'; + return undef; + } + +...which you'd then call in a construct like this: + + my $response; + until(defined $response) { + print $lh->maketext("Open the pod bay door (y/n)? "); + $response = $lh->y_or_n( get_input_from_keyboard_somehow() ); + } + if($response) { $pod_bay_door->open() } + else { $pod_bay_door->leave_closed() } + +Other data worth storing in a lexicon might be things like +filenames for language-targetted resources: + + ... + "_main_splash_png" + => "/styles/en_us/main_splash.png", + "_main_splash_imagemap" + => "/styles/en_us/main_splash.incl", + "_general_graphics_path" + => "/styles/en_us/", + "_alert_sound" + => "/styles/en_us/hey_there.wav", + "_forward_icon" + => "left_arrow.png", + "_backward_icon" + => "right_arrow.png", + # In some other languages, left equals + # BACKwards, and right is FOREwards. + ... + +You might want to do the same thing for expressing key bindings +or the like (since hardwiring "q" as the binding for the function +that quits a screen/menu/program is useful only if your language +happens to associate "q" with "quit"!) + +=head1 BRACKET NOTATION + +Bracket Notation is a crucial feature of Locale::Maketext. I mean +Bracket Notation to provide a replacement for the use of sprintf formatting. +Everything you do with Bracket Notation could be done with a sub block, +but bracket notation is meant to be much more concise. + +Bracket Notation is a like a miniature "template" system (in the sense +of L<Text::Template|Text::Template>, not in the sense of C++ templates), +where normal text is passed thru basically as is, but text in special +regions is specially interpreted. In Bracket Notation, you use square brackets ("[...]"), +not curly braces ("{...}") to note sections that are specially interpreted. + +For example, here all the areas that are taken literally are underlined with +a "^", and all the in-bracket special regions are underlined with an X: + + "Minimum ([_1]) is larger than maximum ([_2])!\n", + ^^^^^^^^^ XX ^^^^^^^^^^^^^^^^^^^^^^^^^^ XX ^^^^ + +When that string is compiled from bracket notation into a real Perl sub, +it's basically turned into: + + sub { + my $lh = $_[0]; + my @params = @_; + return join '', + "Minimum (", + ...some code here... + ") is larger than maximum (", + ...some code here... + ")!\n", + } + # to be called by $lh->maketext(KEY, params...) + +In other words, text outside bracket groups is turned into string +literals. Text in brackets is rather more complex, and currently follows +these rules: + +=over + +=item * + +Bracket groups that are empty, or which consist only of whitespace, +are ignored. (Examples: "[]", "[ ]", or a [ and a ] with returns +and/or tabs and/or spaces between them. + +Otherwise, each group is taken to be a comma-separated group of items, +and each item is interpreted as follows: + +=item * + +An item that is "_I<digits>" or "_-I<digits>" is interpreted as +$_[I<value>]. I.e., "_1" becomes with $_[1], and "_-3" is interpreted +as $_[-3] (in which case @_ should have at least three elements in it). +Note that $_[0] is the language handle, and is typically not named +directly. + +=item * + +An item "_*" is interpreted to mean "all of @_ except $_[0]". +I.e., C<@_[1..$#_]>. Note that this is an empty list in the case +of calls like $lh->maketext(I<key>) where there are no +parameters (except $_[0], the language handle). + +=item * + +Otherwise, each item is interpreted as a string literal. + +=back + +The group as a whole is interpreted as follows: + +=over + +=item * + +If the first item in a bracket group looks like a method name, +then that group is interpreted like this: + + $lh->that_method_name( + ...rest of items in this group... + ), + +=item * + +If the first item in a bracket group is "*", it's taken as shorthand +for the so commonly called "quant" method. Similarly, if the first +item in a bracket group is "#", it's taken to be shorthand for +"numf". + +=item * + +If the first item in a bracket group is the empty-string, or "_*" +or "_I<digits>" or "_-I<digits>", then that group is interpreted +as just the interpolation of all its items: + + join('', + ...rest of items in this group... + ), + +Examples: "[_1]" and "[,_1]", which are synonymous; and +"C<[,ID-(,_4,-,_2,)]>", which compiles as +C<join "", "ID-(", $_[4], "-", $_[2], ")">. + +=item * + +Otherwise this bracket group is invalid. For example, in the group +"[!@#,whatever]", the first item C<"!@#"> is neither the empty-string, +"_I<number>", "_-I<number>", "_*", nor a valid method name; and so +Locale::Maketext will throw an exception of you try compiling an +expression containing this bracket group. + +=back + +Note, incidentally, that items in each group are comma-separated, +not C</\s*,\s*/>-separated. That is, you might expect that this +bracket group: + + "Hoohah [foo, _1 , bar ,baz]!" + +would compile to this: + + sub { + my $lh = $_[0]; + return join '', + "Hoohah ", + $lh->foo( $_[1], "bar", "baz"), + "!", + } + +But it actually compiles as this: + + sub { + my $lh = $_[0]; + return join '', + "Hoohah ", + $lh->foo(" _1 ", " bar ", "baz"), # note the <space> in " bar " + "!", + } + +In the notation discussed so far, the characters "[" and "]" are given +special meaning, for opening and closing bracket groups, and "," has +a special meaning inside bracket groups, where it separates items in the +group. This begs the question of how you'd express a literal "[" or +"]" in a Bracket Notation string, and how you'd express a literal +comma inside a bracket group. For this purpose I've adopted "~" (tilde) +as an escape character: "~[" means a literal '[' character anywhere +in Bracket Notation (i.e., regardless of whether you're in a bracket +group or not), and ditto for "~]" meaning a literal ']', and "~," meaning +a literal comma. (Altho "," means a literal comma outside of +bracket groups -- it's only inside bracket groups that commas are special.) + +And on the off chance you need a literal tilde in a bracket expression, +you get it with "~~". + +Currently, an unescaped "~" before a character +other than a bracket or a comma is taken to mean just a "~" and that +character. I.e., "~X" means the same as "~~X" -- i.e., one literal tilde, +and then one literal "X". However, by using "~X", you are assuming that +no future version of Maketext will use "~X" as a magic escape sequence. +In practice this is not a great problem, since first off you can just +write "~~X" and not worry about it; second off, I doubt I'll add lots +of new magic characters to bracket notation; and third off, you +aren't likely to want literal "~" characters in your messages anyway, +since it's not a character with wide use in natural language text. + +Brackets must be balanced -- every openbracket must have +one matching closebracket, and vice versa. So these are all B<invalid>: + + "I ate [quant,_1,rhubarb pie." + "I ate [quant,_1,rhubarb pie[." + "I ate quant,_1,rhubarb pie]." + "I ate quant,_1,rhubarb pie[." + +Currently, bracket groups do not nest. That is, you B<cannot> say: + + "Foo [bar,baz,[quux,quuux]]\n"; + +If you need a notation that's that powerful, use normal Perl: + + %Lexicon = ( + ... + "some_key" => sub { + my $lh = $_[0]; + join '', + "Foo ", + $lh->bar('baz', $lh->quux('quuux')), + "\n", + }, + ... + ); + +Or write the "bar" method so you don't need to pass it the +output from calling quux. + +I do not anticipate that you will need (or particularly want) +to nest bracket groups, but you are welcome to email me with +convincing (real-life) arguments to the contrary. + +=head1 AUTO LEXICONS + +If maketext goes to look in an individual %Lexicon for an entry +for I<key> (where I<key> does not start with an underscore), and +sees none, B<but does see> an entry of "_AUTO" => I<some_true_value>, +then we actually define $Lexicon{I<key>} = I<key> right then and there, +and then use that value as if it had been there all +along. This happens before we even look in any superclass %Lexicons! + +(This is meant to be somewhat like the AUTOLOAD mechanism in +Perl's function call system -- or, looked at another way, +like the L<AutoLoader|AutoLoader> module.) + +I can picture all sorts of circumstances where you just +do not want lookup to be able to fail (since failing +normally means that maketext throws a C<die>, although +see the next section for greater control over that). But +here's one circumstance where _AUTO lexicons are meant to +be I<especially> useful: + +As you're writing an application, you decide as you go what messages +you need to emit. Normally you'd go to write this: + + if(-e $filename) { + go_process_file($filename) + } else { + print qq{Couldn't find file "$filename"!\n}; + } + +but since you anticipate localizing this, you write: + + use ThisProject::I18N; + my $lh = ThisProject::I18N->get_handle(); + # For the moment, assume that things are set up so + # that we load class ThisProject::I18N::en + # and that that's the class that $lh belongs to. + ... + if(-e $filename) { + go_process_file($filename) + } else { + print $lh->maketext( + qq{Couldn't find file "[_1]"!\n}, $filename + ); + } + +Now, right after you've just written the above lines, you'd +normally have to go open the file +ThisProject/I18N/en.pm, and immediately add an entry: + + "Couldn't find file \"[_1]\"!\n" + => "Couldn't find file \"[_1]\"!\n", + +But I consider that somewhat of a distraction from the work +of getting the main code working -- to say nothing of the fact +that I often have to play with the program a few times before +I can decide exactly what wording I want in the messages (which +in this case would require me to go changing three lines of code: +the call to maketext with that key, and then the two lines in +ThisProject/I18N/en.pm). + +However, if you set "_AUTO => 1" in the %Lexicon in, +ThisProject/I18N/en.pm (assuming that English (en) is +the language that all your programmers will be using for this +project's internal message keys), then you don't ever have to +go adding lines like this + + "Couldn't find file \"[_1]\"!\n" + => "Couldn't find file \"[_1]\"!\n", + +to ThisProject/I18N/en.pm, because if _AUTO is true there, +then just looking for an entry with the key "Couldn't find +file \"[_1]\"!\n" in that lexicon will cause it to be added, +with that value! + +Note that the reason that keys that start with "_" +are immune to _AUTO isn't anything generally magical about +the underscore character -- I just wanted a way to have most +lexicon keys be autoable, except for possibly a few, and I +arbitrarily decided to use a leading underscore as a signal +to distinguish those few. + +=head1 CONTROLLING LOOKUP FAILURE + +If you call $lh->maketext(I<key>, ...parameters...), +and there's no entry I<key> in $lh's class's %Lexicon, nor +in the superclass %Lexicon hash, I<and> if we can't auto-make +I<key> (because either it starts with a "_", or because none +of its lexicons have C<_AUTO =E<gt> 1,>), then we have +failed to find a normal way to maketext I<key>. What then +happens in these failure conditions, depends on the $lh object's +"fail" attribute. + +If the language handle has no "fail" attribute, maketext +will simply throw an exception (i.e., it calls C<die>, mentioning +the I<key> whose lookup failed, and naming the line number where +the calling $lh->maketext(I<key>,...) was. + +If the language handle has a "fail" attribute whose value is a +coderef, then $lh->maketext(I<key>,...params...) gives up and calls: + + return $that_subref->($lh, $key, @params); + +Otherwise, the "fail" attribute's value should be a string denoting +a method name, so that $lh->maketext(I<key>,...params...) can +give up with: + + return $lh->$that_method_name($phrase, @params); + +The "fail" attribute can be accessed with the C<fail_with> method: + + # Set to a coderef: + $lh->fail_with( \&failure_handler ); + + # Set to a method name: + $lh->fail_with( 'failure_method' ); + + # Set to nothing (i.e., so failure throws a plain exception) + $lh->fail_with( undef ); + + # Get the current value + $handler = $lh->fail_with(); + +Now, as to what you may want to do with these handlers: Maybe you'd +want to log what key failed for what class, and then die. Maybe +you don't like C<die> and instead you want to send the error message +to STDOUT (or wherever) and then merely C<exit()>. + +Or maybe you don't want to C<die> at all! Maybe you could use a +handler like this: + + # Make all lookups fall back onto an English value, + # but only after we log it for later fingerpointing. + my $lh_backup = ThisProject->get_handle('en'); + open(LEX_FAIL_LOG, ">>wherever/lex.log") || die "GNAARGH $!"; + sub lex_fail { + my($failing_lh, $key, $params) = @_; + print LEX_FAIL_LOG scalar(localtime), "\t", + ref($failing_lh), "\t", $key, "\n"; + return $lh_backup->maketext($key,@params); + } + +Some users have expressed that they think this whole mechanism of +having a "fail" attribute at all, seems a rather pointless complication. +But I want Locale::Maketext to be usable for software projects of I<any> +scale and type; and different software projects have different ideas +of what the right thing is to do in failure conditions. I could simply +say that failure always throws an exception, and that if you want to be +careful, you'll just have to wrap every call to $lh->maketext in an +S<eval { }>. However, I want programmers to reserve the right (via +the "fail" attribute) to treat lookup failure as something other than +an exception of the same level of severity as a config file being +unreadable, or some essential resource being inaccessible. + +One possibly useful value for the "fail" attribute is the method name +"failure_handler_auto". This is a method defined in the class +Locale::Maketext itself. You set it with: + + $lh->fail_with('failure_handler_auto'); + +Then when you call $lh->maketext(I<key>, ...parameters...) and +there's no I<key> in any of those lexicons, maketext gives up with + + return $lh->failure_handler_auto($key, @params); + +But failure_handler_auto, instead of dying or anything, compiles +$key, caching it in + + $lh->{'failure_lex'}{$key} = $complied + +and then calls the compiled value, and returns that. (I.e., if +$key looks like bracket notation, $compiled is a sub, and we return +&{$compiled}(@params); but if $key is just a plain string, we just +return that.) + +The effect of using "failure_auto_handler" +is like an AUTO lexicon, except that it 1) compiles $key even if +it starts with "_", and 2) you have a record in the new hashref +$lh->{'failure_lex'} of all the keys that have failed for +this object. This should avoid your program dying -- as long +as your keys aren't actually invalid as bracket code, and as +long as they don't try calling methods that don't exist. + +"failure_auto_handler" may not be exactly what you want, but I +hope it at least shows you that maketext failure can be mitigated +in any number of very flexible ways. If you can formalize exactly +what you want, you should be able to express that as a failure +handler. You can even make it default for every object of a given +class, by setting it in that class's init: + + sub init { + my $lh = $_[0]; # a newborn handle + $lh->SUPER::init(); + $lh->fail_with('my_clever_failure_handler'); + return; + } + sub my_clever_failure_handler { + ...you clever things here... + } + +=head1 HOW TO USE MAKETEXT + +Here is a brief checklist on how to use Maketext to localize +applications: + +=over + +=item * + +Decide what system you'll use for lexicon keys. If you insist, +you can use opaque IDs (if you're nostalgic for C<catgets>), +but I have better suggestions in the +section "Entries in Each Lexicon", above. Assuming you opt for +meaningful keys that double as values (like "Minimum ([_1]) is +larger than maximum ([_2])!\n"), you'll have to settle on what +language those should be in. For the sake of argument, I'll +call this English, specifically American English, "en-US". + +=item * + +Create a class for your localization project. This is +the name of the class that you'll use in the idiom: + + use Projname::L10N; + my $lh = Projname::L10N->get_handle(...) || die "Language?"; + +Assuming you call your class Projname::L10N, create a class +consisting minimally of: + + package Projname::L10N; + use base qw(Locale::Maketext); + ...any methods you might want all your languages to share... + + # And, assuming you want the base class to be an _AUTO lexicon, + # as is discussed a few sections up: + + 1; + +=item * + +Create a class for the language your internal keys are in. Name +the class after the language-tag for that language, in lowercase, +with dashes changed to underscores. Assuming your project's first +language is US English, you should call this Projname::L10N::en_us. +It should consist minimally of: + + package Projname::L10N::en_us; + use base qw(Projname::L10N); + %Lexicon = ( + '_AUTO' => 1, + ); + 1; + +(For the rest of this section, I'll assume that this "first +language class" of Projname::L10N::en_us has +_AUTO lexicon.) + +=item * + +Go and write your program. Everywhere in your program where +you would say: + + print "Foobar $thing stuff\n"; + +instead do it thru maketext, using no variable interpolation in +the key: + + print $lh->maketext("Foobar [_1] stuff\n", $thing); + +If you get tired of constantly saying C<print $lh-E<gt>maketext>, +consider making a functional wrapper for it, like so: + + use Projname::L10N; + use vars qw($lh); + $lh = Projname::L10N->get_handle(...) || die "Language?"; + sub pmt (@) { print( $lh->maketext(@_)) } + # "pmt" is short for "Print MakeText" + $Carp::Verbose = 1; + # so if maketext fails, we see made the call to pmt + +Besides whole phrases meant for output, anything language-dependent +should be put into the class Projname::L10N::en_us, +whether as methods, or as lexicon entries -- this is discussed +in the section "Entries in Each Lexicon", above. + +=item * + +Once the program is otherwise done, and once its localization for +the first language works right (via the data and methods in +Projname::L10N::en_us), you can get together the data for translation. +If your first language lexicon isn't an _AUTO lexicon, then you already +have all the messages explicitly in the lexicon (or else you'd be +getting exceptions thrown when you call $lh->maketext to get +messages that aren't in there). But if you were (advisedly) lazy and are +using an _AUTO lexicon, then you've got to make a list of all the phrases +that you've so far been letting _AUTO generate for you. There are very +many ways to assemble such a list. The most straightforward is to simply +grep the source for every occurrence of "maketext" (or calls +to wrappers around it, like the above C<pmt> function), and to log the +following phrase. + +=item * + +You may at this point want to consider whether your base class +(Projname::L10N), from which all lexicons inherit from (Projname::L10N::en, +Projname::L10N::es, etc.), should be an _AUTO lexicon. It may be true +that in theory, all needed messages will be in each language class; +but in the presumably unlikely or "impossible" case of lookup failure, +you should consider whether your program should throw an exception, +emit text in English (or whatever your project's first language is), +or some more complex solution as described in the section +"Controlling Lookup Failure", above. + +=item * + +Submit all messages/phrases/etc. to translators. + +(You may, in fact, want to start with localizing to I<one> other language +at first, if you're not sure that you've properly abstracted the +language-dependent parts of your code.) + +Translators may request clarification of the situation in which a +particular phrase is found. For example, in English we are entirely happy +saying "I<n> files found", regardless of whether we mean "I looked for files, +and found I<n> of them" or the rather distinct situation of "I looked for +something else (like lines in files), and along the way I saw I<n> +files." This may involve rethinking things that you thought quite clear: +should "Edit" on a toolbar be a noun ("editing") or a verb ("to edit")? Is +there already a conventionalized way to express that menu option, separate +from the target language's normal word for "to edit"? + +In all cases where the very common phenomenon of quantification +(saying "I<N> files", for B<any> value of N) +is involved, each translator should make clear what dependencies the +number causes in the sentence. In many cases, dependency is +limited to words adjacent to the number, in places where you might +expect them ("I found the-?PLURAL I<N> +empty-?PLURAL directory-?PLURAL"), but in some cases there are +unexpected dependencies ("I found-?PLURAL ..."!) as well as long-distance +dependencies "The I<N> directory-?PLURAL could not be deleted-?PLURAL"!). + +Remind the translators to consider the case where N is 0: +"0 files found" isn't exactly natural-sounding in any language, but it +may be unacceptable in many -- or it may condition special +kinds of agreement (similar to English "I didN'T find ANY files"). + +Remember to ask your translators about numeral formatting in their +language, so that you can override the C<numf> method as +appropriate. Typical variables in number formatting are: what to +use as a decimal point (comma? period?); what to use as a thousands +separator (space? nonbreaking space? comma? period? small +middot? prime? apostrophe?); and even whether the so-called "thousands +separator" is actually for every third digit -- I've heard reports of +two hundred thousand being expressible as "2,00,000" for some Indian +(Subcontinental) languages, besides the less surprising "S<200 000>", +"200.000", "200,000", and "200'000". Also, using a set of numeral +glyphs other than the usual ASCII "0"-"9" might be appreciated, as via +C<tr/0-9/\x{0966}-\x{096F}/> for getting digits in Devanagari script +(for Hindi, Konkani, others). + +The basic C<quant> method that Locale::Maketext provides should be +good for many languages. For some languages, it might be useful +to modify it (or its constituent C<numerate> method) +to take a plural form in the two-argument call to C<quant> +(as in "[quant,_1,files]") if +it's all-around easier to infer the singular form from the plural, than +to infer the plural form from the singular. + +But for other languages (as is discussed at length +in L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13>), simple +C<quant>/C<numerify> is not enough. For the particularly problematic +Slavic languages, what you may need is a method which you provide +with the number, the citation form of the noun to quantify, and +the case and gender that the sentence's syntax projects onto that +noun slot. The method would then be responsible for determining +what grammatical number that numeral projects onto its noun phrase, +and what case and gender it may override the normal case and gender +with; and then it would look up the noun in a lexicon providing +all needed inflected forms. + +=item * + +You may also wish to discuss with the translators the question of +how to relate different subforms of the same language tag, +considering how this reacts with C<get_handle>'s treatment of +these. For example, if a user accepts interfaces in "en, fr", and +you have interfaces available in "en-US" and "fr", what should +they get? You may wish to resolve this by establishing that "en" +and "en-US" are effectively synonymous, by having one class +zero-derive from the other. + +For some languages this issue may never come up (Danish is rarely +expressed as "da-DK", but instead is just "da"). And for other +languages, the whole concept of a "generic" form may verge on +being uselessly vague, particularly for interfaces involving voice +media in forms of Arabic or Chinese. + +=item * + +Once you've localized your program/site/etc. for all desired +languages, be sure to show the result (whether live, or via +screenshots) to the translators. Once they approve, make every +effort to have it then checked by at least one other speaker of +that language. This holds true even when (or especially when) the +translation is done by one of your own programmers. Some +kinds of systems may be harder to find testers for than others, +depending on the amount of domain-specific jargon and concepts +involved -- it's easier to find people who can tell you whether +they approve of your translation for "delete this message" in an +email-via-Web interface, than to find people who can give you +an informed opinion on your translation for "attribute value" +in an XML query tool's interface. + +=back + +=head1 SEE ALSO + +I recommend reading all of these: + +L<Locale::Maketext::TPJ13|Locale::Maketext::TPJ13> -- my I<The Perl +Journal> article about Maketext. It explains many important concepts +underlying Locale::Maketext's design, and some insight into why +Maketext is better than the plain old approach of having +message catalogs that are just databases of sprintf formats. + +L<File::Findgrep|File::Findgrep> is a sample application/module +that uses Locale::Maketext to localize its messages. For a larger +internationalized system, see also L<Apache::MP3>. + +L<I18N::LangTags|I18N::LangTags>. + +L<Win32::Locale|Win32::Locale>. + +RFC 3066, I<Tags for the Identification of Languages>, +as at http://sunsite.dk/RFC/rfc/rfc3066.html + +RFC 2277, I<IETF Policy on Character Sets and Languages> +is at http://sunsite.dk/RFC/rfc/rfc2277.html -- much of it is +just things of interest to protocol designers, but it explains +some basic concepts, like the distinction between locales and +language-tags. + +The manual for GNU C<gettext>. The gettext dist is available in +C<ftp://prep.ai.mit.edu/pub/gnu/> -- get +a recent gettext tarball and look in its "doc/" directory, there's +an easily browsable HTML version in there. The +gettext documentation asks lots of questions worth thinking +about, even if some of their answers are sometimes wonky, +particularly where they start talking about pluralization. + +The Locale/Maketext.pm source. Obverse that the module is much +shorter than its documentation! + +=head1 COPYRIGHT AND DISCLAIMER + +Copyright (c) 1999-2004 Sean M. Burke. All rights reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +This program is distributed in the hope that it will be useful, but +without any warranty; without even the implied warranty of +merchantability or fitness for a particular purpose. + +=head1 AUTHOR + +Sean M. Burke C<sburke@cpan.org> + +=cut diff --git a/dist/Locale-Maketext/lib/Locale/Maketext/Guts.pm b/dist/Locale-Maketext/lib/Locale/Maketext/Guts.pm new file mode 100644 index 0000000000..9af292c61c --- /dev/null +++ b/dist/Locale-Maketext/lib/Locale/Maketext/Guts.pm @@ -0,0 +1,328 @@ +package Locale::Maketext::Guts; + +$VERSION = '1.13'; + +BEGIN { + # Just so we're nice and define SOMETHING in "our" package. + *zorp = sub { return scalar @_ } unless defined &zorp; +} + +package Locale::Maketext; +use strict; +use vars qw($USE_LITERALS $GUTSPATH); + +BEGIN { + $GUTSPATH = __FILE__; + *DEBUG = sub () {0} unless defined &DEBUG; +} + +use utf8; + +sub _compile { + # This big scary routine compiles an entry. + # It returns either a coderef if there's brackety bits in this, or + # otherwise a ref to a scalar. + + my $target = ref($_[0]) || $_[0]; + + my(@code); + my(@c) = (''); # "chunks" -- scratch. + my $call_count = 0; + my $big_pile = ''; + { + my $in_group = 0; # start out outside a group + my($m, @params); # scratch + + while($_[1] =~ # Iterate over chunks. + m/\G( + [^\~\[\]]+ # non-~[] stuff + | + ~. # ~[, ~], ~~, ~other + | + \[ # [ presumably opening a group + | + \] # ] presumably closing a group + | + ~ # terminal ~ ? + | + $ + )/xgs + ) { + DEBUG>2 and print qq{ "$1"\n}; + + if($1 eq '[' or $1 eq '') { # "[" or end + # Whether this is "[" or end, force processing of any + # preceding literal. + if($in_group) { + if($1 eq '') { + $target->_die_pointing($_[1], 'Unterminated bracket group'); + } + else { + $target->_die_pointing($_[1], 'You can\'t nest bracket groups'); + } + } + else { + if ($1 eq '') { + DEBUG>2 and print " [end-string]\n"; + } + else { + $in_group = 1; + } + die "How come \@c is empty?? in <$_[1]>" unless @c; # sanity + if(length $c[-1]) { + # Now actually processing the preceding literal + $big_pile .= $c[-1]; + if($USE_LITERALS and ( + (ord('A') == 65) + ? $c[-1] !~ m/[^\x20-\x7E]/s + # ASCII very safe chars + : $c[-1] !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s + # EBCDIC very safe chars + )) { + # normal case -- all very safe chars + $c[-1] =~ s/'/\\'/g; + push @code, q{ '} . $c[-1] . "',\n"; + $c[-1] = ''; # reuse this slot + } + else { + push @code, ' $c[' . $#c . "],\n"; + push @c, ''; # new chunk + } + } + # else just ignore the empty string. + } + + } + elsif($1 eq ']') { # "]" + # close group -- go back in-band + if($in_group) { + $in_group = 0; + + DEBUG>2 and print " --Closing group [$c[-1]]\n"; + + # And now process the group... + + if(!length($c[-1]) or $c[-1] =~ m/^\s+$/s) { + DEBUG > 2 and print " -- (Ignoring)\n"; + $c[-1] = ''; # reset out chink + next; + } + + #$c[-1] =~ s/^\s+//s; + #$c[-1] =~ s/\s+$//s; + ($m,@params) = split(/,/, $c[-1], -1); # was /\s*,\s*/ + + # A bit of a hack -- we've turned "~,"'s into DELs, so turn + # 'em into real commas here. + if (ord('A') == 65) { # ASCII, etc + foreach($m, @params) { tr/\x7F/,/ } + } + else { # EBCDIC (1047, 0037, POSIX-BC) + # Thanks to Peter Prymmer for the EBCDIC handling + foreach($m, @params) { tr/\x07/,/ } + } + + # Special-case handling of some method names: + if($m eq '_*' or $m =~ m/^_(-?\d+)$/s) { + # Treat [_1,...] as [,_1,...], etc. + unshift @params, $m; + $m = ''; + } + elsif($m eq '*') { + $m = 'quant'; # "*" for "times": "4 cars" is 4 times "cars" + } + elsif($m eq '#') { + $m = 'numf'; # "#" for "number": [#,_1] for "the number _1" + } + + # Most common case: a simple, legal-looking method name + if($m eq '') { + # 0-length method name means to just interpolate: + push @code, ' ('; + } + elsif($m =~ /^\w+(?:\:\:\w+)*$/s + and $m !~ m/(?:^|\:)\d/s + # exclude starting a (sub)package or symbol with a digit + ) { + # Yes, it even supports the demented (and undocumented?) + # $obj->Foo::bar(...) syntax. + $target->_die_pointing( + $_[1], q{Can't use "SUPER::" in a bracket-group method}, + 2 + length($c[-1]) + ) + if $m =~ m/^SUPER::/s; + # Because for SUPER:: to work, we'd have to compile this into + # the right package, and that seems just not worth the bother, + # unless someone convinces me otherwise. + + push @code, ' $_[0]->' . $m . '('; + } + else { + # TODO: implement something? or just too icky to consider? + $target->_die_pointing( + $_[1], + "Can't use \"$m\" as a method name in bracket group", + 2 + length($c[-1]) + ); + } + + pop @c; # we don't need that chunk anymore + ++$call_count; + + foreach my $p (@params) { + if($p eq '_*') { + # Meaning: all parameters except $_[0] + $code[-1] .= ' @_[1 .. $#_], '; + # and yes, that does the right thing for all @_ < 3 + } + elsif($p =~ m/^_(-?\d+)$/s) { + # _3 meaning $_[3] + $code[-1] .= '$_[' . (0 + $1) . '], '; + } + elsif($USE_LITERALS and ( + (ord('A') == 65) + ? $p !~ m/[^\x20-\x7E]/s + # ASCII very safe chars + : $p !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s + # EBCDIC very safe chars + )) { + # Normal case: a literal containing only safe characters + $p =~ s/'/\\'/g; + $code[-1] .= q{'} . $p . q{', }; + } + else { + # Stow it on the chunk-stack, and just refer to that. + push @c, $p; + push @code, ' $c[' . $#c . '], '; + } + } + $code[-1] .= "),\n"; + + push @c, ''; + } + else { + $target->_die_pointing($_[1], q{Unbalanced ']'}); + } + + } + elsif(substr($1,0,1) ne '~') { + # it's stuff not containing "~" or "[" or "]" + # i.e., a literal blob + $c[-1] .= $1; + + } + elsif($1 eq '~~') { # "~~" + $c[-1] .= '~'; + + } + elsif($1 eq '~[') { # "~[" + $c[-1] .= '['; + + } + elsif($1 eq '~]') { # "~]" + $c[-1] .= ']'; + + } + elsif($1 eq '~,') { # "~," + if($in_group) { + # This is a hack, based on the assumption that no-one will actually + # want a DEL inside a bracket group. Let's hope that's it's true. + if (ord('A') == 65) { # ASCII etc + $c[-1] .= "\x7F"; + } + else { # EBCDIC (cp 1047, 0037, POSIX-BC) + $c[-1] .= "\x07"; + } + } + else { + $c[-1] .= '~,'; + } + + } + elsif($1 eq '~') { # possible only at string-end, it seems. + $c[-1] .= '~'; + + } + else { + # It's a "~X" where X is not a special character. + # Consider it a literal ~ and X. + $c[-1] .= $1; + } + } + } + + if($call_count) { + undef $big_pile; # Well, nevermind that. + } + else { + # It's all literals! Ahwell, that can happen. + # So don't bother with the eval. Return a SCALAR reference. + return \$big_pile; + } + + die q{Last chunk isn't null??} if @c and length $c[-1]; # sanity + DEBUG and warn scalar(@c), " chunks under closure\n"; + if(@code == 0) { # not possible? + DEBUG and warn "Empty code\n"; + return \''; + } + elsif(@code > 1) { # most cases, presumably! + unshift @code, "join '',\n"; + } + unshift @code, "use strict; sub {\n"; + push @code, "}\n"; + + DEBUG and warn @code; + my $sub = eval(join '', @code); + die "$@ while evalling" . join('', @code) if $@; # Should be impossible. + return $sub; +} + +# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +sub _die_pointing { + # This is used by _compile to throw a fatal error + my $target = shift; # class name + # ...leaving $_[0] the error-causing text, and $_[1] the error message + + my $i = index($_[0], "\n"); + + my $pointy; + my $pos = pos($_[0]) - (defined($_[2]) ? $_[2] : 0) - 1; + if($pos < 1) { + $pointy = "^=== near there\n"; + } + else { # we need to space over + my $first_tab = index($_[0], "\t"); + if($pos > 2 and ( -1 == $first_tab or $first_tab > pos($_[0]))) { + # No tabs, or the first tab is harmlessly after where we will point to, + # AND we're far enough from the margin that we can draw a proper arrow. + $pointy = ('=' x $pos) . "^ near there\n"; + } + else { + # tabs screw everything up! + $pointy = substr($_[0],0,$pos); + $pointy =~ tr/\t //cd; + # make everything into whitespace, but preseving tabs + $pointy .= "^=== near there\n"; + } + } + + my $errmsg = "$_[1], in\:\n$_[0]"; + + if($i == -1) { + # No newline. + $errmsg .= "\n" . $pointy; + } + elsif($i == (length($_[0]) - 1) ) { + # Already has a newline at end. + $errmsg .= $pointy; + } + else { + # don't bother with the pointy bit, I guess. + } + Carp::croak( "$errmsg via $target, as used" ); +} + +1; + diff --git a/dist/Locale-Maketext/lib/Locale/Maketext/GutsLoader.pm b/dist/Locale-Maketext/lib/Locale/Maketext/GutsLoader.pm new file mode 100644 index 0000000000..daa9840260 --- /dev/null +++ b/dist/Locale-Maketext/lib/Locale/Maketext/GutsLoader.pm @@ -0,0 +1,49 @@ +package Locale::Maketext::GutsLoader; + +$VERSION = '1.13'; + +use strict; +sub zorp { return scalar @_ } + +BEGIN { + $Locale::Maketext::GutsLoader::GUTSPATH = __FILE__; + *Locale::Maketext::DEBUG = sub () {0} + unless defined &Locale::Maketext::DEBUG; +} + +# +# This whole drama is so that we can load the utf8'd code +# in Locale::Maketext::Guts, but if that fails, snip the +# utf8 and then try THAT. +# + +$Locale::Maketext::GUTSPATH = ''; +Locale::Maketext::DEBUG and warn "Requiring Locale::Maketext::Guts...\n"; +eval 'require Locale::Maketext::Guts'; + +if ($@) { + my $path = $Locale::Maketext::GUTSPATH; + + die "Can't load Locale::Maketext::Guts\nAborting" unless $path; + + die "No readable file $Locale::Maketext::GutsLoader::GUTSPATH\nAborting" + unless -e $path and -f _ and -r _; + + open(IN, $path) or die "Can't read-open $path\nAborting"; + + my $source; + { local $/; $source = <IN>; } + close(IN); + unless( $source =~ s/\b(use utf8)/# $1/ ) { + Locale::Maketext::DEBUG and + print "I didn't see 'use utf8' in $path\n"; + } + eval $source; + die "Can't compile $path\n...The error I got was:\n$@\nAborting" if $@; + Locale::Maketext::DEBUG and warn "Non-utf8'd Locale::Maketext::Guts fine\n"; +} +else { + Locale::Maketext::DEBUG and warn "Loaded Locale::Maketext::Guts fine\n"; +} + +1; diff --git a/dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod b/dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod new file mode 100644 index 0000000000..b5e2c0b972 --- /dev/null +++ b/dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod @@ -0,0 +1,775 @@ +# This document contains text in Perl "POD" format. +# Use a POD viewer like perldoc or perlman to render it. + +=head1 NAME + +Locale::Maketext::TPJ13 -- article about software localization + +=head1 SYNOPSIS + + # This an article, not a module. + +=head1 DESCRIPTION + +The following article by Sean M. Burke and Jordan Lachler +first appeared in I<The Perl Journal> #13 +and is copyright 1999 The Perl Journal. It appears +courtesy of Jon Orwant and The Perl Journal. This document may be +distributed under the same terms as Perl itself. + +=head1 Localization and Perl: gettext breaks, Maketext fixes + +by Sean M. Burke and Jordan Lachler + +This article points out cases where gettext (a common system for +localizing software interfaces -- i.e., making them work in the user's +language of choice) fails because of basic differences between human +languages. This article then describes Maketext, a new system capable +of correctly treating these differences. + +=head2 A Localization Horror Story: It Could Happen To You + +=over + +"There are a number of languages spoken by human beings in this +world." + +-- Harald Tveit Alvestrand, in RFC 1766, "Tags for the +Identification of Languages" + +=back + +Imagine that your task for the day is to localize a piece of software +-- and luckily for you, the only output the program emits is two +messages, like this: + + I scanned 12 directories. + + Your query matched 10 files in 4 directories. + +So how hard could that be? You look at the code that +produces the first item, and it reads: + + printf("I scanned %g directories.", + $directory_count); + +You think about that, and realize that it doesn't even work right for +English, as it can produce this output: + + I scanned 1 directories. + +So you rewrite it to read: + + printf("I scanned %g %s.", + $directory_count, + $directory_count == 1 ? + "directory" : "directories", + ); + +...which does the Right Thing. (In case you don't recall, "%g" is for +locale-specific number interpolation, and "%s" is for string +interpolation.) + +But you still have to localize it for all the languages you're +producing this software for, so you pull Locale::gettext off of CPAN +so you can access the C<gettext> C functions you've heard are standard +for localization tasks. + +And you write: + + printf(gettext("I scanned %g %s."), + $dir_scan_count, + $dir_scan_count == 1 ? + gettext("directory") : gettext("directories"), + ); + +But you then read in the gettext manual (Drepper, Miller, and Pinard 1995) +that this is not a good idea, since how a single word like "directory" +or "directories" is translated may depend on context -- and this is +true, since in a case language like German or Russian, you'd may need +these words with a different case ending in the first instance (where the +word is the object of a verb) than in the second instance, which you haven't even +gotten to yet (where the word is the object of a preposition, "in %g +directories") -- assuming these keep the same syntax when translated +into those languages. + +So, on the advice of the gettext manual, you rewrite: + + printf( $dir_scan_count == 1 ? + gettext("I scanned %g directory.") : + gettext("I scanned %g directories."), + $dir_scan_count ); + +So, you email your various translators (the boss decides that the +languages du jour are Chinese, Arabic, Russian, and Italian, so you +have one translator for each), asking for translations for "I scanned +%g directory." and "I scanned %g directories.". When they reply, +you'll put that in the lexicons for gettext to use when it localizes +your software, so that when the user is running under the "zh" +(Chinese) locale, gettext("I scanned %g directory.") will return the +appropriate Chinese text, with a "%g" in there where printf can then +interpolate $dir_scan. + +Your Chinese translator emails right back -- he says both of these +phrases translate to the same thing in Chinese, because, in linguistic +jargon, Chinese "doesn't have number as a grammatical category" -- +whereas English does. That is, English has grammatical rules that +refer to "number", i.e., whether something is grammatically singular +or plural; and one of these rules is the one that forces nouns to take +a plural suffix (generally "s") when in a plural context, as they are when +they follow a number other than "one" (including, oddly enough, "zero"). +Chinese has no such rules, and so has just the one phrase where English +has two. But, no problem, you can have this one Chinese phrase appear +as the translation for the two English phrases in the "zh" gettext +lexicon for your program. + +Emboldened by this, you dive into the second phrase that your software +needs to output: "Your query matched 10 files in 4 directories.". You notice +that if you want to treat phrases as indivisible, as the gettext +manual wisely advises, you need four cases now, instead of two, to +cover the permutations of singular and plural on the two items, +$dir_count and $file_count. So you try this: + + printf( $file_count == 1 ? + ( $directory_count == 1 ? + gettext("Your query matched %g file in %g directory.") : + gettext("Your query matched %g file in %g directories.") ) : + ( $directory_count == 1 ? + gettext("Your query matched %g files in %g directory.") : + gettext("Your query matched %g files in %g directories.") ), + $file_count, $directory_count, + ); + +(The case of "1 file in 2 [or more] directories" could, I suppose, +occur in the case of symlinking or something of the sort.) + +It occurs to you that this is not the prettiest code you've ever +written, but this seems the way to go. You mail off to the +translators asking for translations for these four cases. The +Chinese guy replies with the one phrase that these all translate to in +Chinese, and that phrase has two "%g"s in it, as it should -- but +there's a problem. He translates it word-for-word back: "In %g +directories contains %g files match your query." The %g +slots are in an order reverse to what they are in English. You wonder +how you'll get gettext to handle that. + +But you put it aside for the moment, and optimistically hope that the +other translators won't have this problem, and that their languages +will be better behaved -- i.e., that they will be just like English. + +But the Arabic translator is the next to write back. First off, your +code for "I scanned %g directory." or "I scanned %g directories." +assumes there's only singular or plural. But, to use linguistic +jargon again, Arabic has grammatical number, like English (but unlike +Chinese), but it's a three-term category: singular, dual, and plural. +In other words, the way you say "directory" depends on whether there's +one directory, or I<two> of them, or I<more than two> of them. Your +test of C<($directory == 1)> no longer does the job. And it means +that where English's grammatical category of number necessitates +only the two permutations of the first sentence based on "directory +[singular]" and "directories [plural]", Arabic has three -- and, +worse, in the second sentence ("Your query matched %g file in %g +directory."), where English has four, Arabic has nine. You sense +an unwelcome, exponential trend taking shape. + +Your Italian translator emails you back and says that "I searched 0 +directories" (a possible English output of your program) is stilted, +and if you think that's fine English, that's your problem, but that +I<just will not do> in the language of Dante. He insists that where +$directory_count is 0, your program should produce the Italian text +for "I I<didn't> scan I<any> directories.". And ditto for "I didn't +match any files in any directories", although he says the last part +about "in any directories" should probably just be left off. + +You wonder how you'll get gettext to handle this; to accomodate the +ways Arabic, Chinese, and Italian deal with numbers in just these few +very simple phrases, you need to write code that will ask gettext for +different queries depending on whether the numerical values in +question are 1, 2, more than 2, or in some cases 0, and you still haven't +figured out the problem with the different word order in Chinese. + +Then your Russian translator calls on the phone, to I<personally> tell +you the bad news about how really unpleasant your life is about to +become: + +Russian, like German or Latin, is an inflectional language; that is, nouns +and adjectives have to take endings that depend on their case +(i.e., nominative, accusative, genitive, etc...) -- which is roughly a matter of +what role they have in syntax of the sentence -- +as well as on the grammatical gender (i.e., masculine, feminine, neuter) +and number (i.e., singular or plural) of the noun, as well as on the +declension class of the noun. But unlike with most other inflected languages, +putting a number-phrase (like "ten" or "forty-three", or their Arabic +numeral equivalents) in front of noun in Russian can change the case and +number that noun is, and therefore the endings you have to put on it. + +He elaborates: In "I scanned %g directories", you'd I<expect> +"directories" to be in the accusative case (since it is the direct +object in the sentence) and the plural number, +except where $directory_count is 1, then you'd expect the singular, of +course. Just like Latin or German. I<But!> Where $directory_count % +10 is 1 ("%" for modulo, remember), assuming $directory count is an +integer, and except where $directory_count % 100 is 11, "directories" +is forced to become grammatically singular, which means it gets the +ending for the accusative singular... You begin to visualize the code +it'd take to test for the problem so far, I<and still work for Chinese +and Arabic and Italian>, and how many gettext items that'd take, but +he keeps going... But where $directory_count % 10 is 2, 3, or 4 +(except where $directory_count % 100 is 12, 13, or 14), the word for +"directories" is forced to be genitive singular -- which means another +ending... The room begins to spin around you, slowly at first... But +with I<all other> integer values, since "directory" is an inanimate +noun, when preceded by a number and in the nominative or accusative +cases (as it is here, just your luck!), it does stay plural, but it is +forced into the genitive case -- yet another ending... And +you never hear him get to the part about how you're going to run into +similar (but maybe subtly different) problems with other Slavic +languages like Polish, because the floor comes up to meet you, and you +fade into unconsciousness. + + +The above cautionary tale relates how an attempt at localization can +lead from programmer consternation, to program obfuscation, to a need +for sedation. But careful evaluation shows that your choice of tools +merely needed further consideration. + +=head2 The Linguistic View + +=over + +"It is more complicated than you think." + +-- The Eighth Networking Truth, from RFC 1925 + +=back + +The field of Linguistics has expended a great deal of effort over the +past century trying to find grammatical patterns which hold across +languages; it's been a constant process +of people making generalizations that should apply to all languages, +only to find out that, all too often, these generalizations fail -- +sometimes failing for just a few languages, sometimes whole classes of +languages, and sometimes nearly every language in the world except +English. Broad statistical trends are evident in what the "average +language" is like as far as what its rules can look like, must look +like, and cannot look like. But the "average language" is just as +unreal a concept as the "average person" -- it runs up against the +fact no language (or person) is, in fact, average. The wisdom of past +experience leads us to believe that any given language can do whatever +it wants, in any order, with appeal to any kind of grammatical +categories wants -- case, number, tense, real or metaphoric +characteristics of the things that words refer to, arbitrary or +predictable classifications of words based on what endings or prefixes +they can take, degree or means of certainty about the truth of +statements expressed, and so on, ad infinitum. + +Mercifully, most localization tasks are a matter of finding ways to +translate whole phrases, generally sentences, where the context is +relatively set, and where the only variation in content is I<usually> +in a number being expressed -- as in the example sentences above. +Translating specific, fully-formed sentences is, in practice, fairly +foolproof -- which is good, because that's what's in the phrasebooks +that so many tourists rely on. Now, a given phrase (whether in a +phrasebook or in a gettext lexicon) in one language I<might> have a +greater or lesser applicability than that phrase's translation into +another language -- for example, strictly speaking, in Arabic, the +"your" in "Your query matched..." would take a different form +depending on whether the user is male or female; so the Arabic +translation "your[feminine] query" is applicable in fewer cases than +the corresponding English phrase, which doesn't distinguish the user's +gender. (In practice, it's not feasable to have a program know the +user's gender, so the masculine "you" in Arabic is usually used, by +default.) + +But in general, such surprises are rare when entire sentences are +being translated, especially when the functional context is restricted +to that of a computer interacting with a user either to convey a fact +or to prompt for a piece of information. So, for purposes of +localization, translation by phrase (generally by sentence) is both the +simplest and the least problematic. + +=head2 Breaking gettext + +=over + +"It Has To Work." + +-- First Networking Truth, RFC 1925 + +=back + +Consider that sentences in a tourist phrasebook are of two types: ones +like "How do I get to the marketplace?" that don't have any blanks to +fill in, and ones like "How much do these ___ cost?", where there's +one or more blanks to fill in (and these are usually linked to a +list of words that you can put in that blank: "fish", "potatoes", +"tomatoes", etc.) The ones with no blanks are no problem, but the +fill-in-the-blank ones may not be really straightforward. If it's a +Swahili phrasebook, for example, the authors probably didn't bother to +tell you the complicated ways that the verb "cost" changes its +inflectional prefix depending on the noun you're putting in the blank. +The trader in the marketplace will still understand what you're saying if +you say "how much do these potatoes cost?" with the wrong +inflectional prefix on "cost". After all, I<you> can't speak proper Swahili, +I<you're> just a tourist. But while tourists can be stupid, computers +are supposed to be smart; the computer should be able to fill in the +blank, and still have the results be grammatical. + +In other words, a phrasebook entry takes some values as parameters +(the things that you fill in the blank or blanks), and provides a value +based on these parameters, where the way you get that final value from +the given values can, properly speaking, involve an arbitrarily +complex series of operations. (In the case of Chinese, it'd be not at +all complex, at least in cases like the examples at the beginning of +this article; whereas in the case of Russian it'd be a rather complex +series of operations. And in some languages, the +complexity could be spread around differently: while the act of +putting a number-expression in front of a noun phrase might not be +complex by itself, it may change how you have to, for example, inflect +a verb elsewhere in the sentence. This is what in syntax is called +"long-distance dependencies".) + +This talk of parameters and arbitrary complexity is just another way +to say that an entry in a phrasebook is what in a programming language +would be called a "function". Just so you don't miss it, this is the +crux of this article: I<A phrase is a function; a phrasebook is a +bunch of functions.> + +The reason that using gettext runs into walls (as in the above +second-person horror story) is that you're trying to use a string (or +worse, a choice among a bunch of strings) to do what you really need a +function for -- which is futile. Preforming (s)printf interpolation +on the strings which you get back from gettext does allow you to do I<some> +common things passably well... sometimes... sort of; but, to paraphrase +what some people say about C<csh> script programming, "it fools you +into thinking you can use it for real things, but you can't, and you +don't discover this until you've already spent too much time trying, +and by then it's too late." + +=head2 Replacing gettext + +So, what needs to replace gettext is a system that supports lexicons +of functions instead of lexicons of strings. An entry in a lexicon +from such a system should I<not> look like this: + + "J'ai trouv\xE9 %g fichiers dans %g r\xE9pertoires" + +[\xE9 is e-acute in Latin-1. Some pod renderers would +scream if I used the actual character here. -- SB] + +but instead like this, bearing in mind that this is just a first stab: + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = sprintf("%g %s", $files, + $files == 1 ? 'fichier' : 'fichiers'); + $dirs = sprintf("%g %s", $dirs, + $dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +Now, there's no particularly obvious way to store anything but strings +in a gettext lexicon; so it looks like we just have to start over and +make something better, from scratch. I call my shot at a +gettext-replacement system "Maketext", or, in CPAN terms, +Locale::Maketext. + +When designing Maketext, I chose to plan its main features in terms of +"buzzword compliance". And here are the buzzwords: + +=head2 Buzzwords: Abstraction and Encapsulation + +The complexity of the language you're trying to output a phrase in is +entirely abstracted inside (and encapsulated within) the Maketext module +for that interface. When you call: + + print $lang->maketext("You have [quant,_1,piece] of new mail.", + scalar(@messages)); + +you don't know (and in fact can't easily find out) whether this will +involve lots of figuring, as in Russian (if $lang is a handle to the +Russian module), or relatively little, as in Chinese. That kind of +abstraction and encapsulation may encourage other pleasant buzzwords +like modularization and stratification, depending on what design +decisions you make. + +=head2 Buzzword: Isomorphism + +"Isomorphism" means "having the same structure or form"; in discussions +of program design, the word takes on the special, specific meaning that +your implementation of a solution to a problem I<has the same +structure> as, say, an informal verbal description of the solution, or +maybe of the problem itself. Isomorphism is, all things considered, +a good thing -- it's what problem-solving (and solution-implementing) +should look like. + +What's wrong the with gettext-using code like this... + + printf( $file_count == 1 ? + ( $directory_count == 1 ? + "Your query matched %g file in %g directory." : + "Your query matched %g file in %g directories." ) : + ( $directory_count == 1 ? + "Your query matched %g files in %g directory." : + "Your query matched %g files in %g directories." ), + $file_count, $directory_count, + ); + +is first off that it's not well abstracted -- these ways of testing +for grammatical number (as in the expressions like C<foo == 1 ? +singular_form : plural_form>) should be abstracted to each language +module, since how you get grammatical number is language-specific. + +But second off, it's not isomorphic -- the "solution" (i.e., the +phrasebook entries) for Chinese maps from these four English phrases to +the one Chinese phrase that fits for all of them. In other words, the +informal solution would be "The way to say what you want in Chinese is +with the one phrase 'For your question, in Y directories you would +find X files'" -- and so the implemented solution should be, +isomorphically, just a straightforward way to spit out that one +phrase, with numerals properly interpolated. It shouldn't have to map +from the complexity of other languages to the simplicity of this one. + +=head2 Buzzword: Inheritance + +There's a great deal of reuse possible for sharing of phrases between +modules for related dialects, or for sharing of auxiliary functions +between related languages. (By "auxiliary functions", I mean +functions that don't produce phrase-text, but which, say, return an +answer to "does this number require a plural noun after it?". Such +auxiliary functions would be used in the internal logic of functions +that actually do produce phrase-text.) + +In the case of sharing phrases, consider that you have an interface +already localized for American English (probably by having been +written with that as the native locale, but that's incidental). +Localizing it for UK English should, in practical terms, be just a +matter of running it past a British person with the instructions to +indicate what few phrases would benefit from a change in spelling or +possibly minor rewording. In that case, you should be able to put in +the UK English localization module I<only> those phrases that are +UK-specific, and for all the rest, I<inherit> from the American +English module. (And I expect this same situation would apply with +Brazilian and Continental Portugese, possbily with some I<very> +closely related languages like Czech and Slovak, and possibly with the +slightly different "versions" of written Mandarin Chinese, as I hear exist in +Taiwan and mainland China.) + +As to sharing of auxiliary functions, consider the problem of Russian +numbers from the beginning of this article; obviously, you'd want to +write only once the hairy code that, given a numeric value, would +return some specification of which case and number a given quanitified +noun should use. But suppose that you discover, while localizing an +interface for, say, Ukranian (a Slavic language related to Russian, +spoken by several million people, many of whom would be relieved to +find that your Web site's or software's interface is available in +their language), that the rules in Ukranian are the same as in Russian +for quantification, and probably for many other grammatical functions. +While there may well be no phrases in common between Russian and +Ukranian, you could still choose to have the Ukranian module inherit +from the Russian module, just for the sake of inheriting all the +various grammatical methods. Or, probably better organizationally, +you could move those functions to a module called C<_E_Slavic> or +something, which Russian and Ukranian could inherit useful functions +from, but which would (presumably) provide no lexicon. + +=head2 Buzzword: Concision + +Okay, concision isn't a buzzword. But it should be, so I decree that +as a new buzzword, "concision" means that simple common things should +be expressible in very few lines (or maybe even just a few characters) +of code -- call it a special case of "making simple things easy and +hard things possible", and see also the role it played in the +MIDI::Simple language, discussed elsewhere in this issue [TPJ#13]. + +Consider our first stab at an entry in our "phrasebook of functions": + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = sprintf("%g %s", $files, + $files == 1 ? 'fichier' : 'fichiers'); + $dirs = sprintf("%g %s", $dirs, + $dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +You may sense that a lexicon (to use a non-committal catch-all term for a +collection of things you know how to say, regardless of whether they're +phrases or words) consisting of functions I<expressed> as above would +make for rather long-winded and repetitive code -- even if you wisely +rewrote this to have quantification (as we call adding a number +expression to a noun phrase) be a function called like: + + sub I_found_X1_files_in_X2_directories { + my( $files, $dirs ) = @_[0,1]; + $files = quant($files, "fichier"); + $dirs = quant($dirs, "r\xE9pertoire"); + return "J'ai trouv\xE9 $files dans $dirs."; + } + +And you may also sense that you do not want to bother your translators +with having to write Perl code -- you'd much rather that they spend +their I<very costly time> on just translation. And this is to say +nothing of the near impossibility of finding a commercial translator +who would know even simple Perl. + +In a first-hack implementation of Maketext, each language-module's +lexicon looked like this: + + %Lexicon = ( + "I found %g files in %g directories" + => sub { + my( $files, $dirs ) = @_[0,1]; + $files = quant($files, "fichier"); + $dirs = quant($dirs, "r\xE9pertoire"); + return "J'ai trouv\xE9 $files dans $dirs."; + }, + ... and so on with other phrase => sub mappings ... + ); + +but I immediately went looking for some more concise way to basically +denote the same phrase-function -- a way that would also serve to +concisely denote I<most> phrase-functions in the lexicon for I<most> +languages. After much time and even some actual thought, I decided on +this system: + +* Where a value in a %Lexicon hash is a contentful string instead of +an anonymous sub (or, conceivably, a coderef), it would be interpreted +as a sort of shorthand expression of what the sub does. When accessed +for the first time in a session, it is parsed, turned into Perl code, +and then eval'd into an anonymous sub; then that sub replaces the +original string in that lexicon. (That way, the work of parsing and +evaling the shorthand form for a given phrase is done no more than +once per session.) + +* Calls to C<maketext> (as Maketext's main function is called) happen +thru a "language session handle", notionally very much like an IO +handle, in that you open one at the start of the session, and use it +for "sending signals" to an object in order to have it return the text +you want. + +So, this: + + $lang->maketext("You have [quant,_1,piece] of new mail.", + scalar(@messages)); + +basically means this: look in the lexicon for $lang (which may inherit +from any number of other lexicons), and find the function that we +happen to associate with the string "You have [quant,_1,piece] of new +mail" (which is, and should be, a functioning "shorthand" for this +function in the native locale -- English in this case). If you find +such a function, call it with $lang as its first parameter (as if it +were a method), and then a copy of scalar(@messages) as its second, +and then return that value. If that function was found, but was in +string shorthand instead of being a fully specified function, parse it +and make it into a function before calling it the first time. + +* The shorthand uses code in brackets to indicate method calls that +should be performed. A full explanation is not in order here, but a +few examples will suffice: + + "You have [quant,_1,piece] of new mail." + +The above code is shorthand for, and will be interpreted as, +this: + + sub { + my $handle = $_[0]; + my(@params) = @_; + return join '', + "You have ", + $handle->quant($params[1], 'piece'), + "of new mail."; + } + +where "quant" is the name of a method you're using to quantify the +noun "piece" with the number $params[0]. + +A string with no brackety calls, like this: + + "Your search expression was malformed." + +is somewhat of a degerate case, and just gets turned into: + + sub { return "Your search expression was malformed." } + +However, not everything you can write in Perl code can be written in +the above shorthand system -- not by a long shot. For example, consider +the Italian translator from the beginning of this article, who wanted +the Italian for "I didn't find any files" as a special case, instead +of "I found 0 files". That couldn't be specified (at least not easily +or simply) in our shorthand system, and it would have to be written +out in full, like this: + + sub { # pretend the English strings are in Italian + my($handle, $files, $dirs) = @_[0,1,2]; + return "I didn't find any files" unless $files; + return join '', + "I found ", + $handle->quant($files, 'file'), + " in ", + $handle->quant($dirs, 'directory'), + "."; + } + +Next to a lexicon full of shorthand code, that sort of sticks out like a +sore thumb -- but this I<is> a special case, after all; and at least +it's possible, if not as concise as usual. + +As to how you'd implement the Russian example from the beginning of +the article, well, There's More Than One Way To Do It, but it could be +something like this (using English words for Russian, just so you know +what's going on): + + "I [quant,_1,directory,accusative] scanned." + +This shifts the burden of complexity off to the quant method. That +method's parameters are: the numeric value it's going to use to +quantify something; the Russian word it's going to quantify; and the +parameter "accusative", which you're using to mean that this +sentence's syntax wants a noun in the accusative case there, although +that quantification method may have to overrule, for grammatical +reasons you may recall from the beginning of this article. + +Now, the Russian quant method here is responsible not only for +implementing the strange logic necessary for figuring out how Russian +number-phrases impose case and number on their noun-phrases, but also +for inflecting the Russian word for "directory". How that inflection +is to be carried out is no small issue, and among the solutions I've +seen, some (like variations on a simple lookup in a hash where all +possible forms are provided for all necessary words) are +straightforward but I<can> become cumbersome when you need to inflect +more than a few dozen words; and other solutions (like using +algorithms to model the inflections, storing only root forms and +irregularities) I<can> involve more overhead than is justifiable for +all but the largest lexicons. + +Mercifully, this design decision becomes crucial only in the hairiest +of inflected languages, of which Russian is by no means the I<worst> case +scenario, but is worse than most. Most languages have simpler +inflection systems; for example, in English or Swahili, there are +generally no more than two possible inflected forms for a given noun +("error/errors"; "kosa/makosa"), and the +rules for producing these forms are fairly simple -- or at least, +simple rules can be formulated that work for most words, and you can +then treat the exceptions as just "irregular", at least relative to +your ad hoc rules. A simpler inflection system (simpler rules, fewer +forms) means that design decisions are less crucial to maintaining +sanity, whereas the same decisions could incur +overhead-versus-scalability problems in languages like Russian. It +may I<also> be likely that code (possibly in Perl, as with +Lingua::EN::Inflect, for English nouns) has already +been written for the language in question, whether simple or complex. + +Moreover, a third possibility may even be simpler than anything +discussed above: "Just require that all possible (or at least +applicable) forms be provided in the call to the given language's quant +method, as in:" + + "I found [quant,_1,file,files]." + +That way, quant just has to chose which form it needs, without having +to look up or generate anything. While possibly not optimal for +Russian, this should work well for most other languages, where +quantification is not as complicated an operation. + +=head2 The Devil in the Details + +There's plenty more to Maketext than described above -- for example, +there's the details of how language tags ("en-US", "i-pwn", "fi", +etc.) or locale IDs ("en_US") interact with actual module naming +("BogoQuery/Locale/en_us.pm"), and what magic can ensue; there's the +details of how to record (and possibly negotiate) what character +encoding Maketext will return text in (UTF8? Latin-1? KOI8?). There's +the interesting fact that Maketext is for localization, but nowhere +actually has a "C<use locale;>" anywhere in it. For the curious, +there's the somewhat frightening details of how I actually +implement something like data inheritance so that searches across +modules' %Lexicon hashes can parallel how Perl implements method +inheritance. + +And, most importantly, there's all the practical details of how to +actually go about deriving from Maketext so you can use it for your +interfaces, and the various tools and conventions for starting out and +maintaining individual language modules. + +That is all covered in the documentation for Locale::Maketext and the +modules that come with it, available in CPAN. After having read this +article, which covers the why's of Maketext, the documentation, +which covers the how's of it, should be quite straightfoward. + +=head2 The Proof in the Pudding: Localizing Web Sites + +Maketext and gettext have a notable difference: gettext is in C, +accessible thru C library calls, whereas Maketext is in Perl, and +really can't work without a Perl interpreter (although I suppose +something like it could be written for C). Accidents of history (and +not necessarily lucky ones) have made C++ the most common language for +the implementation of applications like word processors, Web browsers, +and even many in-house applications like custom query systems. Current +conditions make it somewhat unlikely that the next one of any of these +kinds of applications will be written in Perl, albeit clearly more for +reasons of custom and inertia than out of consideration of what is the +right tool for the job. + +However, other accidents of history have made Perl a well-accepted +language for design of server-side programs (generally in CGI form) +for Web site interfaces. Localization of static pages in Web sites is +trivial, feasable either with simple language-negotiation features in +servers like Apache, or with some kind of server-side inclusions of +language-appropriate text into layout templates. However, I think +that the localization of Perl-based search systems (or other kinds of +dynamic content) in Web sites, be they public or access-restricted, +is where Maketext will see the greatest use. + +I presume that it would be only the exceptional Web site that gets +localized for English I<and> Chinese I<and> Italian I<and> Arabic +I<and> Russian, to recall the languages from the beginning of this +article -- to say nothing of German, Spanish, French, Japanese, +Finnish, and Hindi, to name a few languages that benefit from large +numbers of programmers or Web viewers or both. + +However, the ever-increasing internationalization of the Web (whether +measured in terms of amount of content, of numbers of content writers +or programmers, or of size of content audiences) makes it increasingly +likely that the interface to the average Web-based dynamic content +service will be localized for two or maybe three languages. It is my +hope that Maketext will make that task as simple as possible, and will +remove previous barriers to localization for languages dissimilar to +English. + + __END__ + +Sean M. Burke (sburkeE<64>cpan.org) has a Master's in linguistics +from Northwestern University; he specializes in language technology. +Jordan Lachler (lachlerE<64>unm.edu) is a PhD student in the Department of +Linguistics at the University of New Mexico; he specializes in +morphology and pedagogy of North American native languages. + +=head2 References + +Alvestrand, Harald Tveit. 1995. I<RFC 1766: Tags for the +Identification of Languages.> +C<ftp://ftp.isi.edu/in-notes/rfc1766.txt> +[Now see RFC 3066.] + +Callon, Ross, editor. 1996. I<RFC 1925: The Twelve +Networking Truths.> +C<ftp://ftp.isi.edu/in-notes/rfc1925.txt> + +Drepper, Ulrich, Peter Miller, +and FranE<ccedil>ois Pinard. 1995-2001. GNU +C<gettext>. Available in C<ftp://prep.ai.mit.edu/pub/gnu/>, with +extensive docs in the distribution tarball. [Since +I wrote this article in 1998, I now see that the +gettext docs are now trying more to come to terms with +plurality. Whether useful conclusions have come from it +is another question altogether. -- SMB, May 2001] + +Forbes, Nevill. 1964. I<Russian Grammar.> Third Edition, revised +by J. C. Dumbreck. Oxford University Press. + +=cut + +#End + diff --git a/dist/Locale-Maketext/t/01_about_verbose.t b/dist/Locale-Maketext/t/01_about_verbose.t new file mode 100644 index 0000000000..88a74eb2b6 --- /dev/null +++ b/dist/Locale-Maketext/t/01_about_verbose.t @@ -0,0 +1,76 @@ +#!/usr/bin/perl -Tw + +require 5; + +use strict; + +use Test::More tests => 1; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +my @out; +push @out, + "\n\nPerl v", + defined($^V) ? sprintf('%vd', $^V) : $], + " under $^O ", + (defined(&Win32::BuildNumber) and defined &Win32::BuildNumber()) + ? ('(Win32::BuildNumber ', &Win32::BuildNumber(), ')') : (), + (defined $MacPerl::Version) + ? ("(MacPerl version $MacPerl::Version)") : (), + "\n" +; + +# Ugly code to walk the symbol tables: +my %v; +my @stack = (''); # start out in %:: +my $this; +my $count = 0; +my $pref; +while(@stack) { + $this = shift @stack; + die 'Too many packages?' if ++$count > 1000; + next if exists $v{$this}; + next if $this eq 'main'; # %main:: is %:: + + no strict 'refs'; + if ( defined ${$this . '::VERSION'} ) { + $v{$this} = ${$this . '::VERSION'} + } + elsif ( + defined *{$this . '::ISA'} or defined &{$this . '::import'} + or ($this ne '' and grep defined *{$_}{'CODE'}, values %{$this . '::'}) + # If it has an ISA, an import, or any subs... + ) { + # It's a class/module with no version. + $v{$this} = undef; + } + else { + # It's probably an unpopulated package. + ## $v{$this} = '...'; + } + + $pref = length($this) ? "$this\::" : ''; + push @stack, map m/^(.+)::$/ ? "$pref$1" : (), keys %{$this . '::'}; +} +push @out, " Modules in memory:\n"; +delete @v{'', '[none]'}; +foreach my $p (sort {lc($a) cmp lc($b)} keys %v) { + my $indent = ' ' x (2 + ($p =~ tr/:/:/)); + push @out, ' ', $indent, $p, defined($v{$p}) ? " v$v{$p};\n" : ";\n"; +} +push @out, sprintf "[at %s (local) / %s (GMT)]\n", scalar(gmtime), scalar(localtime); +my $x = join '', @out; +$x =~ s/^/#/mg; +print $x; + +my $ascii = (chr(65) eq 'A') ? 'an ASCII' : 'a non-ASCII'; +print "# Running in $ascii world.\n"; + +print "# \@INC:\n", map("# [$_]\n", @INC), "#\n#\n"; + +print "# \%INC:\n"; +foreach my $x (sort {lc($a) cmp lc($b)} keys %INC) { + print "# [$x] = [", $INC{$x} || '', "]\n"; +} diff --git a/dist/Locale-Maketext/t/10_make.t b/dist/Locale-Maketext/t/10_make.t new file mode 100644 index 0000000000..a457c6319d --- /dev/null +++ b/dist/Locale-Maketext/t/10_make.t @@ -0,0 +1,33 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests => 5; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +# declare some classes... +{ + package Woozle; + our @ISA = ('Locale::Maketext'); + sub dubbil { return $_[1] * 2 } + sub numerate { return $_[2] . 'en' } +} +{ + package Woozle::elx; + our @ISA = ('Woozle'); + our %Lexicon = ( + 'd2' => 'hum [dubbil,_1]', + 'd3' => 'hoo [quant,_1,zaz]', + 'd4' => 'hoo [*,_1,zaz]', + ); + keys %Lexicon; # dodges the 'used only once' warning +} + +my $lh = Woozle->get_handle('elx'); +isa_ok( $lh, 'Woozle::elx' ); + +is( $lh->maketext('d2', 7), 'hum 14' ); +is( $lh->maketext('d3', 7), 'hoo 7 zazen' ); +is( $lh->maketext('d4', 7), 'hoo 7 zazen' ); diff --git a/dist/Locale-Maketext/t/20_get.t b/dist/Locale-Maketext/t/20_get.t new file mode 100644 index 0000000000..5f191dbe48 --- /dev/null +++ b/dist/Locale-Maketext/t/20_get.t @@ -0,0 +1,72 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests => 10; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +print "# --- Making sure that get_handle works ---\n"; + +# declare some classes... +{ + package Woozle; + our @ISA = ('Locale::Maketext'); + sub dubbil { return $_[1] * 2 } + sub numerate { return $_[2] . 'en' } +} +{ + package Woozle::eu_mt; + our @ISA = ('Woozle'); + our %Lexicon = ( + 'd2' => 'hum [dubbil,_1]', + 'd3' => 'hoo [quant,_1,zaz]', + 'd4' => 'hoo [*,_1,zaz]', + ); + keys %Lexicon; # dodges the 'used only once' warning +} + +my $lh = Woozle->get_handle('eu-mt'); +isa_ok( $lh, 'Woozle::eu_mt' ); +is( $lh->maketext( 'd2', 7 ), 'hum 14' ); + +print "# Make sure we can assign to ENV entries\n", +"# (Otherwise we can't run the subsequent tests)...\n"; +$ENV{'MYORP'} = 'Zing'; +is( $ENV{'MYORP'}, 'Zing' ); +$ENV{'SWUZ'} = 'KLORTHO HOOBOY'; +is( $ENV{'SWUZ'}, 'KLORTHO HOOBOY' ); + +delete $ENV{'MYORP'}; +delete $ENV{'SWUZ'}; + + +print "# Test LANG...\n"; +$ENV{'LC_ALL'} = ''; +$ENV{'LC_MESSAGES'} = ''; +$ENV{'REQUEST_METHOD'} = ''; +$ENV{'LANG'} = 'Eu_MT'; +$ENV{'LANGUAGE'} = ''; +$lh = Woozle->get_handle(); +isa_ok( $lh, 'Woozle::eu_mt' ); + +print "# Test LANGUAGE...\n"; +$ENV{'LANG'} = ''; +$ENV{'LANGUAGE'} = 'Eu-MT'; +$lh = Woozle->get_handle(); +isa_ok( $lh, 'Woozle::eu_mt' ); + +print "# Test HTTP_ACCEPT_LANGUAGE...\n"; +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'eu-MT'; +$lh = Woozle->get_handle(); +isa_ok( $lh, 'Woozle::eu_mt' ); + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eu-MT, i-klung'; +$lh = Woozle->get_handle(); +isa_ok( $lh, 'Woozle::eu_mt' ); + +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'x-plorp, zaz, eU-Mt, i-klung'; +$lh = Woozle->get_handle(); +isa_ok( $lh, 'Woozle::eu_mt' ); diff --git a/dist/Locale-Maketext/t/30_local.t b/dist/Locale-Maketext/t/30_local.t new file mode 100644 index 0000000000..379197ee25 --- /dev/null +++ b/dist/Locale-Maketext/t/30_local.t @@ -0,0 +1,32 @@ +#!/usr/bin/perl -Tw + +use strict; + +use Test; +BEGIN { plan tests => 4; } +use Locale::Maketext; +print "# Hi there...\n"; +ok 1; + +print "# --- Making sure that Perl globals are localized ---\n"; + +# declare a class... +{ + package Woozle; + our @ISA = ('Locale::Maketext'); + our %Lexicon = ( + _AUTO => 1 + ); + keys %Lexicon; # dodges the 'used only once' warning +} + +my $lh; +print "# Basic sanity:\n"; +ok defined( $lh = Woozle->new() ) && ref($lh); + +print "# Make sure \$@ is localized...\n"; +$@ = 'foo'; +ok $lh && $lh->maketext('Eval error: [_1]', $@), 'Eval error: foo'; + +print "# Byebye!\n"; +ok 1; diff --git a/dist/Locale-Maketext/t/40_super.t b/dist/Locale-Maketext/t/40_super.t new file mode 100644 index 0000000000..8f72687d3c --- /dev/null +++ b/dist/Locale-Maketext/t/40_super.t @@ -0,0 +1,61 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests=>19; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +print "#\n# Testing non-tight insertion of super-ordinate language tags...\n#\n"; + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br fr pt + pt-br fr pt => pt-br fr pt + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br fr pt + hai pt-br fr => hai pt-br fr pt + +# Now test multi-part complicateds: + pt-br-janeiro fr => pt-br-janeiro fr pt-br pt +pt-br-janeiro de fr => pt-br-janeiro de fr pt-br pt +pt-br-janeiro de pt fr => pt-br-janeiro de pt fr pt-br + +ja pt-br-janeiro fr => ja pt-br-janeiro fr pt-br pt +ja pt-br-janeiro de fr => ja pt-br-janeiro de fr pt-br pt +ja pt-br-janeiro de pt fr => ja pt-br-janeiro de pt fr pt-br + +pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br fr pt + # an odd case, since we don't filter for uniqueness in this sub + +}; + +$Locale::Maketext::MATCH_SUPERS_TIGHTLY = 0; + +foreach my $in ( @in ) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my ($i,$s) = ($1, $2); + my @in = ($i =~ m/(\S+)/g); + my @should = ($s =~ m/(\S+)/g); + + my @out = Locale::Maketext->_add_supers( + ("@in" eq 'NIX') ? () : @in + ); + @out = 'NIX' unless @out; + + is_deeply( \@out, \@should, "Happily got [@out] from $in" ); +} diff --git a/dist/Locale-Maketext/t/50_super.t b/dist/Locale-Maketext/t/50_super.t new file mode 100644 index 0000000000..d253d78da7 --- /dev/null +++ b/dist/Locale-Maketext/t/50_super.t @@ -0,0 +1,71 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests => 26; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +print "#\n# Testing tight insertion of super-ordinate language tags...\n#\n"; + +my @in = grep m/\S/, split /[\n\r]/, q{ + NIX => NIX + sv => sv + en => en + hai => hai + + pt-br => pt-br pt + pt-br fr => pt-br pt fr + pt-br fr pt => pt-br fr pt + + pt-br fr pt de => pt-br fr pt de + de pt-br fr pt => de pt-br fr pt + de pt-br fr => de pt-br pt fr + hai pt-br fr => hai pt-br pt fr + + # Now test multi-part complicateds: + pt-br-janeiro => pt-br-janeiro pt-br pt + pt-br-janeiro fr => pt-br-janeiro pt-br pt fr + pt-br-janeiro de fr => pt-br-janeiro pt-br pt de fr + pt-br-janeiro de pt fr => pt-br-janeiro pt-br de pt fr + + pt-br-janeiro pt-br-saopaolo => pt-br-janeiro pt-br pt pt-br-saopaolo + pt-br-janeiro fr pt-br-saopaolo => pt-br-janeiro pt-br pt fr pt-br-saopaolo + pt-br-janeiro de pt-br-saopaolo fr => pt-br-janeiro pt-br pt de pt-br-saopaolo fr + pt-br-janeiro de pt-br fr pt-br-saopaolo => pt-br-janeiro de pt-br pt fr pt-br-saopaolo + + pt-br de en fr pt-br-janeiro => pt-br pt de en fr pt-br-janeiro + pt-br de en fr => pt-br pt de en fr + + ja pt-br-janeiro fr => ja pt-br-janeiro pt-br pt fr + ja pt-br-janeiro de fr => ja pt-br-janeiro pt-br pt de fr + ja pt-br-janeiro de pt fr => ja pt-br-janeiro pt-br de pt fr + + pt-br-janeiro de pt-br fr => pt-br-janeiro de pt-br pt fr +# an odd case, since we don't filter for uniqueness in this sub + +}; + +sub uniq { my %seen; return grep(!($seen{$_}++), @_); } + +foreach my $in ( @in ) { + $in =~ s/^\s+//s; + $in =~ s/\s+$//s; + $in =~ s/#.+//s; + next unless $in =~ m/\S/; + + die "What kind of line is <$in>?!" + unless $in =~ m/^(.+)=>(.+)$/s; + + my ($i,$s) = ($1, $2); + my @in = ($i =~ m/(\S+)/g); + my @should = ($s =~ m/(\S+)/g); + + my @out = uniq( Locale::Maketext->_add_supers( + ("@in" eq 'NIX') ? () : @in + ) ); + @out = 'NIX' unless @out; + + is_deeply( \@out, \@should, "Happily got [@out] from $in" ); +} diff --git a/dist/Locale-Maketext/t/60_super.t b/dist/Locale-Maketext/t/60_super.t new file mode 100644 index 0000000000..d54fc33007 --- /dev/null +++ b/dist/Locale-Maketext/t/60_super.t @@ -0,0 +1,37 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests => 3; + +BEGIN { + use_ok( 'Locale::Maketext' ); +} + +{ + package Whunk::L10N; + use vars qw(@ISA %Lexicon); + @ISA = 'Locale::Maketext'; + %Lexicon = ('hello' => 'SROBLR!'); +} + +{ + package Whunk::L10N::en; + use vars qw(@ISA %Lexicon); + @ISA = 'Whunk::L10N'; + %Lexicon = ('hello' => 'HI AND STUFF!'); +} + +{ + package Whunk::L10N::zh_tw; + use vars qw(@ISA %Lexicon); + @ISA = 'Whunk::L10N'; + %Lexicon = ('hello' => 'NIHAU JOE!'); +} + +$ENV{'REQUEST_METHOD'} = 'GET'; +$ENV{'HTTP_ACCEPT_LANGUAGE'} = 'en-US, zh-TW'; + +my $x = Whunk::L10N->get_handle; +isa_ok( $x, 'Whunk::L10N::en' ); +print "# LH object: $x\n"; +is( $x->maketext('hello'), 'HI AND STUFF!' ); diff --git a/dist/Locale-Maketext/t/90_utf8.t b/dist/Locale-Maketext/t/90_utf8.t new file mode 100644 index 0000000000..a9677bb660 --- /dev/null +++ b/dist/Locale-Maketext/t/90_utf8.t @@ -0,0 +1,33 @@ +#!/usr/bin/perl -Tw + +use strict; +use Test::More tests => 3; + +BEGIN { + use_ok( 'Locale::Maketext', 1.01 ); +} + +use utf8; + +# declare some classes... +{ + package Woozle; + our @ISA = ('Locale::Maketext'); + sub dubbil { return $_[1] * 2 . chr(2000) } + sub numerate { return $_[2] . 'en' } +} +{ + package Woozle::eu_mt; + our @ISA = ('Woozle'); + our %Lexicon = ( + 'd2' => chr(1000) . 'hum [dubbil,_1]', + 'd3' => chr(1000) . 'hoo [quant,_1,zaz]', + 'd4' => chr(1000) . 'hoo [*,_1,zaz]', + ); + keys %Lexicon; # dodges the 'used only once' warning +} + +my $lh = Woozle->get_handle('eu-mt'); +isa_ok( $lh, 'Woozle::eu_mt' ); +is( $lh->maketext('d2', 7), chr(1000).'hum 14'.chr(2000) ); + |