Initial Commit

This commit is contained in:
Riley Schneider
2025-12-03 16:38:10 +01:00
parent c5e26bf594
commit b732d8d4b5
17680 changed files with 5977495 additions and 2 deletions

View File

@@ -0,0 +1,859 @@
package Locale::Maketext;
use strict;
our $USE_LITERALS;
use Carp ();
use I18N::LangTags ();
use I18N::LangTags::Detect ();
#--------------------------------------------------------------------------
BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } }
# define the constant 'DEBUG' at compile-time
# turn on utf8 if we have it (this is what GutsLoader.pm used to do essentially )
# use if (exists $INC{'utf8.pm'} || eval 'use utf8'), 'utf8';
BEGIN {
# if we have it || we can load it
if ( exists $INC{'utf8.pm'} || eval { local $SIG{'__DIE__'};require utf8; } ) {
utf8->import();
DEBUG and warn " utf8 on for _compile()\n";
}
else {
DEBUG and warn " utf8 not available for _compile() ($INC{'utf8.pm'})\n$@\n";
}
}
our $VERSION = '1.29';
our @ISA = ();
our $MATCH_SUPERS = 1;
our $MATCH_SUPERS_TIGHTLY = 1;
our $USING_LANGUAGE_TAGS = 1;
# Turning this off is somewhat of a security risk in that little or no
# checking will be done on the legality of tokens passed to the
# eval("use $module_name") in _try_use. If you turn this off, you have
# to do your own taint checking.
$USE_LITERALS = 1 unless defined $USE_LITERALS;
# a hint for compiling bracket-notation things.
my %isa_scan = ();
###########################################################################
sub quant {
my($handle, $num, @forms) = @_;
return $num if @forms == 0; # what should this mean?
return $forms[2] if @forms > 2 and $num == 0; # special zeroth case
# Normal case:
# Note that the formatting of $num is preserved.
return( $handle->numf($num) . ' ' . $handle->numerate($num, @forms) );
# Most human languages put the number phrase before the qualified phrase.
}
sub numerate {
# return this lexical item in a form appropriate to this number
my($handle, $num, @forms) = @_;
my $s = ($num == 1);
return '' unless @forms;
if(@forms == 1) { # only the headword form specified
return $s ? $forms[0] : ($forms[0] . 's'); # very cheap hack.
}
else { # sing and plural were specified
return $s ? $forms[0] : $forms[1];
}
}
#--------------------------------------------------------------------------
sub numf {
my($handle, $num) = @_[0,1];
if($num < 10_000_000_000 and $num > -10_000_000_000 and $num == int($num)) {
$num += 0; # Just use normal integer stringification.
# Specifically, don't let %G turn ten million into 1E+007
}
else {
$num = CORE::sprintf('%G', $num);
# "CORE::" is there to avoid confusion with the above sub sprintf.
}
while( $num =~ s/^([-+]?\d+)(\d{3})/$1,$2/s ) {1} # right from perlfaq5
# The initial \d+ gobbles as many digits as it can, and then we
# backtrack so it un-eats the rightmost three, and then we
# insert the comma there.
$num =~ tr<.,><,.> if ref($handle) and $handle->{'numf_comma'};
# This is just a lame hack instead of using Number::Format
return $num;
}
sub sprintf {
no integer;
my($handle, $format, @params) = @_;
return CORE::sprintf($format, @params);
# "CORE::" is there to avoid confusion with myself!
}
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
use integer; # vroom vroom... applies to the whole rest of the module
sub language_tag {
my $it = ref($_[0]) || $_[0];
return undef unless $it =~ m/([^':]+)(?:::)?$/s;
$it = lc($1);
$it =~ tr<_><->;
return $it;
}
sub encoding {
my $it = $_[0];
return(
(ref($it) && $it->{'encoding'})
|| 'iso-8859-1' # Latin-1
);
}
#--------------------------------------------------------------------------
sub fallback_languages { return('i-default', 'en', 'en-US') }
sub fallback_language_classes { return () }
#--------------------------------------------------------------------------
sub fail_with { # an actual attribute method!
my($handle, @params) = @_;
return unless ref($handle);
$handle->{'fail'} = $params[0] if @params;
return $handle->{'fail'};
}
#--------------------------------------------------------------------------
sub blacklist {
my ( $handle, @methods ) = @_;
unless ( defined $handle->{'blacklist'} ) {
no strict 'refs';
# Don't let people call methods they're not supposed to from maketext.
# Explicitly exclude all methods in this package that start with an
# underscore on principle.
$handle->{'blacklist'} = {
map { $_ => 1 } (
qw/
blacklist
encoding
fail_with
failure_handler_auto
fallback_language_classes
fallback_languages
get_handle
init
language_tag
maketext
new
whitelist
/, grep { /^_/ } keys %{ __PACKAGE__ . "::" }
),
};
}
if ( scalar @methods ) {
$handle->{'blacklist'} = { %{ $handle->{'blacklist'} }, map { $_ => 1 } @methods };
}
delete $handle->{'_external_lex_cache'};
return;
}
sub whitelist {
my ( $handle, @methods ) = @_;
if ( scalar @methods ) {
$handle->{'whitelist'} = {} unless defined $handle->{'whitelist'};
$handle->{'whitelist'} = { %{ $handle->{'whitelist'} }, map { $_ => 1 } @methods };
}
delete $handle->{'_external_lex_cache'};
return;
}
#--------------------------------------------------------------------------
sub failure_handler_auto {
# Meant to be used like:
# $handle->fail_with('failure_handler_auto')
my $handle = shift;
my $phrase = shift;
$handle->{'failure_lex'} ||= {};
my $lex = $handle->{'failure_lex'};
my $value ||= ($lex->{$phrase} ||= $handle->_compile($phrase));
# Dumbly copied from sub maketext:
return ${$value} if ref($value) eq 'SCALAR';
return $value if ref($value) ne 'CODE';
{
local $SIG{'__DIE__'};
eval { $value = &$value($handle, @_) };
}
# If we make it here, there was an exception thrown in the
# call to $value, and so scream:
if($@) {
# pretty up the error message
$@ =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?}
{\n in bracket code [compiled line $1],}s;
#$err =~ s/\n?$/\n/s;
Carp::croak "Error in maketexting \"$phrase\":\n$@ as used";
# Rather unexpected, but suppose that the sub tried calling
# a method that didn't exist.
}
else {
return $value;
}
}
#==========================================================================
sub new {
# Nothing fancy!
my $class = ref($_[0]) || $_[0];
my $handle = bless {}, $class;
$handle->blacklist;
$handle->init;
return $handle;
}
sub init { return } # no-op
###########################################################################
sub maketext {
# Remember, this can fail. Failure is controllable many ways.
Carp::croak 'maketext requires at least one parameter' unless @_ > 1;
my($handle, $phrase) = splice(@_,0,2);
Carp::confess('No handle/phrase') unless (defined($handle) && defined($phrase));
# backup $@ in case it's still being used in the calling code.
# If no failures, we'll re-set it back to what it was later.
my $at = $@;
# Copy @_ case one of its elements is $@.
@_ = @_;
# Look up the value:
my $value;
if (exists $handle->{'_external_lex_cache'}{$phrase}) {
DEBUG and warn "* Using external lex cache version of \"$phrase\"\n";
$value = $handle->{'_external_lex_cache'}{$phrase};
}
else {
foreach my $h_r (
@{ $isa_scan{ref($handle) || $handle} || $handle->_lex_refs }
) {
DEBUG and warn "* Looking up \"$phrase\" in $h_r\n";
if(exists $h_r->{$phrase}) {
DEBUG and warn " Found \"$phrase\" in $h_r\n";
unless(ref($value = $h_r->{$phrase})) {
# Nonref means it's not yet compiled. Compile and replace.
if ($handle->{'use_external_lex_cache'}) {
$value = $handle->{'_external_lex_cache'}{$phrase} = $handle->_compile($value);
}
else {
$value = $h_r->{$phrase} = $handle->_compile($value);
}
}
last;
}
# extending packages need to be able to localize _AUTO and if readonly can't "local $h_r->{'_AUTO'} = 1;"
# but they can "local $handle->{'_external_lex_cache'}{'_AUTO'} = 1;"
elsif($phrase !~ m/^_/s and ($handle->{'use_external_lex_cache'} ? ( exists $handle->{'_external_lex_cache'}{'_AUTO'} ? $handle->{'_external_lex_cache'}{'_AUTO'} : $h_r->{'_AUTO'} ) : $h_r->{'_AUTO'})) {
# it's an auto lex, and this is an autoable key!
DEBUG and warn " Automaking \"$phrase\" into $h_r\n";
if ($handle->{'use_external_lex_cache'}) {
$value = $handle->{'_external_lex_cache'}{$phrase} = $handle->_compile($phrase);
}
else {
$value = $h_r->{$phrase} = $handle->_compile($phrase);
}
last;
}
DEBUG>1 and print " Not found in $h_r, nor automakable\n";
# else keep looking
}
}
unless(defined($value)) {
DEBUG and warn "! Lookup of \"$phrase\" in/under ", ref($handle) || $handle, " fails.\n";
if(ref($handle) and $handle->{'fail'}) {
DEBUG and warn "WARNING0: maketext fails looking for <$phrase>\n";
my $fail;
if(ref($fail = $handle->{'fail'}) eq 'CODE') { # it's a sub reference
$@ = $at; # Put $@ back in case we altered it along the way.
return &{$fail}($handle, $phrase, @_);
# If it ever returns, it should return a good value.
}
else { # It's a method name
$@ = $at; # Put $@ back in case we altered it along the way.
return $handle->$fail($phrase, @_);
# If it ever returns, it should return a good value.
}
}
else {
# All we know how to do is this;
Carp::croak("maketext doesn't know how to say:\n$phrase\nas needed");
}
}
if(ref($value) eq 'SCALAR'){
$@ = $at; # Put $@ back in case we altered it along the way.
return $$value ;
}
if(ref($value) ne 'CODE'){
$@ = $at; # Put $@ back in case we altered it along the way.
return $value ;
}
{
local $SIG{'__DIE__'};
eval { $value = &$value($handle, @_) };
}
# If we make it here, there was an exception thrown in the
# call to $value, and so scream:
if ($@) {
# pretty up the error message
$@ =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?}
{\n in bracket code [compiled line $1],}s;
#$err =~ s/\n?$/\n/s;
Carp::croak "Error in maketexting \"$phrase\":\n$@ as used";
# Rather unexpected, but suppose that the sub tried calling
# a method that didn't exist.
}
else {
$@ = $at; # Put $@ back in case we altered it along the way.
return $value;
}
$@ = $at; # Put $@ back in case we altered it along the way.
}
###########################################################################
sub get_handle { # This is a constructor and, yes, it CAN FAIL.
# Its class argument has to be the base class for the current
# application's l10n files.
my($base_class, @languages) = @_;
$base_class = ref($base_class) || $base_class;
# Complain if they use __PACKAGE__ as a project base class?
if( @languages ) {
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
if($USING_LANGUAGE_TAGS) { # An explicit language-list was given!
@languages =
map {; $_, I18N::LangTags::alternate_language_tags($_) }
# Catch alternation
map I18N::LangTags::locale2language_tag($_),
# If it's a lg tag, fine, pass thru (untainted)
# If it's a locale ID, try converting to a lg tag (untainted),
# otherwise nix it.
@languages;
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
}
else {
@languages = $base_class->_ambient_langprefs;
}
@languages = $base_class->_langtag_munging(@languages);
my %seen;
foreach my $module_name ( map { $base_class . '::' . $_ } @languages ) {
next unless length $module_name; # sanity
next if $seen{$module_name}++ # Already been here, and it was no-go
|| !&_try_use($module_name); # Try to use() it, but can't it.
return($module_name->new); # Make it!
}
return undef; # Fail!
}
###########################################################################
sub _langtag_munging {
my($base_class, @languages) = @_;
# We have all these DEBUG statements because otherwise it's hard as hell
# to diagnose if/when something goes wrong.
DEBUG and warn 'Lgs1: ', map("<$_>", @languages), "\n";
if($USING_LANGUAGE_TAGS) {
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
@languages = $base_class->_add_supers( @languages );
push @languages, I18N::LangTags::panic_languages(@languages);
DEBUG and warn "After adding panic languages:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
push @languages, $base_class->fallback_languages;
# You are free to override fallback_languages to return empty-list!
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
@languages = # final bit of processing to turn them into classname things
map {
my $it = $_; # copy
$it =~ tr<-A-Z><_a-z>; # lc, and turn - to _
$it =~ tr<_a-z0-9><>cd; # remove all but a-z0-9_
$it;
} @languages
;
DEBUG and warn "Nearing end of munging:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
else {
DEBUG and warn "Bypassing language-tags.\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
DEBUG and warn "Before adding fallback classes:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
push @languages, $base_class->fallback_language_classes;
# You are free to override that to return whatever.
DEBUG and warn "Finally:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
return @languages;
}
###########################################################################
sub _ambient_langprefs {
return I18N::LangTags::Detect::detect();
}
###########################################################################
sub _add_supers {
my($base_class, @languages) = @_;
if (!$MATCH_SUPERS) {
# Nothing
DEBUG and warn "Bypassing any super-matching.\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
elsif( $MATCH_SUPERS_TIGHTLY ) {
DEBUG and warn "Before adding new supers tightly:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
@languages = I18N::LangTags::implicate_supers( @languages );
DEBUG and warn "After adding new supers tightly:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
else {
DEBUG and warn "Before adding supers to end:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
@languages = I18N::LangTags::implicate_supers_strictly( @languages );
DEBUG and warn "After adding supers to end:\n",
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
}
return @languages;
}
###########################################################################
#
# This is where most people should stop reading.
#
###########################################################################
my %tried = ();
# memoization of whether we've used this module, or found it unusable.
sub _try_use { # Basically a wrapper around "require Modulename"
# "Many men have tried..." "They tried and failed?" "They tried and died."
return $tried{$_[0]} if exists $tried{$_[0]}; # memoization
my $module = $_[0]; # ASSUME sane module name!
{ no strict 'refs';
no warnings 'once';
return($tried{$module} = 1)
if %{$module . '::Lexicon'} or @{$module . '::ISA'};
# weird case: we never use'd it, but there it is!
}
DEBUG and warn " About to use $module ...\n";
local $SIG{'__DIE__'};
local $@;
local @INC = @INC;
pop @INC if $INC[-1] eq '.';
eval "require $module"; # used to be "use $module", but no point in that.
if($@) {
DEBUG and warn "Error using $module \: $@\n";
return $tried{$module} = 0;
}
else {
DEBUG and warn " OK, $module is used\n";
return $tried{$module} = 1;
}
}
#--------------------------------------------------------------------------
sub _lex_refs { # report the lexicon references for this handle's class
# returns an arrayREF!
no strict 'refs';
no warnings 'once';
my $class = ref($_[0]) || $_[0];
DEBUG and warn "Lex refs lookup on $class\n";
return $isa_scan{$class} if exists $isa_scan{$class}; # memoization!
my @lex_refs;
my $seen_r = ref($_[1]) ? $_[1] : {};
if( defined( *{$class . '::Lexicon'}{'HASH'} )) {
push @lex_refs, *{$class . '::Lexicon'}{'HASH'};
DEBUG and warn '%' . $class . '::Lexicon contains ',
scalar(keys %{$class . '::Lexicon'}), " entries\n";
}
# Implements depth(height?)-first recursive searching of superclasses.
# In hindsight, I suppose I could have just used Class::ISA!
foreach my $superclass (@{$class . '::ISA'}) {
DEBUG and warn " Super-class search into $superclass\n";
next if $seen_r->{$superclass}++;
push @lex_refs, @{&_lex_refs($superclass, $seen_r)}; # call myself
}
$isa_scan{$class} = \@lex_refs; # save for next time
return \@lex_refs;
}
sub clear_isa_scan { %isa_scan = (); return; } # end on a note of simplicity!
#--------------------------------------------------------------------------
sub _compile {
# This big scary routine compiles an entry.
# It returns either a coderef if there's brackety bits in this, or
# otherwise a ref to a scalar.
my $string_to_compile = $_[1]; # There are taint issues using regex on @_ - perlbug 60378,27344
# The while() regex is more expensive than this check on strings that don't need a compile.
# this op causes a ~2% speed hit for strings that need compile and a 250% speed improvement
# on strings that don't need compiling.
return \"$string_to_compile" if($string_to_compile !~ m/[\[~\]]/ms); # return a string ref if chars [~] are not in the string
my $handle = $_[0];
my(@code);
my(@c) = (''); # "chunks" -- scratch.
my $call_count = 0;
my $big_pile = '';
{
my $in_group = 0; # start out outside a group
my($m, @params); # scratch
while($string_to_compile =~ # Iterate over chunks.
m/(
[^\~\[\]]+ # non-~[] stuff (Capture everything else here)
|
~. # ~[, ~], ~~, ~other
|
\[ # [ presumably opening a group
|
\] # ] presumably closing a group
|
~ # terminal ~ ?
|
$
)/xgs
) {
DEBUG>2 and warn qq{ "$1"\n};
if($1 eq '[' or $1 eq '') { # "[" or end
# Whether this is "[" or end, force processing of any
# preceding literal.
if($in_group) {
if($1 eq '') {
$handle->_die_pointing($string_to_compile, 'Unterminated bracket group');
}
else {
$handle->_die_pointing($string_to_compile, 'You can\'t nest bracket groups');
}
}
else {
if ($1 eq '') {
DEBUG>2 and warn " [end-string]\n";
}
else {
$in_group = 1;
}
die "How come \@c is empty?? in <$string_to_compile>" unless @c; # sanity
if(length $c[-1]) {
# Now actually processing the preceding literal
$big_pile .= $c[-1];
if($USE_LITERALS and (
(ord('A') == 65)
? $c[-1] !~ m/[^\x20-\x7E]/s
# ASCII very safe chars
: $c[-1] !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s
# EBCDIC very safe chars
)) {
# normal case -- all very safe chars
$c[-1] =~ s/'/\\'/g;
push @code, q{ '} . $c[-1] . "',\n";
$c[-1] = ''; # reuse this slot
}
else {
$c[-1] =~ s/\\\\/\\/g;
push @code, ' $c[' . $#c . "],\n";
push @c, ''; # new chunk
}
}
# else just ignore the empty string.
}
}
elsif($1 eq ']') { # "]"
# close group -- go back in-band
if($in_group) {
$in_group = 0;
DEBUG>2 and warn " --Closing group [$c[-1]]\n";
# And now process the group...
if(!length($c[-1]) or $c[-1] =~ m/^\s+$/s) {
DEBUG>2 and warn " -- (Ignoring)\n";
$c[-1] = ''; # reset out chink
next;
}
#$c[-1] =~ s/^\s+//s;
#$c[-1] =~ s/\s+$//s;
($m,@params) = split(/,/, $c[-1], -1); # was /\s*,\s*/
# A bit of a hack -- we've turned "~,"'s into DELs, so turn
# 'em into real commas here.
if (ord('A') == 65) { # ASCII, etc
foreach($m, @params) { tr/\x7F/,/ }
}
else { # EBCDIC (1047, 0037, POSIX-BC)
# Thanks to Peter Prymmer for the EBCDIC handling
foreach($m, @params) { tr/\x07/,/ }
}
# Special-case handling of some method names:
if($m eq '_*' or $m =~ m/^_(-?\d+)$/s) {
# Treat [_1,...] as [,_1,...], etc.
unshift @params, $m;
$m = '';
}
elsif($m eq '*') {
$m = 'quant'; # "*" for "times": "4 cars" is 4 times "cars"
}
elsif($m eq '#') {
$m = 'numf'; # "#" for "number": [#,_1] for "the number _1"
}
# Most common case: a simple, legal-looking method name
if($m eq '') {
# 0-length method name means to just interpolate:
push @code, ' (';
}
elsif($m =~ /^\w+$/s
&& !$handle->{'blacklist'}{$m}
&& ( !defined $handle->{'whitelist'} || $handle->{'whitelist'}{$m} )
# exclude anything fancy and restrict to the whitelist/blacklist.
) {
push @code, ' $_[0]->' . $m . '(';
}
else {
# TODO: implement something? or just too icky to consider?
$handle->_die_pointing(
$string_to_compile,
"Can't use \"$m\" as a method name in bracket group",
2 + length($c[-1])
);
}
pop @c; # we don't need that chunk anymore
++$call_count;
foreach my $p (@params) {
if($p eq '_*') {
# Meaning: all parameters except $_[0]
$code[-1] .= ' @_[1 .. $#_], ';
# and yes, that does the right thing for all @_ < 3
}
elsif($p =~ m/^_(-?\d+)$/s) {
# _3 meaning $_[3]
$code[-1] .= '$_[' . (0 + $1) . '], ';
}
elsif($USE_LITERALS and (
(ord('A') == 65)
? $p !~ m/[^\x20-\x7E]/s
# ASCII very safe chars
: $p !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s
# EBCDIC very safe chars
)) {
# Normal case: a literal containing only safe characters
$p =~ s/'/\\'/g;
$code[-1] .= q{'} . $p . q{', };
}
else {
# Stow it on the chunk-stack, and just refer to that.
push @c, $p;
push @code, ' $c[' . $#c . '], ';
}
}
$code[-1] .= "),\n";
push @c, '';
}
else {
$handle->_die_pointing($string_to_compile, q{Unbalanced ']'});
}
}
elsif(substr($1,0,1) ne '~') {
# it's stuff not containing "~" or "[" or "]"
# i.e., a literal blob
my $text = $1;
$text =~ s/\\/\\\\/g;
$c[-1] .= $text;
}
elsif($1 eq '~~') { # "~~"
$c[-1] .= '~';
}
elsif($1 eq '~[') { # "~["
$c[-1] .= '[';
}
elsif($1 eq '~]') { # "~]"
$c[-1] .= ']';
}
elsif($1 eq '~,') { # "~,"
if($in_group) {
# This is a hack, based on the assumption that no-one will actually
# want a DEL inside a bracket group. Let's hope that's it's true.
if (ord('A') == 65) { # ASCII etc
$c[-1] .= "\x7F";
}
else { # EBCDIC (cp 1047, 0037, POSIX-BC)
$c[-1] .= "\x07";
}
}
else {
$c[-1] .= '~,';
}
}
elsif($1 eq '~') { # possible only at string-end, it seems.
$c[-1] .= '~';
}
else {
# It's a "~X" where X is not a special character.
# Consider it a literal ~ and X.
my $text = $1;
$text =~ s/\\/\\\\/g;
$c[-1] .= $text;
}
}
}
if($call_count) {
undef $big_pile; # Well, nevermind that.
}
else {
# It's all literals! Ahwell, that can happen.
# So don't bother with the eval. Return a SCALAR reference.
return \$big_pile;
}
die q{Last chunk isn't null??} if @c and length $c[-1]; # sanity
DEBUG and warn scalar(@c), " chunks under closure\n";
if(@code == 0) { # not possible?
DEBUG and warn "Empty code\n";
return \'';
}
elsif(@code > 1) { # most cases, presumably!
unshift @code, "join '',\n";
}
unshift @code, "use strict; sub {\n";
push @code, "}\n";
DEBUG and warn @code;
my $sub = eval(join '', @code);
die "$@ while evalling" . join('', @code) if $@; # Should be impossible.
return $sub;
}
#--------------------------------------------------------------------------
sub _die_pointing {
# This is used by _compile to throw a fatal error
my $target = shift;
$target = ref($target) || $target; # class name
# ...leaving $_[0] the error-causing text, and $_[1] the error message
my $i = index($_[0], "\n");
my $pointy;
my $pos = pos($_[0]) - (defined($_[2]) ? $_[2] : 0) - 1;
if($pos < 1) {
$pointy = "^=== near there\n";
}
else { # we need to space over
my $first_tab = index($_[0], "\t");
if($pos > 2 and ( -1 == $first_tab or $first_tab > pos($_[0]))) {
# No tabs, or the first tab is harmlessly after where we will point to,
# AND we're far enough from the margin that we can draw a proper arrow.
$pointy = ('=' x $pos) . "^ near there\n";
}
else {
# tabs screw everything up!
$pointy = substr($_[0],0,$pos);
$pointy =~ tr/\t //cd;
# make everything into whitespace, but preserving tabs
$pointy .= "^=== near there\n";
}
}
my $errmsg = "$_[1], in\:\n$_[0]";
if($i == -1) {
# No newline.
$errmsg .= "\n" . $pointy;
}
elsif($i == (length($_[0]) - 1) ) {
# Already has a newline at end.
$errmsg .= $pointy;
}
else {
# don't bother with the pointy bit, I guess.
}
Carp::croak( "$errmsg via $target, as used" );
}
1;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,150 @@
# This document contains text in Perl "POD" format.
# Use a POD viewer like perldoc or perlman to render it.
=encoding utf-8
=head1 NAME
Locale::Maketext::Cookbook - recipes for using Locale::Maketext
=head1 INTRODUCTION
This is a work in progress. Not much progress by now :-)
=head1 ONESIDED LEXICONS
I<Adapted from a suggestion by Dan Muey>
It may be common (for example at your main lexicon) that
the hash keys and values coincide. Like that
q{Hello, tell me your name}
=> q{Hello, tell me your name}
It would be nice to just write:
q{Hello, tell me your name} => ''
and have this magically inflated to the first form.
Among the advantages of such representation, that would
lead to
smaller files, less prone to mistyping or mispasting,
and handy to someone translating it which can simply
copy the main lexicon and enter the translation
instead of having to remove the value first.
That can be achieved by overriding C<init>
in your class and working on the main lexicon
with code like that:
package My::I18N;
...
sub init {
my $lh = shift; # a newborn handle
$lh->SUPER::init();
inflate_lexicon(\%My::I18N::en::Lexicon);
return;
}
sub inflate_lexicon {
my $lex = shift;
while (my ($k, $v) = each %$lex) {
$v = $k if !defined $v || $v eq '';
}
}
Here we are assuming C<My::I18N::en> to own the
main lexicon.
There are some downsides here: the size economy
will not stand at runtime after this C<init()>
runs. But it should not be that critical, since
if you don't have space for that, you won't have
space for any other language besides the main one
as well. You could do that too with ties,
expanding the value at lookup time which
should be more time expensive as an option.
=head1 DECIMAL PLACES IN NUMBER FORMATTING
I<After CPAN RT #36136 (L<https://rt.cpan.org/Ticket/Display.html?id=36136>)>
The documentation of L<Locale::Maketext> advises that
the standard bracket method C<numf> is limited and that
you must override that for better results. It even
suggests the use of L<Number::Format>.
One such defect of standard C<numf> is to not be
able to use a certain decimal precision.
For example,
$lh->maketext('pi is [numf,_1]', 355/113);
outputs
pi is 3.14159292035398
Since pi ≈ 355/116 is only accurate
to 6 decimal places, you would want to say:
$lh->maketext('pi is [numf,_1,6]', 355/113);
and get "pi is 3.141592".
One solution for that could use C<Number::Format>
like that:
package Wuu;
use base qw(Locale::Maketext);
use Number::Format;
# can be overridden according to language conventions
sub _numf_params {
return (
-thousands_sep => '.',
-decimal_point => ',',
-decimal_digits => 2,
);
}
# builds a Number::Format
sub _numf_formatter {
my ($lh, $scale) = @_;
my @params = $lh->_numf_params;
if ($scale) { # use explicit scale rather than default
push @params, (-decimal_digits => $scale);
}
return Number::Format->new(@params);
}
sub numf {
my ($lh, $n, $scale) = @_;
# get the (cached) formatter
my $nf = $lh->{__nf}{$scale} ||= $lh->_numf_formatter($scale);
# format the number itself
return $nf->format_number($n);
}
package Wuu::pt;
use base qw(Wuu);
and then
my $lh = Wuu->get_handle('pt');
$lh->maketext('A [numf,_1,3] km de distância', 1550.2222);
would return "A 1.550,222 km de distância".
Notice that the standard utility methods of
C<Locale::Maketext> are irremediably limited
because they could not aim to do everything
that could be expected from them in different languages,
cultures and applications. So extending C<numf>,
C<quant>, and C<sprintf> is natural as soon
as your needs exceed what the standard ones do.

View File

@@ -0,0 +1,24 @@
package Locale::Maketext::Guts;
use Locale::Maketext;
our $VERSION = '1.20';
=head1 NAME
Locale::Maketext::Guts - Deprecated module to load Locale::Maketext utf8 code
=head1 SYNOPSIS
# Do this instead please
use Locale::Maketext
=head1 DESCRIPTION
Previously Local::Maketext::GutsLoader performed some magic to load
Locale::Maketext when utf8 was unavailable. The subs this module provided
were merged back into Locale::Maketext
=cut
1;

View File

@@ -0,0 +1,26 @@
package Locale::Maketext::GutsLoader;
use Locale::Maketext;
our $VERSION = '1.20';
sub zorp { return scalar @_ }
=head1 NAME
Locale::Maketext::GutsLoader - Deprecated module to load Locale::Maketext utf8 code
=head1 SYNOPSIS
# Do this instead please
use Locale::Maketext
=head1 DESCRIPTION
Previously Locale::Maketext::Guts performed some magic to load
Locale::Maketext when utf8 was unavailable. The subs this module provided
were merged back into Locale::Maketext.
=cut
1;

View File

@@ -0,0 +1,346 @@
package Locale::Maketext::Simple;
$Locale::Maketext::Simple::VERSION = '0.21_01';
use strict;
use 5.005;
=head1 NAME
Locale::Maketext::Simple - Simple interface to Locale::Maketext::Lexicon
=head1 VERSION
This document describes version 0.18 of Locale::Maketext::Simple,
released Septermber 8, 2006.
=head1 SYNOPSIS
Minimal setup (looks for F<auto/Foo/*.po> and F<auto/Foo/*.mo>):
package Foo;
use Locale::Maketext::Simple; # exports 'loc'
loc_lang('fr'); # set language to French
sub hello {
print loc("Hello, [_1]!", "World");
}
More sophisticated example:
package Foo::Bar;
use Locale::Maketext::Simple (
Class => 'Foo', # search in auto/Foo/
Style => 'gettext', # %1 instead of [_1]
Export => 'maketext', # maketext() instead of loc()
Subclass => 'L10N', # Foo::L10N instead of Foo::I18N
Decode => 1, # decode entries to unicode-strings
Encoding => 'locale', # but encode lexicons in current locale
# (needs Locale::Maketext::Lexicon 0.36)
);
sub japh {
print maketext("Just another %1 hacker", "Perl");
}
=head1 DESCRIPTION
This module is a simple wrapper around B<Locale::Maketext::Lexicon>,
designed to alleviate the need of creating I<Language Classes> for
module authors.
The language used is chosen from the loc_lang call. If a lookup is not
possible, the i-default language will be used. If the lookup is not in the
i-default language, then the key will be returned.
If B<Locale::Maketext::Lexicon> is not present, it implements a
minimal localization function by simply interpolating C<[_1]> with
the first argument, C<[_2]> with the second, etc. Interpolated
function like C<[quant,_1]> are treated as C<[_1]>, with the sole
exception of C<[tense,_1,X]>, which will append C<ing> to C<_1> when
X is C<present>, or appending C<ed> to <_1> otherwise.
=head1 OPTIONS
All options are passed either via the C<use> statement, or via an
explicit C<import>.
=head2 Class
By default, B<Locale::Maketext::Simple> draws its source from the
calling package's F<auto/> directory; you can override this behaviour
by explicitly specifying another package as C<Class>.
=head2 Path
If your PO and MO files are under a path elsewhere than C<auto/>,
you may specify it using the C<Path> option.
=head2 Style
By default, this module uses the C<maketext> style of C<[_1]> and
C<[quant,_1]> for interpolation. Alternatively, you can specify the
C<gettext> style, which uses C<%1> and C<%quant(%1)> for interpolation.
This option is case-insensitive.
=head2 Export
By default, this module exports a single function, C<loc>, into its
caller's namespace. You can set it to another name, or set it to
an empty string to disable exporting.
=head2 Subclass
By default, this module creates an C<::I18N> subclass under the
caller's package (or the package specified by C<Class>), and stores
lexicon data in its subclasses. You can assign a name other than
C<I18N> via this option.
=head2 Decode
If set to a true value, source entries will be converted into
utf8-strings (available in Perl 5.6.1 or later). This feature
needs the B<Encode> or B<Encode::compat> module.
=head2 Encoding
Specifies an encoding to store lexicon entries, instead of
utf8-strings. If set to C<locale>, the encoding from the current
locale setting is used. Implies a true value for C<Decode>.
=cut
sub import {
my ($class, %args) = @_;
$args{Class} ||= caller;
$args{Style} ||= 'maketext';
$args{Export} ||= 'loc';
$args{Subclass} ||= 'I18N';
my ($loc, $loc_lang) = $class->load_loc(%args);
$loc ||= $class->default_loc(%args);
no strict 'refs';
*{caller(0) . "::$args{Export}"} = $loc if $args{Export};
*{caller(0) . "::$args{Export}_lang"} = $loc_lang || sub { 1 };
}
my %Loc;
sub reload_loc { %Loc = () }
sub load_loc {
my ($class, %args) = @_;
my $pkg = join('::', grep { defined and length } $args{Class}, $args{Subclass});
return $Loc{$pkg} if exists $Loc{$pkg};
eval {
local @INC = @INC;
pop @INC if $INC[-1] eq '.';
require Locale::Maketext::Lexicon;
1
} or return;
$Locale::Maketext::Lexicon::VERSION > 0.20 or return;
eval { require File::Spec; 1 } or return;
my $path = $args{Path} || $class->auto_path($args{Class}) or return;
my $pattern = File::Spec->catfile($path, '*.[pm]o');
my $decode = $args{Decode} || 0;
my $encoding = $args{Encoding} || undef;
$decode = 1 if $encoding;
$pattern =~ s{\\}{/}g; # to counter win32 paths
eval "
package $pkg;
use base 'Locale::Maketext';
Locale::Maketext::Lexicon->import({
'i-default' => [ 'Auto' ],
'*' => [ Gettext => \$pattern ],
_decode => \$decode,
_encoding => \$encoding,
});
*${pkg}::Lexicon = \\%${pkg}::i_default::Lexicon;
*tense = sub { \$_[1] . ((\$_[2] eq 'present') ? 'ing' : 'ed') }
unless defined &tense;
1;
" or die $@;
my $lh = eval { $pkg->get_handle } or return;
my $style = lc($args{Style});
if ($style eq 'maketext') {
$Loc{$pkg} = sub {
$lh->maketext(@_)
};
}
elsif ($style eq 'gettext') {
$Loc{$pkg} = sub {
my $str = shift;
$str =~ s{([\~\[\]])}{~$1}g;
$str =~ s{
([%\\]%) # 1 - escaped sequence
|
% (?:
([A-Za-z#*]\w*) # 2 - function call
\(([^\)]*)\) # 3 - arguments
|
([1-9]\d*|\*) # 4 - variable
)
}{
$1 ? $1
: $2 ? "\[$2,"._unescape($3)."]"
: "[_$4]"
}egx;
return $lh->maketext($str, @_);
};
}
else {
die "Unknown Style: $style";
}
return $Loc{$pkg}, sub {
$lh = $pkg->get_handle(@_);
};
}
sub default_loc {
my ($self, %args) = @_;
my $style = lc($args{Style});
if ($style eq 'maketext') {
return sub {
my $str = shift;
$str =~ s{((?<!~)(?:~~)*)\[_([1-9]\d*|\*)\]}
{$1%$2}g;
$str =~ s{((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]}
{"$1%$2(" . _escape($3) . ')'}eg;
_default_gettext($str, @_);
};
}
elsif ($style eq 'gettext') {
return \&_default_gettext;
}
else {
die "Unknown Style: $style";
}
}
sub _default_gettext {
my $str = shift;
$str =~ s{
% # leading symbol
(?: # either one of
\d+ # a digit, like %1
| # or
(\w+)\( # a function call -- 1
(?: # either
%\d+ # an interpolation
| # or
([^,]*) # some string -- 2
) # end either
(?: # maybe followed
, # by a comma
([^),]*) # and a param -- 3
)? # end maybe
(?: # maybe followed
, # by another comma
([^),]*) # and a param -- 4
)? # end maybe
[^)]* # and other ignorable params
\) # closing function call
) # closing either one of
}{
my $digit = $2 || shift;
$digit . (
$1 ? (
($1 eq 'tense') ? (($3 eq 'present') ? 'ing' : 'ed') :
($1 eq 'quant') ? ' ' . (($digit > 1) ? ($4 || "$3s") : $3) :
''
) : ''
);
}egx;
return $str;
};
sub _escape {
my $text = shift;
$text =~ s/\b_([1-9]\d*)/%$1/g;
return $text;
}
sub _unescape {
join(',', map {
/\A(\s*)%([1-9]\d*|\*)(\s*)\z/ ? "$1_$2$3" : $_
} split(/,/, $_[0]));
}
sub auto_path {
my ($self, $calldir) = @_;
$calldir =~ s#::#/#g;
my $path = $INC{$calldir . '.pm'} or return;
# Try absolute path name.
if ($^O eq 'MacOS') {
(my $malldir = $calldir) =~ tr#/#:#;
$path =~ s#^(.*)$malldir\.pm\z#$1auto:$malldir:#s;
} else {
$path =~ s#^(.*)$calldir\.pm\z#$1auto/$calldir/#;
}
return $path if -d $path;
# If that failed, try relative path with normal @INC searching.
$path = "auto/$calldir/";
foreach my $inc (@INC) {
return "$inc/$path" if -d "$inc/$path";
}
return;
}
1;
=head1 ACKNOWLEDGMENTS
Thanks to Jos I. Boumans for suggesting this module to be written.
Thanks to Chia-Liang Kao for suggesting C<Path> and C<loc_lang>.
=head1 SEE ALSO
L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
=head1 AUTHORS
Audrey Tang E<lt>cpan@audreyt.orgE<gt>
=head1 COPYRIGHT
Copyright 2003, 2004, 2005, 2006 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>.
This software is released under the MIT license cited below. Additionally,
when this software is distributed with B<Perl Kit, Version 5>, you may also
redistribute it and/or modify it under the same terms as Perl itself.
=head2 The "MIT" License
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
=cut

View File

@@ -0,0 +1,775 @@
# This document contains text in Perl "POD" format.
# Use a POD viewer like perldoc or perlman to render it.
=head1 NAME
Locale::Maketext::TPJ13 -- article about software localization
=head1 SYNOPSIS
# This an article, not a module.
=head1 DESCRIPTION
The following article by Sean M. Burke and Jordan Lachler
first appeared in I<The Perl Journal> #13
and is copyright 1999 The Perl Journal. It appears
courtesy of Jon Orwant and The Perl Journal. This document may be
distributed under the same terms as Perl itself.
=head1 Localization and Perl: gettext breaks, Maketext fixes
by Sean M. Burke and Jordan Lachler
This article points out cases where gettext (a common system for
localizing software interfaces -- i.e., making them work in the user's
language of choice) fails because of basic differences between human
languages. This article then describes Maketext, a new system capable
of correctly treating these differences.
=head2 A Localization Horror Story: It Could Happen To You
=over
"There are a number of languages spoken by human beings in this
world."
-- Harald Tveit Alvestrand, in RFC 1766, "Tags for the
Identification of Languages"
=back
Imagine that your task for the day is to localize a piece of software
-- and luckily for you, the only output the program emits is two
messages, like this:
I scanned 12 directories.
Your query matched 10 files in 4 directories.
So how hard could that be? You look at the code that
produces the first item, and it reads:
printf("I scanned %g directories.",
$directory_count);
You think about that, and realize that it doesn't even work right for
English, as it can produce this output:
I scanned 1 directories.
So you rewrite it to read:
printf("I scanned %g %s.",
$directory_count,
$directory_count == 1 ?
"directory" : "directories",
);
...which does the Right Thing. (In case you don't recall, "%g" is for
locale-specific number interpolation, and "%s" is for string
interpolation.)
But you still have to localize it for all the languages you're
producing this software for, so you pull Locale::gettext off of CPAN
so you can access the C<gettext> C functions you've heard are standard
for localization tasks.
And you write:
printf(gettext("I scanned %g %s."),
$dir_scan_count,
$dir_scan_count == 1 ?
gettext("directory") : gettext("directories"),
);
But you then read in the gettext manual (Drepper, Miller, and Pinard 1995)
that this is not a good idea, since how a single word like "directory"
or "directories" is translated may depend on context -- and this is
true, since in a case language like German or Russian, you'd may need
these words with a different case ending in the first instance (where the
word is the object of a verb) than in the second instance, which you haven't even
gotten to yet (where the word is the object of a preposition, "in %g
directories") -- assuming these keep the same syntax when translated
into those languages.
So, on the advice of the gettext manual, you rewrite:
printf( $dir_scan_count == 1 ?
gettext("I scanned %g directory.") :
gettext("I scanned %g directories."),
$dir_scan_count );
So, you email your various translators (the boss decides that the
languages du jour are Chinese, Arabic, Russian, and Italian, so you
have one translator for each), asking for translations for "I scanned
%g directory." and "I scanned %g directories.". When they reply,
you'll put that in the lexicons for gettext to use when it localizes
your software, so that when the user is running under the "zh"
(Chinese) locale, gettext("I scanned %g directory.") will return the
appropriate Chinese text, with a "%g" in there where printf can then
interpolate $dir_scan.
Your Chinese translator emails right back -- he says both of these
phrases translate to the same thing in Chinese, because, in linguistic
jargon, Chinese "doesn't have number as a grammatical category" --
whereas English does. That is, English has grammatical rules that
refer to "number", i.e., whether something is grammatically singular
or plural; and one of these rules is the one that forces nouns to take
a plural suffix (generally "s") when in a plural context, as they are when
they follow a number other than "one" (including, oddly enough, "zero").
Chinese has no such rules, and so has just the one phrase where English
has two. But, no problem, you can have this one Chinese phrase appear
as the translation for the two English phrases in the "zh" gettext
lexicon for your program.
Emboldened by this, you dive into the second phrase that your software
needs to output: "Your query matched 10 files in 4 directories.". You notice
that if you want to treat phrases as indivisible, as the gettext
manual wisely advises, you need four cases now, instead of two, to
cover the permutations of singular and plural on the two items,
$dir_count and $file_count. So you try this:
printf( $file_count == 1 ?
( $directory_count == 1 ?
gettext("Your query matched %g file in %g directory.") :
gettext("Your query matched %g file in %g directories.") ) :
( $directory_count == 1 ?
gettext("Your query matched %g files in %g directory.") :
gettext("Your query matched %g files in %g directories.") ),
$file_count, $directory_count,
);
(The case of "1 file in 2 [or more] directories" could, I suppose,
occur in the case of symlinking or something of the sort.)
It occurs to you that this is not the prettiest code you've ever
written, but this seems the way to go. You mail off to the
translators asking for translations for these four cases. The
Chinese guy replies with the one phrase that these all translate to in
Chinese, and that phrase has two "%g"s in it, as it should -- but
there's a problem. He translates it word-for-word back: "In %g
directories contains %g files match your query." The %g
slots are in an order reverse to what they are in English. You wonder
how you'll get gettext to handle that.
But you put it aside for the moment, and optimistically hope that the
other translators won't have this problem, and that their languages
will be better behaved -- i.e., that they will be just like English.
But the Arabic translator is the next to write back. First off, your
code for "I scanned %g directory." or "I scanned %g directories."
assumes there's only singular or plural. But, to use linguistic
jargon again, Arabic has grammatical number, like English (but unlike
Chinese), but it's a three-term category: singular, dual, and plural.
In other words, the way you say "directory" depends on whether there's
one directory, or I<two> of them, or I<more than two> of them. Your
test of C<($directory == 1)> no longer does the job. And it means
that where English's grammatical category of number necessitates
only the two permutations of the first sentence based on "directory
[singular]" and "directories [plural]", Arabic has three -- and,
worse, in the second sentence ("Your query matched %g file in %g
directory."), where English has four, Arabic has nine. You sense
an unwelcome, exponential trend taking shape.
Your Italian translator emails you back and says that "I searched 0
directories" (a possible English output of your program) is stilted,
and if you think that's fine English, that's your problem, but that
I<just will not do> in the language of Dante. He insists that where
$directory_count is 0, your program should produce the Italian text
for "I I<didn't> scan I<any> directories.". And ditto for "I didn't
match any files in any directories", although he says the last part
about "in any directories" should probably just be left off.
You wonder how you'll get gettext to handle this; to accommodate the
ways Arabic, Chinese, and Italian deal with numbers in just these few
very simple phrases, you need to write code that will ask gettext for
different queries depending on whether the numerical values in
question are 1, 2, more than 2, or in some cases 0, and you still haven't
figured out the problem with the different word order in Chinese.
Then your Russian translator calls on the phone, to I<personally> tell
you the bad news about how really unpleasant your life is about to
become:
Russian, like German or Latin, is an inflectional language; that is, nouns
and adjectives have to take endings that depend on their case
(i.e., nominative, accusative, genitive, etc...) -- which is roughly a matter of
what role they have in syntax of the sentence --
as well as on the grammatical gender (i.e., masculine, feminine, neuter)
and number (i.e., singular or plural) of the noun, as well as on the
declension class of the noun. But unlike with most other inflected languages,
putting a number-phrase (like "ten" or "forty-three", or their Arabic
numeral equivalents) in front of noun in Russian can change the case and
number that noun is, and therefore the endings you have to put on it.
He elaborates: In "I scanned %g directories", you'd I<expect>
"directories" to be in the accusative case (since it is the direct
object in the sentence) and the plural number,
except where $directory_count is 1, then you'd expect the singular, of
course. Just like Latin or German. I<But!> Where $directory_count %
10 is 1 ("%" for modulo, remember), assuming $directory count is an
integer, and except where $directory_count % 100 is 11, "directories"
is forced to become grammatically singular, which means it gets the
ending for the accusative singular... You begin to visualize the code
it'd take to test for the problem so far, I<and still work for Chinese
and Arabic and Italian>, and how many gettext items that'd take, but
he keeps going... But where $directory_count % 10 is 2, 3, or 4
(except where $directory_count % 100 is 12, 13, or 14), the word for
"directories" is forced to be genitive singular -- which means another
ending... The room begins to spin around you, slowly at first... But
with I<all other> integer values, since "directory" is an inanimate
noun, when preceded by a number and in the nominative or accusative
cases (as it is here, just your luck!), it does stay plural, but it is
forced into the genitive case -- yet another ending... And
you never hear him get to the part about how you're going to run into
similar (but maybe subtly different) problems with other Slavic
languages like Polish, because the floor comes up to meet you, and you
fade into unconsciousness.
The above cautionary tale relates how an attempt at localization can
lead from programmer consternation, to program obfuscation, to a need
for sedation. But careful evaluation shows that your choice of tools
merely needed further consideration.
=head2 The Linguistic View
=over
"It is more complicated than you think."
-- The Eighth Networking Truth, from RFC 1925
=back
The field of Linguistics has expended a great deal of effort over the
past century trying to find grammatical patterns which hold across
languages; it's been a constant process
of people making generalizations that should apply to all languages,
only to find out that, all too often, these generalizations fail --
sometimes failing for just a few languages, sometimes whole classes of
languages, and sometimes nearly every language in the world except
English. Broad statistical trends are evident in what the "average
language" is like as far as what its rules can look like, must look
like, and cannot look like. But the "average language" is just as
unreal a concept as the "average person" -- it runs up against the
fact no language (or person) is, in fact, average. The wisdom of past
experience leads us to believe that any given language can do whatever
it wants, in any order, with appeal to any kind of grammatical
categories wants -- case, number, tense, real or metaphoric
characteristics of the things that words refer to, arbitrary or
predictable classifications of words based on what endings or prefixes
they can take, degree or means of certainty about the truth of
statements expressed, and so on, ad infinitum.
Mercifully, most localization tasks are a matter of finding ways to
translate whole phrases, generally sentences, where the context is
relatively set, and where the only variation in content is I<usually>
in a number being expressed -- as in the example sentences above.
Translating specific, fully-formed sentences is, in practice, fairly
foolproof -- which is good, because that's what's in the phrasebooks
that so many tourists rely on. Now, a given phrase (whether in a
phrasebook or in a gettext lexicon) in one language I<might> have a
greater or lesser applicability than that phrase's translation into
another language -- for example, strictly speaking, in Arabic, the
"your" in "Your query matched..." would take a different form
depending on whether the user is male or female; so the Arabic
translation "your[feminine] query" is applicable in fewer cases than
the corresponding English phrase, which doesn't distinguish the user's
gender. (In practice, it's not feasible to have a program know the
user's gender, so the masculine "you" in Arabic is usually used, by
default.)
But in general, such surprises are rare when entire sentences are
being translated, especially when the functional context is restricted
to that of a computer interacting with a user either to convey a fact
or to prompt for a piece of information. So, for purposes of
localization, translation by phrase (generally by sentence) is both the
simplest and the least problematic.
=head2 Breaking gettext
=over
"It Has To Work."
-- First Networking Truth, RFC 1925
=back
Consider that sentences in a tourist phrasebook are of two types: ones
like "How do I get to the marketplace?" that don't have any blanks to
fill in, and ones like "How much do these ___ cost?", where there's
one or more blanks to fill in (and these are usually linked to a
list of words that you can put in that blank: "fish", "potatoes",
"tomatoes", etc.). The ones with no blanks are no problem, but the
fill-in-the-blank ones may not be really straightforward. If it's a
Swahili phrasebook, for example, the authors probably didn't bother to
tell you the complicated ways that the verb "cost" changes its
inflectional prefix depending on the noun you're putting in the blank.
The trader in the marketplace will still understand what you're saying if
you say "how much do these potatoes cost?" with the wrong
inflectional prefix on "cost". After all, I<you> can't speak proper Swahili,
I<you're> just a tourist. But while tourists can be stupid, computers
are supposed to be smart; the computer should be able to fill in the
blank, and still have the results be grammatical.
In other words, a phrasebook entry takes some values as parameters
(the things that you fill in the blank or blanks), and provides a value
based on these parameters, where the way you get that final value from
the given values can, properly speaking, involve an arbitrarily
complex series of operations. (In the case of Chinese, it'd be not at
all complex, at least in cases like the examples at the beginning of
this article; whereas in the case of Russian it'd be a rather complex
series of operations. And in some languages, the
complexity could be spread around differently: while the act of
putting a number-expression in front of a noun phrase might not be
complex by itself, it may change how you have to, for example, inflect
a verb elsewhere in the sentence. This is what in syntax is called
"long-distance dependencies".)
This talk of parameters and arbitrary complexity is just another way
to say that an entry in a phrasebook is what in a programming language
would be called a "function". Just so you don't miss it, this is the
crux of this article: I<A phrase is a function; a phrasebook is a
bunch of functions.>
The reason that using gettext runs into walls (as in the above
second-person horror story) is that you're trying to use a string (or
worse, a choice among a bunch of strings) to do what you really need a
function for -- which is futile. Preforming (s)printf interpolation
on the strings which you get back from gettext does allow you to do I<some>
common things passably well... sometimes... sort of; but, to paraphrase
what some people say about C<csh> script programming, "it fools you
into thinking you can use it for real things, but you can't, and you
don't discover this until you've already spent too much time trying,
and by then it's too late."
=head2 Replacing gettext
So, what needs to replace gettext is a system that supports lexicons
of functions instead of lexicons of strings. An entry in a lexicon
from such a system should I<not> look like this:
"J'ai trouv\xE9 %g fichiers dans %g r\xE9pertoires"
[\xE9 is e-acute in Latin-1. Some pod renderers would
scream if I used the actual character here. -- SB]
but instead like this, bearing in mind that this is just a first stab:
sub I_found_X1_files_in_X2_directories {
my( $files, $dirs ) = @_[0,1];
$files = sprintf("%g %s", $files,
$files == 1 ? 'fichier' : 'fichiers');
$dirs = sprintf("%g %s", $dirs,
$dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires");
return "J'ai trouv\xE9 $files dans $dirs.";
}
Now, there's no particularly obvious way to store anything but strings
in a gettext lexicon; so it looks like we just have to start over and
make something better, from scratch. I call my shot at a
gettext-replacement system "Maketext", or, in CPAN terms,
Locale::Maketext.
When designing Maketext, I chose to plan its main features in terms of
"buzzword compliance". And here are the buzzwords:
=head2 Buzzwords: Abstraction and Encapsulation
The complexity of the language you're trying to output a phrase in is
entirely abstracted inside (and encapsulated within) the Maketext module
for that interface. When you call:
print $lang->maketext("You have [quant,_1,piece] of new mail.",
scalar(@messages));
you don't know (and in fact can't easily find out) whether this will
involve lots of figuring, as in Russian (if $lang is a handle to the
Russian module), or relatively little, as in Chinese. That kind of
abstraction and encapsulation may encourage other pleasant buzzwords
like modularization and stratification, depending on what design
decisions you make.
=head2 Buzzword: Isomorphism
"Isomorphism" means "having the same structure or form"; in discussions
of program design, the word takes on the special, specific meaning that
your implementation of a solution to a problem I<has the same
structure> as, say, an informal verbal description of the solution, or
maybe of the problem itself. Isomorphism is, all things considered,
a good thing -- it's what problem-solving (and solution-implementing)
should look like.
What's wrong the with gettext-using code like this...
printf( $file_count == 1 ?
( $directory_count == 1 ?
"Your query matched %g file in %g directory." :
"Your query matched %g file in %g directories." ) :
( $directory_count == 1 ?
"Your query matched %g files in %g directory." :
"Your query matched %g files in %g directories." ),
$file_count, $directory_count,
);
is first off that it's not well abstracted -- these ways of testing
for grammatical number (as in the expressions like C<foo == 1 ?
singular_form : plural_form>) should be abstracted to each language
module, since how you get grammatical number is language-specific.
But second off, it's not isomorphic -- the "solution" (i.e., the
phrasebook entries) for Chinese maps from these four English phrases to
the one Chinese phrase that fits for all of them. In other words, the
informal solution would be "The way to say what you want in Chinese is
with the one phrase 'For your question, in Y directories you would
find X files'" -- and so the implemented solution should be,
isomorphically, just a straightforward way to spit out that one
phrase, with numerals properly interpolated. It shouldn't have to map
from the complexity of other languages to the simplicity of this one.
=head2 Buzzword: Inheritance
There's a great deal of reuse possible for sharing of phrases between
modules for related dialects, or for sharing of auxiliary functions
between related languages. (By "auxiliary functions", I mean
functions that don't produce phrase-text, but which, say, return an
answer to "does this number require a plural noun after it?". Such
auxiliary functions would be used in the internal logic of functions
that actually do produce phrase-text.)
In the case of sharing phrases, consider that you have an interface
already localized for American English (probably by having been
written with that as the native locale, but that's incidental).
Localizing it for UK English should, in practical terms, be just a
matter of running it past a British person with the instructions to
indicate what few phrases would benefit from a change in spelling or
possibly minor rewording. In that case, you should be able to put in
the UK English localization module I<only> those phrases that are
UK-specific, and for all the rest, I<inherit> from the American
English module. (And I expect this same situation would apply with
Brazilian and Continental Portugese, possibly with some I<very>
closely related languages like Czech and Slovak, and possibly with the
slightly different "versions" of written Mandarin Chinese, as I hear exist in
Taiwan and mainland China.)
As to sharing of auxiliary functions, consider the problem of Russian
numbers from the beginning of this article; obviously, you'd want to
write only once the hairy code that, given a numeric value, would
return some specification of which case and number a given quantified
noun should use. But suppose that you discover, while localizing an
interface for, say, Ukranian (a Slavic language related to Russian,
spoken by several million people, many of whom would be relieved to
find that your Web site's or software's interface is available in
their language), that the rules in Ukranian are the same as in Russian
for quantification, and probably for many other grammatical functions.
While there may well be no phrases in common between Russian and
Ukranian, you could still choose to have the Ukranian module inherit
from the Russian module, just for the sake of inheriting all the
various grammatical methods. Or, probably better organizationally,
you could move those functions to a module called C<_E_Slavic> or
something, which Russian and Ukrainian could inherit useful functions
from, but which would (presumably) provide no lexicon.
=head2 Buzzword: Concision
Okay, concision isn't a buzzword. But it should be, so I decree that
as a new buzzword, "concision" means that simple common things should
be expressible in very few lines (or maybe even just a few characters)
of code -- call it a special case of "making simple things easy and
hard things possible", and see also the role it played in the
MIDI::Simple language, discussed elsewhere in this issue [TPJ#13].
Consider our first stab at an entry in our "phrasebook of functions":
sub I_found_X1_files_in_X2_directories {
my( $files, $dirs ) = @_[0,1];
$files = sprintf("%g %s", $files,
$files == 1 ? 'fichier' : 'fichiers');
$dirs = sprintf("%g %s", $dirs,
$dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires");
return "J'ai trouv\xE9 $files dans $dirs.";
}
You may sense that a lexicon (to use a non-committal catch-all term for a
collection of things you know how to say, regardless of whether they're
phrases or words) consisting of functions I<expressed> as above would
make for rather long-winded and repetitive code -- even if you wisely
rewrote this to have quantification (as we call adding a number
expression to a noun phrase) be a function called like:
sub I_found_X1_files_in_X2_directories {
my( $files, $dirs ) = @_[0,1];
$files = quant($files, "fichier");
$dirs = quant($dirs, "r\xE9pertoire");
return "J'ai trouv\xE9 $files dans $dirs.";
}
And you may also sense that you do not want to bother your translators
with having to write Perl code -- you'd much rather that they spend
their I<very costly time> on just translation. And this is to say
nothing of the near impossibility of finding a commercial translator
who would know even simple Perl.
In a first-hack implementation of Maketext, each language-module's
lexicon looked like this:
%Lexicon = (
"I found %g files in %g directories"
=> sub {
my( $files, $dirs ) = @_[0,1];
$files = quant($files, "fichier");
$dirs = quant($dirs, "r\xE9pertoire");
return "J'ai trouv\xE9 $files dans $dirs.";
},
... and so on with other phrase => sub mappings ...
);
but I immediately went looking for some more concise way to basically
denote the same phrase-function -- a way that would also serve to
concisely denote I<most> phrase-functions in the lexicon for I<most>
languages. After much time and even some actual thought, I decided on
this system:
* Where a value in a %Lexicon hash is a contentful string instead of
an anonymous sub (or, conceivably, a coderef), it would be interpreted
as a sort of shorthand expression of what the sub does. When accessed
for the first time in a session, it is parsed, turned into Perl code,
and then eval'd into an anonymous sub; then that sub replaces the
original string in that lexicon. (That way, the work of parsing and
evaling the shorthand form for a given phrase is done no more than
once per session.)
* Calls to C<maketext> (as Maketext's main function is called) happen
thru a "language session handle", notionally very much like an IO
handle, in that you open one at the start of the session, and use it
for "sending signals" to an object in order to have it return the text
you want.
So, this:
$lang->maketext("You have [quant,_1,piece] of new mail.",
scalar(@messages));
basically means this: look in the lexicon for $lang (which may inherit
from any number of other lexicons), and find the function that we
happen to associate with the string "You have [quant,_1,piece] of new
mail" (which is, and should be, a functioning "shorthand" for this
function in the native locale -- English in this case). If you find
such a function, call it with $lang as its first parameter (as if it
were a method), and then a copy of scalar(@messages) as its second,
and then return that value. If that function was found, but was in
string shorthand instead of being a fully specified function, parse it
and make it into a function before calling it the first time.
* The shorthand uses code in brackets to indicate method calls that
should be performed. A full explanation is not in order here, but a
few examples will suffice:
"You have [quant,_1,piece] of new mail."
The above code is shorthand for, and will be interpreted as,
this:
sub {
my $handle = $_[0];
my(@params) = @_;
return join '',
"You have ",
$handle->quant($params[1], 'piece'),
"of new mail.";
}
where "quant" is the name of a method you're using to quantify the
noun "piece" with the number $params[0].
A string with no brackety calls, like this:
"Your search expression was malformed."
is somewhat of a degenerate case, and just gets turned into:
sub { return "Your search expression was malformed." }
However, not everything you can write in Perl code can be written in
the above shorthand system -- not by a long shot. For example, consider
the Italian translator from the beginning of this article, who wanted
the Italian for "I didn't find any files" as a special case, instead
of "I found 0 files". That couldn't be specified (at least not easily
or simply) in our shorthand system, and it would have to be written
out in full, like this:
sub { # pretend the English strings are in Italian
my($handle, $files, $dirs) = @_[0,1,2];
return "I didn't find any files" unless $files;
return join '',
"I found ",
$handle->quant($files, 'file'),
" in ",
$handle->quant($dirs, 'directory'),
".";
}
Next to a lexicon full of shorthand code, that sort of sticks out like a
sore thumb -- but this I<is> a special case, after all; and at least
it's possible, if not as concise as usual.
As to how you'd implement the Russian example from the beginning of
the article, well, There's More Than One Way To Do It, but it could be
something like this (using English words for Russian, just so you know
what's going on):
"I [quant,_1,directory,accusative] scanned."
This shifts the burden of complexity off to the quant method. That
method's parameters are: the numeric value it's going to use to
quantify something; the Russian word it's going to quantify; and the
parameter "accusative", which you're using to mean that this
sentence's syntax wants a noun in the accusative case there, although
that quantification method may have to overrule, for grammatical
reasons you may recall from the beginning of this article.
Now, the Russian quant method here is responsible not only for
implementing the strange logic necessary for figuring out how Russian
number-phrases impose case and number on their noun-phrases, but also
for inflecting the Russian word for "directory". How that inflection
is to be carried out is no small issue, and among the solutions I've
seen, some (like variations on a simple lookup in a hash where all
possible forms are provided for all necessary words) are
straightforward but I<can> become cumbersome when you need to inflect
more than a few dozen words; and other solutions (like using
algorithms to model the inflections, storing only root forms and
irregularities) I<can> involve more overhead than is justifiable for
all but the largest lexicons.
Mercifully, this design decision becomes crucial only in the hairiest
of inflected languages, of which Russian is by no means the I<worst> case
scenario, but is worse than most. Most languages have simpler
inflection systems; for example, in English or Swahili, there are
generally no more than two possible inflected forms for a given noun
("error/errors"; "kosa/makosa"), and the
rules for producing these forms are fairly simple -- or at least,
simple rules can be formulated that work for most words, and you can
then treat the exceptions as just "irregular", at least relative to
your ad hoc rules. A simpler inflection system (simpler rules, fewer
forms) means that design decisions are less crucial to maintaining
sanity, whereas the same decisions could incur
overhead-versus-scalability problems in languages like Russian. It
may I<also> be likely that code (possibly in Perl, as with
Lingua::EN::Inflect, for English nouns) has already
been written for the language in question, whether simple or complex.
Moreover, a third possibility may even be simpler than anything
discussed above: "Just require that all possible (or at least
applicable) forms be provided in the call to the given language's quant
method, as in:"
"I found [quant,_1,file,files]."
That way, quant just has to chose which form it needs, without having
to look up or generate anything. While possibly not optimal for
Russian, this should work well for most other languages, where
quantification is not as complicated an operation.
=head2 The Devil in the Details
There's plenty more to Maketext than described above -- for example,
there's the details of how language tags ("en-US", "i-pwn", "fi",
etc.) or locale IDs ("en_US") interact with actual module naming
("BogoQuery/Locale/en_us.pm"), and what magic can ensue; there's the
details of how to record (and possibly negotiate) what character
encoding Maketext will return text in (UTF8? Latin-1? KOI8?). There's
the interesting fact that Maketext is for localization, but nowhere
actually has a "C<use locale;>" anywhere in it. For the curious,
there's the somewhat frightening details of how I actually
implement something like data inheritance so that searches across
modules' %Lexicon hashes can parallel how Perl implements method
inheritance.
And, most importantly, there's all the practical details of how to
actually go about deriving from Maketext so you can use it for your
interfaces, and the various tools and conventions for starting out and
maintaining individual language modules.
That is all covered in the documentation for Locale::Maketext and the
modules that come with it, available in CPAN. After having read this
article, which covers the why's of Maketext, the documentation,
which covers the how's of it, should be quite straightforward.
=head2 The Proof in the Pudding: Localizing Web Sites
Maketext and gettext have a notable difference: gettext is in C,
accessible thru C library calls, whereas Maketext is in Perl, and
really can't work without a Perl interpreter (although I suppose
something like it could be written for C). Accidents of history (and
not necessarily lucky ones) have made C++ the most common language for
the implementation of applications like word processors, Web browsers,
and even many in-house applications like custom query systems. Current
conditions make it somewhat unlikely that the next one of any of these
kinds of applications will be written in Perl, albeit clearly more for
reasons of custom and inertia than out of consideration of what is the
right tool for the job.
However, other accidents of history have made Perl a well-accepted
language for design of server-side programs (generally in CGI form)
for Web site interfaces. Localization of static pages in Web sites is
trivial, feasible either with simple language-negotiation features in
servers like Apache, or with some kind of server-side inclusions of
language-appropriate text into layout templates. However, I think
that the localization of Perl-based search systems (or other kinds of
dynamic content) in Web sites, be they public or access-restricted,
is where Maketext will see the greatest use.
I presume that it would be only the exceptional Web site that gets
localized for English I<and> Chinese I<and> Italian I<and> Arabic
I<and> Russian, to recall the languages from the beginning of this
article -- to say nothing of German, Spanish, French, Japanese,
Finnish, and Hindi, to name a few languages that benefit from large
numbers of programmers or Web viewers or both.
However, the ever-increasing internationalization of the Web (whether
measured in terms of amount of content, of numbers of content writers
or programmers, or of size of content audiences) makes it increasingly
likely that the interface to the average Web-based dynamic content
service will be localized for two or maybe three languages. It is my
hope that Maketext will make that task as simple as possible, and will
remove previous barriers to localization for languages dissimilar to
English.
__END__
Sean M. Burke (sburkeE<64>cpan.org) has a Master's in linguistics
from Northwestern University; he specializes in language technology.
Jordan Lachler (lachlerE<64>unm.edu) is a PhD student in the Department of
Linguistics at the University of New Mexico; he specializes in
morphology and pedagogy of North American native languages.
=head2 References
Alvestrand, Harald Tveit. 1995. I<RFC 1766: Tags for the
Identification of Languages.>
C<L<http://www.ietf.org/rfc/rfc1766.txt>>
[Now see RFC 3066.]
Callon, Ross, editor. 1996. I<RFC 1925: The Twelve
Networking Truths.>
C<L<http://www.ietf.org/rfc/rfc1925.txt>>
Drepper, Ulrich, Peter Miller,
and FranE<ccedil>ois Pinard. 1995-2001. GNU
C<gettext>. Available in C<L<ftp://prep.ai.mit.edu/pub/gnu/>>, with
extensive docs in the distribution tarball. [Since
I wrote this article in 1998, I now see that the
gettext docs are now trying more to come to terms with
plurality. Whether useful conclusions have come from it
is another question altogether. -- SMB, May 2001]
Forbes, Nevill. 1964. I<Russian Grammar.> Third Edition, revised
by J. C. Dumbreck. Oxford University Press.
=cut
#End