Initial Commit
This commit is contained in:
859
database/perl/lib/Locale/Maketext.pm
Normal file
859
database/perl/lib/Locale/Maketext.pm
Normal file
@@ -0,0 +1,859 @@
|
||||
package Locale::Maketext;
|
||||
use strict;
|
||||
our $USE_LITERALS;
|
||||
use Carp ();
|
||||
use I18N::LangTags ();
|
||||
use I18N::LangTags::Detect ();
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } }
|
||||
# define the constant 'DEBUG' at compile-time
|
||||
|
||||
# turn on utf8 if we have it (this is what GutsLoader.pm used to do essentially )
|
||||
# use if (exists $INC{'utf8.pm'} || eval 'use utf8'), 'utf8';
|
||||
BEGIN {
|
||||
|
||||
# if we have it || we can load it
|
||||
if ( exists $INC{'utf8.pm'} || eval { local $SIG{'__DIE__'};require utf8; } ) {
|
||||
utf8->import();
|
||||
DEBUG and warn " utf8 on for _compile()\n";
|
||||
}
|
||||
else {
|
||||
DEBUG and warn " utf8 not available for _compile() ($INC{'utf8.pm'})\n$@\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
our $VERSION = '1.29';
|
||||
our @ISA = ();
|
||||
|
||||
our $MATCH_SUPERS = 1;
|
||||
our $MATCH_SUPERS_TIGHTLY = 1;
|
||||
our $USING_LANGUAGE_TAGS = 1;
|
||||
# Turning this off is somewhat of a security risk in that little or no
|
||||
# checking will be done on the legality of tokens passed to the
|
||||
# eval("use $module_name") in _try_use. If you turn this off, you have
|
||||
# to do your own taint checking.
|
||||
|
||||
$USE_LITERALS = 1 unless defined $USE_LITERALS;
|
||||
# a hint for compiling bracket-notation things.
|
||||
|
||||
my %isa_scan = ();
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub quant {
|
||||
my($handle, $num, @forms) = @_;
|
||||
|
||||
return $num if @forms == 0; # what should this mean?
|
||||
return $forms[2] if @forms > 2 and $num == 0; # special zeroth case
|
||||
|
||||
# Normal case:
|
||||
# Note that the formatting of $num is preserved.
|
||||
return( $handle->numf($num) . ' ' . $handle->numerate($num, @forms) );
|
||||
# Most human languages put the number phrase before the qualified phrase.
|
||||
}
|
||||
|
||||
|
||||
sub numerate {
|
||||
# return this lexical item in a form appropriate to this number
|
||||
my($handle, $num, @forms) = @_;
|
||||
my $s = ($num == 1);
|
||||
|
||||
return '' unless @forms;
|
||||
if(@forms == 1) { # only the headword form specified
|
||||
return $s ? $forms[0] : ($forms[0] . 's'); # very cheap hack.
|
||||
}
|
||||
else { # sing and plural were specified
|
||||
return $s ? $forms[0] : $forms[1];
|
||||
}
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub numf {
|
||||
my($handle, $num) = @_[0,1];
|
||||
if($num < 10_000_000_000 and $num > -10_000_000_000 and $num == int($num)) {
|
||||
$num += 0; # Just use normal integer stringification.
|
||||
# Specifically, don't let %G turn ten million into 1E+007
|
||||
}
|
||||
else {
|
||||
$num = CORE::sprintf('%G', $num);
|
||||
# "CORE::" is there to avoid confusion with the above sub sprintf.
|
||||
}
|
||||
while( $num =~ s/^([-+]?\d+)(\d{3})/$1,$2/s ) {1} # right from perlfaq5
|
||||
# The initial \d+ gobbles as many digits as it can, and then we
|
||||
# backtrack so it un-eats the rightmost three, and then we
|
||||
# insert the comma there.
|
||||
|
||||
$num =~ tr<.,><,.> if ref($handle) and $handle->{'numf_comma'};
|
||||
# This is just a lame hack instead of using Number::Format
|
||||
return $num;
|
||||
}
|
||||
|
||||
sub sprintf {
|
||||
no integer;
|
||||
my($handle, $format, @params) = @_;
|
||||
return CORE::sprintf($format, @params);
|
||||
# "CORE::" is there to avoid confusion with myself!
|
||||
}
|
||||
|
||||
#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#
|
||||
|
||||
use integer; # vroom vroom... applies to the whole rest of the module
|
||||
|
||||
sub language_tag {
|
||||
my $it = ref($_[0]) || $_[0];
|
||||
return undef unless $it =~ m/([^':]+)(?:::)?$/s;
|
||||
$it = lc($1);
|
||||
$it =~ tr<_><->;
|
||||
return $it;
|
||||
}
|
||||
|
||||
sub encoding {
|
||||
my $it = $_[0];
|
||||
return(
|
||||
(ref($it) && $it->{'encoding'})
|
||||
|| 'iso-8859-1' # Latin-1
|
||||
);
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub fallback_languages { return('i-default', 'en', 'en-US') }
|
||||
|
||||
sub fallback_language_classes { return () }
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub fail_with { # an actual attribute method!
|
||||
my($handle, @params) = @_;
|
||||
return unless ref($handle);
|
||||
$handle->{'fail'} = $params[0] if @params;
|
||||
return $handle->{'fail'};
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub blacklist {
|
||||
my ( $handle, @methods ) = @_;
|
||||
|
||||
unless ( defined $handle->{'blacklist'} ) {
|
||||
no strict 'refs';
|
||||
|
||||
# Don't let people call methods they're not supposed to from maketext.
|
||||
# Explicitly exclude all methods in this package that start with an
|
||||
# underscore on principle.
|
||||
$handle->{'blacklist'} = {
|
||||
map { $_ => 1 } (
|
||||
qw/
|
||||
blacklist
|
||||
encoding
|
||||
fail_with
|
||||
failure_handler_auto
|
||||
fallback_language_classes
|
||||
fallback_languages
|
||||
get_handle
|
||||
init
|
||||
language_tag
|
||||
maketext
|
||||
new
|
||||
whitelist
|
||||
/, grep { /^_/ } keys %{ __PACKAGE__ . "::" }
|
||||
),
|
||||
};
|
||||
}
|
||||
|
||||
if ( scalar @methods ) {
|
||||
$handle->{'blacklist'} = { %{ $handle->{'blacklist'} }, map { $_ => 1 } @methods };
|
||||
}
|
||||
|
||||
delete $handle->{'_external_lex_cache'};
|
||||
return;
|
||||
}
|
||||
|
||||
sub whitelist {
|
||||
my ( $handle, @methods ) = @_;
|
||||
if ( scalar @methods ) {
|
||||
$handle->{'whitelist'} = {} unless defined $handle->{'whitelist'};
|
||||
$handle->{'whitelist'} = { %{ $handle->{'whitelist'} }, map { $_ => 1 } @methods };
|
||||
}
|
||||
|
||||
delete $handle->{'_external_lex_cache'};
|
||||
return;
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub failure_handler_auto {
|
||||
# Meant to be used like:
|
||||
# $handle->fail_with('failure_handler_auto')
|
||||
|
||||
my $handle = shift;
|
||||
my $phrase = shift;
|
||||
|
||||
$handle->{'failure_lex'} ||= {};
|
||||
my $lex = $handle->{'failure_lex'};
|
||||
|
||||
my $value ||= ($lex->{$phrase} ||= $handle->_compile($phrase));
|
||||
|
||||
# Dumbly copied from sub maketext:
|
||||
return ${$value} if ref($value) eq 'SCALAR';
|
||||
return $value if ref($value) ne 'CODE';
|
||||
{
|
||||
local $SIG{'__DIE__'};
|
||||
eval { $value = &$value($handle, @_) };
|
||||
}
|
||||
# If we make it here, there was an exception thrown in the
|
||||
# call to $value, and so scream:
|
||||
if($@) {
|
||||
# pretty up the error message
|
||||
$@ =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?}
|
||||
{\n in bracket code [compiled line $1],}s;
|
||||
#$err =~ s/\n?$/\n/s;
|
||||
Carp::croak "Error in maketexting \"$phrase\":\n$@ as used";
|
||||
# Rather unexpected, but suppose that the sub tried calling
|
||||
# a method that didn't exist.
|
||||
}
|
||||
else {
|
||||
return $value;
|
||||
}
|
||||
}
|
||||
|
||||
#==========================================================================
|
||||
|
||||
sub new {
|
||||
# Nothing fancy!
|
||||
my $class = ref($_[0]) || $_[0];
|
||||
my $handle = bless {}, $class;
|
||||
$handle->blacklist;
|
||||
$handle->init;
|
||||
return $handle;
|
||||
}
|
||||
|
||||
sub init { return } # no-op
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub maketext {
|
||||
# Remember, this can fail. Failure is controllable many ways.
|
||||
Carp::croak 'maketext requires at least one parameter' unless @_ > 1;
|
||||
|
||||
my($handle, $phrase) = splice(@_,0,2);
|
||||
Carp::confess('No handle/phrase') unless (defined($handle) && defined($phrase));
|
||||
|
||||
# backup $@ in case it's still being used in the calling code.
|
||||
# If no failures, we'll re-set it back to what it was later.
|
||||
my $at = $@;
|
||||
|
||||
# Copy @_ case one of its elements is $@.
|
||||
@_ = @_;
|
||||
|
||||
# Look up the value:
|
||||
|
||||
my $value;
|
||||
if (exists $handle->{'_external_lex_cache'}{$phrase}) {
|
||||
DEBUG and warn "* Using external lex cache version of \"$phrase\"\n";
|
||||
$value = $handle->{'_external_lex_cache'}{$phrase};
|
||||
}
|
||||
else {
|
||||
foreach my $h_r (
|
||||
@{ $isa_scan{ref($handle) || $handle} || $handle->_lex_refs }
|
||||
) {
|
||||
DEBUG and warn "* Looking up \"$phrase\" in $h_r\n";
|
||||
if(exists $h_r->{$phrase}) {
|
||||
DEBUG and warn " Found \"$phrase\" in $h_r\n";
|
||||
unless(ref($value = $h_r->{$phrase})) {
|
||||
# Nonref means it's not yet compiled. Compile and replace.
|
||||
if ($handle->{'use_external_lex_cache'}) {
|
||||
$value = $handle->{'_external_lex_cache'}{$phrase} = $handle->_compile($value);
|
||||
}
|
||||
else {
|
||||
$value = $h_r->{$phrase} = $handle->_compile($value);
|
||||
}
|
||||
}
|
||||
last;
|
||||
}
|
||||
# extending packages need to be able to localize _AUTO and if readonly can't "local $h_r->{'_AUTO'} = 1;"
|
||||
# but they can "local $handle->{'_external_lex_cache'}{'_AUTO'} = 1;"
|
||||
elsif($phrase !~ m/^_/s and ($handle->{'use_external_lex_cache'} ? ( exists $handle->{'_external_lex_cache'}{'_AUTO'} ? $handle->{'_external_lex_cache'}{'_AUTO'} : $h_r->{'_AUTO'} ) : $h_r->{'_AUTO'})) {
|
||||
# it's an auto lex, and this is an autoable key!
|
||||
DEBUG and warn " Automaking \"$phrase\" into $h_r\n";
|
||||
if ($handle->{'use_external_lex_cache'}) {
|
||||
$value = $handle->{'_external_lex_cache'}{$phrase} = $handle->_compile($phrase);
|
||||
}
|
||||
else {
|
||||
$value = $h_r->{$phrase} = $handle->_compile($phrase);
|
||||
}
|
||||
last;
|
||||
}
|
||||
DEBUG>1 and print " Not found in $h_r, nor automakable\n";
|
||||
# else keep looking
|
||||
}
|
||||
}
|
||||
|
||||
unless(defined($value)) {
|
||||
DEBUG and warn "! Lookup of \"$phrase\" in/under ", ref($handle) || $handle, " fails.\n";
|
||||
if(ref($handle) and $handle->{'fail'}) {
|
||||
DEBUG and warn "WARNING0: maketext fails looking for <$phrase>\n";
|
||||
my $fail;
|
||||
if(ref($fail = $handle->{'fail'}) eq 'CODE') { # it's a sub reference
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
return &{$fail}($handle, $phrase, @_);
|
||||
# If it ever returns, it should return a good value.
|
||||
}
|
||||
else { # It's a method name
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
return $handle->$fail($phrase, @_);
|
||||
# If it ever returns, it should return a good value.
|
||||
}
|
||||
}
|
||||
else {
|
||||
# All we know how to do is this;
|
||||
Carp::croak("maketext doesn't know how to say:\n$phrase\nas needed");
|
||||
}
|
||||
}
|
||||
|
||||
if(ref($value) eq 'SCALAR'){
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
return $$value ;
|
||||
}
|
||||
if(ref($value) ne 'CODE'){
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
return $value ;
|
||||
}
|
||||
|
||||
{
|
||||
local $SIG{'__DIE__'};
|
||||
eval { $value = &$value($handle, @_) };
|
||||
}
|
||||
# If we make it here, there was an exception thrown in the
|
||||
# call to $value, and so scream:
|
||||
if ($@) {
|
||||
# pretty up the error message
|
||||
$@ =~ s{\s+at\s+\(eval\s+\d+\)\s+line\s+(\d+)\.?\n?}
|
||||
{\n in bracket code [compiled line $1],}s;
|
||||
#$err =~ s/\n?$/\n/s;
|
||||
Carp::croak "Error in maketexting \"$phrase\":\n$@ as used";
|
||||
# Rather unexpected, but suppose that the sub tried calling
|
||||
# a method that didn't exist.
|
||||
}
|
||||
else {
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
return $value;
|
||||
}
|
||||
$@ = $at; # Put $@ back in case we altered it along the way.
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub get_handle { # This is a constructor and, yes, it CAN FAIL.
|
||||
# Its class argument has to be the base class for the current
|
||||
# application's l10n files.
|
||||
|
||||
my($base_class, @languages) = @_;
|
||||
$base_class = ref($base_class) || $base_class;
|
||||
# Complain if they use __PACKAGE__ as a project base class?
|
||||
|
||||
if( @languages ) {
|
||||
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
if($USING_LANGUAGE_TAGS) { # An explicit language-list was given!
|
||||
@languages =
|
||||
map {; $_, I18N::LangTags::alternate_language_tags($_) }
|
||||
# Catch alternation
|
||||
map I18N::LangTags::locale2language_tag($_),
|
||||
# If it's a lg tag, fine, pass thru (untainted)
|
||||
# If it's a locale ID, try converting to a lg tag (untainted),
|
||||
# otherwise nix it.
|
||||
@languages;
|
||||
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
}
|
||||
}
|
||||
else {
|
||||
@languages = $base_class->_ambient_langprefs;
|
||||
}
|
||||
|
||||
@languages = $base_class->_langtag_munging(@languages);
|
||||
|
||||
my %seen;
|
||||
foreach my $module_name ( map { $base_class . '::' . $_ } @languages ) {
|
||||
next unless length $module_name; # sanity
|
||||
next if $seen{$module_name}++ # Already been here, and it was no-go
|
||||
|| !&_try_use($module_name); # Try to use() it, but can't it.
|
||||
return($module_name->new); # Make it!
|
||||
}
|
||||
|
||||
return undef; # Fail!
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub _langtag_munging {
|
||||
my($base_class, @languages) = @_;
|
||||
|
||||
# We have all these DEBUG statements because otherwise it's hard as hell
|
||||
# to diagnose if/when something goes wrong.
|
||||
|
||||
DEBUG and warn 'Lgs1: ', map("<$_>", @languages), "\n";
|
||||
|
||||
if($USING_LANGUAGE_TAGS) {
|
||||
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
@languages = $base_class->_add_supers( @languages );
|
||||
|
||||
push @languages, I18N::LangTags::panic_languages(@languages);
|
||||
DEBUG and warn "After adding panic languages:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
push @languages, $base_class->fallback_languages;
|
||||
# You are free to override fallback_languages to return empty-list!
|
||||
DEBUG and warn 'Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
@languages = # final bit of processing to turn them into classname things
|
||||
map {
|
||||
my $it = $_; # copy
|
||||
$it =~ tr<-A-Z><_a-z>; # lc, and turn - to _
|
||||
$it =~ tr<_a-z0-9><>cd; # remove all but a-z0-9_
|
||||
$it;
|
||||
} @languages
|
||||
;
|
||||
DEBUG and warn "Nearing end of munging:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
}
|
||||
else {
|
||||
DEBUG and warn "Bypassing language-tags.\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
}
|
||||
|
||||
DEBUG and warn "Before adding fallback classes:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
push @languages, $base_class->fallback_language_classes;
|
||||
# You are free to override that to return whatever.
|
||||
|
||||
DEBUG and warn "Finally:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
return @languages;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub _ambient_langprefs {
|
||||
return I18N::LangTags::Detect::detect();
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
sub _add_supers {
|
||||
my($base_class, @languages) = @_;
|
||||
|
||||
if (!$MATCH_SUPERS) {
|
||||
# Nothing
|
||||
DEBUG and warn "Bypassing any super-matching.\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
}
|
||||
elsif( $MATCH_SUPERS_TIGHTLY ) {
|
||||
DEBUG and warn "Before adding new supers tightly:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
@languages = I18N::LangTags::implicate_supers( @languages );
|
||||
DEBUG and warn "After adding new supers tightly:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
|
||||
}
|
||||
else {
|
||||
DEBUG and warn "Before adding supers to end:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
@languages = I18N::LangTags::implicate_supers_strictly( @languages );
|
||||
DEBUG and warn "After adding supers to end:\n",
|
||||
' Lgs@', __LINE__, ': ', map("<$_>", @languages), "\n";
|
||||
}
|
||||
|
||||
return @languages;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
#
|
||||
# This is where most people should stop reading.
|
||||
#
|
||||
###########################################################################
|
||||
|
||||
my %tried = ();
|
||||
# memoization of whether we've used this module, or found it unusable.
|
||||
|
||||
sub _try_use { # Basically a wrapper around "require Modulename"
|
||||
# "Many men have tried..." "They tried and failed?" "They tried and died."
|
||||
return $tried{$_[0]} if exists $tried{$_[0]}; # memoization
|
||||
|
||||
my $module = $_[0]; # ASSUME sane module name!
|
||||
{ no strict 'refs';
|
||||
no warnings 'once';
|
||||
return($tried{$module} = 1)
|
||||
if %{$module . '::Lexicon'} or @{$module . '::ISA'};
|
||||
# weird case: we never use'd it, but there it is!
|
||||
}
|
||||
|
||||
DEBUG and warn " About to use $module ...\n";
|
||||
|
||||
local $SIG{'__DIE__'};
|
||||
local $@;
|
||||
local @INC = @INC;
|
||||
pop @INC if $INC[-1] eq '.';
|
||||
eval "require $module"; # used to be "use $module", but no point in that.
|
||||
|
||||
if($@) {
|
||||
DEBUG and warn "Error using $module \: $@\n";
|
||||
return $tried{$module} = 0;
|
||||
}
|
||||
else {
|
||||
DEBUG and warn " OK, $module is used\n";
|
||||
return $tried{$module} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub _lex_refs { # report the lexicon references for this handle's class
|
||||
# returns an arrayREF!
|
||||
no strict 'refs';
|
||||
no warnings 'once';
|
||||
my $class = ref($_[0]) || $_[0];
|
||||
DEBUG and warn "Lex refs lookup on $class\n";
|
||||
return $isa_scan{$class} if exists $isa_scan{$class}; # memoization!
|
||||
|
||||
my @lex_refs;
|
||||
my $seen_r = ref($_[1]) ? $_[1] : {};
|
||||
|
||||
if( defined( *{$class . '::Lexicon'}{'HASH'} )) {
|
||||
push @lex_refs, *{$class . '::Lexicon'}{'HASH'};
|
||||
DEBUG and warn '%' . $class . '::Lexicon contains ',
|
||||
scalar(keys %{$class . '::Lexicon'}), " entries\n";
|
||||
}
|
||||
|
||||
# Implements depth(height?)-first recursive searching of superclasses.
|
||||
# In hindsight, I suppose I could have just used Class::ISA!
|
||||
foreach my $superclass (@{$class . '::ISA'}) {
|
||||
DEBUG and warn " Super-class search into $superclass\n";
|
||||
next if $seen_r->{$superclass}++;
|
||||
push @lex_refs, @{&_lex_refs($superclass, $seen_r)}; # call myself
|
||||
}
|
||||
|
||||
$isa_scan{$class} = \@lex_refs; # save for next time
|
||||
return \@lex_refs;
|
||||
}
|
||||
|
||||
sub clear_isa_scan { %isa_scan = (); return; } # end on a note of simplicity!
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub _compile {
|
||||
# This big scary routine compiles an entry.
|
||||
# It returns either a coderef if there's brackety bits in this, or
|
||||
# otherwise a ref to a scalar.
|
||||
|
||||
my $string_to_compile = $_[1]; # There are taint issues using regex on @_ - perlbug 60378,27344
|
||||
|
||||
# The while() regex is more expensive than this check on strings that don't need a compile.
|
||||
# this op causes a ~2% speed hit for strings that need compile and a 250% speed improvement
|
||||
# on strings that don't need compiling.
|
||||
return \"$string_to_compile" if($string_to_compile !~ m/[\[~\]]/ms); # return a string ref if chars [~] are not in the string
|
||||
|
||||
my $handle = $_[0];
|
||||
|
||||
my(@code);
|
||||
my(@c) = (''); # "chunks" -- scratch.
|
||||
my $call_count = 0;
|
||||
my $big_pile = '';
|
||||
{
|
||||
my $in_group = 0; # start out outside a group
|
||||
my($m, @params); # scratch
|
||||
|
||||
while($string_to_compile =~ # Iterate over chunks.
|
||||
m/(
|
||||
[^\~\[\]]+ # non-~[] stuff (Capture everything else here)
|
||||
|
|
||||
~. # ~[, ~], ~~, ~other
|
||||
|
|
||||
\[ # [ presumably opening a group
|
||||
|
|
||||
\] # ] presumably closing a group
|
||||
|
|
||||
~ # terminal ~ ?
|
||||
|
|
||||
$
|
||||
)/xgs
|
||||
) {
|
||||
DEBUG>2 and warn qq{ "$1"\n};
|
||||
|
||||
if($1 eq '[' or $1 eq '') { # "[" or end
|
||||
# Whether this is "[" or end, force processing of any
|
||||
# preceding literal.
|
||||
if($in_group) {
|
||||
if($1 eq '') {
|
||||
$handle->_die_pointing($string_to_compile, 'Unterminated bracket group');
|
||||
}
|
||||
else {
|
||||
$handle->_die_pointing($string_to_compile, 'You can\'t nest bracket groups');
|
||||
}
|
||||
}
|
||||
else {
|
||||
if ($1 eq '') {
|
||||
DEBUG>2 and warn " [end-string]\n";
|
||||
}
|
||||
else {
|
||||
$in_group = 1;
|
||||
}
|
||||
die "How come \@c is empty?? in <$string_to_compile>" unless @c; # sanity
|
||||
if(length $c[-1]) {
|
||||
# Now actually processing the preceding literal
|
||||
$big_pile .= $c[-1];
|
||||
if($USE_LITERALS and (
|
||||
(ord('A') == 65)
|
||||
? $c[-1] !~ m/[^\x20-\x7E]/s
|
||||
# ASCII very safe chars
|
||||
: $c[-1] !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s
|
||||
# EBCDIC very safe chars
|
||||
)) {
|
||||
# normal case -- all very safe chars
|
||||
$c[-1] =~ s/'/\\'/g;
|
||||
push @code, q{ '} . $c[-1] . "',\n";
|
||||
$c[-1] = ''; # reuse this slot
|
||||
}
|
||||
else {
|
||||
$c[-1] =~ s/\\\\/\\/g;
|
||||
push @code, ' $c[' . $#c . "],\n";
|
||||
push @c, ''; # new chunk
|
||||
}
|
||||
}
|
||||
# else just ignore the empty string.
|
||||
}
|
||||
|
||||
}
|
||||
elsif($1 eq ']') { # "]"
|
||||
# close group -- go back in-band
|
||||
if($in_group) {
|
||||
$in_group = 0;
|
||||
|
||||
DEBUG>2 and warn " --Closing group [$c[-1]]\n";
|
||||
|
||||
# And now process the group...
|
||||
|
||||
if(!length($c[-1]) or $c[-1] =~ m/^\s+$/s) {
|
||||
DEBUG>2 and warn " -- (Ignoring)\n";
|
||||
$c[-1] = ''; # reset out chink
|
||||
next;
|
||||
}
|
||||
|
||||
#$c[-1] =~ s/^\s+//s;
|
||||
#$c[-1] =~ s/\s+$//s;
|
||||
($m,@params) = split(/,/, $c[-1], -1); # was /\s*,\s*/
|
||||
|
||||
# A bit of a hack -- we've turned "~,"'s into DELs, so turn
|
||||
# 'em into real commas here.
|
||||
if (ord('A') == 65) { # ASCII, etc
|
||||
foreach($m, @params) { tr/\x7F/,/ }
|
||||
}
|
||||
else { # EBCDIC (1047, 0037, POSIX-BC)
|
||||
# Thanks to Peter Prymmer for the EBCDIC handling
|
||||
foreach($m, @params) { tr/\x07/,/ }
|
||||
}
|
||||
|
||||
# Special-case handling of some method names:
|
||||
if($m eq '_*' or $m =~ m/^_(-?\d+)$/s) {
|
||||
# Treat [_1,...] as [,_1,...], etc.
|
||||
unshift @params, $m;
|
||||
$m = '';
|
||||
}
|
||||
elsif($m eq '*') {
|
||||
$m = 'quant'; # "*" for "times": "4 cars" is 4 times "cars"
|
||||
}
|
||||
elsif($m eq '#') {
|
||||
$m = 'numf'; # "#" for "number": [#,_1] for "the number _1"
|
||||
}
|
||||
|
||||
# Most common case: a simple, legal-looking method name
|
||||
if($m eq '') {
|
||||
# 0-length method name means to just interpolate:
|
||||
push @code, ' (';
|
||||
}
|
||||
elsif($m =~ /^\w+$/s
|
||||
&& !$handle->{'blacklist'}{$m}
|
||||
&& ( !defined $handle->{'whitelist'} || $handle->{'whitelist'}{$m} )
|
||||
# exclude anything fancy and restrict to the whitelist/blacklist.
|
||||
) {
|
||||
push @code, ' $_[0]->' . $m . '(';
|
||||
}
|
||||
else {
|
||||
# TODO: implement something? or just too icky to consider?
|
||||
$handle->_die_pointing(
|
||||
$string_to_compile,
|
||||
"Can't use \"$m\" as a method name in bracket group",
|
||||
2 + length($c[-1])
|
||||
);
|
||||
}
|
||||
|
||||
pop @c; # we don't need that chunk anymore
|
||||
++$call_count;
|
||||
|
||||
foreach my $p (@params) {
|
||||
if($p eq '_*') {
|
||||
# Meaning: all parameters except $_[0]
|
||||
$code[-1] .= ' @_[1 .. $#_], ';
|
||||
# and yes, that does the right thing for all @_ < 3
|
||||
}
|
||||
elsif($p =~ m/^_(-?\d+)$/s) {
|
||||
# _3 meaning $_[3]
|
||||
$code[-1] .= '$_[' . (0 + $1) . '], ';
|
||||
}
|
||||
elsif($USE_LITERALS and (
|
||||
(ord('A') == 65)
|
||||
? $p !~ m/[^\x20-\x7E]/s
|
||||
# ASCII very safe chars
|
||||
: $p !~ m/[^ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~\x07]/s
|
||||
# EBCDIC very safe chars
|
||||
)) {
|
||||
# Normal case: a literal containing only safe characters
|
||||
$p =~ s/'/\\'/g;
|
||||
$code[-1] .= q{'} . $p . q{', };
|
||||
}
|
||||
else {
|
||||
# Stow it on the chunk-stack, and just refer to that.
|
||||
push @c, $p;
|
||||
push @code, ' $c[' . $#c . '], ';
|
||||
}
|
||||
}
|
||||
$code[-1] .= "),\n";
|
||||
|
||||
push @c, '';
|
||||
}
|
||||
else {
|
||||
$handle->_die_pointing($string_to_compile, q{Unbalanced ']'});
|
||||
}
|
||||
|
||||
}
|
||||
elsif(substr($1,0,1) ne '~') {
|
||||
# it's stuff not containing "~" or "[" or "]"
|
||||
# i.e., a literal blob
|
||||
my $text = $1;
|
||||
$text =~ s/\\/\\\\/g;
|
||||
$c[-1] .= $text;
|
||||
|
||||
}
|
||||
elsif($1 eq '~~') { # "~~"
|
||||
$c[-1] .= '~';
|
||||
|
||||
}
|
||||
elsif($1 eq '~[') { # "~["
|
||||
$c[-1] .= '[';
|
||||
|
||||
}
|
||||
elsif($1 eq '~]') { # "~]"
|
||||
$c[-1] .= ']';
|
||||
|
||||
}
|
||||
elsif($1 eq '~,') { # "~,"
|
||||
if($in_group) {
|
||||
# This is a hack, based on the assumption that no-one will actually
|
||||
# want a DEL inside a bracket group. Let's hope that's it's true.
|
||||
if (ord('A') == 65) { # ASCII etc
|
||||
$c[-1] .= "\x7F";
|
||||
}
|
||||
else { # EBCDIC (cp 1047, 0037, POSIX-BC)
|
||||
$c[-1] .= "\x07";
|
||||
}
|
||||
}
|
||||
else {
|
||||
$c[-1] .= '~,';
|
||||
}
|
||||
|
||||
}
|
||||
elsif($1 eq '~') { # possible only at string-end, it seems.
|
||||
$c[-1] .= '~';
|
||||
|
||||
}
|
||||
else {
|
||||
# It's a "~X" where X is not a special character.
|
||||
# Consider it a literal ~ and X.
|
||||
my $text = $1;
|
||||
$text =~ s/\\/\\\\/g;
|
||||
$c[-1] .= $text;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if($call_count) {
|
||||
undef $big_pile; # Well, nevermind that.
|
||||
}
|
||||
else {
|
||||
# It's all literals! Ahwell, that can happen.
|
||||
# So don't bother with the eval. Return a SCALAR reference.
|
||||
return \$big_pile;
|
||||
}
|
||||
|
||||
die q{Last chunk isn't null??} if @c and length $c[-1]; # sanity
|
||||
DEBUG and warn scalar(@c), " chunks under closure\n";
|
||||
if(@code == 0) { # not possible?
|
||||
DEBUG and warn "Empty code\n";
|
||||
return \'';
|
||||
}
|
||||
elsif(@code > 1) { # most cases, presumably!
|
||||
unshift @code, "join '',\n";
|
||||
}
|
||||
unshift @code, "use strict; sub {\n";
|
||||
push @code, "}\n";
|
||||
|
||||
DEBUG and warn @code;
|
||||
my $sub = eval(join '', @code);
|
||||
die "$@ while evalling" . join('', @code) if $@; # Should be impossible.
|
||||
return $sub;
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
sub _die_pointing {
|
||||
# This is used by _compile to throw a fatal error
|
||||
my $target = shift;
|
||||
$target = ref($target) || $target; # class name
|
||||
# ...leaving $_[0] the error-causing text, and $_[1] the error message
|
||||
|
||||
my $i = index($_[0], "\n");
|
||||
|
||||
my $pointy;
|
||||
my $pos = pos($_[0]) - (defined($_[2]) ? $_[2] : 0) - 1;
|
||||
if($pos < 1) {
|
||||
$pointy = "^=== near there\n";
|
||||
}
|
||||
else { # we need to space over
|
||||
my $first_tab = index($_[0], "\t");
|
||||
if($pos > 2 and ( -1 == $first_tab or $first_tab > pos($_[0]))) {
|
||||
# No tabs, or the first tab is harmlessly after where we will point to,
|
||||
# AND we're far enough from the margin that we can draw a proper arrow.
|
||||
$pointy = ('=' x $pos) . "^ near there\n";
|
||||
}
|
||||
else {
|
||||
# tabs screw everything up!
|
||||
$pointy = substr($_[0],0,$pos);
|
||||
$pointy =~ tr/\t //cd;
|
||||
# make everything into whitespace, but preserving tabs
|
||||
$pointy .= "^=== near there\n";
|
||||
}
|
||||
}
|
||||
|
||||
my $errmsg = "$_[1], in\:\n$_[0]";
|
||||
|
||||
if($i == -1) {
|
||||
# No newline.
|
||||
$errmsg .= "\n" . $pointy;
|
||||
}
|
||||
elsif($i == (length($_[0]) - 1) ) {
|
||||
# Already has a newline at end.
|
||||
$errmsg .= $pointy;
|
||||
}
|
||||
else {
|
||||
# don't bother with the pointy bit, I guess.
|
||||
}
|
||||
Carp::croak( "$errmsg via $target, as used" );
|
||||
}
|
||||
|
||||
1;
|
||||
1424
database/perl/lib/Locale/Maketext.pod
Normal file
1424
database/perl/lib/Locale/Maketext.pod
Normal file
File diff suppressed because it is too large
Load Diff
150
database/perl/lib/Locale/Maketext/Cookbook.pod
Normal file
150
database/perl/lib/Locale/Maketext/Cookbook.pod
Normal file
@@ -0,0 +1,150 @@
|
||||
# This document contains text in Perl "POD" format.
|
||||
# Use a POD viewer like perldoc or perlman to render it.
|
||||
|
||||
=encoding utf-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Locale::Maketext::Cookbook - recipes for using Locale::Maketext
|
||||
|
||||
=head1 INTRODUCTION
|
||||
|
||||
This is a work in progress. Not much progress by now :-)
|
||||
|
||||
=head1 ONESIDED LEXICONS
|
||||
|
||||
I<Adapted from a suggestion by Dan Muey>
|
||||
|
||||
It may be common (for example at your main lexicon) that
|
||||
the hash keys and values coincide. Like that
|
||||
|
||||
q{Hello, tell me your name}
|
||||
=> q{Hello, tell me your name}
|
||||
|
||||
It would be nice to just write:
|
||||
|
||||
q{Hello, tell me your name} => ''
|
||||
|
||||
and have this magically inflated to the first form.
|
||||
Among the advantages of such representation, that would
|
||||
lead to
|
||||
smaller files, less prone to mistyping or mispasting,
|
||||
and handy to someone translating it which can simply
|
||||
copy the main lexicon and enter the translation
|
||||
instead of having to remove the value first.
|
||||
|
||||
That can be achieved by overriding C<init>
|
||||
in your class and working on the main lexicon
|
||||
with code like that:
|
||||
|
||||
package My::I18N;
|
||||
...
|
||||
|
||||
sub init {
|
||||
my $lh = shift; # a newborn handle
|
||||
$lh->SUPER::init();
|
||||
inflate_lexicon(\%My::I18N::en::Lexicon);
|
||||
return;
|
||||
}
|
||||
|
||||
sub inflate_lexicon {
|
||||
my $lex = shift;
|
||||
while (my ($k, $v) = each %$lex) {
|
||||
$v = $k if !defined $v || $v eq '';
|
||||
}
|
||||
}
|
||||
|
||||
Here we are assuming C<My::I18N::en> to own the
|
||||
main lexicon.
|
||||
|
||||
There are some downsides here: the size economy
|
||||
will not stand at runtime after this C<init()>
|
||||
runs. But it should not be that critical, since
|
||||
if you don't have space for that, you won't have
|
||||
space for any other language besides the main one
|
||||
as well. You could do that too with ties,
|
||||
expanding the value at lookup time which
|
||||
should be more time expensive as an option.
|
||||
|
||||
=head1 DECIMAL PLACES IN NUMBER FORMATTING
|
||||
|
||||
I<After CPAN RT #36136 (L<https://rt.cpan.org/Ticket/Display.html?id=36136>)>
|
||||
|
||||
The documentation of L<Locale::Maketext> advises that
|
||||
the standard bracket method C<numf> is limited and that
|
||||
you must override that for better results. It even
|
||||
suggests the use of L<Number::Format>.
|
||||
|
||||
One such defect of standard C<numf> is to not be
|
||||
able to use a certain decimal precision.
|
||||
For example,
|
||||
|
||||
$lh->maketext('pi is [numf,_1]', 355/113);
|
||||
|
||||
outputs
|
||||
|
||||
pi is 3.14159292035398
|
||||
|
||||
Since pi ≈ 355/116 is only accurate
|
||||
to 6 decimal places, you would want to say:
|
||||
|
||||
$lh->maketext('pi is [numf,_1,6]', 355/113);
|
||||
|
||||
and get "pi is 3.141592".
|
||||
|
||||
One solution for that could use C<Number::Format>
|
||||
like that:
|
||||
|
||||
package Wuu;
|
||||
|
||||
use base qw(Locale::Maketext);
|
||||
|
||||
use Number::Format;
|
||||
|
||||
# can be overridden according to language conventions
|
||||
sub _numf_params {
|
||||
return (
|
||||
-thousands_sep => '.',
|
||||
-decimal_point => ',',
|
||||
-decimal_digits => 2,
|
||||
);
|
||||
}
|
||||
|
||||
# builds a Number::Format
|
||||
sub _numf_formatter {
|
||||
my ($lh, $scale) = @_;
|
||||
my @params = $lh->_numf_params;
|
||||
if ($scale) { # use explicit scale rather than default
|
||||
push @params, (-decimal_digits => $scale);
|
||||
}
|
||||
return Number::Format->new(@params);
|
||||
}
|
||||
|
||||
sub numf {
|
||||
my ($lh, $n, $scale) = @_;
|
||||
# get the (cached) formatter
|
||||
my $nf = $lh->{__nf}{$scale} ||= $lh->_numf_formatter($scale);
|
||||
# format the number itself
|
||||
return $nf->format_number($n);
|
||||
}
|
||||
|
||||
package Wuu::pt;
|
||||
|
||||
use base qw(Wuu);
|
||||
|
||||
and then
|
||||
|
||||
my $lh = Wuu->get_handle('pt');
|
||||
$lh->maketext('A [numf,_1,3] km de distância', 1550.2222);
|
||||
|
||||
would return "A 1.550,222 km de distância".
|
||||
|
||||
Notice that the standard utility methods of
|
||||
C<Locale::Maketext> are irremediably limited
|
||||
because they could not aim to do everything
|
||||
that could be expected from them in different languages,
|
||||
cultures and applications. So extending C<numf>,
|
||||
C<quant>, and C<sprintf> is natural as soon
|
||||
as your needs exceed what the standard ones do.
|
||||
|
||||
|
||||
24
database/perl/lib/Locale/Maketext/Guts.pm
Normal file
24
database/perl/lib/Locale/Maketext/Guts.pm
Normal file
@@ -0,0 +1,24 @@
|
||||
package Locale::Maketext::Guts;
|
||||
|
||||
use Locale::Maketext;
|
||||
|
||||
our $VERSION = '1.20';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Locale::Maketext::Guts - Deprecated module to load Locale::Maketext utf8 code
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
# Do this instead please
|
||||
use Locale::Maketext
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Previously Local::Maketext::GutsLoader performed some magic to load
|
||||
Locale::Maketext when utf8 was unavailable. The subs this module provided
|
||||
were merged back into Locale::Maketext
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
26
database/perl/lib/Locale/Maketext/GutsLoader.pm
Normal file
26
database/perl/lib/Locale/Maketext/GutsLoader.pm
Normal file
@@ -0,0 +1,26 @@
|
||||
package Locale::Maketext::GutsLoader;
|
||||
|
||||
use Locale::Maketext;
|
||||
|
||||
our $VERSION = '1.20';
|
||||
|
||||
sub zorp { return scalar @_ }
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Locale::Maketext::GutsLoader - Deprecated module to load Locale::Maketext utf8 code
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
# Do this instead please
|
||||
use Locale::Maketext
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Previously Locale::Maketext::Guts performed some magic to load
|
||||
Locale::Maketext when utf8 was unavailable. The subs this module provided
|
||||
were merged back into Locale::Maketext.
|
||||
|
||||
=cut
|
||||
|
||||
1;
|
||||
346
database/perl/lib/Locale/Maketext/Simple.pm
Normal file
346
database/perl/lib/Locale/Maketext/Simple.pm
Normal file
@@ -0,0 +1,346 @@
|
||||
package Locale::Maketext::Simple;
|
||||
$Locale::Maketext::Simple::VERSION = '0.21_01';
|
||||
|
||||
use strict;
|
||||
use 5.005;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Locale::Maketext::Simple - Simple interface to Locale::Maketext::Lexicon
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
This document describes version 0.18 of Locale::Maketext::Simple,
|
||||
released Septermber 8, 2006.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Minimal setup (looks for F<auto/Foo/*.po> and F<auto/Foo/*.mo>):
|
||||
|
||||
package Foo;
|
||||
use Locale::Maketext::Simple; # exports 'loc'
|
||||
loc_lang('fr'); # set language to French
|
||||
sub hello {
|
||||
print loc("Hello, [_1]!", "World");
|
||||
}
|
||||
|
||||
More sophisticated example:
|
||||
|
||||
package Foo::Bar;
|
||||
use Locale::Maketext::Simple (
|
||||
Class => 'Foo', # search in auto/Foo/
|
||||
Style => 'gettext', # %1 instead of [_1]
|
||||
Export => 'maketext', # maketext() instead of loc()
|
||||
Subclass => 'L10N', # Foo::L10N instead of Foo::I18N
|
||||
Decode => 1, # decode entries to unicode-strings
|
||||
Encoding => 'locale', # but encode lexicons in current locale
|
||||
# (needs Locale::Maketext::Lexicon 0.36)
|
||||
);
|
||||
sub japh {
|
||||
print maketext("Just another %1 hacker", "Perl");
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module is a simple wrapper around B<Locale::Maketext::Lexicon>,
|
||||
designed to alleviate the need of creating I<Language Classes> for
|
||||
module authors.
|
||||
|
||||
The language used is chosen from the loc_lang call. If a lookup is not
|
||||
possible, the i-default language will be used. If the lookup is not in the
|
||||
i-default language, then the key will be returned.
|
||||
|
||||
If B<Locale::Maketext::Lexicon> is not present, it implements a
|
||||
minimal localization function by simply interpolating C<[_1]> with
|
||||
the first argument, C<[_2]> with the second, etc. Interpolated
|
||||
function like C<[quant,_1]> are treated as C<[_1]>, with the sole
|
||||
exception of C<[tense,_1,X]>, which will append C<ing> to C<_1> when
|
||||
X is C<present>, or appending C<ed> to <_1> otherwise.
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
All options are passed either via the C<use> statement, or via an
|
||||
explicit C<import>.
|
||||
|
||||
=head2 Class
|
||||
|
||||
By default, B<Locale::Maketext::Simple> draws its source from the
|
||||
calling package's F<auto/> directory; you can override this behaviour
|
||||
by explicitly specifying another package as C<Class>.
|
||||
|
||||
=head2 Path
|
||||
|
||||
If your PO and MO files are under a path elsewhere than C<auto/>,
|
||||
you may specify it using the C<Path> option.
|
||||
|
||||
=head2 Style
|
||||
|
||||
By default, this module uses the C<maketext> style of C<[_1]> and
|
||||
C<[quant,_1]> for interpolation. Alternatively, you can specify the
|
||||
C<gettext> style, which uses C<%1> and C<%quant(%1)> for interpolation.
|
||||
|
||||
This option is case-insensitive.
|
||||
|
||||
=head2 Export
|
||||
|
||||
By default, this module exports a single function, C<loc>, into its
|
||||
caller's namespace. You can set it to another name, or set it to
|
||||
an empty string to disable exporting.
|
||||
|
||||
=head2 Subclass
|
||||
|
||||
By default, this module creates an C<::I18N> subclass under the
|
||||
caller's package (or the package specified by C<Class>), and stores
|
||||
lexicon data in its subclasses. You can assign a name other than
|
||||
C<I18N> via this option.
|
||||
|
||||
=head2 Decode
|
||||
|
||||
If set to a true value, source entries will be converted into
|
||||
utf8-strings (available in Perl 5.6.1 or later). This feature
|
||||
needs the B<Encode> or B<Encode::compat> module.
|
||||
|
||||
=head2 Encoding
|
||||
|
||||
Specifies an encoding to store lexicon entries, instead of
|
||||
utf8-strings. If set to C<locale>, the encoding from the current
|
||||
locale setting is used. Implies a true value for C<Decode>.
|
||||
|
||||
=cut
|
||||
|
||||
sub import {
|
||||
my ($class, %args) = @_;
|
||||
|
||||
$args{Class} ||= caller;
|
||||
$args{Style} ||= 'maketext';
|
||||
$args{Export} ||= 'loc';
|
||||
$args{Subclass} ||= 'I18N';
|
||||
|
||||
my ($loc, $loc_lang) = $class->load_loc(%args);
|
||||
$loc ||= $class->default_loc(%args);
|
||||
|
||||
no strict 'refs';
|
||||
*{caller(0) . "::$args{Export}"} = $loc if $args{Export};
|
||||
*{caller(0) . "::$args{Export}_lang"} = $loc_lang || sub { 1 };
|
||||
}
|
||||
|
||||
my %Loc;
|
||||
|
||||
sub reload_loc { %Loc = () }
|
||||
|
||||
sub load_loc {
|
||||
my ($class, %args) = @_;
|
||||
|
||||
my $pkg = join('::', grep { defined and length } $args{Class}, $args{Subclass});
|
||||
return $Loc{$pkg} if exists $Loc{$pkg};
|
||||
|
||||
eval {
|
||||
local @INC = @INC;
|
||||
pop @INC if $INC[-1] eq '.';
|
||||
require Locale::Maketext::Lexicon;
|
||||
1
|
||||
} or return;
|
||||
$Locale::Maketext::Lexicon::VERSION > 0.20 or return;
|
||||
eval { require File::Spec; 1 } or return;
|
||||
|
||||
my $path = $args{Path} || $class->auto_path($args{Class}) or return;
|
||||
my $pattern = File::Spec->catfile($path, '*.[pm]o');
|
||||
my $decode = $args{Decode} || 0;
|
||||
my $encoding = $args{Encoding} || undef;
|
||||
|
||||
$decode = 1 if $encoding;
|
||||
|
||||
$pattern =~ s{\\}{/}g; # to counter win32 paths
|
||||
|
||||
eval "
|
||||
package $pkg;
|
||||
use base 'Locale::Maketext';
|
||||
Locale::Maketext::Lexicon->import({
|
||||
'i-default' => [ 'Auto' ],
|
||||
'*' => [ Gettext => \$pattern ],
|
||||
_decode => \$decode,
|
||||
_encoding => \$encoding,
|
||||
});
|
||||
*${pkg}::Lexicon = \\%${pkg}::i_default::Lexicon;
|
||||
*tense = sub { \$_[1] . ((\$_[2] eq 'present') ? 'ing' : 'ed') }
|
||||
unless defined &tense;
|
||||
|
||||
1;
|
||||
" or die $@;
|
||||
|
||||
my $lh = eval { $pkg->get_handle } or return;
|
||||
my $style = lc($args{Style});
|
||||
if ($style eq 'maketext') {
|
||||
$Loc{$pkg} = sub {
|
||||
$lh->maketext(@_)
|
||||
};
|
||||
}
|
||||
elsif ($style eq 'gettext') {
|
||||
$Loc{$pkg} = sub {
|
||||
my $str = shift;
|
||||
$str =~ s{([\~\[\]])}{~$1}g;
|
||||
$str =~ s{
|
||||
([%\\]%) # 1 - escaped sequence
|
||||
|
|
||||
% (?:
|
||||
([A-Za-z#*]\w*) # 2 - function call
|
||||
\(([^\)]*)\) # 3 - arguments
|
||||
|
|
||||
([1-9]\d*|\*) # 4 - variable
|
||||
)
|
||||
}{
|
||||
$1 ? $1
|
||||
: $2 ? "\[$2,"._unescape($3)."]"
|
||||
: "[_$4]"
|
||||
}egx;
|
||||
return $lh->maketext($str, @_);
|
||||
};
|
||||
}
|
||||
else {
|
||||
die "Unknown Style: $style";
|
||||
}
|
||||
|
||||
return $Loc{$pkg}, sub {
|
||||
$lh = $pkg->get_handle(@_);
|
||||
};
|
||||
}
|
||||
|
||||
sub default_loc {
|
||||
my ($self, %args) = @_;
|
||||
my $style = lc($args{Style});
|
||||
if ($style eq 'maketext') {
|
||||
return sub {
|
||||
my $str = shift;
|
||||
$str =~ s{((?<!~)(?:~~)*)\[_([1-9]\d*|\*)\]}
|
||||
{$1%$2}g;
|
||||
$str =~ s{((?<!~)(?:~~)*)\[([A-Za-z#*]\w*),([^\]]+)\]}
|
||||
{"$1%$2(" . _escape($3) . ')'}eg;
|
||||
_default_gettext($str, @_);
|
||||
};
|
||||
}
|
||||
elsif ($style eq 'gettext') {
|
||||
return \&_default_gettext;
|
||||
}
|
||||
else {
|
||||
die "Unknown Style: $style";
|
||||
}
|
||||
}
|
||||
|
||||
sub _default_gettext {
|
||||
my $str = shift;
|
||||
$str =~ s{
|
||||
% # leading symbol
|
||||
(?: # either one of
|
||||
\d+ # a digit, like %1
|
||||
| # or
|
||||
(\w+)\( # a function call -- 1
|
||||
(?: # either
|
||||
%\d+ # an interpolation
|
||||
| # or
|
||||
([^,]*) # some string -- 2
|
||||
) # end either
|
||||
(?: # maybe followed
|
||||
, # by a comma
|
||||
([^),]*) # and a param -- 3
|
||||
)? # end maybe
|
||||
(?: # maybe followed
|
||||
, # by another comma
|
||||
([^),]*) # and a param -- 4
|
||||
)? # end maybe
|
||||
[^)]* # and other ignorable params
|
||||
\) # closing function call
|
||||
) # closing either one of
|
||||
}{
|
||||
my $digit = $2 || shift;
|
||||
$digit . (
|
||||
$1 ? (
|
||||
($1 eq 'tense') ? (($3 eq 'present') ? 'ing' : 'ed') :
|
||||
($1 eq 'quant') ? ' ' . (($digit > 1) ? ($4 || "$3s") : $3) :
|
||||
''
|
||||
) : ''
|
||||
);
|
||||
}egx;
|
||||
return $str;
|
||||
};
|
||||
|
||||
sub _escape {
|
||||
my $text = shift;
|
||||
$text =~ s/\b_([1-9]\d*)/%$1/g;
|
||||
return $text;
|
||||
}
|
||||
|
||||
sub _unescape {
|
||||
join(',', map {
|
||||
/\A(\s*)%([1-9]\d*|\*)(\s*)\z/ ? "$1_$2$3" : $_
|
||||
} split(/,/, $_[0]));
|
||||
}
|
||||
|
||||
sub auto_path {
|
||||
my ($self, $calldir) = @_;
|
||||
$calldir =~ s#::#/#g;
|
||||
my $path = $INC{$calldir . '.pm'} or return;
|
||||
|
||||
# Try absolute path name.
|
||||
if ($^O eq 'MacOS') {
|
||||
(my $malldir = $calldir) =~ tr#/#:#;
|
||||
$path =~ s#^(.*)$malldir\.pm\z#$1auto:$malldir:#s;
|
||||
} else {
|
||||
$path =~ s#^(.*)$calldir\.pm\z#$1auto/$calldir/#;
|
||||
}
|
||||
|
||||
return $path if -d $path;
|
||||
|
||||
# If that failed, try relative path with normal @INC searching.
|
||||
$path = "auto/$calldir/";
|
||||
foreach my $inc (@INC) {
|
||||
return "$inc/$path" if -d "$inc/$path";
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
=head1 ACKNOWLEDGMENTS
|
||||
|
||||
Thanks to Jos I. Boumans for suggesting this module to be written.
|
||||
|
||||
Thanks to Chia-Liang Kao for suggesting C<Path> and C<loc_lang>.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Audrey Tang E<lt>cpan@audreyt.orgE<gt>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 2003, 2004, 2005, 2006 by Audrey Tang E<lt>cpan@audreyt.orgE<gt>.
|
||||
|
||||
This software is released under the MIT license cited below. Additionally,
|
||||
when this software is distributed with B<Perl Kit, Version 5>, you may also
|
||||
redistribute it and/or modify it under the same terms as Perl itself.
|
||||
|
||||
=head2 The "MIT" License
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
=cut
|
||||
775
database/perl/lib/Locale/Maketext/TPJ13.pod
Normal file
775
database/perl/lib/Locale/Maketext/TPJ13.pod
Normal file
@@ -0,0 +1,775 @@
|
||||
# This document contains text in Perl "POD" format.
|
||||
# Use a POD viewer like perldoc or perlman to render it.
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Locale::Maketext::TPJ13 -- article about software localization
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
# This an article, not a module.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The following article by Sean M. Burke and Jordan Lachler
|
||||
first appeared in I<The Perl Journal> #13
|
||||
and is copyright 1999 The Perl Journal. It appears
|
||||
courtesy of Jon Orwant and The Perl Journal. This document may be
|
||||
distributed under the same terms as Perl itself.
|
||||
|
||||
=head1 Localization and Perl: gettext breaks, Maketext fixes
|
||||
|
||||
by Sean M. Burke and Jordan Lachler
|
||||
|
||||
This article points out cases where gettext (a common system for
|
||||
localizing software interfaces -- i.e., making them work in the user's
|
||||
language of choice) fails because of basic differences between human
|
||||
languages. This article then describes Maketext, a new system capable
|
||||
of correctly treating these differences.
|
||||
|
||||
=head2 A Localization Horror Story: It Could Happen To You
|
||||
|
||||
=over
|
||||
|
||||
"There are a number of languages spoken by human beings in this
|
||||
world."
|
||||
|
||||
-- Harald Tveit Alvestrand, in RFC 1766, "Tags for the
|
||||
Identification of Languages"
|
||||
|
||||
=back
|
||||
|
||||
Imagine that your task for the day is to localize a piece of software
|
||||
-- and luckily for you, the only output the program emits is two
|
||||
messages, like this:
|
||||
|
||||
I scanned 12 directories.
|
||||
|
||||
Your query matched 10 files in 4 directories.
|
||||
|
||||
So how hard could that be? You look at the code that
|
||||
produces the first item, and it reads:
|
||||
|
||||
printf("I scanned %g directories.",
|
||||
$directory_count);
|
||||
|
||||
You think about that, and realize that it doesn't even work right for
|
||||
English, as it can produce this output:
|
||||
|
||||
I scanned 1 directories.
|
||||
|
||||
So you rewrite it to read:
|
||||
|
||||
printf("I scanned %g %s.",
|
||||
$directory_count,
|
||||
$directory_count == 1 ?
|
||||
"directory" : "directories",
|
||||
);
|
||||
|
||||
...which does the Right Thing. (In case you don't recall, "%g" is for
|
||||
locale-specific number interpolation, and "%s" is for string
|
||||
interpolation.)
|
||||
|
||||
But you still have to localize it for all the languages you're
|
||||
producing this software for, so you pull Locale::gettext off of CPAN
|
||||
so you can access the C<gettext> C functions you've heard are standard
|
||||
for localization tasks.
|
||||
|
||||
And you write:
|
||||
|
||||
printf(gettext("I scanned %g %s."),
|
||||
$dir_scan_count,
|
||||
$dir_scan_count == 1 ?
|
||||
gettext("directory") : gettext("directories"),
|
||||
);
|
||||
|
||||
But you then read in the gettext manual (Drepper, Miller, and Pinard 1995)
|
||||
that this is not a good idea, since how a single word like "directory"
|
||||
or "directories" is translated may depend on context -- and this is
|
||||
true, since in a case language like German or Russian, you'd may need
|
||||
these words with a different case ending in the first instance (where the
|
||||
word is the object of a verb) than in the second instance, which you haven't even
|
||||
gotten to yet (where the word is the object of a preposition, "in %g
|
||||
directories") -- assuming these keep the same syntax when translated
|
||||
into those languages.
|
||||
|
||||
So, on the advice of the gettext manual, you rewrite:
|
||||
|
||||
printf( $dir_scan_count == 1 ?
|
||||
gettext("I scanned %g directory.") :
|
||||
gettext("I scanned %g directories."),
|
||||
$dir_scan_count );
|
||||
|
||||
So, you email your various translators (the boss decides that the
|
||||
languages du jour are Chinese, Arabic, Russian, and Italian, so you
|
||||
have one translator for each), asking for translations for "I scanned
|
||||
%g directory." and "I scanned %g directories.". When they reply,
|
||||
you'll put that in the lexicons for gettext to use when it localizes
|
||||
your software, so that when the user is running under the "zh"
|
||||
(Chinese) locale, gettext("I scanned %g directory.") will return the
|
||||
appropriate Chinese text, with a "%g" in there where printf can then
|
||||
interpolate $dir_scan.
|
||||
|
||||
Your Chinese translator emails right back -- he says both of these
|
||||
phrases translate to the same thing in Chinese, because, in linguistic
|
||||
jargon, Chinese "doesn't have number as a grammatical category" --
|
||||
whereas English does. That is, English has grammatical rules that
|
||||
refer to "number", i.e., whether something is grammatically singular
|
||||
or plural; and one of these rules is the one that forces nouns to take
|
||||
a plural suffix (generally "s") when in a plural context, as they are when
|
||||
they follow a number other than "one" (including, oddly enough, "zero").
|
||||
Chinese has no such rules, and so has just the one phrase where English
|
||||
has two. But, no problem, you can have this one Chinese phrase appear
|
||||
as the translation for the two English phrases in the "zh" gettext
|
||||
lexicon for your program.
|
||||
|
||||
Emboldened by this, you dive into the second phrase that your software
|
||||
needs to output: "Your query matched 10 files in 4 directories.". You notice
|
||||
that if you want to treat phrases as indivisible, as the gettext
|
||||
manual wisely advises, you need four cases now, instead of two, to
|
||||
cover the permutations of singular and plural on the two items,
|
||||
$dir_count and $file_count. So you try this:
|
||||
|
||||
printf( $file_count == 1 ?
|
||||
( $directory_count == 1 ?
|
||||
gettext("Your query matched %g file in %g directory.") :
|
||||
gettext("Your query matched %g file in %g directories.") ) :
|
||||
( $directory_count == 1 ?
|
||||
gettext("Your query matched %g files in %g directory.") :
|
||||
gettext("Your query matched %g files in %g directories.") ),
|
||||
$file_count, $directory_count,
|
||||
);
|
||||
|
||||
(The case of "1 file in 2 [or more] directories" could, I suppose,
|
||||
occur in the case of symlinking or something of the sort.)
|
||||
|
||||
It occurs to you that this is not the prettiest code you've ever
|
||||
written, but this seems the way to go. You mail off to the
|
||||
translators asking for translations for these four cases. The
|
||||
Chinese guy replies with the one phrase that these all translate to in
|
||||
Chinese, and that phrase has two "%g"s in it, as it should -- but
|
||||
there's a problem. He translates it word-for-word back: "In %g
|
||||
directories contains %g files match your query." The %g
|
||||
slots are in an order reverse to what they are in English. You wonder
|
||||
how you'll get gettext to handle that.
|
||||
|
||||
But you put it aside for the moment, and optimistically hope that the
|
||||
other translators won't have this problem, and that their languages
|
||||
will be better behaved -- i.e., that they will be just like English.
|
||||
|
||||
But the Arabic translator is the next to write back. First off, your
|
||||
code for "I scanned %g directory." or "I scanned %g directories."
|
||||
assumes there's only singular or plural. But, to use linguistic
|
||||
jargon again, Arabic has grammatical number, like English (but unlike
|
||||
Chinese), but it's a three-term category: singular, dual, and plural.
|
||||
In other words, the way you say "directory" depends on whether there's
|
||||
one directory, or I<two> of them, or I<more than two> of them. Your
|
||||
test of C<($directory == 1)> no longer does the job. And it means
|
||||
that where English's grammatical category of number necessitates
|
||||
only the two permutations of the first sentence based on "directory
|
||||
[singular]" and "directories [plural]", Arabic has three -- and,
|
||||
worse, in the second sentence ("Your query matched %g file in %g
|
||||
directory."), where English has four, Arabic has nine. You sense
|
||||
an unwelcome, exponential trend taking shape.
|
||||
|
||||
Your Italian translator emails you back and says that "I searched 0
|
||||
directories" (a possible English output of your program) is stilted,
|
||||
and if you think that's fine English, that's your problem, but that
|
||||
I<just will not do> in the language of Dante. He insists that where
|
||||
$directory_count is 0, your program should produce the Italian text
|
||||
for "I I<didn't> scan I<any> directories.". And ditto for "I didn't
|
||||
match any files in any directories", although he says the last part
|
||||
about "in any directories" should probably just be left off.
|
||||
|
||||
You wonder how you'll get gettext to handle this; to accommodate the
|
||||
ways Arabic, Chinese, and Italian deal with numbers in just these few
|
||||
very simple phrases, you need to write code that will ask gettext for
|
||||
different queries depending on whether the numerical values in
|
||||
question are 1, 2, more than 2, or in some cases 0, and you still haven't
|
||||
figured out the problem with the different word order in Chinese.
|
||||
|
||||
Then your Russian translator calls on the phone, to I<personally> tell
|
||||
you the bad news about how really unpleasant your life is about to
|
||||
become:
|
||||
|
||||
Russian, like German or Latin, is an inflectional language; that is, nouns
|
||||
and adjectives have to take endings that depend on their case
|
||||
(i.e., nominative, accusative, genitive, etc...) -- which is roughly a matter of
|
||||
what role they have in syntax of the sentence --
|
||||
as well as on the grammatical gender (i.e., masculine, feminine, neuter)
|
||||
and number (i.e., singular or plural) of the noun, as well as on the
|
||||
declension class of the noun. But unlike with most other inflected languages,
|
||||
putting a number-phrase (like "ten" or "forty-three", or their Arabic
|
||||
numeral equivalents) in front of noun in Russian can change the case and
|
||||
number that noun is, and therefore the endings you have to put on it.
|
||||
|
||||
He elaborates: In "I scanned %g directories", you'd I<expect>
|
||||
"directories" to be in the accusative case (since it is the direct
|
||||
object in the sentence) and the plural number,
|
||||
except where $directory_count is 1, then you'd expect the singular, of
|
||||
course. Just like Latin or German. I<But!> Where $directory_count %
|
||||
10 is 1 ("%" for modulo, remember), assuming $directory count is an
|
||||
integer, and except where $directory_count % 100 is 11, "directories"
|
||||
is forced to become grammatically singular, which means it gets the
|
||||
ending for the accusative singular... You begin to visualize the code
|
||||
it'd take to test for the problem so far, I<and still work for Chinese
|
||||
and Arabic and Italian>, and how many gettext items that'd take, but
|
||||
he keeps going... But where $directory_count % 10 is 2, 3, or 4
|
||||
(except where $directory_count % 100 is 12, 13, or 14), the word for
|
||||
"directories" is forced to be genitive singular -- which means another
|
||||
ending... The room begins to spin around you, slowly at first... But
|
||||
with I<all other> integer values, since "directory" is an inanimate
|
||||
noun, when preceded by a number and in the nominative or accusative
|
||||
cases (as it is here, just your luck!), it does stay plural, but it is
|
||||
forced into the genitive case -- yet another ending... And
|
||||
you never hear him get to the part about how you're going to run into
|
||||
similar (but maybe subtly different) problems with other Slavic
|
||||
languages like Polish, because the floor comes up to meet you, and you
|
||||
fade into unconsciousness.
|
||||
|
||||
|
||||
The above cautionary tale relates how an attempt at localization can
|
||||
lead from programmer consternation, to program obfuscation, to a need
|
||||
for sedation. But careful evaluation shows that your choice of tools
|
||||
merely needed further consideration.
|
||||
|
||||
=head2 The Linguistic View
|
||||
|
||||
=over
|
||||
|
||||
"It is more complicated than you think."
|
||||
|
||||
-- The Eighth Networking Truth, from RFC 1925
|
||||
|
||||
=back
|
||||
|
||||
The field of Linguistics has expended a great deal of effort over the
|
||||
past century trying to find grammatical patterns which hold across
|
||||
languages; it's been a constant process
|
||||
of people making generalizations that should apply to all languages,
|
||||
only to find out that, all too often, these generalizations fail --
|
||||
sometimes failing for just a few languages, sometimes whole classes of
|
||||
languages, and sometimes nearly every language in the world except
|
||||
English. Broad statistical trends are evident in what the "average
|
||||
language" is like as far as what its rules can look like, must look
|
||||
like, and cannot look like. But the "average language" is just as
|
||||
unreal a concept as the "average person" -- it runs up against the
|
||||
fact no language (or person) is, in fact, average. The wisdom of past
|
||||
experience leads us to believe that any given language can do whatever
|
||||
it wants, in any order, with appeal to any kind of grammatical
|
||||
categories wants -- case, number, tense, real or metaphoric
|
||||
characteristics of the things that words refer to, arbitrary or
|
||||
predictable classifications of words based on what endings or prefixes
|
||||
they can take, degree or means of certainty about the truth of
|
||||
statements expressed, and so on, ad infinitum.
|
||||
|
||||
Mercifully, most localization tasks are a matter of finding ways to
|
||||
translate whole phrases, generally sentences, where the context is
|
||||
relatively set, and where the only variation in content is I<usually>
|
||||
in a number being expressed -- as in the example sentences above.
|
||||
Translating specific, fully-formed sentences is, in practice, fairly
|
||||
foolproof -- which is good, because that's what's in the phrasebooks
|
||||
that so many tourists rely on. Now, a given phrase (whether in a
|
||||
phrasebook or in a gettext lexicon) in one language I<might> have a
|
||||
greater or lesser applicability than that phrase's translation into
|
||||
another language -- for example, strictly speaking, in Arabic, the
|
||||
"your" in "Your query matched..." would take a different form
|
||||
depending on whether the user is male or female; so the Arabic
|
||||
translation "your[feminine] query" is applicable in fewer cases than
|
||||
the corresponding English phrase, which doesn't distinguish the user's
|
||||
gender. (In practice, it's not feasible to have a program know the
|
||||
user's gender, so the masculine "you" in Arabic is usually used, by
|
||||
default.)
|
||||
|
||||
But in general, such surprises are rare when entire sentences are
|
||||
being translated, especially when the functional context is restricted
|
||||
to that of a computer interacting with a user either to convey a fact
|
||||
or to prompt for a piece of information. So, for purposes of
|
||||
localization, translation by phrase (generally by sentence) is both the
|
||||
simplest and the least problematic.
|
||||
|
||||
=head2 Breaking gettext
|
||||
|
||||
=over
|
||||
|
||||
"It Has To Work."
|
||||
|
||||
-- First Networking Truth, RFC 1925
|
||||
|
||||
=back
|
||||
|
||||
Consider that sentences in a tourist phrasebook are of two types: ones
|
||||
like "How do I get to the marketplace?" that don't have any blanks to
|
||||
fill in, and ones like "How much do these ___ cost?", where there's
|
||||
one or more blanks to fill in (and these are usually linked to a
|
||||
list of words that you can put in that blank: "fish", "potatoes",
|
||||
"tomatoes", etc.). The ones with no blanks are no problem, but the
|
||||
fill-in-the-blank ones may not be really straightforward. If it's a
|
||||
Swahili phrasebook, for example, the authors probably didn't bother to
|
||||
tell you the complicated ways that the verb "cost" changes its
|
||||
inflectional prefix depending on the noun you're putting in the blank.
|
||||
The trader in the marketplace will still understand what you're saying if
|
||||
you say "how much do these potatoes cost?" with the wrong
|
||||
inflectional prefix on "cost". After all, I<you> can't speak proper Swahili,
|
||||
I<you're> just a tourist. But while tourists can be stupid, computers
|
||||
are supposed to be smart; the computer should be able to fill in the
|
||||
blank, and still have the results be grammatical.
|
||||
|
||||
In other words, a phrasebook entry takes some values as parameters
|
||||
(the things that you fill in the blank or blanks), and provides a value
|
||||
based on these parameters, where the way you get that final value from
|
||||
the given values can, properly speaking, involve an arbitrarily
|
||||
complex series of operations. (In the case of Chinese, it'd be not at
|
||||
all complex, at least in cases like the examples at the beginning of
|
||||
this article; whereas in the case of Russian it'd be a rather complex
|
||||
series of operations. And in some languages, the
|
||||
complexity could be spread around differently: while the act of
|
||||
putting a number-expression in front of a noun phrase might not be
|
||||
complex by itself, it may change how you have to, for example, inflect
|
||||
a verb elsewhere in the sentence. This is what in syntax is called
|
||||
"long-distance dependencies".)
|
||||
|
||||
This talk of parameters and arbitrary complexity is just another way
|
||||
to say that an entry in a phrasebook is what in a programming language
|
||||
would be called a "function". Just so you don't miss it, this is the
|
||||
crux of this article: I<A phrase is a function; a phrasebook is a
|
||||
bunch of functions.>
|
||||
|
||||
The reason that using gettext runs into walls (as in the above
|
||||
second-person horror story) is that you're trying to use a string (or
|
||||
worse, a choice among a bunch of strings) to do what you really need a
|
||||
function for -- which is futile. Preforming (s)printf interpolation
|
||||
on the strings which you get back from gettext does allow you to do I<some>
|
||||
common things passably well... sometimes... sort of; but, to paraphrase
|
||||
what some people say about C<csh> script programming, "it fools you
|
||||
into thinking you can use it for real things, but you can't, and you
|
||||
don't discover this until you've already spent too much time trying,
|
||||
and by then it's too late."
|
||||
|
||||
=head2 Replacing gettext
|
||||
|
||||
So, what needs to replace gettext is a system that supports lexicons
|
||||
of functions instead of lexicons of strings. An entry in a lexicon
|
||||
from such a system should I<not> look like this:
|
||||
|
||||
"J'ai trouv\xE9 %g fichiers dans %g r\xE9pertoires"
|
||||
|
||||
[\xE9 is e-acute in Latin-1. Some pod renderers would
|
||||
scream if I used the actual character here. -- SB]
|
||||
|
||||
but instead like this, bearing in mind that this is just a first stab:
|
||||
|
||||
sub I_found_X1_files_in_X2_directories {
|
||||
my( $files, $dirs ) = @_[0,1];
|
||||
$files = sprintf("%g %s", $files,
|
||||
$files == 1 ? 'fichier' : 'fichiers');
|
||||
$dirs = sprintf("%g %s", $dirs,
|
||||
$dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires");
|
||||
return "J'ai trouv\xE9 $files dans $dirs.";
|
||||
}
|
||||
|
||||
Now, there's no particularly obvious way to store anything but strings
|
||||
in a gettext lexicon; so it looks like we just have to start over and
|
||||
make something better, from scratch. I call my shot at a
|
||||
gettext-replacement system "Maketext", or, in CPAN terms,
|
||||
Locale::Maketext.
|
||||
|
||||
When designing Maketext, I chose to plan its main features in terms of
|
||||
"buzzword compliance". And here are the buzzwords:
|
||||
|
||||
=head2 Buzzwords: Abstraction and Encapsulation
|
||||
|
||||
The complexity of the language you're trying to output a phrase in is
|
||||
entirely abstracted inside (and encapsulated within) the Maketext module
|
||||
for that interface. When you call:
|
||||
|
||||
print $lang->maketext("You have [quant,_1,piece] of new mail.",
|
||||
scalar(@messages));
|
||||
|
||||
you don't know (and in fact can't easily find out) whether this will
|
||||
involve lots of figuring, as in Russian (if $lang is a handle to the
|
||||
Russian module), or relatively little, as in Chinese. That kind of
|
||||
abstraction and encapsulation may encourage other pleasant buzzwords
|
||||
like modularization and stratification, depending on what design
|
||||
decisions you make.
|
||||
|
||||
=head2 Buzzword: Isomorphism
|
||||
|
||||
"Isomorphism" means "having the same structure or form"; in discussions
|
||||
of program design, the word takes on the special, specific meaning that
|
||||
your implementation of a solution to a problem I<has the same
|
||||
structure> as, say, an informal verbal description of the solution, or
|
||||
maybe of the problem itself. Isomorphism is, all things considered,
|
||||
a good thing -- it's what problem-solving (and solution-implementing)
|
||||
should look like.
|
||||
|
||||
What's wrong the with gettext-using code like this...
|
||||
|
||||
printf( $file_count == 1 ?
|
||||
( $directory_count == 1 ?
|
||||
"Your query matched %g file in %g directory." :
|
||||
"Your query matched %g file in %g directories." ) :
|
||||
( $directory_count == 1 ?
|
||||
"Your query matched %g files in %g directory." :
|
||||
"Your query matched %g files in %g directories." ),
|
||||
$file_count, $directory_count,
|
||||
);
|
||||
|
||||
is first off that it's not well abstracted -- these ways of testing
|
||||
for grammatical number (as in the expressions like C<foo == 1 ?
|
||||
singular_form : plural_form>) should be abstracted to each language
|
||||
module, since how you get grammatical number is language-specific.
|
||||
|
||||
But second off, it's not isomorphic -- the "solution" (i.e., the
|
||||
phrasebook entries) for Chinese maps from these four English phrases to
|
||||
the one Chinese phrase that fits for all of them. In other words, the
|
||||
informal solution would be "The way to say what you want in Chinese is
|
||||
with the one phrase 'For your question, in Y directories you would
|
||||
find X files'" -- and so the implemented solution should be,
|
||||
isomorphically, just a straightforward way to spit out that one
|
||||
phrase, with numerals properly interpolated. It shouldn't have to map
|
||||
from the complexity of other languages to the simplicity of this one.
|
||||
|
||||
=head2 Buzzword: Inheritance
|
||||
|
||||
There's a great deal of reuse possible for sharing of phrases between
|
||||
modules for related dialects, or for sharing of auxiliary functions
|
||||
between related languages. (By "auxiliary functions", I mean
|
||||
functions that don't produce phrase-text, but which, say, return an
|
||||
answer to "does this number require a plural noun after it?". Such
|
||||
auxiliary functions would be used in the internal logic of functions
|
||||
that actually do produce phrase-text.)
|
||||
|
||||
In the case of sharing phrases, consider that you have an interface
|
||||
already localized for American English (probably by having been
|
||||
written with that as the native locale, but that's incidental).
|
||||
Localizing it for UK English should, in practical terms, be just a
|
||||
matter of running it past a British person with the instructions to
|
||||
indicate what few phrases would benefit from a change in spelling or
|
||||
possibly minor rewording. In that case, you should be able to put in
|
||||
the UK English localization module I<only> those phrases that are
|
||||
UK-specific, and for all the rest, I<inherit> from the American
|
||||
English module. (And I expect this same situation would apply with
|
||||
Brazilian and Continental Portugese, possibly with some I<very>
|
||||
closely related languages like Czech and Slovak, and possibly with the
|
||||
slightly different "versions" of written Mandarin Chinese, as I hear exist in
|
||||
Taiwan and mainland China.)
|
||||
|
||||
As to sharing of auxiliary functions, consider the problem of Russian
|
||||
numbers from the beginning of this article; obviously, you'd want to
|
||||
write only once the hairy code that, given a numeric value, would
|
||||
return some specification of which case and number a given quantified
|
||||
noun should use. But suppose that you discover, while localizing an
|
||||
interface for, say, Ukranian (a Slavic language related to Russian,
|
||||
spoken by several million people, many of whom would be relieved to
|
||||
find that your Web site's or software's interface is available in
|
||||
their language), that the rules in Ukranian are the same as in Russian
|
||||
for quantification, and probably for many other grammatical functions.
|
||||
While there may well be no phrases in common between Russian and
|
||||
Ukranian, you could still choose to have the Ukranian module inherit
|
||||
from the Russian module, just for the sake of inheriting all the
|
||||
various grammatical methods. Or, probably better organizationally,
|
||||
you could move those functions to a module called C<_E_Slavic> or
|
||||
something, which Russian and Ukrainian could inherit useful functions
|
||||
from, but which would (presumably) provide no lexicon.
|
||||
|
||||
=head2 Buzzword: Concision
|
||||
|
||||
Okay, concision isn't a buzzword. But it should be, so I decree that
|
||||
as a new buzzword, "concision" means that simple common things should
|
||||
be expressible in very few lines (or maybe even just a few characters)
|
||||
of code -- call it a special case of "making simple things easy and
|
||||
hard things possible", and see also the role it played in the
|
||||
MIDI::Simple language, discussed elsewhere in this issue [TPJ#13].
|
||||
|
||||
Consider our first stab at an entry in our "phrasebook of functions":
|
||||
|
||||
sub I_found_X1_files_in_X2_directories {
|
||||
my( $files, $dirs ) = @_[0,1];
|
||||
$files = sprintf("%g %s", $files,
|
||||
$files == 1 ? 'fichier' : 'fichiers');
|
||||
$dirs = sprintf("%g %s", $dirs,
|
||||
$dirs == 1 ? "r\xE9pertoire" : "r\xE9pertoires");
|
||||
return "J'ai trouv\xE9 $files dans $dirs.";
|
||||
}
|
||||
|
||||
You may sense that a lexicon (to use a non-committal catch-all term for a
|
||||
collection of things you know how to say, regardless of whether they're
|
||||
phrases or words) consisting of functions I<expressed> as above would
|
||||
make for rather long-winded and repetitive code -- even if you wisely
|
||||
rewrote this to have quantification (as we call adding a number
|
||||
expression to a noun phrase) be a function called like:
|
||||
|
||||
sub I_found_X1_files_in_X2_directories {
|
||||
my( $files, $dirs ) = @_[0,1];
|
||||
$files = quant($files, "fichier");
|
||||
$dirs = quant($dirs, "r\xE9pertoire");
|
||||
return "J'ai trouv\xE9 $files dans $dirs.";
|
||||
}
|
||||
|
||||
And you may also sense that you do not want to bother your translators
|
||||
with having to write Perl code -- you'd much rather that they spend
|
||||
their I<very costly time> on just translation. And this is to say
|
||||
nothing of the near impossibility of finding a commercial translator
|
||||
who would know even simple Perl.
|
||||
|
||||
In a first-hack implementation of Maketext, each language-module's
|
||||
lexicon looked like this:
|
||||
|
||||
%Lexicon = (
|
||||
"I found %g files in %g directories"
|
||||
=> sub {
|
||||
my( $files, $dirs ) = @_[0,1];
|
||||
$files = quant($files, "fichier");
|
||||
$dirs = quant($dirs, "r\xE9pertoire");
|
||||
return "J'ai trouv\xE9 $files dans $dirs.";
|
||||
},
|
||||
... and so on with other phrase => sub mappings ...
|
||||
);
|
||||
|
||||
but I immediately went looking for some more concise way to basically
|
||||
denote the same phrase-function -- a way that would also serve to
|
||||
concisely denote I<most> phrase-functions in the lexicon for I<most>
|
||||
languages. After much time and even some actual thought, I decided on
|
||||
this system:
|
||||
|
||||
* Where a value in a %Lexicon hash is a contentful string instead of
|
||||
an anonymous sub (or, conceivably, a coderef), it would be interpreted
|
||||
as a sort of shorthand expression of what the sub does. When accessed
|
||||
for the first time in a session, it is parsed, turned into Perl code,
|
||||
and then eval'd into an anonymous sub; then that sub replaces the
|
||||
original string in that lexicon. (That way, the work of parsing and
|
||||
evaling the shorthand form for a given phrase is done no more than
|
||||
once per session.)
|
||||
|
||||
* Calls to C<maketext> (as Maketext's main function is called) happen
|
||||
thru a "language session handle", notionally very much like an IO
|
||||
handle, in that you open one at the start of the session, and use it
|
||||
for "sending signals" to an object in order to have it return the text
|
||||
you want.
|
||||
|
||||
So, this:
|
||||
|
||||
$lang->maketext("You have [quant,_1,piece] of new mail.",
|
||||
scalar(@messages));
|
||||
|
||||
basically means this: look in the lexicon for $lang (which may inherit
|
||||
from any number of other lexicons), and find the function that we
|
||||
happen to associate with the string "You have [quant,_1,piece] of new
|
||||
mail" (which is, and should be, a functioning "shorthand" for this
|
||||
function in the native locale -- English in this case). If you find
|
||||
such a function, call it with $lang as its first parameter (as if it
|
||||
were a method), and then a copy of scalar(@messages) as its second,
|
||||
and then return that value. If that function was found, but was in
|
||||
string shorthand instead of being a fully specified function, parse it
|
||||
and make it into a function before calling it the first time.
|
||||
|
||||
* The shorthand uses code in brackets to indicate method calls that
|
||||
should be performed. A full explanation is not in order here, but a
|
||||
few examples will suffice:
|
||||
|
||||
"You have [quant,_1,piece] of new mail."
|
||||
|
||||
The above code is shorthand for, and will be interpreted as,
|
||||
this:
|
||||
|
||||
sub {
|
||||
my $handle = $_[0];
|
||||
my(@params) = @_;
|
||||
return join '',
|
||||
"You have ",
|
||||
$handle->quant($params[1], 'piece'),
|
||||
"of new mail.";
|
||||
}
|
||||
|
||||
where "quant" is the name of a method you're using to quantify the
|
||||
noun "piece" with the number $params[0].
|
||||
|
||||
A string with no brackety calls, like this:
|
||||
|
||||
"Your search expression was malformed."
|
||||
|
||||
is somewhat of a degenerate case, and just gets turned into:
|
||||
|
||||
sub { return "Your search expression was malformed." }
|
||||
|
||||
However, not everything you can write in Perl code can be written in
|
||||
the above shorthand system -- not by a long shot. For example, consider
|
||||
the Italian translator from the beginning of this article, who wanted
|
||||
the Italian for "I didn't find any files" as a special case, instead
|
||||
of "I found 0 files". That couldn't be specified (at least not easily
|
||||
or simply) in our shorthand system, and it would have to be written
|
||||
out in full, like this:
|
||||
|
||||
sub { # pretend the English strings are in Italian
|
||||
my($handle, $files, $dirs) = @_[0,1,2];
|
||||
return "I didn't find any files" unless $files;
|
||||
return join '',
|
||||
"I found ",
|
||||
$handle->quant($files, 'file'),
|
||||
" in ",
|
||||
$handle->quant($dirs, 'directory'),
|
||||
".";
|
||||
}
|
||||
|
||||
Next to a lexicon full of shorthand code, that sort of sticks out like a
|
||||
sore thumb -- but this I<is> a special case, after all; and at least
|
||||
it's possible, if not as concise as usual.
|
||||
|
||||
As to how you'd implement the Russian example from the beginning of
|
||||
the article, well, There's More Than One Way To Do It, but it could be
|
||||
something like this (using English words for Russian, just so you know
|
||||
what's going on):
|
||||
|
||||
"I [quant,_1,directory,accusative] scanned."
|
||||
|
||||
This shifts the burden of complexity off to the quant method. That
|
||||
method's parameters are: the numeric value it's going to use to
|
||||
quantify something; the Russian word it's going to quantify; and the
|
||||
parameter "accusative", which you're using to mean that this
|
||||
sentence's syntax wants a noun in the accusative case there, although
|
||||
that quantification method may have to overrule, for grammatical
|
||||
reasons you may recall from the beginning of this article.
|
||||
|
||||
Now, the Russian quant method here is responsible not only for
|
||||
implementing the strange logic necessary for figuring out how Russian
|
||||
number-phrases impose case and number on their noun-phrases, but also
|
||||
for inflecting the Russian word for "directory". How that inflection
|
||||
is to be carried out is no small issue, and among the solutions I've
|
||||
seen, some (like variations on a simple lookup in a hash where all
|
||||
possible forms are provided for all necessary words) are
|
||||
straightforward but I<can> become cumbersome when you need to inflect
|
||||
more than a few dozen words; and other solutions (like using
|
||||
algorithms to model the inflections, storing only root forms and
|
||||
irregularities) I<can> involve more overhead than is justifiable for
|
||||
all but the largest lexicons.
|
||||
|
||||
Mercifully, this design decision becomes crucial only in the hairiest
|
||||
of inflected languages, of which Russian is by no means the I<worst> case
|
||||
scenario, but is worse than most. Most languages have simpler
|
||||
inflection systems; for example, in English or Swahili, there are
|
||||
generally no more than two possible inflected forms for a given noun
|
||||
("error/errors"; "kosa/makosa"), and the
|
||||
rules for producing these forms are fairly simple -- or at least,
|
||||
simple rules can be formulated that work for most words, and you can
|
||||
then treat the exceptions as just "irregular", at least relative to
|
||||
your ad hoc rules. A simpler inflection system (simpler rules, fewer
|
||||
forms) means that design decisions are less crucial to maintaining
|
||||
sanity, whereas the same decisions could incur
|
||||
overhead-versus-scalability problems in languages like Russian. It
|
||||
may I<also> be likely that code (possibly in Perl, as with
|
||||
Lingua::EN::Inflect, for English nouns) has already
|
||||
been written for the language in question, whether simple or complex.
|
||||
|
||||
Moreover, a third possibility may even be simpler than anything
|
||||
discussed above: "Just require that all possible (or at least
|
||||
applicable) forms be provided in the call to the given language's quant
|
||||
method, as in:"
|
||||
|
||||
"I found [quant,_1,file,files]."
|
||||
|
||||
That way, quant just has to chose which form it needs, without having
|
||||
to look up or generate anything. While possibly not optimal for
|
||||
Russian, this should work well for most other languages, where
|
||||
quantification is not as complicated an operation.
|
||||
|
||||
=head2 The Devil in the Details
|
||||
|
||||
There's plenty more to Maketext than described above -- for example,
|
||||
there's the details of how language tags ("en-US", "i-pwn", "fi",
|
||||
etc.) or locale IDs ("en_US") interact with actual module naming
|
||||
("BogoQuery/Locale/en_us.pm"), and what magic can ensue; there's the
|
||||
details of how to record (and possibly negotiate) what character
|
||||
encoding Maketext will return text in (UTF8? Latin-1? KOI8?). There's
|
||||
the interesting fact that Maketext is for localization, but nowhere
|
||||
actually has a "C<use locale;>" anywhere in it. For the curious,
|
||||
there's the somewhat frightening details of how I actually
|
||||
implement something like data inheritance so that searches across
|
||||
modules' %Lexicon hashes can parallel how Perl implements method
|
||||
inheritance.
|
||||
|
||||
And, most importantly, there's all the practical details of how to
|
||||
actually go about deriving from Maketext so you can use it for your
|
||||
interfaces, and the various tools and conventions for starting out and
|
||||
maintaining individual language modules.
|
||||
|
||||
That is all covered in the documentation for Locale::Maketext and the
|
||||
modules that come with it, available in CPAN. After having read this
|
||||
article, which covers the why's of Maketext, the documentation,
|
||||
which covers the how's of it, should be quite straightforward.
|
||||
|
||||
=head2 The Proof in the Pudding: Localizing Web Sites
|
||||
|
||||
Maketext and gettext have a notable difference: gettext is in C,
|
||||
accessible thru C library calls, whereas Maketext is in Perl, and
|
||||
really can't work without a Perl interpreter (although I suppose
|
||||
something like it could be written for C). Accidents of history (and
|
||||
not necessarily lucky ones) have made C++ the most common language for
|
||||
the implementation of applications like word processors, Web browsers,
|
||||
and even many in-house applications like custom query systems. Current
|
||||
conditions make it somewhat unlikely that the next one of any of these
|
||||
kinds of applications will be written in Perl, albeit clearly more for
|
||||
reasons of custom and inertia than out of consideration of what is the
|
||||
right tool for the job.
|
||||
|
||||
However, other accidents of history have made Perl a well-accepted
|
||||
language for design of server-side programs (generally in CGI form)
|
||||
for Web site interfaces. Localization of static pages in Web sites is
|
||||
trivial, feasible either with simple language-negotiation features in
|
||||
servers like Apache, or with some kind of server-side inclusions of
|
||||
language-appropriate text into layout templates. However, I think
|
||||
that the localization of Perl-based search systems (or other kinds of
|
||||
dynamic content) in Web sites, be they public or access-restricted,
|
||||
is where Maketext will see the greatest use.
|
||||
|
||||
I presume that it would be only the exceptional Web site that gets
|
||||
localized for English I<and> Chinese I<and> Italian I<and> Arabic
|
||||
I<and> Russian, to recall the languages from the beginning of this
|
||||
article -- to say nothing of German, Spanish, French, Japanese,
|
||||
Finnish, and Hindi, to name a few languages that benefit from large
|
||||
numbers of programmers or Web viewers or both.
|
||||
|
||||
However, the ever-increasing internationalization of the Web (whether
|
||||
measured in terms of amount of content, of numbers of content writers
|
||||
or programmers, or of size of content audiences) makes it increasingly
|
||||
likely that the interface to the average Web-based dynamic content
|
||||
service will be localized for two or maybe three languages. It is my
|
||||
hope that Maketext will make that task as simple as possible, and will
|
||||
remove previous barriers to localization for languages dissimilar to
|
||||
English.
|
||||
|
||||
__END__
|
||||
|
||||
Sean M. Burke (sburkeE<64>cpan.org) has a Master's in linguistics
|
||||
from Northwestern University; he specializes in language technology.
|
||||
Jordan Lachler (lachlerE<64>unm.edu) is a PhD student in the Department of
|
||||
Linguistics at the University of New Mexico; he specializes in
|
||||
morphology and pedagogy of North American native languages.
|
||||
|
||||
=head2 References
|
||||
|
||||
Alvestrand, Harald Tveit. 1995. I<RFC 1766: Tags for the
|
||||
Identification of Languages.>
|
||||
C<L<http://www.ietf.org/rfc/rfc1766.txt>>
|
||||
[Now see RFC 3066.]
|
||||
|
||||
Callon, Ross, editor. 1996. I<RFC 1925: The Twelve
|
||||
Networking Truths.>
|
||||
C<L<http://www.ietf.org/rfc/rfc1925.txt>>
|
||||
|
||||
Drepper, Ulrich, Peter Miller,
|
||||
and FranE<ccedil>ois Pinard. 1995-2001. GNU
|
||||
C<gettext>. Available in C<L<ftp://prep.ai.mit.edu/pub/gnu/>>, with
|
||||
extensive docs in the distribution tarball. [Since
|
||||
I wrote this article in 1998, I now see that the
|
||||
gettext docs are now trying more to come to terms with
|
||||
plurality. Whether useful conclusions have come from it
|
||||
is another question altogether. -- SMB, May 2001]
|
||||
|
||||
Forbes, Nevill. 1964. I<Russian Grammar.> Third Edition, revised
|
||||
by J. C. Dumbreck. Oxford University Press.
|
||||
|
||||
=cut
|
||||
|
||||
#End
|
||||
|
||||
Reference in New Issue
Block a user