Initial Commit
This commit is contained in:
196
database/perl/lib/I18N/Collate.pm
Normal file
196
database/perl/lib/I18N/Collate.pm
Normal file
@@ -0,0 +1,196 @@
|
||||
package I18N::Collate;
|
||||
|
||||
use strict;
|
||||
our $VERSION = '1.02';
|
||||
|
||||
=head1 NAME
|
||||
|
||||
I18N::Collate - compare 8-bit scalar data according to the current locale
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use I18N::Collate;
|
||||
setlocale(LC_COLLATE, 'locale-of-your-choice');
|
||||
$s1 = I18N::Collate->new("scalar_data_1");
|
||||
$s2 = I18N::Collate->new("scalar_data_2");
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
***
|
||||
|
||||
WARNING: starting from the Perl version 5.003_06
|
||||
the I18N::Collate interface for comparing 8-bit scalar data
|
||||
according to the current locale
|
||||
|
||||
HAS BEEN DEPRECATED
|
||||
|
||||
That is, please do not use it anymore for any new applications
|
||||
and please migrate the old applications away from it because its
|
||||
functionality was integrated into the Perl core language in the
|
||||
release 5.003_06.
|
||||
|
||||
See the perllocale manual page for further information.
|
||||
|
||||
***
|
||||
|
||||
This module provides you with objects that will collate
|
||||
according to your national character set, provided that the
|
||||
POSIX setlocale() function is supported on your system.
|
||||
|
||||
You can compare $s1 and $s2 above with
|
||||
|
||||
$s1 le $s2
|
||||
|
||||
to extract the data itself, you'll need a dereference: $$s1
|
||||
|
||||
This module uses POSIX::setlocale(). The basic collation conversion is
|
||||
done by strxfrm() which terminates at NUL characters being a decent C
|
||||
routine. collate_xfrm() handles embedded NUL characters gracefully.
|
||||
|
||||
The available locales depend on your operating system; try whether
|
||||
C<locale -a> shows them or man pages for "locale" or "nlsinfo" or the
|
||||
direct approach C<ls /usr/lib/nls/loc> or C<ls /usr/lib/nls> or
|
||||
C<ls /usr/lib/locale>. Not all the locales that your vendor supports
|
||||
are necessarily installed: please consult your operating system's
|
||||
documentation and possibly your local system administration. The
|
||||
locale names are probably something like C<xx_XX.(ISO)?8859-N> or
|
||||
C<xx_XX.(ISO)?8859N>, for example C<fr_CH.ISO8859-1> is the Swiss (CH)
|
||||
variant of French (fr), ISO Latin (8859) 1 (-1) which is the Western
|
||||
European character set.
|
||||
|
||||
=cut
|
||||
|
||||
# I18N::Collate.pm
|
||||
#
|
||||
# Author: Jarkko Hietaniemi <F<jhi@iki.fi>>
|
||||
# Helsinki University of Technology, Finland
|
||||
#
|
||||
# Acks: Guy Decoux <F<decoux@moulon.inra.fr>> understood
|
||||
# overloading magic much deeper than I and told
|
||||
# how to cut the size of this code by more than half.
|
||||
# (my first version did overload all of lt gt eq le ge cmp)
|
||||
#
|
||||
# Purpose: compare 8-bit scalar data according to the current locale
|
||||
#
|
||||
# Requirements: Perl5 POSIX::setlocale() and POSIX::strxfrm()
|
||||
#
|
||||
# Exports: setlocale 1)
|
||||
# collate_xfrm 2)
|
||||
#
|
||||
# Overloads: cmp # 3)
|
||||
#
|
||||
# Usage: use I18N::Collate;
|
||||
# setlocale(LC_COLLATE, 'locale-of-your-choice'); # 4)
|
||||
# $s1 = I18N::Collate->("scalar_data_1");
|
||||
# $s2 = I18N::Collate->("scalar_data_2");
|
||||
#
|
||||
# now you can compare $s1 and $s2: $s1 le $s2
|
||||
# to extract the data itself, you need to deref: $$s1
|
||||
#
|
||||
# Notes:
|
||||
# 1) this uses POSIX::setlocale
|
||||
# 2) the basic collation conversion is done by strxfrm() which
|
||||
# terminates at NUL characters being a decent C routine.
|
||||
# collate_xfrm handles embedded NUL characters gracefully.
|
||||
# 3) due to cmp and overload magic, lt le eq ge gt work also
|
||||
# 4) the available locales depend on your operating system;
|
||||
# try whether "locale -a" shows them or man pages for
|
||||
# "locale" or "nlsinfo" work or the more direct
|
||||
# approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls".
|
||||
# Not all the locales that your vendor supports
|
||||
# are necessarily installed: please consult your
|
||||
# operating system's documentation.
|
||||
# The locale names are probably something like
|
||||
# 'xx_XX.(ISO)?8859-N' or 'xx_XX.(ISO)?8859N',
|
||||
# for example 'fr_CH.ISO8859-1' is the Swiss (CH)
|
||||
# variant of French (fr), ISO Latin (8859) 1 (-1)
|
||||
# which is the Western European character set.
|
||||
#
|
||||
# Updated: 19961005
|
||||
#
|
||||
# ---
|
||||
|
||||
use POSIX qw(strxfrm LC_COLLATE);
|
||||
use warnings::register;
|
||||
|
||||
require Exporter;
|
||||
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT = qw(collate_xfrm setlocale LC_COLLATE);
|
||||
our @EXPORT_OK = qw();
|
||||
|
||||
use overload qw(
|
||||
fallback 1
|
||||
cmp collate_cmp
|
||||
);
|
||||
|
||||
our($LOCALE, $C);
|
||||
|
||||
our $please_use_I18N_Collate_even_if_deprecated = 0;
|
||||
sub new {
|
||||
my $new = $_[1];
|
||||
|
||||
if (warnings::enabled() && $] >= 5.003_06) {
|
||||
unless ($please_use_I18N_Collate_even_if_deprecated) {
|
||||
warnings::warn <<___EOD___;
|
||||
***
|
||||
|
||||
WARNING: starting from the Perl version 5.003_06
|
||||
the I18N::Collate interface for comparing 8-bit scalar data
|
||||
according to the current locale
|
||||
|
||||
HAS BEEN DEPRECATED
|
||||
|
||||
That is, please do not use it anymore for any new applications
|
||||
and please migrate the old applications away from it because its
|
||||
functionality was integrated into the Perl core language in the
|
||||
release 5.003_06.
|
||||
|
||||
See the perllocale manual page for further information.
|
||||
|
||||
***
|
||||
___EOD___
|
||||
$please_use_I18N_Collate_even_if_deprecated++;
|
||||
}
|
||||
}
|
||||
|
||||
bless \$new;
|
||||
}
|
||||
|
||||
sub setlocale {
|
||||
my ($category, $locale) = @_[0,1];
|
||||
|
||||
POSIX::setlocale($category, $locale) if (defined $category);
|
||||
# the current $LOCALE
|
||||
$LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || '';
|
||||
}
|
||||
|
||||
sub C {
|
||||
my $s = ${$_[0]};
|
||||
|
||||
$C->{$LOCALE}->{$s} = collate_xfrm($s)
|
||||
unless (defined $C->{$LOCALE}->{$s}); # cache when met
|
||||
|
||||
$C->{$LOCALE}->{$s};
|
||||
}
|
||||
|
||||
sub collate_xfrm {
|
||||
my $s = $_[0];
|
||||
my $x = '';
|
||||
|
||||
for (split(/(\000+)/, $s)) {
|
||||
$x .= (/^\000/) ? $_ : strxfrm("$_\000");
|
||||
}
|
||||
|
||||
$x;
|
||||
}
|
||||
|
||||
sub collate_cmp {
|
||||
&C($_[0]) cmp &C($_[1]);
|
||||
}
|
||||
|
||||
# init $LOCALE
|
||||
|
||||
&I18N::Collate::setlocale();
|
||||
|
||||
1; # keep require happy
|
||||
887
database/perl/lib/I18N/LangTags.pm
Normal file
887
database/perl/lib/I18N/LangTags.pm
Normal file
@@ -0,0 +1,887 @@
|
||||
|
||||
# Time-stamp: "2004-10-06 23:26:33 ADT"
|
||||
# Sean M. Burke <sburke@cpan.org>
|
||||
|
||||
require 5.000;
|
||||
package I18N::LangTags;
|
||||
use strict;
|
||||
require Exporter;
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT = qw();
|
||||
our @EXPORT_OK = qw(is_language_tag same_language_tag
|
||||
extract_language_tags super_languages
|
||||
similarity_language_tag is_dialect_of
|
||||
locale2language_tag alternate_language_tags
|
||||
encode_language_tag panic_languages
|
||||
implicate_supers
|
||||
implicate_supers_strictly
|
||||
);
|
||||
our %EXPORT_TAGS = ('ALL' => \@EXPORT_OK);
|
||||
|
||||
our $VERSION = "0.44";
|
||||
our %Panic;
|
||||
|
||||
sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function
|
||||
|
||||
|
||||
=head1 NAME
|
||||
|
||||
I18N::LangTags - functions for dealing with RFC3066-style language tags
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use I18N::LangTags();
|
||||
|
||||
...or specify whichever of those functions you want to import, like so:
|
||||
|
||||
use I18N::LangTags qw(implicate_supers similarity_language_tag);
|
||||
|
||||
All the exportable functions are listed below -- you're free to import
|
||||
only some, or none at all. By default, none are imported. If you
|
||||
say:
|
||||
|
||||
use I18N::LangTags qw(:ALL)
|
||||
|
||||
...then all are exported. (This saves you from having to use
|
||||
something less obvious like C<use I18N::LangTags qw(/./)>.)
|
||||
|
||||
If you don't import any of these functions, assume a C<&I18N::LangTags::>
|
||||
in front of all the function names in the following examples.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Language tags are a formalism, described in RFC 3066 (obsoleting
|
||||
1766), for declaring what language form (language and possibly
|
||||
dialect) a given chunk of information is in.
|
||||
|
||||
This library provides functions for common tasks involving language
|
||||
tags as they are needed in a variety of protocols and applications.
|
||||
|
||||
Please see the "See Also" references for a thorough explanation
|
||||
of how to correctly use language tags.
|
||||
|
||||
=over
|
||||
|
||||
=cut
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function is_language_tag($lang1)
|
||||
|
||||
Returns true iff $lang1 is a formally valid language tag.
|
||||
|
||||
is_language_tag("fr") is TRUE
|
||||
is_language_tag("x-jicarilla") is FALSE
|
||||
(Subtags can be 8 chars long at most -- 'jicarilla' is 9)
|
||||
|
||||
is_language_tag("sgn-US") is TRUE
|
||||
(That's American Sign Language)
|
||||
|
||||
is_language_tag("i-Klikitat") is TRUE
|
||||
(True without regard to the fact noone has actually
|
||||
registered Klikitat -- it's a formally valid tag)
|
||||
|
||||
is_language_tag("fr-patois") is TRUE
|
||||
(Formally valid -- altho descriptively weak!)
|
||||
|
||||
is_language_tag("Spanish") is FALSE
|
||||
is_language_tag("french-patois") is FALSE
|
||||
(No good -- first subtag has to match
|
||||
/^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066)
|
||||
|
||||
is_language_tag("x-borg-prot2532") is TRUE
|
||||
(Yes, subtags can contain digits, as of RFC3066)
|
||||
|
||||
=cut
|
||||
|
||||
sub is_language_tag {
|
||||
|
||||
## Changes in the language tagging standards may have to be reflected here.
|
||||
|
||||
my($tag) = lc($_[0]);
|
||||
|
||||
return 0 if $tag eq "i" or $tag eq "x";
|
||||
# Bad degenerate cases that the following
|
||||
# regexp would erroneously let pass
|
||||
|
||||
return $tag =~
|
||||
/^(?: # First subtag
|
||||
[xi] | [a-z]{2,3}
|
||||
)
|
||||
(?: # Subtags thereafter
|
||||
- # separator
|
||||
[a-z0-9]{1,8} # subtag
|
||||
)*
|
||||
$/xs ? 1 : 0;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function extract_language_tags($whatever)
|
||||
|
||||
Returns a list of whatever looks like formally valid language tags
|
||||
in $whatever. Not very smart, so don't get too creative with
|
||||
what you want to feed it.
|
||||
|
||||
extract_language_tags("fr, fr-ca, i-mingo")
|
||||
returns: ('fr', 'fr-ca', 'i-mingo')
|
||||
|
||||
extract_language_tags("It's like this: I'm in fr -- French!")
|
||||
returns: ('It', 'in', 'fr')
|
||||
(So don't just feed it any old thing.)
|
||||
|
||||
The output is untainted. If you don't know what tainting is,
|
||||
don't worry about it.
|
||||
|
||||
=cut
|
||||
|
||||
sub extract_language_tags {
|
||||
|
||||
## Changes in the language tagging standards may have to be reflected here.
|
||||
|
||||
my($text) =
|
||||
$_[0] =~ m/(.+)/ # to make for an untainted result
|
||||
? $1 : ''
|
||||
;
|
||||
|
||||
return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags
|
||||
$text =~
|
||||
m/
|
||||
\b
|
||||
(?: # First subtag
|
||||
[iIxX] | [a-zA-Z]{2,3}
|
||||
)
|
||||
(?: # Subtags thereafter
|
||||
- # separator
|
||||
[a-zA-Z0-9]{1,8} # subtag
|
||||
)*
|
||||
\b
|
||||
/xsg
|
||||
);
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function same_language_tag($lang1, $lang2)
|
||||
|
||||
Returns true iff $lang1 and $lang2 are acceptable variant tags
|
||||
representing the same language-form.
|
||||
|
||||
same_language_tag('x-kadara', 'i-kadara') is TRUE
|
||||
(The x/i- alternation doesn't matter)
|
||||
same_language_tag('X-KADARA', 'i-kadara') is TRUE
|
||||
(...and neither does case)
|
||||
same_language_tag('en', 'en-US') is FALSE
|
||||
(all-English is not the SAME as US English)
|
||||
same_language_tag('x-kadara', 'x-kadar') is FALSE
|
||||
(these are totally unrelated tags)
|
||||
same_language_tag('no-bok', 'nb') is TRUE
|
||||
(no-bok is a legacy tag for nb (Norwegian Bokmal))
|
||||
|
||||
C<same_language_tag> works by just seeing whether
|
||||
C<encode_language_tag($lang1)> is the same as
|
||||
C<encode_language_tag($lang2)>.
|
||||
|
||||
(Yes, I know this function is named a bit oddly. Call it historic
|
||||
reasons.)
|
||||
|
||||
=cut
|
||||
|
||||
sub same_language_tag {
|
||||
my $el1 = &encode_language_tag($_[0]);
|
||||
return 0 unless defined $el1;
|
||||
# this avoids the problem of
|
||||
# encode_language_tag($lang1) eq and encode_language_tag($lang2)
|
||||
# being true if $lang1 and $lang2 are both undef
|
||||
|
||||
return $el1 eq &encode_language_tag($_[1]) ? 1 : 0;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function similarity_language_tag($lang1, $lang2)
|
||||
|
||||
Returns an integer representing the degree of similarity between
|
||||
tags $lang1 and $lang2 (the order of which does not matter), where
|
||||
similarity is the number of common elements on the left,
|
||||
without regard to case and to x/i- alternation.
|
||||
|
||||
similarity_language_tag('fr', 'fr-ca') is 1
|
||||
(one element in common)
|
||||
similarity_language_tag('fr-ca', 'fr-FR') is 1
|
||||
(one element in common)
|
||||
|
||||
similarity_language_tag('fr-CA-joual',
|
||||
'fr-CA-PEI') is 2
|
||||
similarity_language_tag('fr-CA-joual', 'fr-CA') is 2
|
||||
(two elements in common)
|
||||
|
||||
similarity_language_tag('x-kadara', 'i-kadara') is 1
|
||||
(x/i- doesn't matter)
|
||||
|
||||
similarity_language_tag('en', 'x-kadar') is 0
|
||||
similarity_language_tag('x-kadara', 'x-kadar') is 0
|
||||
(unrelated tags -- no similarity)
|
||||
|
||||
similarity_language_tag('i-cree-syllabic',
|
||||
'i-cherokee-syllabic') is 0
|
||||
(no B<leftmost> elements in common!)
|
||||
|
||||
=cut
|
||||
|
||||
sub similarity_language_tag {
|
||||
my $lang1 = &encode_language_tag($_[0]);
|
||||
my $lang2 = &encode_language_tag($_[1]);
|
||||
# And encode_language_tag takes care of the whole
|
||||
# no-nyn==nn, i-hakka==zh-hakka, etc, things
|
||||
|
||||
# NB: (i-sil-...)? (i-sgn-...)?
|
||||
|
||||
return undef if !defined($lang1) and !defined($lang2);
|
||||
return 0 if !defined($lang1) or !defined($lang2);
|
||||
|
||||
my @l1_subtags = split('-', $lang1);
|
||||
my @l2_subtags = split('-', $lang2);
|
||||
my $similarity = 0;
|
||||
|
||||
while(@l1_subtags and @l2_subtags) {
|
||||
if(shift(@l1_subtags) eq shift(@l2_subtags)) {
|
||||
++$similarity;
|
||||
} else {
|
||||
last;
|
||||
}
|
||||
}
|
||||
return $similarity;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function is_dialect_of($lang1, $lang2)
|
||||
|
||||
Returns true iff language tag $lang1 represents a subform of
|
||||
language tag $lang2.
|
||||
|
||||
B<Get the order right! It doesn't work the other way around!>
|
||||
|
||||
is_dialect_of('en-US', 'en') is TRUE
|
||||
(American English IS a dialect of all-English)
|
||||
|
||||
is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE
|
||||
is_dialect_of('fr-CA-joual', 'fr') is TRUE
|
||||
(Joual is a dialect of (a dialect of) French)
|
||||
|
||||
is_dialect_of('en', 'en-US') is FALSE
|
||||
(all-English is a NOT dialect of American English)
|
||||
|
||||
is_dialect_of('fr', 'en-CA') is FALSE
|
||||
|
||||
is_dialect_of('en', 'en' ) is TRUE
|
||||
is_dialect_of('en-US', 'en-US') is TRUE
|
||||
(B<Note:> these are degenerate cases)
|
||||
|
||||
is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE
|
||||
(the x/i thing doesn't matter, nor does case)
|
||||
|
||||
is_dialect_of('nn', 'no') is TRUE
|
||||
(because 'nn' (New Norse) is aliased to 'no-nyn',
|
||||
as a special legacy case, and 'no-nyn' is a
|
||||
subform of 'no' (Norwegian))
|
||||
|
||||
=cut
|
||||
|
||||
sub is_dialect_of {
|
||||
|
||||
my $lang1 = &encode_language_tag($_[0]);
|
||||
my $lang2 = &encode_language_tag($_[1]);
|
||||
|
||||
return undef if !defined($lang1) and !defined($lang2);
|
||||
return 0 if !defined($lang1) or !defined($lang2);
|
||||
|
||||
return 1 if $lang1 eq $lang2;
|
||||
return 0 if length($lang1) < length($lang2);
|
||||
|
||||
$lang1 .= '-';
|
||||
$lang2 .= '-';
|
||||
return
|
||||
(substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function super_languages($lang1)
|
||||
|
||||
Returns a list of language tags that are superordinate tags to $lang1
|
||||
-- it gets this by removing subtags from the end of $lang1 until
|
||||
nothing (or just "i" or "x") is left.
|
||||
|
||||
super_languages("fr-CA-joual") is ("fr-CA", "fr")
|
||||
|
||||
super_languages("en-AU") is ("en")
|
||||
|
||||
super_languages("en") is empty-list, ()
|
||||
|
||||
super_languages("i-cherokee") is empty-list, ()
|
||||
...not ("i"), which would be illegal as well as pointless.
|
||||
|
||||
If $lang1 is not a valid language tag, returns empty-list in
|
||||
a list context, undef in a scalar context.
|
||||
|
||||
A notable and rather unavoidable problem with this method:
|
||||
"x-mingo-tom" has an "x" because the whole tag isn't an
|
||||
IANA-registered tag -- but super_languages('x-mingo-tom') is
|
||||
('x-mingo') -- which isn't really right, since 'i-mingo' is
|
||||
registered. But this module has no way of knowing that. (But note
|
||||
that same_language_tag('x-mingo', 'i-mingo') is TRUE.)
|
||||
|
||||
More importantly, you assume I<at your peril> that superordinates of
|
||||
$lang1 are mutually intelligible with $lang1. Consider this
|
||||
carefully.
|
||||
|
||||
=cut
|
||||
|
||||
sub super_languages {
|
||||
my $lang1 = $_[0];
|
||||
return() unless defined($lang1) && &is_language_tag($lang1);
|
||||
|
||||
# a hack for those annoying new (2001) tags:
|
||||
$lang1 =~ s/^nb\b/no-bok/i; # yes, backwards
|
||||
$lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards
|
||||
$lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way
|
||||
# i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark
|
||||
|
||||
my @l1_subtags = split('-', $lang1);
|
||||
|
||||
## Changes in the language tagging standards may have to be reflected here.
|
||||
|
||||
# NB: (i-sil-...)?
|
||||
|
||||
my @supers = ();
|
||||
foreach my $bit (@l1_subtags) {
|
||||
push @supers,
|
||||
scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit;
|
||||
}
|
||||
pop @supers if @supers;
|
||||
shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s;
|
||||
return reverse @supers;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function locale2language_tag($locale_identifier)
|
||||
|
||||
This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1")
|
||||
and maps it to a language tag. If it's not mappable (as with,
|
||||
notably, "C" and "POSIX"), this returns empty-list in a list context,
|
||||
or undef in a scalar context.
|
||||
|
||||
locale2language_tag("en") is "en"
|
||||
|
||||
locale2language_tag("en_US") is "en-US"
|
||||
|
||||
locale2language_tag("en_US.ISO8859-1") is "en-US"
|
||||
|
||||
locale2language_tag("C") is undef or ()
|
||||
|
||||
locale2language_tag("POSIX") is undef or ()
|
||||
|
||||
locale2language_tag("POSIX") is undef or ()
|
||||
|
||||
I'm not totally sure that locale names map satisfactorily to language
|
||||
tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED.
|
||||
|
||||
The output is untainted. If you don't know what tainting is,
|
||||
don't worry about it.
|
||||
|
||||
=cut
|
||||
|
||||
sub locale2language_tag {
|
||||
my $lang =
|
||||
$_[0] =~ m/(.+)/ # to make for an untainted result
|
||||
? $1 : ''
|
||||
;
|
||||
|
||||
return $lang if &is_language_tag($lang); # like "en"
|
||||
|
||||
$lang =~ tr<_><->; # "en_US" -> en-US
|
||||
$lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US
|
||||
# it_IT.utf8@euro => it-IT
|
||||
|
||||
return $lang if &is_language_tag($lang);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
=item * the function encode_language_tag($lang1)
|
||||
|
||||
This function, if given a language tag, returns an encoding of it such
|
||||
that:
|
||||
|
||||
* tags representing different languages never get the same encoding.
|
||||
|
||||
* tags representing the same language always get the same encoding.
|
||||
|
||||
* an encoding of a formally valid language tag always is a string
|
||||
value that is defined, has length, and is true if considered as a
|
||||
boolean.
|
||||
|
||||
Note that the encoding itself is B<not> a formally valid language tag.
|
||||
Note also that you cannot, currently, go from an encoding back to a
|
||||
language tag that it's an encoding of.
|
||||
|
||||
Note also that you B<must> consider the encoded value as atomic; i.e.,
|
||||
you should not consider it as anything but an opaque, unanalysable
|
||||
string value. (The internals of the encoding method may change in
|
||||
future versions, as the language tagging standard changes over time.)
|
||||
|
||||
C<encode_language_tag> returns undef if given anything other than a
|
||||
formally valid language tag.
|
||||
|
||||
The reason C<encode_language_tag> exists is because different language
|
||||
tags may represent the same language; this is normally treatable with
|
||||
C<same_language_tag>, but consider this situation:
|
||||
|
||||
You have a data file that expresses greetings in different languages.
|
||||
Its format is "[language tag]=[how to say 'Hello']", like:
|
||||
|
||||
en-US=Hiho
|
||||
fr=Bonjour
|
||||
i-mingo=Hau'
|
||||
|
||||
And suppose you write a program that reads that file and then runs as
|
||||
a daemon, answering client requests that specify a language tag and
|
||||
then expect the string that says how to greet in that language. So an
|
||||
interaction looks like:
|
||||
|
||||
greeting-client asks: fr
|
||||
greeting-server answers: Bonjour
|
||||
|
||||
So far so good. But suppose the way you're implementing this is:
|
||||
|
||||
my %greetings;
|
||||
die unless open(IN, "<", "in.dat");
|
||||
while(<IN>) {
|
||||
chomp;
|
||||
next unless /^([^=]+)=(.+)/s;
|
||||
my($lang, $expr) = ($1, $2);
|
||||
$greetings{$lang} = $expr;
|
||||
}
|
||||
close(IN);
|
||||
|
||||
at which point %greetings has the contents:
|
||||
|
||||
"en-US" => "Hiho"
|
||||
"fr" => "Bonjour"
|
||||
"i-mingo" => "Hau'"
|
||||
|
||||
And suppose then that you answer client requests for language $wanted
|
||||
by just looking up $greetings{$wanted}.
|
||||
|
||||
If the client asks for "fr", that will look up successfully in
|
||||
%greetings, to the value "Bonjour". And if the client asks for
|
||||
"i-mingo", that will look up successfully in %greetings, to the value
|
||||
"Hau'".
|
||||
|
||||
But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the
|
||||
lookup in %greetings fails. That's the Wrong Thing.
|
||||
|
||||
You could instead do lookups on $wanted with:
|
||||
|
||||
use I18N::LangTags qw(same_language_tag);
|
||||
my $response = '';
|
||||
foreach my $l2 (keys %greetings) {
|
||||
if(same_language_tag($wanted, $l2)) {
|
||||
$response = $greetings{$l2};
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
But that's rather inefficient. A better way to do it is to start your
|
||||
program with:
|
||||
|
||||
use I18N::LangTags qw(encode_language_tag);
|
||||
my %greetings;
|
||||
die unless open(IN, "<", "in.dat");
|
||||
while(<IN>) {
|
||||
chomp;
|
||||
next unless /^([^=]+)=(.+)/s;
|
||||
my($lang, $expr) = ($1, $2);
|
||||
$greetings{
|
||||
encode_language_tag($lang)
|
||||
} = $expr;
|
||||
}
|
||||
close(IN);
|
||||
|
||||
and then just answer client requests for language $wanted by just
|
||||
looking up
|
||||
|
||||
$greetings{encode_language_tag($wanted)}
|
||||
|
||||
And that does the Right Thing.
|
||||
|
||||
=cut
|
||||
|
||||
sub encode_language_tag {
|
||||
# Only similarity_language_tag() is allowed to analyse encodings!
|
||||
|
||||
## Changes in the language tagging standards may have to be reflected here.
|
||||
|
||||
my($tag) = $_[0] || return undef;
|
||||
return undef unless &is_language_tag($tag);
|
||||
|
||||
# For the moment, these legacy variances are few enough that
|
||||
# we can just handle them here with regexps.
|
||||
$tag =~ s/^iw\b/he/i; # Hebrew
|
||||
$tag =~ s/^in\b/id/i; # Indonesian
|
||||
$tag =~ s/^cre\b/cr/i; # Cree
|
||||
$tag =~ s/^jw\b/jv/i; # Javanese
|
||||
$tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger
|
||||
$tag =~ s/^[ix]-navajo\b/nv/i; # Navajo
|
||||
$tag =~ s/^ji\b/yi/i; # Yiddish
|
||||
# SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now,
|
||||
# but maybe they're all so obscure I can ignore them. "Obscure"
|
||||
# meaning either that the language is obscure, and/or that the
|
||||
# XXX form was extant so briefly that it's unlikely it was ever
|
||||
# used. I hope.
|
||||
#
|
||||
# These go FROM the simplex to complex form, to get
|
||||
# similarity-comparison right. And that's okay, since
|
||||
# similarity_language_tag is the only thing that
|
||||
# analyzes our output.
|
||||
$tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka
|
||||
$tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal
|
||||
$tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk
|
||||
|
||||
$tag =~ s/^[xiXI]-//s;
|
||||
# Just lop off any leading "x/i-"
|
||||
|
||||
return "~" . uc($tag);
|
||||
}
|
||||
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
=item * the function alternate_language_tags($lang1)
|
||||
|
||||
This function, if given a language tag, returns all language tags that
|
||||
are alternate forms of this language tag. (I.e., tags which refer to
|
||||
the same language.) This is meant to handle legacy tags caused by
|
||||
the minor changes in language tag standards over the years; and
|
||||
the x-/i- alternation is also dealt with.
|
||||
|
||||
Note that this function does I<not> try to equate new (and never-used,
|
||||
and unusable)
|
||||
ISO639-2 three-letter tags to old (and still in use) ISO639-1
|
||||
two-letter equivalents -- like "ara" -> "ar" -- because
|
||||
"ara" has I<never> been in use as an Internet language tag,
|
||||
and RFC 3066 stipulates that it never should be, since a shorter
|
||||
tag ("ar") exists.
|
||||
|
||||
Examples:
|
||||
|
||||
alternate_language_tags('no-bok') is ('nb')
|
||||
alternate_language_tags('nb') is ('no-bok')
|
||||
alternate_language_tags('he') is ('iw')
|
||||
alternate_language_tags('iw') is ('he')
|
||||
alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka')
|
||||
alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka')
|
||||
alternate_language_tags('en') is ()
|
||||
alternate_language_tags('x-mingo-tom') is ('i-mingo-tom')
|
||||
alternate_language_tags('x-klikitat') is ('i-klikitat')
|
||||
alternate_language_tags('i-klikitat') is ('x-klikitat')
|
||||
|
||||
This function returns empty-list if given anything other than a formally
|
||||
valid language tag.
|
||||
|
||||
=cut
|
||||
|
||||
my %alt = qw( i x x i I X X I );
|
||||
sub alternate_language_tags {
|
||||
my $tag = $_[0];
|
||||
return() unless &is_language_tag($tag);
|
||||
|
||||
my @em; # push 'em real goood!
|
||||
|
||||
# For the moment, these legacy variances are few enough that
|
||||
# we can just handle them here with regexps.
|
||||
|
||||
if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1";
|
||||
} elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1";
|
||||
|
||||
} elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1";
|
||||
} elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1";
|
||||
|
||||
} elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1";
|
||||
} elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1";
|
||||
|
||||
} elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1";
|
||||
} elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1";
|
||||
|
||||
} elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1";
|
||||
} elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1";
|
||||
|
||||
} elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1";
|
||||
} elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1";
|
||||
|
||||
} elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1";
|
||||
} elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1";
|
||||
|
||||
} elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1";
|
||||
} elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1";
|
||||
}
|
||||
|
||||
push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/;
|
||||
return @em;
|
||||
}
|
||||
|
||||
###########################################################################
|
||||
|
||||
{
|
||||
# Init %Panic...
|
||||
|
||||
my @panic = ( # MUST all be lowercase!
|
||||
# Only large ("national") languages make it in this list.
|
||||
# If you, as a user, are so bizarre that the /only/ language
|
||||
# you claim to accept is Galician, then no, we won't do you
|
||||
# the favor of providing Catalan as a panic-fallback for
|
||||
# you. Because if I start trying to add "little languages" in
|
||||
# here, I'll just go crazy.
|
||||
|
||||
# Scandinavian lgs. All based on opinion and hearsay.
|
||||
'sv' => [qw(nb no da nn)],
|
||||
'da' => [qw(nb no sv nn)], # I guess
|
||||
[qw(no nn nb)], [qw(no nn nb sv da)],
|
||||
'is' => [qw(da sv no nb nn)],
|
||||
'fo' => [qw(da is no nb nn sv)], # I guess
|
||||
|
||||
# I think this is about the extent of tolerable intelligibility
|
||||
# among large modern Romance languages.
|
||||
'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French
|
||||
'ca' => [qw(es pt it fr)],
|
||||
'es' => [qw(ca it fr pt)],
|
||||
'it' => [qw(es fr ca pt)],
|
||||
'fr' => [qw(es it ca pt)],
|
||||
|
||||
# Also assume that speakers of the main Indian languages prefer
|
||||
# to read/hear Hindi over English
|
||||
[qw(
|
||||
as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur
|
||||
)] => 'hi',
|
||||
# Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri,
|
||||
# Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya,
|
||||
# Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu.
|
||||
'hi' => [qw(bn pa as or)],
|
||||
# I welcome finer data for the other Indian languages.
|
||||
# E.g., what should Oriya's list be, besides just Hindi?
|
||||
|
||||
# And the panic languages for English is, of course, nil!
|
||||
|
||||
# My guesses at Slavic intelligibility:
|
||||
([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian
|
||||
([qw(sr hr bs)]) x 2, # Serbian, Croatian, Bosnian
|
||||
'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak
|
||||
|
||||
'ms' => 'id', 'id' => 'ms', # Malay + Indonesian
|
||||
|
||||
'et' => 'fi', 'fi' => 'et', # Estonian + Finnish
|
||||
|
||||
#?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai
|
||||
|
||||
);
|
||||
my($k,$v);
|
||||
while(@panic) {
|
||||
($k,$v) = splice(@panic,0,2);
|
||||
foreach my $k (ref($k) ? @$k : $k) {
|
||||
foreach my $v (ref($v) ? @$v : $v) {
|
||||
push @{$Panic{$k} ||= []}, $v unless $k eq $v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
=item * the function @langs = panic_languages(@accept_languages)
|
||||
|
||||
This function takes a list of 0 or more language
|
||||
tags that constitute a given user's Accept-Language list, and
|
||||
returns a list of tags for I<other> (non-super)
|
||||
languages that are probably acceptable to the user, to be
|
||||
used I<if all else fails>.
|
||||
|
||||
For example, if a user accepts only 'ca' (Catalan) and
|
||||
'es' (Spanish), and the documents/interfaces you have
|
||||
available are just in German, Italian, and Chinese, then
|
||||
the user will most likely want the Italian one (and not
|
||||
the Chinese or German one!), instead of getting
|
||||
nothing. So C<panic_languages('ca', 'es')> returns
|
||||
a list containing 'it' (Italian).
|
||||
|
||||
English ('en') is I<always> in the return list, but
|
||||
whether it's at the very end or not depends
|
||||
on the input languages. This function works by consulting
|
||||
an internal table that stipulates what common
|
||||
languages are "close" to each other.
|
||||
|
||||
A useful construct you might consider using is:
|
||||
|
||||
@fallbacks = super_languages(@accept_languages);
|
||||
push @fallbacks, panic_languages(
|
||||
@accept_languages, @fallbacks,
|
||||
);
|
||||
|
||||
=cut
|
||||
|
||||
sub panic_languages {
|
||||
# When in panic or in doubt, run in circles, scream, and shout!
|
||||
my(@out, %seen);
|
||||
foreach my $t (@_) {
|
||||
next unless $t;
|
||||
next if $seen{$t}++; # so we don't return it or hit it again
|
||||
# push @out, super_languages($t); # nah, keep that separate
|
||||
push @out, @{ $Panic{lc $t} || next };
|
||||
}
|
||||
return grep !$seen{$_}++, @out, 'en';
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
=item * the function implicate_supers( ...languages... )
|
||||
|
||||
This takes a list of strings (which are presumed to be language-tags;
|
||||
strings that aren't, are ignored); and after each one, this function
|
||||
inserts super-ordinate forms that don't already appear in the list.
|
||||
The original list, plus these insertions, is returned.
|
||||
|
||||
In other words, it takes this:
|
||||
|
||||
pt-br de-DE en-US fr pt-br-janeiro
|
||||
|
||||
and returns this:
|
||||
|
||||
pt-br pt de-DE de en-US en fr pt-br-janeiro
|
||||
|
||||
This function is most useful in the idiom
|
||||
|
||||
implicate_supers( I18N::LangTags::Detect::detect() );
|
||||
|
||||
(See L<I18N::LangTags::Detect>.)
|
||||
|
||||
|
||||
=item * the function implicate_supers_strictly( ...languages... )
|
||||
|
||||
This works like C<implicate_supers> except that the implicated
|
||||
forms are added to the end of the return list.
|
||||
|
||||
In other words, implicate_supers_strictly takes a list of strings
|
||||
(which are presumed to be language-tags; strings that aren't, are
|
||||
ignored) and after the whole given list, it inserts the super-ordinate forms
|
||||
of all given tags, minus any tags that already appear in the input list.
|
||||
|
||||
In other words, it takes this:
|
||||
|
||||
pt-br de-DE en-US fr pt-br-janeiro
|
||||
|
||||
and returns this:
|
||||
|
||||
pt-br de-DE en-US fr pt-br-janeiro pt de en
|
||||
|
||||
The reason this function has "_strictly" in its name is that when
|
||||
you're processing an Accept-Language list according to the RFCs, if
|
||||
you interpret the RFCs quite strictly, then you would use
|
||||
implicate_supers_strictly, but for normal use (i.e., common-sense use,
|
||||
as far as I'm concerned) you'd use implicate_supers.
|
||||
|
||||
=cut
|
||||
|
||||
sub implicate_supers {
|
||||
my @languages = grep is_language_tag($_), @_;
|
||||
my %seen_encoded;
|
||||
foreach my $lang (@languages) {
|
||||
$seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1
|
||||
}
|
||||
|
||||
my(@output_languages);
|
||||
foreach my $lang (@languages) {
|
||||
push @output_languages, $lang;
|
||||
foreach my $s ( I18N::LangTags::super_languages($lang) ) {
|
||||
# Note that super_languages returns the longest first.
|
||||
last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) };
|
||||
push @output_languages, $s;
|
||||
}
|
||||
}
|
||||
return uniq( @output_languages );
|
||||
|
||||
}
|
||||
|
||||
sub implicate_supers_strictly {
|
||||
my @tags = grep is_language_tag($_), @_;
|
||||
return uniq( @_, map super_languages($_), @_ );
|
||||
}
|
||||
|
||||
|
||||
|
||||
###########################################################################
|
||||
1;
|
||||
__END__
|
||||
|
||||
=back
|
||||
|
||||
=head1 ABOUT LOWERCASING
|
||||
|
||||
I've considered making all the above functions that output language
|
||||
tags return all those tags strictly in lowercase. Having all your
|
||||
language tags in lowercase does make some things easier. But you
|
||||
might as well just lowercase as you like, or call
|
||||
C<encode_language_tag($lang1)> where appropriate.
|
||||
|
||||
=head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS
|
||||
|
||||
In some future version of I18N::LangTags, I plan to include support
|
||||
for RFC2482-style language tags -- which are basically just normal
|
||||
language tags with their ASCII characters shifted into Plane 14.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
* L<I18N::LangTags::List|I18N::LangTags::List>
|
||||
|
||||
* RFC 3066, C<L<http://www.ietf.org/rfc/rfc3066.txt>>, "Tags for the
|
||||
Identification of Languages". (Obsoletes RFC 1766)
|
||||
|
||||
* RFC 2277, C<L<http://www.ietf.org/rfc/rfc2277.txt>>, "IETF Policy on
|
||||
Character Sets and Languages".
|
||||
|
||||
* RFC 2231, C<L<http://www.ietf.org/rfc/rfc2231.txt>>, "MIME Parameter
|
||||
Value and Encoded Word Extensions: Character Sets, Languages, and
|
||||
Continuations".
|
||||
|
||||
* RFC 2482, C<L<http://www.ietf.org/rfc/rfc2482.txt>>,
|
||||
"Language Tagging in Unicode Plain Text".
|
||||
|
||||
* Locale::Codes, in
|
||||
C<L<http://www.perl.com/CPAN/modules/by-module/Locale/>>
|
||||
|
||||
* ISO 639-2, "Codes for the representation of names of languages",
|
||||
including two-letter and three-letter codes,
|
||||
C<L<http://www.loc.gov/standards/iso639-2/php/code_list.php>>
|
||||
|
||||
* The IANA list of registered languages (hopefully up-to-date),
|
||||
C<L<http://www.iana.org/assignments/language-tags>>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 1998+ Sean M. Burke. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
The programs and documentation in this dist are distributed in
|
||||
the hope that they will be useful, but without any warranty; without
|
||||
even the implied warranty of merchantability or fitness for a
|
||||
particular purpose.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Sean M. Burke C<sburke@cpan.org>
|
||||
|
||||
=cut
|
||||
|
||||
242
database/perl/lib/I18N/LangTags/Detect.pm
Normal file
242
database/perl/lib/I18N/LangTags/Detect.pm
Normal file
@@ -0,0 +1,242 @@
|
||||
|
||||
# Time-stamp: "2004-06-20 21:47:55 ADT"
|
||||
|
||||
require 5;
|
||||
package I18N::LangTags::Detect;
|
||||
use strict;
|
||||
|
||||
our ( $MATCH_SUPERS, $USING_LANGUAGE_TAGS,
|
||||
$USE_LITERALS, $MATCH_SUPERS_TIGHTLY);
|
||||
|
||||
BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } }
|
||||
# define the constant 'DEBUG' at compile-time
|
||||
|
||||
our $VERSION = "1.08";
|
||||
our @ISA = ();
|
||||
use I18N::LangTags qw(alternate_language_tags locale2language_tag);
|
||||
|
||||
sub _uniq { my %seen; return grep(!($seen{$_}++), @_); }
|
||||
sub _normalize {
|
||||
my(@languages) =
|
||||
map lc($_),
|
||||
grep $_,
|
||||
map {; $_, alternate_language_tags($_) } @_;
|
||||
return _uniq(@languages) if wantarray;
|
||||
return $languages[0];
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
# The extent of our functional interface:
|
||||
|
||||
sub detect () { return __PACKAGE__->ambient_langprefs; }
|
||||
|
||||
#===========================================================================
|
||||
|
||||
sub ambient_langprefs { # always returns things untainted
|
||||
my $base_class = $_[0];
|
||||
|
||||
return $base_class->http_accept_langs
|
||||
if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI
|
||||
# it's off in its own routine because it's complicated
|
||||
|
||||
# Not running as a CGI: try to puzzle out from the environment
|
||||
my @languages;
|
||||
|
||||
foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) {
|
||||
next unless $ENV{$envname};
|
||||
DEBUG and print "Noting \$$envname: $ENV{$envname}\n";
|
||||
push @languages,
|
||||
map locale2language_tag($_),
|
||||
# if it's a lg tag, fine, pass thru (untainted)
|
||||
# if it's a locale ID, try converting to a lg tag (untainted),
|
||||
# otherwise nix it.
|
||||
|
||||
split m/[,:]/,
|
||||
$ENV{$envname}
|
||||
;
|
||||
last; # first one wins
|
||||
}
|
||||
|
||||
if($ENV{'IGNORE_WIN32_LOCALE'}) {
|
||||
# no-op
|
||||
} elsif(&_try_use('Win32::Locale')) {
|
||||
# If we have that module installed...
|
||||
push @languages, Win32::Locale::get_language() || ''
|
||||
if defined &Win32::Locale::get_language;
|
||||
}
|
||||
return _normalize @languages;
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
|
||||
sub http_accept_langs {
|
||||
# Deal with HTTP "Accept-Language:" stuff. Hassle.
|
||||
# This code is more lenient than RFC 3282, which you must read.
|
||||
# Hm. Should I just move this into I18N::LangTags at some point?
|
||||
no integer;
|
||||
|
||||
my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'};
|
||||
# (always ends up untainting)
|
||||
|
||||
return() unless defined $in and length $in;
|
||||
|
||||
$in =~ s/\([^\)]*\)//g; # nix just about any comment
|
||||
|
||||
if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) {
|
||||
# Very common case: just one language tag
|
||||
return _normalize $1;
|
||||
} elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) {
|
||||
# Common case these days: just "foo, bar, baz"
|
||||
return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g );
|
||||
}
|
||||
|
||||
# Else it's complicated...
|
||||
|
||||
$in =~ s/\s+//g; # Yes, we can just do without the WS!
|
||||
my @in = $in =~ m/([^,]+)/g;
|
||||
my %pref;
|
||||
|
||||
my $q;
|
||||
foreach my $tag (@in) {
|
||||
next unless $tag =~
|
||||
m/^([a-zA-Z][-a-zA-Z]+)
|
||||
(?:
|
||||
;q=
|
||||
(
|
||||
\d* # a bit too broad of a RE, but so what.
|
||||
(?:
|
||||
\.\d+
|
||||
)?
|
||||
)
|
||||
)?
|
||||
$
|
||||
/sx
|
||||
;
|
||||
$q = (defined $2 and length $2) ? $2 : 1;
|
||||
#print "$1 with q=$q\n";
|
||||
push @{ $pref{$q} }, lc $1;
|
||||
}
|
||||
|
||||
return _normalize(
|
||||
# Read off %pref, in descending key order...
|
||||
map @{$pref{$_}},
|
||||
sort {$b <=> $a}
|
||||
keys %pref
|
||||
);
|
||||
}
|
||||
|
||||
#===========================================================================
|
||||
|
||||
my %tried = ();
|
||||
# memoization of whether we've used this module, or found it unusable.
|
||||
|
||||
sub _try_use { # Basically a wrapper around "require Modulename"
|
||||
# "Many men have tried..." "They tried and failed?" "They tried and died."
|
||||
return $tried{$_[0]} if exists $tried{$_[0]}; # memoization
|
||||
|
||||
my $module = $_[0]; # ASSUME sane module name!
|
||||
{ no strict 'refs';
|
||||
no warnings 'once';
|
||||
return($tried{$module} = 1)
|
||||
if %{$module . "::Lexicon"} or @{$module . "::ISA"};
|
||||
# weird case: we never use'd it, but there it is!
|
||||
}
|
||||
|
||||
print " About to use $module ...\n" if DEBUG;
|
||||
{
|
||||
local $SIG{'__DIE__'};
|
||||
local @INC = @INC;
|
||||
pop @INC if $INC[-1] eq '.';
|
||||
eval "require $module"; # used to be "use $module", but no point in that.
|
||||
}
|
||||
if($@) {
|
||||
print "Error using $module \: $@\n" if DEBUG > 1;
|
||||
return $tried{$module} = 0;
|
||||
} else {
|
||||
print " OK, $module is used\n" if DEBUG;
|
||||
return $tried{$module} = 1;
|
||||
}
|
||||
}
|
||||
|
||||
#---------------------------------------------------------------------------
|
||||
1;
|
||||
__END__
|
||||
|
||||
|
||||
=head1 NAME
|
||||
|
||||
I18N::LangTags::Detect - detect the user's language preferences
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use I18N::LangTags::Detect;
|
||||
my @user_wants = I18N::LangTags::Detect::detect();
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
It is a common problem to want to detect what language(s) the user would
|
||||
prefer output in.
|
||||
|
||||
=head1 FUNCTIONS
|
||||
|
||||
This module defines one public function,
|
||||
C<I18N::LangTags::Detect::detect()>. This function is not exported
|
||||
(nor is even exportable), and it takes no parameters.
|
||||
|
||||
In scalar context, the function returns the most preferred language
|
||||
tag (or undef if no preference was seen).
|
||||
|
||||
In list context (which is usually what you want),
|
||||
the function returns a
|
||||
(possibly empty) list of language tags representing (best first) what
|
||||
languages the user apparently would accept output in. You will
|
||||
probably want to pass the output of this through
|
||||
C<I18N::LangTags::implicate_supers_tightly(...)>
|
||||
or
|
||||
C<I18N::LangTags::implicate_supers(...)>, like so:
|
||||
|
||||
my @languages =
|
||||
I18N::LangTags::implicate_supers_tightly(
|
||||
I18N::LangTags::Detect::detect()
|
||||
);
|
||||
|
||||
|
||||
=head1 ENVIRONMENT
|
||||
|
||||
This module looks at several environment variables:
|
||||
REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE,
|
||||
LANGUAGE, LC_ALL, LC_MESSAGES, and LANG.
|
||||
|
||||
It will also use the L<Win32::Locale> module, if it's installed
|
||||
and IGNORE_WIN32_LOCALE is not set to a true value in the
|
||||
environment.
|
||||
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<I18N::LangTags>, L<Win32::Locale>, L<Locale::Maketext>.
|
||||
|
||||
(This module's core code started out as a routine in Locale::Maketext;
|
||||
but I moved it here once I realized it was more generally useful.)
|
||||
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (c) 1998-2004 Sean M. Burke. All rights reserved.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
|
||||
The programs and documentation in this dist are distributed in
|
||||
the hope that they will be useful, but without any warranty; without
|
||||
even the implied warranty of merchantability or fitness for a
|
||||
particular purpose.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Sean M. Burke C<sburke@cpan.org>
|
||||
|
||||
=cut
|
||||
|
||||
# a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty!
|
||||
1779
database/perl/lib/I18N/LangTags/List.pm
Normal file
1779
database/perl/lib/I18N/LangTags/List.pm
Normal file
File diff suppressed because it is too large
Load Diff
291
database/perl/lib/I18N/Langinfo.pm
Normal file
291
database/perl/lib/I18N/Langinfo.pm
Normal file
@@ -0,0 +1,291 @@
|
||||
package I18N::Langinfo;
|
||||
|
||||
use 5.006;
|
||||
use strict;
|
||||
use warnings;
|
||||
use Carp;
|
||||
|
||||
require Exporter;
|
||||
require XSLoader;
|
||||
|
||||
our @ISA = qw(Exporter);
|
||||
|
||||
our @EXPORT = qw(langinfo);
|
||||
|
||||
our @EXPORT_OK = qw(
|
||||
ABDAY_1
|
||||
ABDAY_2
|
||||
ABDAY_3
|
||||
ABDAY_4
|
||||
ABDAY_5
|
||||
ABDAY_6
|
||||
ABDAY_7
|
||||
ABMON_1
|
||||
ABMON_10
|
||||
ABMON_11
|
||||
ABMON_12
|
||||
ABMON_2
|
||||
ABMON_3
|
||||
ABMON_4
|
||||
ABMON_5
|
||||
ABMON_6
|
||||
ABMON_7
|
||||
ABMON_8
|
||||
ABMON_9
|
||||
ALT_DIGITS
|
||||
AM_STR
|
||||
CODESET
|
||||
CRNCYSTR
|
||||
DAY_1
|
||||
DAY_2
|
||||
DAY_3
|
||||
DAY_4
|
||||
DAY_5
|
||||
DAY_6
|
||||
DAY_7
|
||||
D_FMT
|
||||
D_T_FMT
|
||||
ERA
|
||||
ERA_D_FMT
|
||||
ERA_D_T_FMT
|
||||
ERA_T_FMT
|
||||
MON_1
|
||||
MON_10
|
||||
MON_11
|
||||
MON_12
|
||||
MON_2
|
||||
MON_3
|
||||
MON_4
|
||||
MON_5
|
||||
MON_6
|
||||
MON_7
|
||||
MON_8
|
||||
MON_9
|
||||
NOEXPR
|
||||
NOSTR
|
||||
PM_STR
|
||||
RADIXCHAR
|
||||
THOUSEP
|
||||
T_FMT
|
||||
T_FMT_AMPM
|
||||
YESEXPR
|
||||
YESSTR
|
||||
);
|
||||
|
||||
our $VERSION = '0.19';
|
||||
|
||||
XSLoader::load();
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
I18N::Langinfo - query locale information
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use I18N::Langinfo;
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
The langinfo() function queries various locale information that can be
|
||||
used to localize output and user interfaces. It uses the current underlying
|
||||
locale, regardless of whether or not it was called from within the scope of
|
||||
S<C<use locale>>. The langinfo() function requires
|
||||
one numeric argument that identifies the locale constant to query:
|
||||
if no argument is supplied, C<$_> is used. The numeric constants
|
||||
appropriate to be used as arguments are exportable from I18N::Langinfo.
|
||||
|
||||
The following example will import the langinfo() function itself and
|
||||
three constants to be used as arguments to langinfo(): a constant for
|
||||
the abbreviated first day of the week (the numbering starts from
|
||||
Sunday = 1) and two more constants for the affirmative and negative
|
||||
answers for a yes/no question in the current locale.
|
||||
|
||||
use I18N::Langinfo qw(langinfo ABDAY_1 YESSTR NOSTR);
|
||||
|
||||
my ($abday_1, $yesstr, $nostr) =
|
||||
map { langinfo($_) } (ABDAY_1, YESSTR, NOSTR);
|
||||
|
||||
print "$abday_1? [$yesstr/$nostr] ";
|
||||
|
||||
In other words, in the "C" (or English) locale the above will probably
|
||||
print something like:
|
||||
|
||||
Sun? [yes/no]
|
||||
|
||||
but under a French locale
|
||||
|
||||
dim? [oui/non]
|
||||
|
||||
The usually available constants are as follows.
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
For abbreviated and full length days of the week and months of the year:
|
||||
|
||||
ABDAY_1 ABDAY_2 ABDAY_3 ABDAY_4 ABDAY_5 ABDAY_6 ABDAY_7
|
||||
ABMON_1 ABMON_2 ABMON_3 ABMON_4 ABMON_5 ABMON_6
|
||||
ABMON_7 ABMON_8 ABMON_9 ABMON_10 ABMON_11 ABMON_12
|
||||
DAY_1 DAY_2 DAY_3 DAY_4 DAY_5 DAY_6 DAY_7
|
||||
MON_1 MON_2 MON_3 MON_4 MON_5 MON_6
|
||||
MON_7 MON_8 MON_9 MON_10 MON_11 MON_12
|
||||
|
||||
=item *
|
||||
|
||||
For the date-time, date, and time formats used by the strftime() function
|
||||
(see L<POSIX>):
|
||||
|
||||
D_T_FMT D_FMT T_FMT
|
||||
|
||||
=item *
|
||||
|
||||
For the locales for which it makes sense to have ante meridiem and post
|
||||
meridiem time formats:
|
||||
|
||||
AM_STR PM_STR T_FMT_AMPM
|
||||
|
||||
=item *
|
||||
|
||||
For the character code set being used (such as "ISO8859-1", "cp850",
|
||||
"koi8-r", "sjis", "utf8", etc.), and for the currency string:
|
||||
|
||||
CODESET CRNCYSTR
|
||||
|
||||
=item *
|
||||
|
||||
For an alternate representation of digits, for the
|
||||
radix character used between the integer and the fractional part
|
||||
of decimal numbers, the group separator string for large-ish floating point
|
||||
numbers (yes, the final two are redundant with
|
||||
L<POSIX::localeconv()|POSIX/localeconv>):
|
||||
|
||||
ALT_DIGITS RADIXCHAR THOUSEP
|
||||
|
||||
=item *
|
||||
|
||||
For the affirmative and negative responses and expressions:
|
||||
|
||||
YESSTR YESEXPR NOSTR NOEXPR
|
||||
|
||||
=item *
|
||||
|
||||
For the eras based on typically some ruler, such as the Japanese Emperor
|
||||
(naturally only defined in the appropriate locales):
|
||||
|
||||
ERA ERA_D_FMT ERA_D_T_FMT ERA_T_FMT
|
||||
|
||||
=back
|
||||
|
||||
=head2 For systems without C<nl_langinfo>
|
||||
|
||||
Starting in Perl 5.28, this module is available even on systems that lack a
|
||||
native C<nl_langinfo>. On such systems, it uses various methods to construct
|
||||
what that function, if present, would return. But there are potential
|
||||
glitches. These are the items that could be different:
|
||||
|
||||
=over
|
||||
|
||||
=item C<ERA>
|
||||
|
||||
Unimplemented, so returns C<"">.
|
||||
|
||||
=item C<CODESET>
|
||||
|
||||
Unimplemented, except on Windows, due to the vagaries of vendor locale names,
|
||||
returning C<""> on non-Windows.
|
||||
|
||||
=item C<YESEXPR>
|
||||
|
||||
=item C<YESSTR>
|
||||
|
||||
=item C<NOEXPR>
|
||||
|
||||
=item C<NOSTR>
|
||||
|
||||
Only the values for English are returned. C<YESSTR> and C<NOSTR> have been
|
||||
removed from POSIX 2008, and are retained here for backwards compatibility.
|
||||
Your platform's C<nl_langinfo> may not support them.
|
||||
|
||||
=item C<D_FMT>
|
||||
|
||||
Always evaluates to C<%x>, the locale's appropriate date representation.
|
||||
|
||||
=item C<T_FMT>
|
||||
|
||||
Always evaluates to C<%X>, the locale's appropriate time representation.
|
||||
|
||||
=item C<D_T_FMT>
|
||||
|
||||
Always evaluates to C<%c>, the locale's appropriate date and time
|
||||
representation.
|
||||
|
||||
=item C<CRNCYSTR>
|
||||
|
||||
The return may be incorrect for those rare locales where the currency symbol
|
||||
replaces the radix character.
|
||||
Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
|
||||
to work differently.
|
||||
|
||||
=item C<ALT_DIGITS>
|
||||
|
||||
Currently this gives the same results as Linux does.
|
||||
Send email to L<mailto:perlbug@perl.org> if you have examples of it needing
|
||||
to work differently.
|
||||
|
||||
=item C<ERA_D_FMT>
|
||||
|
||||
=item C<ERA_T_FMT>
|
||||
|
||||
=item C<ERA_D_T_FMT>
|
||||
|
||||
=item C<T_FMT_AMPM>
|
||||
|
||||
These are derived by using C<strftime()>, and not all versions of that function
|
||||
know about them. C<""> is returned for these on such systems.
|
||||
|
||||
=back
|
||||
|
||||
See your L<nl_langinfo(3)> for more information about the available
|
||||
constants. (Often this means having to look directly at the
|
||||
F<langinfo.h> C header file.)
|
||||
|
||||
=head2 EXPORT
|
||||
|
||||
By default only the C<langinfo()> function is exported.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Before Perl 5.28, the returned values are unreliable for the C<RADIXCHAR> and
|
||||
C<THOUSEP> locale constants.
|
||||
|
||||
Starting in 5.28, changing locales on threaded builds is supported on systems
|
||||
that offer thread-safe locale functions. These include POSIX 2008 systems and
|
||||
Windows starting with Visual Studio 2005, and this module will work properly
|
||||
in such situations. However, on threaded builds on Windows prior to Visual
|
||||
Studio 2015, retrieving the items C<CRNCYSTR> and C<THOUSEP> can result in a
|
||||
race with a thread that has converted to use the global locale. It is quite
|
||||
uncommon for a thread to have done this. It would be possible to construct a
|
||||
workaround for this; patches welcome: see L<perlapi/switch_to_global_locale>.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<perllocale>, L<POSIX/localeconv>, L<POSIX/setlocale>, L<nl_langinfo(3)>.
|
||||
|
||||
The langinfo() function is just a wrapper for the C nl_langinfo() interface.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Jarkko Hietaniemi, E<lt>jhi@hut.fiE<gt>. Now maintained by Perl 5 porters.
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
Copyright 2001 by Jarkko Hietaniemi
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
Reference in New Issue
Block a user