Initial Commit

2025-12-03 16:38:10 +01:00
parent c5e26bf594
commit b732d8d4b5
17680 changed files with 5977495 additions and 2 deletions
--- a/database/perl/lib/Unicode/Collate/Locale.pm
+++ b/database/perl/lib/Unicode/Collate/Locale.pm
@@ -0,0 +1,554 @@
+package Unicode::Collate::Locale;
+
+use strict;
+use warnings;
+use Carp;
+use base qw(Unicode::Collate);
+
+our $VERSION = '1.29';
+
+my $PL_EXT  = '.pl';
+
+my %LocaleFile = map { ($_, $_) } qw(
+   af ar as az be bn ca cs cu cy da dsb ee eo es et fa fi fil fo gu
+   ha haw he hi hr hu hy ig is ja kk kl kn ko kok lkt ln lt lv
+   mk ml mr mt nb nn nso om or pa pl ro sa se si sk sl sq sr sv
+   ta te th tn to tr uk ur vi vo wae wo yo zh
+);
+   $LocaleFile{'default'} = '';
+# aliases
+   $LocaleFile{'bs'}      = 'hr';
+   $LocaleFile{'bs_Cyrl'} = 'sr';
+   $LocaleFile{'sr_Latn'} = 'hr';
+# short file names
+   $LocaleFile{'de__phonebook'}   = 'de_phone';
+   $LocaleFile{'de_AT_phonebook'} = 'de_at_ph';
+   $LocaleFile{'es__traditional'} = 'es_trad';
+   $LocaleFile{'fr_CA'}           = 'fr_ca';
+   $LocaleFile{'fi__phonebook'}   = 'fi_phone';
+   $LocaleFile{'si__dictionary'}  = 'si_dict';
+   $LocaleFile{'sv__reformed'}    = 'sv_refo';
+   $LocaleFile{'ug_Cyrl'}         = 'ug_cyrl';
+   $LocaleFile{'zh__big5han'}     = 'zh_big5';
+   $LocaleFile{'zh__gb2312han'}   = 'zh_gb';
+   $LocaleFile{'zh__pinyin'}      = 'zh_pin';
+   $LocaleFile{'zh__stroke'}      = 'zh_strk';
+   $LocaleFile{'zh__zhuyin'}      = 'zh_zhu';
+
+my %TypeAlias = qw(
+    phone     phonebook
+    phonebk   phonebook
+    dict      dictionary
+    reform    reformed
+    trad      traditional
+    big5      big5han
+    gb2312    gb2312han
+);
+
+sub _locale {
+    my $locale = shift;
+    if ($locale) {
+	$locale = lc $locale;
+	$locale =~ tr/\-\ \./_/;
+	$locale =~ s/_([0-9a-z]+)\z/$TypeAlias{$1} ?
+				  "_$TypeAlias{$1}" : "_$1"/e;
+	$LocaleFile{$locale} and return $locale;
+
+	my @code = split /_/, $locale;
+	my $lan = shift @code;
+	my $scr = @code && length $code[0] == 4 ? ucfirst shift @code : '';
+	my $reg = @code && length $code[0] <  4 ? uc      shift @code : '';
+	my $var = @code                         ?         shift @code : '';
+
+	my @list;
+	push @list, (
+	    "${lan}_${scr}_${reg}_$var",
+	    "${lan}_${scr}__$var", # empty $scr should not be ${lan}__$var.
+	    "${lan}_${reg}_$var",  # empty $reg may be ${lan}__$var.
+	    "${lan}__$var",
+	) if $var ne '';
+	push @list, (
+	    "${lan}_${scr}_${reg}",
+	    "${lan}_${scr}",
+	    "${lan}_${reg}",
+	     ${lan},
+	);
+	for my $loc (@list) {
+	    $LocaleFile{$loc} and return $loc;
+	}
+    }
+    return 'default';
+}
+
+sub getlocale {
+    return shift->{accepted_locale};
+}
+
+sub locale_version {
+    return shift->{locale_version};
+}
+
+sub _fetchpl {
+    my $accepted = shift;
+    my $f = $LocaleFile{$accepted};
+    return if !$f;
+    $f .= $PL_EXT;
+
+    # allow to search @INC
+#   use File::Spec;
+#   my $path = File::Spec->catfile('Unicode', 'Collate', 'Locale', $f);
+    my $path = "Unicode/Collate/Locale/$f";
+    my $h = do $path;
+    croak "Unicode/Collate/Locale/$f can't be found" if !$h;
+    return $h;
+}
+
+sub new {
+    my $class = shift;
+    my %hash = @_;
+    $hash{accepted_locale} = _locale($hash{locale});
+
+    if (exists $hash{table}) {
+	croak "your table can't be used with Unicode::Collate::Locale";
+    }
+
+    my $href = _fetchpl($hash{accepted_locale});
+    while (my($k,$v) = each %$href) {
+	if (!exists $hash{$k}) {
+	    $hash{$k} = $v;
+	} elsif ($k eq 'entry') {
+	    $hash{$k} = $v.$hash{$k};
+	} else {
+	    croak "$k is reserved by $hash{locale}, can't be overwritten";
+	}
+    }
+    return $class->SUPER::new(%hash);
+}
+
+1;
+__END__
+
+=head1 NAME
+
+Unicode::Collate::Locale - Linguistic tailoring for DUCET via Unicode::Collate
+
+=head1 SYNOPSIS
+
+  use Unicode::Collate::Locale;
+
+  #construct
+  $Collator = Unicode::Collate::Locale->
+      new(locale => $locale_name, %tailoring);
+
+  #sort
+  @sorted = $Collator->sort(@not_sorted);
+
+  #compare
+  $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
+
+B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
+according to Perl's Unicode support. See L<perlunicode>,
+L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
+Otherwise you can use C<preprocess> (cf. C<Unicode::Collate>)
+or should decode them before.
+
+=head1 DESCRIPTION
+
+This module provides linguistic tailoring for it
+taking advantage of C<Unicode::Collate>.
+
+=head2 Constructor
+
+The C<new> method returns a collator object.
+
+A parameter list for the constructor is a hash, which can include
+a special key C<locale> and its value (case-insensitive) standing
+for a Unicode base language code (two or three-letter).
+For example, C<Unicode::Collate::Locale-E<gt>new(locale =E<gt> 'ES')>
+returns a collator tailored for Spanish.
+
+C<$locale_name> may be suffixed with a Unicode script code (four-letter),
+a Unicode region (territory) code, a Unicode language variant code.
+These codes are case-insensitive, and separated with C<'_'> or C<'-'>.
+E.g. C<en_US> for English in USA,
+C<az_Cyrl> for Azerbaijani in the Cyrillic script,
+C<es_ES_traditional> for Spanish in Spain (Traditional).
+
+If C<$locale_name> is not available,
+fallback is selected in the following order:
+
+    1. language with a variant code
+    2. language with a script code
+    3. language with a region code
+    4. language
+    5. default
+
+Tailoring tags provided by C<Unicode::Collate> are allowed as long as
+they are not used for C<locale> support.  Esp. the C<table> tag
+is always untailorable, since it is reserved for DUCET.
+
+However C<entry> is allowed, even if it is used for C<locale> support,
+to add or override mappings.
+
+E.g. a collator for Spanish, which ignores diacritics and case difference
+(i.e. level 1), with reversed case ordering and no normalization.
+
+    Unicode::Collate::Locale->new(
+        level => 1,
+        locale => 'es',
+        upper_before_lower => 1,
+        normalization => undef
+    )
+
+Overriding a behavior already tailored by C<locale> is disallowed
+if such a tailoring is passed to C<new()>.
+
+    Unicode::Collate::Locale->new(
+        locale => 'da',
+        upper_before_lower => 0, # causes error as reserved by 'da'
+    )
+
+However C<change()> inherited from C<Unicode::Collate> allows
+such a tailoring that is reserved by C<locale>. Examples:
+
+    new(locale => 'fr_ca')->change(backwards => undef)
+    new(locale => 'da')->change(upper_before_lower => 0)
+    new(locale => 'ja')->change(overrideCJK => undef)
+
+=head2 Methods
+
+C<Unicode::Collate::Locale> is a subclass of C<Unicode::Collate>
+and methods other than C<new> are inherited from C<Unicode::Collate>.
+
+Here is a list of additional methods:
+
+=over 4
+
+=item C<$Collator-E<gt>getlocale>
+
+Returns a language code accepted and used actually on collation.
+If linguistic tailoring is not provided for a language code you passed
+(intensionally for some languages, or due to the incomplete implementation),
+this method returns a string C<'default'> meaning no special tailoring.
+
+=item C<$Collator-E<gt>locale_version>
+
+(Since Unicode::Collate::Locale 0.87)
+Returns the version number (perhaps C</\d\.\d\d/>) of the locale, as that
+of F<Locale/*.pl>.
+
+B<Note:> F<Locale/*.pl> that a collator uses should be identified by
+a combination of return values from C<getlocale> and C<locale_version>.
+
+=back
+
+=head2 A list of tailorable locales
+
+      locale name       description
+    --------------------------------------------------------------
+      af                Afrikaans
+      ar                Arabic
+      as                Assamese
+      az                Azerbaijani (Azeri)
+      be                Belarusian
+      bn                Bengali
+      bs                Bosnian (tailored as Croatian)
+      bs_Cyrl           Bosnian in Cyrillic (tailored as Serbian)
+      ca                Catalan
+      cs                Czech
+      cu                Church Slavic
+      cy                Welsh
+      da                Danish
+      de__phonebook     German (umlaut as 'ae', 'oe', 'ue')
+      de_AT_phonebook   Austrian German (umlaut primary greater)
+      dsb               Lower Sorbian
+      ee                Ewe
+      eo                Esperanto
+      es                Spanish
+      es__traditional   Spanish ('ch' and 'll' as a grapheme)
+      et                Estonian
+      fa                Persian
+      fi                Finnish (v and w are primary equal)
+      fi__phonebook     Finnish (v and w as separate characters)
+      fil               Filipino
+      fo                Faroese
+      fr_CA             Canadian French
+      gu                Gujarati
+      ha                Hausa
+      haw               Hawaiian
+      he                Hebrew
+      hi                Hindi
+      hr                Croatian
+      hu                Hungarian
+      hy                Armenian
+      ig                Igbo
+      is                Icelandic
+      ja                Japanese [1]
+      kk                Kazakh
+      kl                Kalaallisut
+      kn                Kannada
+      ko                Korean [2]
+      kok               Konkani
+      lkt               Lakota
+      ln                Lingala
+      lt                Lithuanian
+      lv                Latvian
+      mk                Macedonian
+      ml                Malayalam
+      mr                Marathi
+      mt                Maltese
+      nb                Norwegian Bokmal
+      nn                Norwegian Nynorsk
+      nso               Northern Sotho
+      om                Oromo
+      or                Oriya
+      pa                Punjabi
+      pl                Polish
+      ro                Romanian
+      sa                Sanskrit
+      se                Northern Sami
+      si                Sinhala
+      si__dictionary    Sinhala (U+0DA5 = U+0DA2,0DCA,0DA4)
+      sk                Slovak
+      sl                Slovenian
+      sq                Albanian
+      sr                Serbian
+      sr_Latn           Serbian in Latin (tailored as Croatian)
+      sv                Swedish (v and w are primary equal)
+      sv__reformed      Swedish (v and w as separate characters)
+      ta                Tamil
+      te                Telugu
+      th                Thai
+      tn                Tswana
+      to                Tonga
+      tr                Turkish
+      ug_Cyrl           Uyghur in Cyrillic
+      uk                Ukrainian
+      ur                Urdu
+      vi                Vietnamese
+      vo                Volapu"k
+      wae               Walser
+      wo                Wolof
+      yo                Yoruba
+      zh                Chinese
+      zh__big5han       Chinese (ideographs: big5 order)
+      zh__gb2312han     Chinese (ideographs: GB-2312 order)
+      zh__pinyin        Chinese (ideographs: pinyin order) [3]
+      zh__stroke        Chinese (ideographs: stroke order) [3]
+      zh__zhuyin        Chinese (ideographs: zhuyin order) [3]
+    --------------------------------------------------------------
+
+Locales according to the default UCA rules include
+am (Amharic) without C<[reorder Ethi]>,
+bg (Bulgarian) without C<[reorder Cyrl]>,
+chr (Cherokee) without C<[reorder Cher]>,
+de (German),
+en (English),
+fr (French),
+ga (Irish),
+id (Indonesian),
+it (Italian),
+ka (Georgian) without C<[reorder Geor]>,
+mn (Mongolian) without C<[reorder Cyrl Mong]>,
+ms (Malay),
+nl (Dutch),
+pt (Portuguese),
+ru (Russian) without C<[reorder Cyrl]>,
+sw (Swahili),
+zu (Zulu).
+
+B<Note>
+
+[1] ja: Ideographs are sorted in JIS X 0208 order.
+Fullwidth and halfwidth forms are identical to their regular form.
+The difference between hiragana and katakana is at the 4th level,
+the comparison also requires C<(variable =E<gt> 'Non-ignorable')>,
+and then C<katakana_before_hiragana> has no effect.
+
+[2] ko: Plenty of ideographs are sorted by their reading. Such
+an ideograph is primary (level 1) equal to, and secondary (level 2)
+greater than, the corresponding hangul syllable.
+
+[3] zh__pinyin, zh__stroke and zh__zhuyin: implemented alt='short',
+where a smaller number of ideographs are tailored.
+
+=head2 A list of variant codes and their aliases
+
+      variant code       alias
+    ------------------------------------------
+      dictionary         dict
+      phonebook          phone     phonebk
+      reformed           reform
+      traditional        trad
+    ------------------------------------------
+      big5han            big5
+      gb2312han          gb2312
+      pinyin
+      stroke
+      zhuyin
+    ------------------------------------------
+
+Note: 'pinyin' is Han in Latin, 'zhuyin' is Han in Bopomofo.
+
+=head1 INSTALL
+
+Installation of C<Unicode::Collate::Locale> requires F<Collate/Locale.pm>,
+F<Collate/Locale/*.pm>, F<Collate/CJK/*.pm> and F<Collate/allkeys.txt>.
+On building, C<Unicode::Collate::Locale> doesn't require
+any of F<data/*.txt>, F<gendata/*>, and F<mklocale>.
+Tests for C<Unicode::Collate::Locale> are named F<t/loc_*.t>.
+
+=head1 CAVEAT
+
+=over 4
+
+=item Tailoring is not maximum
+
+Even if a certain letter is tailored, its equivalent would not always
+tailored as well as it. For example, even though W is tailored,
+fullwidth W (C<U+FF37>), W with acute (C<U+1E82>), etc. are not
+tailored. The result may depend on whether source strings are
+normalized or not, and whether decomposed or composed.
+Thus C<(normalization =E<gt> undef)> is less preferred.
+
+=item Collation reordering is not supported
+
+The order of any groups including scripts is not changed.
+
+=back
+
+=head2 Reference
+
+      locale            based CLDR or other reference
+    --------------------------------------------------------------------
+      af                30 = 1.8.1
+      ar                30 = 28 ("compat" wo [reorder Arab]) = 1.9.0
+      as                30 = 28 (without [reorder Beng..]) = 23
+      az                30 = 24 ("standard" wo [reorder Latn Cyrl])
+      be                30 = 28 (without [reorder Cyrl])
+      bn                30 = 28 ("standard" wo [reorder Beng..]) = 2.0.1
+      bs                30 = 28 (type="standard": [import hr])
+      bs_Cyrl           30 = 28 (type="standard": [import sr])
+      ca                30 = 23 (alt="proposed" type="standard")
+      cs                30 = 1.8.1 (type="standard")
+      cu                34 = 30 (without [reorder Cyrl])
+      cy                30 = 1.8.1
+      da                22.1 = 1.8.1 (type="standard")
+      de__phonebook     30 = 2.0 (type="phonebook")
+      de_AT_phonebook   30 = 27 (type="phonebook")
+      dsb               30 = 26
+      ee                30 = 21
+      eo                30 = 1.8.1
+      es                30 = 1.9.0 (type="standard")
+      es__traditional   30 = 1.8.1 (type="traditional")
+      et                30 = 26
+      fa                22.1 = 1.8.1
+      fi                22.1 = 1.8.1 (type="standard" alt="proposed")
+      fi__phonebook     22.1 = 1.8.1 (type="phonebook")
+      fil               30 = 1.9.0 (type="standard") = 1.8.1
+      fo                22.1 = 1.8.1 (alt="proposed" type="standard")
+      fr_CA             30 = 1.9.0
+      gu                30 = 28 ("standard" wo [reorder Gujr..]) = 1.9.0
+      ha                30 = 1.9.0
+      haw               30 = 24
+      he                30 = 28 (without [reorder Hebr]) = 23
+      hi                30 = 28 (without [reorder Deva..]) = 1.9.0
+      hr                30 = 28 ("standard" wo [reorder Latn Cyrl]) = 1.9.0
+      hu                22.1 = 1.8.1 (alt="proposed" type="standard")
+      hy                30 = 28 (without [reorder Armn]) = 1.8.1
+      ig                30 = 1.8.1
+      is                22.1 = 1.8.1 (type="standard")
+      ja                22.1 = 1.8.1 (type="standard")
+      kk                30 = 28 (without [reorder Cyrl])
+      kl                22.1 = 1.8.1 (type="standard")
+      kn                30 = 28 ("standard" wo [reorder Knda..]) = 1.9.0
+      ko                22.1 = 1.8.1 (type="standard")
+      kok               30 = 28 (without [reorder Deva..]) = 1.8.1
+      lkt               30 = 25
+      ln                30 = 2.0 (type="standard") = 1.8.1
+      lt                22.1 = 1.9.0
+      lv                22.1 = 1.9.0 (type="standard") = 1.8.1
+      mk                30 = 28 (without [reorder Cyrl])
+      ml                22.1 = 1.9.0
+      mr                30 = 28 (without [reorder Deva..]) = 1.8.1
+      mt                22.1 = 1.9.0
+      nb                22.1 = 2.0   (type="standard")
+      nn                22.1 = 2.0   (type="standard")
+      nso           [*] 26 = 1.8.1
+      om                22.1 = 1.8.1
+      or                30 = 28 (without [reorder Orya..]) = 1.9.0
+      pa                22.1 = 1.8.1
+      pl                30 = 1.8.1
+      ro                30 = 1.9.0 (type="standard")
+      sa            [*] 1.9.1 = 1.8.1 (type="standard" alt="proposed")
+      se                22.1 = 1.8.1 (type="standard")
+      si                30 = 28 ("standard" wo [reorder Sinh..]) = 1.9.0
+      si__dictionary    30 = 28 ("dictionary" wo [reorder Sinh..]) = 1.9.0
+      sk                22.1 = 1.9.0 (type="standard")
+      sl                22.1 = 1.8.1 (type="standard" alt="proposed")
+      sq                22.1 = 1.8.1 (alt="proposed" type="standard")
+      sr                30 = 28 (without [reorder Cyrl])
+      sr_Latn           30 = 28 (type="standard": [import hr])
+      sv                22.1 = 1.9.0 (type="standard")
+      sv__reformed      22.1 = 1.8.1 (type="reformed")
+      ta                22.1 = 1.9.0
+      te                30 = 28 (without [reorder Telu..]) = 1.9.0
+      th                22.1 = 22
+      tn            [*] 26 = 1.8.1
+      to                22.1 = 22
+      tr                22.1 = 1.8.1 (type="standard")
+      uk                30 = 28 (without [reorder Cyrl])
+      ug_Cyrl           https://en.wikipedia.org/wiki/Uyghur_Cyrillic_alphabet
+      ur                22.1 = 1.9.0
+      vi                22.1 = 1.8.1
+      vo                30 = 25
+      wae               30 = 2.0
+      wo            [*] 1.9.1 = 1.8.1
+      yo                30 = 1.8.1
+      zh                22.1 = 1.8.1 (type="standard")
+      zh__big5han       22.1 = 1.8.1 (type="big5han")
+      zh__gb2312han     22.1 = 1.8.1 (type="gb2312han")
+      zh__pinyin        22.1 = 2.0   (type='pinyin' alt='short')
+      zh__stroke        22.1 = 1.9.1 (type='stroke' alt='short')
+      zh__zhuyin        22.1 = 22    (type='zhuyin' alt='short')
+    --------------------------------------------------------------------
+
+[*] http://www.unicode.org/repos/cldr/tags/latest/seed/collation/
+
+=head1 AUTHOR
+
+The Unicode::Collate::Locale module for perl was written
+by SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>.
+This module is Copyright(C) 2004-2020, SADAHIRO Tomoyuki. Japan.
+All rights reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+=over 4
+
+=item Unicode Collation Algorithm - UTS #10
+
+L<http://www.unicode.org/reports/tr10/>
+
+=item The Default Unicode Collation Element Table (DUCET)
+
+L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
+
+=item Unicode Locale Data Markup Language (LDML) - UTS #35
+
+L<http://www.unicode.org/reports/tr35/>
+
+=item CLDR - Unicode Common Locale Data Repository
+
+L<http://cldr.unicode.org/>
+
+=item L<Unicode::Collate>
+
+=item L<Unicode::Normalize>
+
+=back
+
+=cut