Initial Commit
This commit is contained in:
599
database/perl/lib/unicore/Name.pm
Normal file
599
database/perl/lib/unicore/Name.pm
Normal file
@@ -0,0 +1,599 @@
|
||||
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
|
||||
# This file is machine-generated by ..\lib\unicore\mktables from the Unicode
|
||||
# database, Version 13.0.0. Any changes made here will be lost!
|
||||
|
||||
|
||||
# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
|
||||
# This file is for internal use by core Perl only. The format and even the
|
||||
# name or existence of this file are subject to change without notice. Don't
|
||||
# use it directly. Use Unicode::UCD to access the Unicode character data
|
||||
# base.
|
||||
|
||||
|
||||
package charnames;
|
||||
|
||||
# This module contains machine-generated tables and code for the
|
||||
# algorithmically-determinable Unicode character names. The following
|
||||
# routines can be used to translate between name and code point and vice versa
|
||||
|
||||
{ # Closure
|
||||
|
||||
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
|
||||
# two must be 10; if there are 5, the first must not be a 0. Written this
|
||||
# way to decrease backtracking. The first regex allows the code point to
|
||||
# be at the end of a word, but to work properly, the word shouldn't end
|
||||
# with a valid hex character. The second one won't match a code point at
|
||||
# the end of a word, and doesn't have the run-on issue
|
||||
my $run_on_code_point_re = qr/(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b)/;
|
||||
my $code_point_re = qr/(?^aa:\b(?^aax: (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b))/;
|
||||
|
||||
# In the following hash, the keys are the bases of names which include
|
||||
# the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
|
||||
# of each key is another hash which is used to get the low and high ends
|
||||
# for each range of code points that apply to the name.
|
||||
my %names_ending_in_code_point = (
|
||||
'CJK COMPATIBILITY IDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
64109,
|
||||
64217,
|
||||
195101,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
63744,
|
||||
64112,
|
||||
194560,
|
||||
],
|
||||
},
|
||||
'CJK UNIFIED IDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
19903,
|
||||
40956,
|
||||
173789,
|
||||
177972,
|
||||
178205,
|
||||
183969,
|
||||
191456,
|
||||
201546,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
13312,
|
||||
19968,
|
||||
131072,
|
||||
173824,
|
||||
177984,
|
||||
178208,
|
||||
183984,
|
||||
196608,
|
||||
],
|
||||
},
|
||||
'KHITAN SMALL SCRIPT CHARACTER' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
101589,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
101120,
|
||||
],
|
||||
},
|
||||
'NUSHU CHARACTER' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
111355,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
110960,
|
||||
],
|
||||
},
|
||||
'TANGUT IDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
100343,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
94208,
|
||||
],
|
||||
},
|
||||
'TANGUT IDEOGRAPH SUPPLEMENT' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
101640,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
101632,
|
||||
],
|
||||
},
|
||||
|
||||
);
|
||||
|
||||
# The following hash is a copy of the previous one, except is for loose
|
||||
# matching, so each name has blanks and dashes squeezed out
|
||||
my %loose_names_ending_in_code_point = (
|
||||
'CJKCOMPATIBILITYIDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
64109,
|
||||
64217,
|
||||
195101,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
63744,
|
||||
64112,
|
||||
194560,
|
||||
],
|
||||
},
|
||||
'CJKUNIFIEDIDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
19903,
|
||||
40956,
|
||||
173789,
|
||||
177972,
|
||||
178205,
|
||||
183969,
|
||||
191456,
|
||||
201546,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
13312,
|
||||
19968,
|
||||
131072,
|
||||
173824,
|
||||
177984,
|
||||
178208,
|
||||
183984,
|
||||
196608,
|
||||
],
|
||||
},
|
||||
'KHITANSMALLSCRIPTCHARACTER' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
101589,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
101120,
|
||||
],
|
||||
},
|
||||
'NUSHUCHARACTER' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
111355,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
110960,
|
||||
],
|
||||
},
|
||||
'TANGUTIDEOGRAPH' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
100343,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
94208,
|
||||
],
|
||||
},
|
||||
'TANGUTIDEOGRAPHSUPPLEMENT' =>
|
||||
{
|
||||
'high' =>
|
||||
[
|
||||
101640,
|
||||
],
|
||||
'low' =>
|
||||
[
|
||||
101632,
|
||||
],
|
||||
},
|
||||
|
||||
);
|
||||
|
||||
# And the following array gives the inverse mapping from code points to
|
||||
# names. Lowest code points are first
|
||||
@code_points_ending_in_code_point = (
|
||||
|
||||
{
|
||||
'high' => 19903,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 13312,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 40956,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 19968,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 64109,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKLMOPRTY',
|
||||
'low' => 63744,
|
||||
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 64217,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKLMOPRTY',
|
||||
'low' => 64112,
|
||||
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 100343,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHINOPRTU',
|
||||
'low' => 94208,
|
||||
'name' => 'TANGUT IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 101589,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFHIKLMNPRST',
|
||||
'low' => 101120,
|
||||
'name' => 'KHITAN SMALL SCRIPT CHARACTER',
|
||||
},
|
||||
{
|
||||
'high' => 101640,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHILMNOPRSTU',
|
||||
'low' => 101632,
|
||||
'name' => 'TANGUT IDEOGRAPH SUPPLEMENT',
|
||||
},
|
||||
{
|
||||
'high' => 111355,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFHNRSTU',
|
||||
'low' => 110960,
|
||||
'name' => 'NUSHU CHARACTER',
|
||||
},
|
||||
{
|
||||
'high' => 173789,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 131072,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 177972,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 173824,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 178205,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 177984,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 183969,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 178208,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 191456,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 183984,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 195101,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKLMOPRTY',
|
||||
'low' => 194560,
|
||||
'name' => 'CJK COMPATIBILITY IDEOGRAPH',
|
||||
},
|
||||
{
|
||||
'high' => 201546,
|
||||
'legal' =>
|
||||
'
|
||||
-0123456789ABCDEFGHIJKNOPRU',
|
||||
'low' => 196608,
|
||||
'name' => 'CJK UNIFIED IDEOGRAPH',
|
||||
},
|
||||
,
|
||||
|
||||
);
|
||||
|
||||
# Is exportable, make read-only
|
||||
Internals::SvREADONLY(@code_points_ending_in_code_point, 1);
|
||||
|
||||
# Convert from code point to Jamo short name for use in composing Hangul
|
||||
# syllable names
|
||||
my %Jamo = (
|
||||
4352 => 'G',
|
||||
4353 => 'GG',
|
||||
4354 => 'N',
|
||||
4355 => 'D',
|
||||
4356 => 'DD',
|
||||
4357 => 'R',
|
||||
4358 => 'M',
|
||||
4359 => 'B',
|
||||
4360 => 'BB',
|
||||
4361 => 'S',
|
||||
4362 => 'SS',
|
||||
4363 => '',
|
||||
4364 => 'J',
|
||||
4365 => 'JJ',
|
||||
4366 => 'C',
|
||||
4367 => 'K',
|
||||
4368 => 'T',
|
||||
4369 => 'P',
|
||||
4370 => 'H',
|
||||
4449 => 'A',
|
||||
4450 => 'AE',
|
||||
4451 => 'YA',
|
||||
4452 => 'YAE',
|
||||
4453 => 'EO',
|
||||
4454 => 'E',
|
||||
4455 => 'YEO',
|
||||
4456 => 'YE',
|
||||
4457 => 'O',
|
||||
4458 => 'WA',
|
||||
4459 => 'WAE',
|
||||
4460 => 'OE',
|
||||
4461 => 'YO',
|
||||
4462 => 'U',
|
||||
4463 => 'WEO',
|
||||
4464 => 'WE',
|
||||
4465 => 'WI',
|
||||
4466 => 'YU',
|
||||
4467 => 'EU',
|
||||
4468 => 'YI',
|
||||
4469 => 'I',
|
||||
4520 => 'G',
|
||||
4521 => 'GG',
|
||||
4522 => 'GS',
|
||||
4523 => 'N',
|
||||
4524 => 'NJ',
|
||||
4525 => 'NH',
|
||||
4526 => 'D',
|
||||
4527 => 'L',
|
||||
4528 => 'LG',
|
||||
4529 => 'LM',
|
||||
4530 => 'LB',
|
||||
4531 => 'LS',
|
||||
4532 => 'LT',
|
||||
4533 => 'LP',
|
||||
4534 => 'LH',
|
||||
4535 => 'M',
|
||||
4536 => 'B',
|
||||
4537 => 'BS',
|
||||
4538 => 'S',
|
||||
4539 => 'SS',
|
||||
4540 => 'NG',
|
||||
4541 => 'J',
|
||||
4542 => 'C',
|
||||
4543 => 'K',
|
||||
4544 => 'T',
|
||||
4545 => 'P',
|
||||
4546 => 'H',
|
||||
|
||||
);
|
||||
|
||||
# Leading consonant (can be null)
|
||||
my %Jamo_L = (
|
||||
'' => 11,
|
||||
'B' => 7,
|
||||
'BB' => 8,
|
||||
'C' => 14,
|
||||
'D' => 3,
|
||||
'DD' => 4,
|
||||
'G' => 0,
|
||||
'GG' => 1,
|
||||
'H' => 18,
|
||||
'J' => 12,
|
||||
'JJ' => 13,
|
||||
'K' => 15,
|
||||
'M' => 6,
|
||||
'N' => 2,
|
||||
'P' => 17,
|
||||
'R' => 5,
|
||||
'S' => 9,
|
||||
'SS' => 10,
|
||||
'T' => 16,
|
||||
|
||||
);
|
||||
|
||||
# Vowel
|
||||
my %Jamo_V = (
|
||||
'A' => 0,
|
||||
'AE' => 1,
|
||||
'E' => 5,
|
||||
'EO' => 4,
|
||||
'EU' => 18,
|
||||
'I' => 20,
|
||||
'O' => 8,
|
||||
'OE' => 11,
|
||||
'U' => 13,
|
||||
'WA' => 9,
|
||||
'WAE' => 10,
|
||||
'WE' => 15,
|
||||
'WEO' => 14,
|
||||
'WI' => 16,
|
||||
'YA' => 2,
|
||||
'YAE' => 3,
|
||||
'YE' => 7,
|
||||
'YEO' => 6,
|
||||
'YI' => 19,
|
||||
'YO' => 12,
|
||||
'YU' => 17,
|
||||
|
||||
);
|
||||
|
||||
# Optional trailing consonant
|
||||
my %Jamo_T = (
|
||||
'B' => 17,
|
||||
'BS' => 18,
|
||||
'C' => 23,
|
||||
'D' => 7,
|
||||
'G' => 1,
|
||||
'GG' => 2,
|
||||
'GS' => 3,
|
||||
'H' => 27,
|
||||
'J' => 22,
|
||||
'K' => 24,
|
||||
'L' => 8,
|
||||
'LB' => 11,
|
||||
'LG' => 9,
|
||||
'LH' => 15,
|
||||
'LM' => 10,
|
||||
'LP' => 14,
|
||||
'LS' => 12,
|
||||
'LT' => 13,
|
||||
'M' => 16,
|
||||
'N' => 4,
|
||||
'NG' => 21,
|
||||
'NH' => 6,
|
||||
'NJ' => 5,
|
||||
'P' => 26,
|
||||
'S' => 19,
|
||||
'SS' => 20,
|
||||
'T' => 25,
|
||||
|
||||
);
|
||||
|
||||
# Computed re that splits up a Hangul name into LVT or LV syllables
|
||||
my $syllable_re = qr/(|B|BB|C|D|DD|G|GG|H|J|JJ|K|M|N|P|R|S|SS|T)(A|AE|E|EO|EU|I|O|OE|U|WA|WAE|WE|WEO|WI|YA|YAE|YE|YEO|YI|YO|YU)(B|BS|C|D|G|GG|GS|H|J|K|L|LB|LG|LH|LM|LP|LS|LT|M|N|NG|NH|NJ|P|S|SS|T)?/;
|
||||
|
||||
my $HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
|
||||
my $loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
|
||||
|
||||
# These constants names and values were taken from the Unicode standard,
|
||||
# version 5.1, section 3.12. They are used in conjunction with Hangul
|
||||
# syllables
|
||||
my $SBase = 0xAC00;
|
||||
my $LBase = 0x1100;
|
||||
my $VBase = 0x1161;
|
||||
my $TBase = 0x11A7;
|
||||
my $SCount = 11172;
|
||||
my $LCount = 19;
|
||||
my $VCount = 21;
|
||||
my $TCount = 28;
|
||||
my $NCount = $VCount * $TCount;
|
||||
|
||||
sub name_to_code_point_special {
|
||||
my ($name, $loose) = @_;
|
||||
|
||||
# Returns undef if not one of the specially handled names; otherwise
|
||||
# returns the code point equivalent to the input name
|
||||
# $loose is non-zero if to use loose matching, 'name' in that case
|
||||
# must be input as upper case with all blanks and dashes squeezed out.
|
||||
|
||||
if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
|
||||
|| ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
|
||||
{
|
||||
return if $name !~ qr/^$syllable_re$/;
|
||||
my $L = $Jamo_L{$1};
|
||||
my $V = $Jamo_V{$2};
|
||||
my $T = (defined $3) ? $Jamo_T{$3} : 0;
|
||||
return ($L * $VCount + $V) * $TCount + $T + $SBase;
|
||||
}
|
||||
|
||||
# Name must end in 'code_point' for this to handle.
|
||||
return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
|
||||
|| (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
|
||||
|
||||
my $base = $1;
|
||||
my $code_point = CORE::hex $2;
|
||||
my $names_ref;
|
||||
|
||||
if ($loose) {
|
||||
$names_ref = \%loose_names_ending_in_code_point;
|
||||
}
|
||||
else {
|
||||
return if $base !~ s/-$//;
|
||||
$names_ref = \%names_ending_in_code_point;
|
||||
}
|
||||
|
||||
# Name must be one of the ones which has the code point in it.
|
||||
return if ! $names_ref->{$base};
|
||||
|
||||
# Look through the list of ranges that apply to this name to see if
|
||||
# the code point is in one of them.
|
||||
for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
|
||||
return if $names_ref->{$base}{'low'}->[$i] > $code_point;
|
||||
next if $names_ref->{$base}{'high'}->[$i] < $code_point;
|
||||
|
||||
# Here, the code point is in the range.
|
||||
return $code_point;
|
||||
}
|
||||
|
||||
# Here, looked like the name had a code point number in it, but
|
||||
# did not match one of the valid ones.
|
||||
return;
|
||||
}
|
||||
|
||||
sub code_point_to_name_special {
|
||||
my $code_point = shift;
|
||||
|
||||
# Returns the name of a code point if algorithmically determinable;
|
||||
# undef if not
|
||||
|
||||
# If in the Hangul range, calculate the name based on Unicode's
|
||||
# algorithm
|
||||
if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
|
||||
use integer;
|
||||
my $SIndex = $code_point - $SBase;
|
||||
my $L = $LBase + $SIndex / $NCount;
|
||||
my $V = $VBase + ($SIndex % $NCount) / $TCount;
|
||||
my $T = $TBase + $SIndex % $TCount;
|
||||
$name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
|
||||
$name .= $Jamo{$T} if $T != $TBase;
|
||||
return $name;
|
||||
}
|
||||
|
||||
# Look through list of these code points for one in range.
|
||||
foreach my $hash (@code_points_ending_in_code_point) {
|
||||
return if $code_point < $hash->{'low'};
|
||||
if ($code_point <= $hash->{'high'}) {
|
||||
return sprintf("%s-%04X", $hash->{'name'}, $code_point);
|
||||
}
|
||||
}
|
||||
return; # None found
|
||||
}
|
||||
} # End closure
|
||||
|
||||
1;
|
||||
Reference in New Issue
Block a user