Initial Commit
This commit is contained in:
60
database/perl/vendor/lib/Unicode/GCString.pm
vendored
Normal file
60
database/perl/vendor/lib/Unicode/GCString.pm
vendored
Normal file
@@ -0,0 +1,60 @@
|
||||
#-*-perl-*-
|
||||
|
||||
package Unicode::GCString;
|
||||
require 5.008;
|
||||
|
||||
=encoding utf-8
|
||||
|
||||
=cut
|
||||
|
||||
### Pragmas:
|
||||
use strict;
|
||||
use warnings;
|
||||
use vars qw($VERSION @EXPORT_OK @ISA);
|
||||
|
||||
### Exporting:
|
||||
use Exporter;
|
||||
our @EXPORT_OK = qw();
|
||||
our %EXPORT_TAGS = ('all' => [@EXPORT_OK]);
|
||||
|
||||
### Inheritance:
|
||||
our @ISA = qw(Exporter);
|
||||
|
||||
### Other modules:
|
||||
use Unicode::LineBreak;
|
||||
|
||||
### Globals
|
||||
|
||||
# The package version
|
||||
our $VERSION = '2013.10';
|
||||
|
||||
use overload
|
||||
'@{}' => \&as_arrayref,
|
||||
'${}' => \&as_scalarref,
|
||||
'""' => \&as_string,
|
||||
'.' => \&concat,
|
||||
#XXX'.=' => \&concat, #FIXME:segfault
|
||||
'cmp' => \&cmp,
|
||||
'<>' => \&next,
|
||||
;
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
|
||||
my $self;
|
||||
if (scalar @_ <= 2) {
|
||||
$self = __PACKAGE__->_new(@_);
|
||||
} else {
|
||||
my $str = shift;
|
||||
my $lb = Unicode::LineBreak->new(@_);
|
||||
$self = __PACKAGE__->_new($str, $lb);
|
||||
}
|
||||
bless $self, $class;
|
||||
}
|
||||
|
||||
sub as_arrayref {
|
||||
my @a = shift->as_array;
|
||||
return \@a;
|
||||
}
|
||||
|
||||
1;
|
||||
304
database/perl/vendor/lib/Unicode/GCString.pod
vendored
Normal file
304
database/perl/vendor/lib/Unicode/GCString.pod
vendored
Normal file
@@ -0,0 +1,304 @@
|
||||
=encoding utf-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Unicode::GCString - String as Sequence of UAX #29 Grapheme Clusters
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Unicode::GCString;
|
||||
$gcstring = Unicode::GCString->new($string);
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Unicode::GCString treats Unicode string as a sequence of
|
||||
I<extended grapheme clusters> defined by Unicode Standard Annex #29 [UAX #29].
|
||||
|
||||
B<Grapheme cluster> is a sequence of Unicode character(s) that consists of one
|
||||
B<grapheme base> and optional B<grapheme extender> and/or
|
||||
B<“prepend” character>. It is close in that people consider as I<character>.
|
||||
|
||||
=head2 Public Interface
|
||||
|
||||
=head3 Constructors
|
||||
|
||||
=over 4
|
||||
|
||||
=item new (STRING, [KEY => VALUE, ...])
|
||||
|
||||
=item new (STRING, [LINEBREAK])
|
||||
|
||||
I<Constructor>.
|
||||
Create new grapheme cluster string (Unicode::GCString object) from
|
||||
Unicode string STRING.
|
||||
|
||||
About optional KEY => VALUE pairs see L<Unicode::LineBreak/Options>.
|
||||
On second form, L<Unicode::LineBreak> object LINEBREAK controls
|
||||
breaking features.
|
||||
|
||||
B<Note>:
|
||||
The first form was introduced by release 2012.10.
|
||||
|
||||
=item copy
|
||||
|
||||
I<Copy constructor>.
|
||||
Create a copy of grapheme cluster string.
|
||||
Next position of new string is set at beginning.
|
||||
|
||||
=back
|
||||
|
||||
=head3 Sizes
|
||||
|
||||
=over 4
|
||||
|
||||
=item chars
|
||||
|
||||
I<Instance method>.
|
||||
Returns number of Unicode characters grapheme cluster string includes,
|
||||
i.e. length as Unicode string.
|
||||
|
||||
=item columns
|
||||
|
||||
I<Instance method>.
|
||||
Returns total number of columns of grapheme clusters
|
||||
defined by built-in character database.
|
||||
For more details see L<Unicode::LineBreak/DESCRIPTION>.
|
||||
|
||||
=item length
|
||||
|
||||
I<Instance method>.
|
||||
Returns number of grapheme clusters contained in grapheme cluster string.
|
||||
|
||||
=back
|
||||
|
||||
=head3 Operations as String
|
||||
|
||||
=over 4
|
||||
|
||||
=item as_string
|
||||
|
||||
=item C<">OBJECTC<">
|
||||
|
||||
I<Instance method>.
|
||||
Convert grapheme cluster string to Unicode string explicitly.
|
||||
|
||||
=item cmp (STRING)
|
||||
|
||||
=item STRING C<cmp> STRING
|
||||
|
||||
I<Instance method>.
|
||||
Compare strings. There are no oddities.
|
||||
One of each STRING may be Unicode string.
|
||||
|
||||
=item concat (STRING)
|
||||
|
||||
=item STRING C<.> STRING
|
||||
|
||||
I<Instance method>.
|
||||
Concatenate STRINGs. One of each STRING may be Unicode string.
|
||||
Note that number of columns (see columns()) or grapheme clusters
|
||||
(see length()) of resulting string is not always equal to sum of both
|
||||
strings.
|
||||
Next position of new string is that set on the left value.
|
||||
|
||||
=item join ([STRING, ...])
|
||||
|
||||
I<Instance method>.
|
||||
Join STRINGs inserting grapheme cluster string.
|
||||
Any of STRINGs may be Unicode string.
|
||||
|
||||
=item substr (OFFSET, [LENGTH, [REPLACEMENT]])
|
||||
|
||||
I<Instance method>.
|
||||
Returns substring of grapheme cluster string.
|
||||
OFFSET and LENGTH are based on grapheme clusters.
|
||||
If REPLACEMENT is specified, substring is replaced by it.
|
||||
REPLACEMENT may be Unicode string.
|
||||
|
||||
Note:
|
||||
This method cannot return the lvalue, unlike built-in substr().
|
||||
|
||||
=back
|
||||
|
||||
=head3 Operations as Sequence of Grapheme Clusters
|
||||
|
||||
=over 4
|
||||
|
||||
=item as_array
|
||||
|
||||
=item C<@{>OBJECTC<}>
|
||||
|
||||
=item as_arrayref
|
||||
|
||||
I<Instance method>.
|
||||
Convert grapheme cluster string to an array of grapheme clusters.
|
||||
|
||||
=item eos
|
||||
|
||||
I<Instance method>.
|
||||
Test if current position is at end of grapheme cluster string.
|
||||
|
||||
=item item ([OFFSET])
|
||||
|
||||
I<Instance method>.
|
||||
Returns OFFSET-th grapheme cluster.
|
||||
If OFFSET was not specified, returns next grapheme cluster.
|
||||
|
||||
=item next
|
||||
|
||||
=item C<E<lt>>OBJECTC<E<gt>>
|
||||
|
||||
I<Instance method>, iterative.
|
||||
Returns next grapheme cluster and increment next position.
|
||||
|
||||
=item pos ([OFFSET])
|
||||
|
||||
I<Instance method>.
|
||||
If optional OFFSET is specified, set next position by it.
|
||||
Returns next position of grapheme cluster string.
|
||||
|
||||
=back
|
||||
|
||||
=begin comment
|
||||
|
||||
=head4 Methods planned to be deprecated
|
||||
|
||||
=over 4
|
||||
|
||||
=item flag ([OFFSET, [VALUE]])
|
||||
|
||||
I<Instance method>.
|
||||
Get or set flag value of OFFEST-th grapheme cluster.
|
||||
If OFFSET was not specified, returns flag value of next grapheme cluster.
|
||||
Flag value is an non-zero integer not greater than 255 and initially is 0.
|
||||
|
||||
Predefined flags are:
|
||||
|
||||
=over 4
|
||||
|
||||
=item Unicode::LineBreak::ALLOW_BEFORE
|
||||
|
||||
Allow line breaking just before this grapheme cluster.
|
||||
|
||||
=item Unicode::LineBreak::PROHIBIT_BEFORE
|
||||
|
||||
Prohibit line breaking just before this grapheme cluster.
|
||||
|
||||
=back
|
||||
|
||||
=item lbclass ([OFFSET])
|
||||
|
||||
I<Instance method>.
|
||||
Returns Line Breaking Class (See L<Unicode::LineBreak>) of the first
|
||||
character of OFFSET-th grapheme cluster.
|
||||
If OFFSET was not specified, returns class of next grapheme cluster.
|
||||
|
||||
B<Note>:
|
||||
Use lbc().
|
||||
|
||||
=item lbclass_ext ([OFFSET])
|
||||
|
||||
I<Instance method>.
|
||||
Returns Line Breaking Class (See L<Unicode::LineBreak>) of the last
|
||||
grapheme extender of OFFSET-th grapheme cluster. If there are no
|
||||
grapheme extenders or its class is CM, value of lbclass() is returned.
|
||||
|
||||
B<Note>:
|
||||
Use lbcext().
|
||||
|
||||
=back
|
||||
|
||||
=end comment
|
||||
|
||||
=head3 Miscelaneous
|
||||
|
||||
=over 4
|
||||
|
||||
=item lbc
|
||||
|
||||
I<Instance method>.
|
||||
Returns Line Breaking Class (See L<Unicode::LineBreak>) of the first
|
||||
character of first grapheme cluster.
|
||||
|
||||
=item lbcext
|
||||
|
||||
I<Instance method>.
|
||||
Returns Line Breaking Class (See L<Unicode::LineBreak>) of the last
|
||||
grapheme extender of last grapheme cluster.
|
||||
If there are no grapheme extenders or its class is CM, value of last
|
||||
grapheme base will be returned.
|
||||
|
||||
=back
|
||||
|
||||
=head1 CAVEATS
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
The grapheme cluster should not be referred to as "grapheme"
|
||||
even though Larry does.
|
||||
|
||||
=item *
|
||||
|
||||
On Perl around 5.10.1, implicit conversion from Unicode::GCString object to
|
||||
Unicode string sometimes let C<"utf8_mg_pos_cache_update"> cache be confused.
|
||||
|
||||
For example, instead of doing
|
||||
|
||||
$sub = substr($gcstring, $i, $j);
|
||||
|
||||
do
|
||||
|
||||
$sub = substr("$gcstring", $i, $j);
|
||||
|
||||
$sub = substr($gcstring->as_string, $i, $j);
|
||||
|
||||
=item *
|
||||
|
||||
This module implements I<default> algorithm for determining grapheme cluster
|
||||
boundaries. Tailoring mechanism has not been supported yet.
|
||||
|
||||
=back
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Consult $VERSION variable.
|
||||
|
||||
=head2 Incompatible Changes
|
||||
|
||||
=over 4
|
||||
|
||||
=item Release 2013.10
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
The new() method can take non-Unicode string argument.
|
||||
In this case it will be decoded by iso-8859-1 (Latin 1) character set.
|
||||
That method of former releases would die with non-Unicode inputs.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
[UAX #29]
|
||||
Mark Davis (ed.) (2009-2013).
|
||||
I<Unicode Standard Annex #29: Unicode Text Segmentation>, Revisions 15-23.
|
||||
L<http://www.unicode.org/reports/tr29/>.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright (C) 2009-2013 Hatuka*nezumi - IKEDA Soji.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
248
database/perl/vendor/lib/Unicode/LineBreak.pm
vendored
Normal file
248
database/perl/vendor/lib/Unicode/LineBreak.pm
vendored
Normal file
@@ -0,0 +1,248 @@
|
||||
#-*- perl -*-
|
||||
|
||||
package Unicode::LineBreak;
|
||||
require 5.008;
|
||||
|
||||
### Pragmas:
|
||||
use strict;
|
||||
use warnings;
|
||||
use vars qw($VERSION @EXPORT_OK @ISA $Config @Config);
|
||||
|
||||
### Exporting:
|
||||
use Exporter;
|
||||
our @EXPORT_OK = qw(UNICODE_VERSION SOMBOK_VERSION context);
|
||||
our %EXPORT_TAGS = ('all' => [@EXPORT_OK]);
|
||||
|
||||
### Inheritance:
|
||||
our @ISA = qw(Exporter);
|
||||
|
||||
### Other modules:
|
||||
use Carp qw(croak carp);
|
||||
use Encode qw(is_utf8);
|
||||
use MIME::Charset;
|
||||
use Unicode::GCString;
|
||||
|
||||
### Globals
|
||||
|
||||
### The package version
|
||||
our $VERSION = '2019.001';
|
||||
|
||||
### Public Configuration Attributes
|
||||
our @Config = (
|
||||
BreakIndent => 'YES',
|
||||
CharMax => 998,
|
||||
ColMax => 76,
|
||||
ColMin => 0,
|
||||
ComplexBreaking => 'YES',
|
||||
Context => 'NONEASTASIAN',
|
||||
EAWidth => undef,
|
||||
Format => 'SIMPLE',
|
||||
HangulAsAL => 'NO',
|
||||
LBClass => undef,
|
||||
LegacyCM => 'YES',
|
||||
Newline => "\n",
|
||||
Prep => undef,
|
||||
Sizing => 'UAX11',
|
||||
Urgent => undef,
|
||||
ViramaAsJoiner => 'YES',
|
||||
);
|
||||
our $Config = {};
|
||||
eval { require Unicode::LineBreak::Defaults; };
|
||||
push @Config, (%$Config);
|
||||
|
||||
### Exportable constants
|
||||
use Unicode::LineBreak::Constants;
|
||||
use constant 1.01;
|
||||
my $package = __PACKAGE__;
|
||||
my @consts = grep { s/^${package}::(\w\w+)$/$1/ } keys %constant::declared;
|
||||
push @EXPORT_OK, @consts;
|
||||
push @{$EXPORT_TAGS{'all'}}, @consts;
|
||||
|
||||
### Load XS module
|
||||
require XSLoader;
|
||||
XSLoader::load('Unicode::LineBreak', $VERSION);
|
||||
|
||||
### Load dynamic constants
|
||||
foreach my $p ((['EA', EAWidths()], ['LB', LBClasses()])) {
|
||||
my $prop = shift @{$p};
|
||||
my $idx = 0;
|
||||
foreach my $val (@{$p}) {
|
||||
no strict;
|
||||
my $const = "${prop}_${val}";
|
||||
*{$const} = eval "sub { $idx }";
|
||||
push @EXPORT_OK, $const;
|
||||
push @{$EXPORT_TAGS{'all'}}, $const;
|
||||
$idx++;
|
||||
}
|
||||
}
|
||||
|
||||
### Privates
|
||||
my $EASTASIAN_CHARSETS = qr{
|
||||
^BIG5 |
|
||||
^CP9\d\d |
|
||||
^EUC- |
|
||||
^GB18030 | ^GB2312 | ^GBK |
|
||||
^HZ |
|
||||
^ISO-2022- |
|
||||
^KS_C_5601 |
|
||||
^SHIFT_JIS
|
||||
}ix;
|
||||
|
||||
my $EASTASIAN_LANGUAGES = qr{
|
||||
^AIN |
|
||||
^JA\b | ^JPN |
|
||||
^KO\b | ^KOR |
|
||||
^ZH\b | ^CHI
|
||||
}ix;
|
||||
|
||||
use overload
|
||||
'%{}' => \&as_hashref,
|
||||
'${}' => \&as_scalarref,
|
||||
'""' => \&as_string,
|
||||
;
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
|
||||
my $self = __PACKAGE__->_new();
|
||||
$self->config(@Config);
|
||||
$self->config(@_);
|
||||
bless $self, $class;
|
||||
}
|
||||
|
||||
sub config ($@) {
|
||||
my $self = shift;
|
||||
|
||||
# Get config.
|
||||
if (scalar @_ == 1) {
|
||||
my $k = shift;
|
||||
my $ret;
|
||||
|
||||
if (uc $k eq uc 'CharactersMax') {
|
||||
return $self->_config('CharMax');
|
||||
} elsif (uc $k eq uc 'ColumnsMax') {
|
||||
return $self->_config('ColMax');
|
||||
} elsif (uc $k eq uc 'ColumnsMin') {
|
||||
return $self->_config('ColMin');
|
||||
} elsif (uc $k eq uc 'SizingMethod') {
|
||||
return $self->_config('Sizing');
|
||||
} elsif (uc $k eq uc 'TailorEA') {
|
||||
carp "$k is obsoleted. Use EAWidth";
|
||||
$ret = $self->_config('EAWidth');
|
||||
if (! defined $ret) {
|
||||
return [];
|
||||
} else {
|
||||
return [map { ($_->[0] => $_->[1]) } @{$ret}];
|
||||
}
|
||||
} elsif (uc $k eq uc 'TailorLB') {
|
||||
carp "$k is obsoleted. Use LBClass";
|
||||
$ret = $self->_config('LBClass');
|
||||
if (! defined $ret) {
|
||||
return [];
|
||||
} else {
|
||||
return [map { ($_->[0] => $_->[1]) } @{$ret}];
|
||||
}
|
||||
} elsif (uc $k eq uc 'UrgentBreaking') {
|
||||
return $self->_config('Urgent');
|
||||
} elsif (uc $k eq uc 'UserBreaking') {
|
||||
carp "$k is obsoleted. Use Prep";
|
||||
$ret = $self->_config('Prep');
|
||||
if (! defined $ret) {
|
||||
return [];
|
||||
} else {
|
||||
return $ret;
|
||||
}
|
||||
} else {
|
||||
return $self->_config($k);
|
||||
}
|
||||
}
|
||||
|
||||
# Set config.
|
||||
my @config = ();
|
||||
while (0 < scalar @_) {
|
||||
my $k = shift;
|
||||
my $v = shift;
|
||||
|
||||
if (uc $k eq uc 'CharactersMax') {
|
||||
push @config, 'CharMax' => $v;
|
||||
} elsif (uc $k eq uc 'ColumnsMax') {
|
||||
push @config, 'ColMax' => $v;
|
||||
} elsif (uc $k eq uc 'ColumnsMin') {
|
||||
push @config, 'ColMin' => $v;
|
||||
} elsif (uc $k eq uc 'SizingMethod') {
|
||||
push @config, 'Sizing' => $v;
|
||||
} elsif (uc $k eq uc 'TailorLB') {
|
||||
carp "$k is obsoleted. Use LBClass";
|
||||
push @config, 'LBClass' => undef;
|
||||
if (! defined $v) {
|
||||
;
|
||||
} else {
|
||||
my @v = @{$v};
|
||||
while (scalar(@v)) {
|
||||
my $k = shift @v;
|
||||
my $v = shift @v;
|
||||
push @config, 'LBClass' => [ $k => $v ];
|
||||
}
|
||||
}
|
||||
} elsif (uc $k eq uc 'TailorEA') {
|
||||
carp "$k is obsoleted. Use EAWidth";
|
||||
push @config, 'EAWidth' => undef;
|
||||
if (! defined $v) {
|
||||
;
|
||||
} else {
|
||||
my @v = @{$v};
|
||||
while (scalar(@v)) {
|
||||
my $k = shift @v;
|
||||
my $v = shift @v;
|
||||
push @config, 'EAWidth' => [ $k => $v ];
|
||||
}
|
||||
}
|
||||
} elsif (uc $k eq uc 'UserBreaking') {
|
||||
carp "$k is obsoleted. Use Prep";
|
||||
push @config, 'Prep' => undef;
|
||||
if (! defined $v) {
|
||||
;
|
||||
} elsif (ref $v eq 'ARRAY') {
|
||||
push @config, map { ('Prep' => $_) } @{$v};
|
||||
} else {
|
||||
push @config, 'Prep' => $v;
|
||||
}
|
||||
} elsif (uc $k eq uc 'UrgentBreaking') {
|
||||
push @config, 'Urgent' => $v;
|
||||
} else {
|
||||
push @config, $k => $v;
|
||||
}
|
||||
}
|
||||
|
||||
$self->_config(@config) if scalar @config;
|
||||
}
|
||||
|
||||
sub context (@) {
|
||||
my %opts = @_;
|
||||
|
||||
my $charset;
|
||||
my $language;
|
||||
my $context;
|
||||
foreach my $k (keys %opts) {
|
||||
if (uc $k eq 'CHARSET') {
|
||||
if (ref $opts{$k}) {
|
||||
$charset = $opts{$k}->as_string;
|
||||
} else {
|
||||
$charset = MIME::Charset->new($opts{$k})->as_string;
|
||||
}
|
||||
} elsif (uc $k eq 'LANGUAGE') {
|
||||
$language = uc $opts{$k};
|
||||
$language =~ s/_/-/;
|
||||
}
|
||||
}
|
||||
if ($charset and $charset =~ /$EASTASIAN_CHARSETS/) {
|
||||
$context = 'EASTASIAN';
|
||||
} elsif ($language and $language =~ /$EASTASIAN_LANGUAGES/) {
|
||||
$context = 'EASTASIAN';
|
||||
} else {
|
||||
$context = 'NONEASTASIAN';
|
||||
}
|
||||
$context;
|
||||
}
|
||||
|
||||
1;
|
||||
983
database/perl/vendor/lib/Unicode/LineBreak.pod
vendored
Normal file
983
database/perl/vendor/lib/Unicode/LineBreak.pod
vendored
Normal file
@@ -0,0 +1,983 @@
|
||||
=encoding utf-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Unicode::LineBreak - UAX #14 Unicode Line Breaking Algorithm
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Unicode::LineBreak;
|
||||
$lb = Unicode::LineBreak->new();
|
||||
$broken = $lb->break($string);
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Unicode::LineBreak performs Line Breaking Algorithm described in Unicode
|
||||
Standard Annex #14 [UAX #14]. East_Asian_Width informative property
|
||||
defined by Annex #11 [UAX #11] will be concerned to determine breaking
|
||||
positions.
|
||||
|
||||
=head2 Terminology
|
||||
|
||||
Following terms are used for convenience.
|
||||
|
||||
B<Mandatory break> is obligatory line breaking behavior defined by core
|
||||
rules and performed regardless of surrounding characters.
|
||||
B<Arbitrary break> is line breaking behavior allowed by core rules
|
||||
and chosen by user to perform it.
|
||||
Arbitrary break includes B<direct break> and B<indirect break>
|
||||
defined by [UAX #14].
|
||||
|
||||
B<Alphabetic characters> are characters usually no line breaks are allowed
|
||||
between pairs of them, except that other characters provide break
|
||||
oppotunities.
|
||||
B<Ideographic characters> are characters that usually allow line breaks
|
||||
both before and after themselves.
|
||||
[UAX #14] classifies most of alphabetic to AL and most of ideographic to ID
|
||||
(These terms are inaccurate from the point of view by grammatology).
|
||||
On several scripts, breaking positions are not obvious by each characters
|
||||
therefore heuristic based on dictionary is used.
|
||||
|
||||
B<Number of columns> of a string is not always equal to the number of characters it contains:
|
||||
Each of characters is either B<wide>, B<narrow> or nonspacing;
|
||||
they occupy 2, 1 or 0 columns, respectively.
|
||||
Several characters may be both wide and narrow by the contexts they are used.
|
||||
Characters may have more various widths by customization.
|
||||
|
||||
=head1 PUBLIC INTERFACE
|
||||
|
||||
=head2 Line Breaking
|
||||
|
||||
=over 4
|
||||
|
||||
=item new ([KEY => VALUE, ...])
|
||||
|
||||
I<Constructor>.
|
||||
About KEY => VALUE pairs see L</Options>.
|
||||
|
||||
=item break (STRING)
|
||||
|
||||
I<Instance method>.
|
||||
Break Unicode string STRING and returns it.
|
||||
In array context, returns array of lines contained in the result.
|
||||
|
||||
=item break_partial (STRING)
|
||||
|
||||
I<Instance method>.
|
||||
Same as break() but accepts incremental inputs.
|
||||
Give C<undef> as STRING argument to specify that input was completed.
|
||||
|
||||
=item config (KEY)
|
||||
|
||||
=item config (KEY => VALUE, ...)
|
||||
|
||||
I<Instance method>.
|
||||
Get or update configuration. About KEY => VALUE pairs see L</Options>.
|
||||
|
||||
=item copy
|
||||
|
||||
I<Copy constructor>.
|
||||
Create a copy of object instance.
|
||||
|
||||
=begin comment
|
||||
|
||||
=item reset
|
||||
|
||||
I<Undocumented>.
|
||||
|
||||
=end comment
|
||||
|
||||
=back
|
||||
|
||||
=head2 Getting Informations
|
||||
|
||||
=over 4
|
||||
|
||||
=item breakingRule (BEFORESTR, AFTERSTR)
|
||||
|
||||
I<Instance method>.
|
||||
Get possible line breaking behavior between strings BEFORESTR and AFTERSTR.
|
||||
See L</Constants> for returned value.
|
||||
|
||||
B<Note>:
|
||||
This method gives just approximate description of line breaking behavior.
|
||||
Use break() and so on to wrap actual texts.
|
||||
|
||||
=item context ([Charset => CHARSET], [Language => LANGUAGE])
|
||||
|
||||
I<Function>.
|
||||
Get language/region context used by character set CHARSET or
|
||||
language LANGUAGE.
|
||||
|
||||
=back
|
||||
|
||||
=begin comment
|
||||
|
||||
=head3 Methods Planned to be Deprecated
|
||||
|
||||
=over 4
|
||||
|
||||
=item lbrule (BEFORE, AFTER)
|
||||
|
||||
I<Instance method>.
|
||||
Get possible line breaking behavior between class BEFORE and class AFTER.
|
||||
See L</Constants> for returned value.
|
||||
|
||||
B<Note>:
|
||||
This method gives just approximate description of line breaking behavior.
|
||||
Use break() and so on to wrap actual texts.
|
||||
|
||||
B<Note>:
|
||||
Use breakingRule().
|
||||
|
||||
=item strsize (LEN, PRE, SPC, STR)
|
||||
|
||||
I<Instance method>.
|
||||
Calculate I<number of columns> of Unicode string
|
||||
PRE.SPC.STR based on character widths defined by [UAX #11].
|
||||
|
||||
B<Note>:
|
||||
Use L<Unicode::GCString/columns>.
|
||||
|
||||
=back
|
||||
|
||||
=end comment
|
||||
|
||||
=head2 Options
|
||||
|
||||
L</new> and L</config> methods accept following pairs.
|
||||
Some of them affect number of columns ([B<E>]),
|
||||
grapheme cluster segmentation ([B<G>])
|
||||
(see also L<Unicode::GCString>) or
|
||||
line breaking behavior ([B<L>]).
|
||||
|
||||
=over 4
|
||||
|
||||
=item BreakIndent => C<"YES"> | C<"NO">
|
||||
|
||||
[B<L>]
|
||||
Always allows break after SPACEs at beginning of line, a.k.a. indent.
|
||||
[UAX #14] does not take account of such usage of SPACE.
|
||||
Default is C<"YES">.
|
||||
|
||||
B<Note>:
|
||||
This option was introduced at release 1.011.
|
||||
|
||||
=item CharMax => NUMBER
|
||||
|
||||
[B<L>]
|
||||
Possible maximum number of characters in one line,
|
||||
not counting trailing SPACEs and newline sequence.
|
||||
Note that number of characters generally doesn't represent length of line.
|
||||
Default is C<998>.
|
||||
C<0> means unlimited (as of release 2012.01).
|
||||
|
||||
=item ColMin => NUMBER
|
||||
|
||||
[B<L>]
|
||||
Minimum number of columns which line broken arbitrarily may include, not
|
||||
counting trailing spaces and newline sequences.
|
||||
Default is C<0>.
|
||||
|
||||
=item ColMax => NUMBER
|
||||
|
||||
[B<L>]
|
||||
Maximum number of columns line may include not counting trailing spaces and
|
||||
newline sequence. In other words, maximum length of line.
|
||||
Default is C<76>.
|
||||
|
||||
=back
|
||||
|
||||
See also L</Urgent> option and L</User-Defined Breaking Behaviors>.
|
||||
|
||||
=over 4
|
||||
|
||||
=item ComplexBreaking => C<"YES"> | C<"NO">
|
||||
|
||||
[B<L>]
|
||||
Performs heuristic breaking on South East Asian complex context.
|
||||
Default is, if word segmentation for South East Asian writing systems is
|
||||
enabled, C<"YES">.
|
||||
|
||||
=item Context => CONTEXT
|
||||
|
||||
[B<E>][B<L>]
|
||||
Specify language/region context.
|
||||
Currently available contexts are C<"EASTASIAN"> and C<"NONEASTASIAN">.
|
||||
Default context is C<"NONEASTASIAN">.
|
||||
|
||||
In C<"EASTASIAN"> context, characters with East_Asian_Width property
|
||||
ambiguous (A) are treated as "wide" and with Line Breaking Class AI as
|
||||
ideographic (ID).
|
||||
|
||||
In C<"NONEASTASIAN"> context, characters with East_Asian_Width property
|
||||
ambiguous (A) are treated as "narrow" and with Line Breaking Class AI as
|
||||
alphabetic (AL).
|
||||
|
||||
=item EAWidth => C<[> ORD C<=E<gt>> PROPERTY C<]>
|
||||
|
||||
=item EAWidth => C<undef>
|
||||
|
||||
[B<E>]
|
||||
Tailor classification of East_Asian_Width property.
|
||||
ORD is UCS scalar value of character or array reference of them.
|
||||
PROPERTY is one of East_Asian_Width property values
|
||||
and extended values
|
||||
(See L</Constants>).
|
||||
This option may be specified multiple times.
|
||||
If C<undef> is specified, all tailoring assigned before will be canceled.
|
||||
|
||||
By default, no tailorings are available.
|
||||
See also L</Tailoring Character Properties>.
|
||||
|
||||
=item Format => METHOD
|
||||
|
||||
[B<L>]
|
||||
Specify the method to format broken lines.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<"SIMPLE">
|
||||
|
||||
Default method.
|
||||
Just only insert newline at arbitrary breaking positions.
|
||||
|
||||
=item C<"NEWLINE">
|
||||
|
||||
Insert or replace newline sequences with that specified by L</Newline> option,
|
||||
remove SPACEs leading newline sequences or end-of-text. Then append newline
|
||||
at end of text if it does not exist.
|
||||
|
||||
=item C<"TRIM">
|
||||
|
||||
Insert newline at arbitrary breaking positions. Remove SPACEs leading
|
||||
newline sequences.
|
||||
|
||||
=item C<undef>
|
||||
|
||||
Do nothing, even inserting any newlines.
|
||||
|
||||
=item Subroutine reference
|
||||
|
||||
See L</Formatting Lines>.
|
||||
|
||||
=back
|
||||
|
||||
=item HangulAsAL => C<"YES"> | C<"NO">
|
||||
|
||||
[B<L>]
|
||||
Treat hangul syllables and conjoining jamos as alphabetic characters (AL).
|
||||
Default is C<"NO">.
|
||||
|
||||
=item LBClass => C<[> ORD C<=E<gt>> CLASS C<]>
|
||||
|
||||
=item LBClass => C<undef>
|
||||
|
||||
[B<G>][B<L>]
|
||||
Tailor classification of line breaking property.
|
||||
ORD is UCS scalar value of character or array reference of them.
|
||||
CLASS is one of line breaking classes (See L</Constants>).
|
||||
This option may be specified multiple times.
|
||||
If C<undef> is specified, all tailoring assigned before will be canceled.
|
||||
|
||||
By default, no tailorings are available.
|
||||
See also L</Tailoring Character Properties>.
|
||||
|
||||
=item LegacyCM => C<"YES"> | C<"NO">
|
||||
|
||||
[B<G>][B<L>]
|
||||
Treat combining characters lead by a SPACE as an isolated combining character
|
||||
(ID).
|
||||
As of Unicode 5.0, such use of SPACE is not recommended.
|
||||
Default is C<"YES">.
|
||||
|
||||
=item Newline => STRING
|
||||
|
||||
[B<L>]
|
||||
Unicode string to be used for newline sequence.
|
||||
Default is C<"\n">.
|
||||
|
||||
=item Prep => METHOD
|
||||
|
||||
[B<L>]
|
||||
Add user-defined line breaking behavior(s).
|
||||
This option may be specified multiple times.
|
||||
Following methods are available.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<"NONBREAKURI">
|
||||
|
||||
Won't break URIs.
|
||||
|
||||
=item C<"BREAKURI">
|
||||
|
||||
Break URIs according to a rule suitable for printed materials.
|
||||
For more details see [CMOS], sections 6.17 and 17.11.
|
||||
|
||||
=item C<[> REGEX, SUBREF C<]>
|
||||
|
||||
The sequences matching regular expression REGEX will be broken by
|
||||
subroutine referred by SUBREF.
|
||||
For more details see L</User-Defined Breaking Behaviors>.
|
||||
|
||||
=item C<undef>
|
||||
|
||||
Cancel all methods assigned before.
|
||||
|
||||
=back
|
||||
|
||||
=item Sizing => METHOD
|
||||
|
||||
[B<L>]
|
||||
Specify method to calculate size of string.
|
||||
Following options are available.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<"UAX11">
|
||||
|
||||
Default method.
|
||||
Sizes are computed by columns of each characters accoring to built-in
|
||||
character database.
|
||||
|
||||
=item C<undef>
|
||||
|
||||
Number of grapheme clusters (see L<Unicode::GCString>) contained in the string.
|
||||
|
||||
=item Subroutine reference
|
||||
|
||||
See L</Calculating String Size>.
|
||||
|
||||
=back
|
||||
|
||||
See also L</ColMax>, L</ColMin> and L</EAWidth> options.
|
||||
|
||||
=item Urgent => METHOD
|
||||
|
||||
[B<L>]
|
||||
Specify method to handle excessing lines.
|
||||
Following options are available.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<"CROAK">
|
||||
|
||||
Print error message and die.
|
||||
|
||||
=item C<"FORCE">
|
||||
|
||||
Force breaking excessing fragment.
|
||||
|
||||
=item C<undef>
|
||||
|
||||
Default method.
|
||||
Won't break excessing fragment.
|
||||
|
||||
=item Subroutine reference
|
||||
|
||||
See L</User-Defined Breaking Behaviors>.
|
||||
|
||||
=back
|
||||
|
||||
=item ViramaAsJoiner => C<"YES"> | C<"NO">
|
||||
|
||||
[B<G>]
|
||||
Virama sign ("halant" in Hindi, "coeng" in Khmer) and its succeeding letter
|
||||
are not broken.
|
||||
Default is C<"YES">.
|
||||
B<Note>:
|
||||
This option was introduced by release 2012.001_29.
|
||||
On previous releases, it was fixed to C<"NO">.
|
||||
"Default" grapheme cluster defined by [UAX #29] does not include this
|
||||
feature.
|
||||
|
||||
=back
|
||||
|
||||
=begin comment
|
||||
|
||||
=head3 Obsoleted Options
|
||||
|
||||
=over 4
|
||||
|
||||
=item TailorEA => C<[> ORD C<=E<gt>> PROPERTY, ... C<]>
|
||||
|
||||
Obsoleted equivalent to L</EAWidth>.
|
||||
|
||||
=item TailorLB => C<[> ORD C<=E<gt>> CLASS, ... C<]>
|
||||
|
||||
Obsoleted equivalent to L</LBClass>.
|
||||
|
||||
=item UserBreaking => C<[>METHOD, ...C<]>
|
||||
|
||||
Obsoleted equivalent to L</Prep>.
|
||||
|
||||
=back
|
||||
|
||||
=end comment
|
||||
|
||||
=head2 Constants
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<EA_Na>, C<EA_N>, C<EA_A>, C<EA_W>, C<EA_H>, C<EA_F>
|
||||
|
||||
Index values to specify six East_Asian_Width property values defined by
|
||||
[UAX #11]:
|
||||
narrow (Na), neutral (N), ambiguous (A), wide (W), halfwidth (H) and
|
||||
fullwidth (F).
|
||||
|
||||
=item C<EA_Z>
|
||||
|
||||
Index value to specify nonspacing characters.
|
||||
|
||||
B<Note>:
|
||||
This "nonspacing" value is extension by this module,
|
||||
not a part of [UAX #11].
|
||||
|
||||
=begin comment
|
||||
|
||||
C<EA_ZA> and C<EA_ZW>: Undocumented.
|
||||
|
||||
Earlier releases had only C<EA_Z>.
|
||||
C<EA_ZA> and C<EA_ZW> were added by release 2012.10.
|
||||
|
||||
=end comment
|
||||
|
||||
=item C<LB_BK>, C<LB_CR>, C<LB_LF>, C<LB_NL>, C<LB_SP>, C<LB_OP>, C<LB_CL>, C<LB_CP>, C<LB_QU>, C<LB_GL>, C<LB_NS>, C<LB_EX>, C<LB_SY>, C<LB_IS>, C<LB_PR>, C<LB_PO>, C<LB_NU>, C<LB_AL>, C<LB_HL>, C<LB_ID>, C<LB_IN>, C<LB_HY>, C<LB_BA>, C<LB_BB>, C<LB_B2>, C<LB_CB>, C<LB_ZW>, C<LB_CM>, C<LB_WJ>, C<LB_H2>, C<LB_H3>, C<LB_JL>, C<LB_JV>, C<LB_JT>, C<LB_SG>, C<LB_AI>, C<LB_CJ>, C<LB_SA>, C<LB_XX>, C<LB_RI>
|
||||
|
||||
Index values to specify 40 line breaking property values (classes)
|
||||
defined by [UAX #14].
|
||||
|
||||
B<Note>: Property value CP was introduced by Unicode 5.2.0.
|
||||
Property values HL and CJ were introduced by Unicode 6.1.0.
|
||||
Property value RI was introduced by Unicode 6.2.0.
|
||||
|
||||
=item C<MANDATORY>, C<DIRECT>, C<INDIRECT>, C<PROHIBITED>
|
||||
|
||||
Four values to specify line breaking behaviors:
|
||||
Mandatory break; Both direct break and indirect break are allowed;
|
||||
Indirect break is allowed but direct break is prohibited;
|
||||
Prohibited break.
|
||||
|
||||
=item C<Unicode::LineBreak::SouthEastAsian::supported>
|
||||
|
||||
Flag to determin if word segmentation for South East Asian writing systems is
|
||||
enabled.
|
||||
If this feature was enabled, a non-empty string is set.
|
||||
Otherwise, C<undef> is set.
|
||||
|
||||
B<N.B.>: Current release supports Thai script of modern Thai language only.
|
||||
|
||||
=item C<UNICODE_VERSION>
|
||||
|
||||
A string to specify version of Unicode standard this module refers.
|
||||
|
||||
=back
|
||||
|
||||
=head1 CUSTOMIZATION
|
||||
|
||||
=head2 Formatting Lines
|
||||
|
||||
If you specify subroutine reference as a value of L</Format> option,
|
||||
it should accept three arguments:
|
||||
|
||||
$MODIFIED = &subroutine(SELF, EVENT, STR);
|
||||
|
||||
SELF is a Unicode::LineBreak object,
|
||||
EVENT is a string to determine the context that subroutine was called in,
|
||||
and STR is a fragment of Unicode string leading or trailing breaking position.
|
||||
|
||||
EVENT |When Fired |Value of STR
|
||||
-----------------------------------------------------------------
|
||||
"sot" |Beginning of text |Fragment of first line
|
||||
"sop" |After mandatory break|Fragment of next line
|
||||
"sol" |After arbitrary break|Fragment on sequel of line
|
||||
"" |Just before any |Complete line without trailing
|
||||
|breaks |SPACEs
|
||||
"eol" |Arbitrary break |SPACEs leading breaking position
|
||||
"eop" |Mandatory break |Newline and its leading SPACEs
|
||||
"eot" |End of text |SPACEs (and newline) at end of
|
||||
| |text
|
||||
-----------------------------------------------------------------
|
||||
|
||||
Subroutine should return modified text fragment or may return
|
||||
C<undef> to express that no modification occurred.
|
||||
Note that modification in the context of C<"sot">, C<"sop"> or C<"sol"> may
|
||||
affect decision of successive breaking positions while in the others won't.
|
||||
|
||||
B<Note>:
|
||||
String arguments are actually sequences of grapheme clusters.
|
||||
See L<Unicode::GCString>.
|
||||
|
||||
For example, following code folds lines removing trailing spaces:
|
||||
|
||||
sub fmt {
|
||||
if ($_[1] =~ /^eo/) {
|
||||
return "\n";
|
||||
}
|
||||
return undef;
|
||||
}
|
||||
my $lb = Unicode::LineBreak->new(Format => \&fmt);
|
||||
$output = $lb->break($text);
|
||||
|
||||
=head2 User-Defined Breaking Behaviors
|
||||
|
||||
When a line generated by arbitrary break is expected to be beyond measure of
|
||||
either CharMax, ColMax or ColMin, B<urgent break> may be
|
||||
performed on successive string.
|
||||
If you specify subroutine reference as a value of L</Urgent> option,
|
||||
it should accept two arguments:
|
||||
|
||||
@BROKEN = &subroutine(SELF, STR);
|
||||
|
||||
SELF is a Unicode::LineBreak object and STR is a Unicode string to be broken.
|
||||
|
||||
Subroutine should return an array of broken string STR.
|
||||
|
||||
B<Note>:
|
||||
String argument is actually a sequence of grapheme clusters.
|
||||
See L<Unicode::GCString>.
|
||||
|
||||
For example, following code inserts hyphen to the name of several chemical substances (such as Titin) so that it may be folded:
|
||||
|
||||
sub hyphenize {
|
||||
return map {$_ =~ s/yl$/yl-/; $_} split /(\w+?yl(?=\w))/, $_[1];
|
||||
}
|
||||
my $lb = Unicode::LineBreak->new(Urgent => \&hyphenize);
|
||||
$output = $lb->break("Methionylthreonylthreonylglutaminylarginyl...");
|
||||
|
||||
If you specify [REGEX, SUBREF] array reference as any of L</Prep> option,
|
||||
subroutine should accept two arguments:
|
||||
|
||||
@BROKEN = &subroutine(SELF, STR);
|
||||
|
||||
SELF is a Unicode::LineBreak object and
|
||||
STR is a Unicode string matched with REGEX.
|
||||
|
||||
Subroutine should return an array of broken string STR.
|
||||
|
||||
For example, following code will break HTTP URLs using [CMOS] rule.
|
||||
|
||||
my $url = qr{http://[\x21-\x7E]+}i;
|
||||
sub breakurl {
|
||||
my $self = shift;
|
||||
my $str = shift;
|
||||
return split m{(?<=[/]) (?=[^/]) |
|
||||
(?<=[^-.]) (?=[-~.,_?\#%=&]) |
|
||||
(?<=[=&]) (?=.)}x, $str;
|
||||
}
|
||||
my $lb = Unicode::LineBreak->new(Prep => [$url, \&breakurl]);
|
||||
$output = $lb->break($string);
|
||||
|
||||
=head3 Preserving State
|
||||
|
||||
Unicode::LineBreak object can behave as hash reference.
|
||||
Any items may be preserved throughout its life.
|
||||
|
||||
For example, following code will separate paragraphs with empty lines.
|
||||
|
||||
sub paraformat {
|
||||
my $self = shift;
|
||||
my $action = shift;
|
||||
my $str = shift;
|
||||
|
||||
if ($action eq 'sot' or $action eq 'sop') {
|
||||
$self->{'line'} = '';
|
||||
} elsif ($action eq '') {
|
||||
$self->{'line'} = $str;
|
||||
} elsif ($action eq 'eol') {
|
||||
return "\n";
|
||||
} elsif ($action eq 'eop') {
|
||||
if (length $self->{'line'}) {
|
||||
return "\n\n";
|
||||
} else {
|
||||
return "\n";
|
||||
}
|
||||
} elsif ($action eq 'eot') {
|
||||
return "\n";
|
||||
}
|
||||
return undef;
|
||||
}
|
||||
my $lb = Unicode::LineBreak->new(Format => \¶format);
|
||||
$output = $lb->break($string);
|
||||
|
||||
=head2 Calculating String Size
|
||||
|
||||
If you specify subroutine reference as a value of L</Sizing> option,
|
||||
it will be called with five arguments:
|
||||
|
||||
$COLS = &subroutine(SELF, LEN, PRE, SPC, STR);
|
||||
|
||||
SELF is a Unicode::LineBreak object, LEN is size of preceding string,
|
||||
PRE is preceding Unicode string, SPC is additional SPACEs and STR is a
|
||||
Unicode string to be processed.
|
||||
|
||||
Subroutine should return calculated number of columns of C<PRE.SPC.STR>.
|
||||
The number of columns may not be an integer: Unit of the number may be freely chosen, however, it should be same as those of L</ColMin> and L</ColMax> option.
|
||||
|
||||
B<Note>:
|
||||
String arguments are actually sequences of grapheme clusters.
|
||||
See L<Unicode::GCString>.
|
||||
|
||||
For example, following code processes lines with tab stops by each eight columns.
|
||||
|
||||
sub tabbedsizing {
|
||||
my ($self, $cols, $pre, $spc, $str) = @_;
|
||||
|
||||
my $spcstr = $spc.$str;
|
||||
while ($spcstr->lbc == LB_SP) {
|
||||
my $c = $spcstr->item(0);
|
||||
if ($c eq "\t") {
|
||||
$cols += 8 - $cols % 8;
|
||||
} else {
|
||||
$cols += $c->columns;
|
||||
}
|
||||
$spcstr = $spcstr->substr(1);
|
||||
}
|
||||
$cols += $spcstr->columns;
|
||||
return $cols;
|
||||
};
|
||||
my $lb = Unicode::LineBreak->new(LBClass => [ord("\t") => LB_SP],
|
||||
Sizing => \&tabbedsizing);
|
||||
$output = $lb->break($string);
|
||||
|
||||
=head2 Tailoring Character Properties
|
||||
|
||||
Character properties may be tailored by L</LBClass> and L</EAWidth>
|
||||
options. Some constants are defined for convenience of tailoring.
|
||||
|
||||
=head3 Line Breaking Properties
|
||||
|
||||
=head4 Non-starters of Kana-like Characters
|
||||
|
||||
By default, several hiragana, katakana and characters corresponding to kana
|
||||
are treated as non-starters (NS or CJ).
|
||||
When the following pair(s) are specified for value of L</LBClass> option,
|
||||
these characters are treated as normal ideographic characters (ID).
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<KANA_NONSTARTERS() =E<gt> LB_ID>
|
||||
|
||||
All of characters below.
|
||||
|
||||
=item C<IDEOGRAPHIC_ITERATION_MARKS() =E<gt> LB_ID>
|
||||
|
||||
Ideographic iteration marks.
|
||||
U+3005 IDEOGRAPHIC ITERATION MARK, U+303B VERTICAL IDEOGRAPHIC ITERATION MARK, U+309D HIRAGANA ITERATION MARK, U+309E HIRAGANA VOICED ITERATION MARK, U+30FD KATAKANA ITERATION MARK and U+30FE KATAKANA VOICED ITERATION MARK.
|
||||
|
||||
N.B. Some of them are neither hiragana nor katakana.
|
||||
|
||||
=item C<KANA_SMALL_LETTERS() =E<gt> LB_ID>
|
||||
|
||||
=item C<KANA_PROLONGED_SOUND_MARKS() =E<gt> LB_ID>
|
||||
|
||||
Hiragana or katakana small letters:
|
||||
Hiragana small letters U+3041 A, U+3043 I, U+3045 U, U+3047 E, U+3049 O, U+3063 TU, U+3083 YA, U+3085 YU, U+3087 YO, U+308E WA, U+3095 KA, U+3096 KE.
|
||||
Katakana small letters U+30A1 A, U+30A3 I, U+30A5 U, U+30A7 E, U+30A9 O, U+30C3 TU, U+30E3 YA, U+30E5 YU, U+30E7 YO, U+30EE WA, U+30F5 KA, U+30F6 KE.
|
||||
Katakana phonetic extensions U+31F0 KU - U+31FF RO.
|
||||
Halfwidth katakana small letters U+FF67 A - U+FF6F TU.
|
||||
|
||||
Hiragana or katakana prolonged sound marks:
|
||||
U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK and
|
||||
U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK.
|
||||
|
||||
N.B. These letters are optionally treated either as non-starter or
|
||||
as normal ideographic. See [JIS X 4051] 6.1.1, [JLREQ] 3.1.7 or
|
||||
[UAX14].
|
||||
|
||||
N.B. U+3095, U+3096, U+30F5, U+30F6 are considered to be
|
||||
neither hiragana nor katakana.
|
||||
|
||||
=item C<MASU_MARK() =E<gt> LB_ID>
|
||||
|
||||
U+303C MASU MARK.
|
||||
|
||||
N.B. Although this character is not kana, it is usually regarded as
|
||||
abbreviation to sequence of hiragana E<0x307E> E<0x3059> or
|
||||
katakana E<0x30DE> E<0x30B9>, MA and SU.
|
||||
|
||||
N.B. This character is classified as non-starter (NS) by [UAX #14]
|
||||
and as the class corresponding to ID by [JIS X 4051] and [JLREQ].
|
||||
|
||||
=back
|
||||
|
||||
=head4 Ambiguous Quotation Marks
|
||||
|
||||
By default, some punctuations are ambiguous quotation marks (QU).
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<BACKWARD_QUOTES() =E<gt> LB_OP, FORWARD_QUOTES() =E<gt> LB_CL>
|
||||
|
||||
Some languages (Dutch, English, Italian, Portugese, Spanish, Turkish and
|
||||
most East Asian) use rotated-9-style punctuations (E<0x2018> E<0x201C>) as
|
||||
opening and 9-style punctuations (E<0x2019> E<0x201D>) as closing quotation
|
||||
marks.
|
||||
|
||||
=item C<FORWARD_QUOTES() =E<gt> LB_OP, BACKWARD_QUOTES() =E<gt> LB_CL>
|
||||
|
||||
Some others (Czech, German and Slovak) use 9-style punctuations
|
||||
(E<0x2019> E<0x201D>) as opening and rotated-9-style punctuations
|
||||
(E<0x2018> E<0x201C>) as closing quotation marks.
|
||||
|
||||
=item C<BACKWARD_GUILLEMETS() =E<gt> LB_OP, FORWARD_GUILLEMETS() =E<gt> LB_CL>
|
||||
|
||||
French, Greek, Russian etc. use left-pointing guillemets (E<0x00AB> E<0x2039>)
|
||||
as opening and right-pointing guillemets (E<0x00BB> E<0x203A>) as closing
|
||||
quotation marks.
|
||||
|
||||
=item C<FORWARD_GUILLEMETS() =E<gt> LB_OP, BACKWARD_GUILLEMETS() =E<gt> LB_CL>
|
||||
|
||||
German and Slovak use right-pointing guillemets (E<0x00BB> E<0x203A>) as
|
||||
opening and left-pointing guillemets (E<0x00AB> E<0x2039>) as closing
|
||||
quotation marks.
|
||||
|
||||
=back
|
||||
|
||||
Danish, Finnish, Norwegian and Swedish use 9-style or right-pointing
|
||||
punctuations (E<0x2019> E<0x201D> E<0x00BB> E<0x203A>) as both opening and
|
||||
closing quotation marks.
|
||||
|
||||
=head4 IDEOGRAPHIC SPACE
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_BA>
|
||||
|
||||
U+3000 IDEOGRAPHIC SPACE won't be placed at beginning of line.
|
||||
This is default behavior.
|
||||
|
||||
=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_ID>
|
||||
|
||||
IDEOGRAPHIC SPACE can be placed at beginning of line.
|
||||
This was default behavior by Unicode 6.2 and earlier.
|
||||
|
||||
=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_SP>
|
||||
|
||||
IDEOGRAPHIC SPACE won't be placed at beginning of line,
|
||||
and will protrude from end of line.
|
||||
|
||||
=back
|
||||
|
||||
=head3 East_Asian_Width Properties
|
||||
|
||||
Some particular letters of Latin, Greek and Cyrillic scripts have ambiguous
|
||||
(A) East_Asian_Width property. Thus, these characters are treated as wide
|
||||
in C<"EASTASIAN"> context.
|
||||
Specifying C<EAWidth =E<gt> [ AMBIGUOUS_>*C<() =E<gt> EA_N ]>,
|
||||
those characters are always treated as narrow.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<AMBIGUOUS_ALPHABETICS() =E<gt> EA_N>
|
||||
|
||||
Treat all of characters below as East_Asian_Width neutral (N).
|
||||
|
||||
=item C<AMBIGUOUS_CYRILLIC() =E<gt> EA_N>
|
||||
|
||||
=item C<AMBIGUOUS_GREEK() =E<gt> EA_N>
|
||||
|
||||
=item C<AMBIGUOUS_LATIN() =E<gt> EA_N>
|
||||
|
||||
Treate letters having ambiguous (A) width of Cyrillic, Greek and Latin scripts
|
||||
as neutral (N).
|
||||
|
||||
=back
|
||||
|
||||
On the other hand, despite several characters were occasionally rendered as wide characters by number of implementations for East Asian character sets, they are given narrow (Na) East_Asian_Width property just because they have fullwidth (F) compatibility characters.
|
||||
Specifying C<EAWidth> as below, those characters are treated as ambiguous
|
||||
--- wide on C<"EASTASIAN"> context.
|
||||
|
||||
=over 4
|
||||
|
||||
=item C<QUESTIONABLE_NARROW_SIGNS() =E<gt> EA_A>
|
||||
|
||||
U+00A2 CENT SIGN, U+00A3 POUND SIGN, U+00A5 YEN SIGN (or yuan sign),
|
||||
U+00A6 BROKEN BAR, U+00AC NOT SIGN, U+00AF MACRON.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Configuration File
|
||||
|
||||
Built-in defaults of option parameters for L</new> and L</config> method
|
||||
can be overridden by configuration files:
|
||||
F<Unicode/LineBreak/Defaults.pm>.
|
||||
For more details read F<Unicode/LineBreak/Defaults.pm.sample>.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Please report bugs or buggy behaviors to developer.
|
||||
|
||||
CPAN Request Tracker:
|
||||
L<http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-LineBreak>.
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
Consult $VERSION variable.
|
||||
|
||||
=head2 Incompatible Changes
|
||||
|
||||
=over 4
|
||||
|
||||
=item Release 2012.06
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
eawidth() method was deprecated.
|
||||
L<Unicode::GCString/columns> may be used instead.
|
||||
|
||||
=item *
|
||||
|
||||
lbclass() method was deprecated.
|
||||
Use L<Unicode::GCString/lbc> or L<Unicode::GCString/lbcext>.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=head2 Conformance to Standards
|
||||
|
||||
Character properties this module is based on are defined by
|
||||
Unicode Standard version 8.0.0.
|
||||
|
||||
This module is intended to implement UAX14-C2.
|
||||
|
||||
=head1 IMPLEMENTATION NOTES
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
Some ideographic characters may be treated either as NS or as ID by choice.
|
||||
|
||||
=item *
|
||||
|
||||
Hangul syllables and conjoining jamos may be treated as
|
||||
either ID or AL by choice.
|
||||
|
||||
=item *
|
||||
|
||||
Characters assigned to AI may be resolved to either AL or ID by choice.
|
||||
|
||||
=item *
|
||||
|
||||
Character(s) assigned to CB are not resolved.
|
||||
|
||||
=item *
|
||||
|
||||
Characters assigned to CJ are always resolved to NS.
|
||||
More flexible tailoring mechanism is provided.
|
||||
|
||||
=item *
|
||||
|
||||
When word segmentation for South East Asian writing systems is not supported,
|
||||
characters assigned to SA are resolved to AL,
|
||||
except that characters that have Grapheme_Cluster_Break property value
|
||||
Extend or SpacingMark be resolved to CM.
|
||||
|
||||
=item *
|
||||
|
||||
Characters assigned to SG or XX are resolved to AL.
|
||||
|
||||
=item *
|
||||
|
||||
Code points of following UCS ranges are given fixed property values even
|
||||
if they have not been assigned any characers.
|
||||
|
||||
Ranges | UAX #14 | UAX #11 | Description
|
||||
-------------------------------------------------------------
|
||||
U+20A0..U+20CF | PR [*1] | N [*2] | Currency symbols
|
||||
U+3400..U+4DBF | ID | W | CJK ideographs
|
||||
U+4E00..U+9FFF | ID | W | CJK ideographs
|
||||
U+D800..U+DFFF | AL (SG) | N | Surrogates
|
||||
U+E000..U+F8FF | AL (XX) | F or N (A) | Private use
|
||||
U+F900..U+FAFF | ID | W | CJK ideographs
|
||||
U+20000..U+2FFFD | ID | W | CJK ideographs
|
||||
U+30000..U+3FFFD | ID | W | Old hanzi
|
||||
U+F0000..U+FFFFD | AL (XX) | F or N (A) | Private use
|
||||
U+100000..U+10FFFD | AL (XX) | F or N (A) | Private use
|
||||
Other unassigned | AL (XX) | N | Unassigned,
|
||||
| | | reserved or
|
||||
| | | noncharacters
|
||||
-------------------------------------------------------------
|
||||
[*1] Except U+20A7 PESETA SIGN (PO),
|
||||
U+20B6 LIVRE TOURNOIS SIGN (PO), U+20BB NORDIC MARK SIGN (PO)
|
||||
and U+20BE LARI SIGN (PO).
|
||||
[*2] Except U+20A9 WON SIGN (H) and U+20AC EURO SIGN
|
||||
(F or N (A)).
|
||||
|
||||
=item *
|
||||
|
||||
Characters belonging to General Category Mn, Me, Cc, Cf, Zl or Zp are
|
||||
treated as nonspacing by this module.
|
||||
|
||||
=back
|
||||
|
||||
=head1 REFERENCES
|
||||
|
||||
=over 4
|
||||
|
||||
=item [CMOS]
|
||||
|
||||
I<The Chicago Manual of Style>, 15th edition.
|
||||
University of Chicago Press, 2003.
|
||||
|
||||
=item [JIS X 4051]
|
||||
|
||||
JIS X 4051:2004
|
||||
I<日本語文書の組版方法> (I<Formatting Rules for Japanese Documents>).
|
||||
Japanese Standards Association, 2004.
|
||||
|
||||
=item [JLREQ]
|
||||
|
||||
Anan, Yasuhiro et al.
|
||||
I<Requirements for Japanese Text Layout>,
|
||||
W3C Working Group Note 3 April 2012.
|
||||
L<http://www.w3.org/TR/2012/NOTE-jlreq-20120403/>.
|
||||
|
||||
=begin comment
|
||||
|
||||
=item [Kubota]
|
||||
|
||||
Kubota, Tomohiro (2001-2002).
|
||||
Width problems, "I<Problems on Interoperativity between Unicode and CJK Local Encodings>".
|
||||
L<http://web.archive.org/web/people.debian.org/~kubota/unicode-symbols-width2.html>.
|
||||
|
||||
=end comment
|
||||
|
||||
=item [UAX #11]
|
||||
|
||||
A. Freytag (ed.) (2008-2009).
|
||||
I<Unicode Standard Annex #11: East Asian Width>, Revisions 17-19.
|
||||
L<http://unicode.org/reports/tr11/>.
|
||||
|
||||
=item [UAX #14]
|
||||
|
||||
A. Freytag and A. Heninger (eds.) (2008-2015).
|
||||
I<Unicode Standard Annex #14: Unicode Line Breaking Algorithm>, Revisions 22-35.
|
||||
L<http://unicode.org/reports/tr14/>.
|
||||
|
||||
=item [UAX #29]
|
||||
|
||||
Mark Davis (ed.) (2009-2013).
|
||||
I<Unicode Standard Annex #29: Unicode Text Segmentation>, Revisions 15-23.
|
||||
L<http://www.unicode.org/reports/tr29/>.
|
||||
|
||||
=back
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Text::LineFold>, L<Text::Wrap>, L<Unicode::GCString>.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Copyright (C) 2009-2018 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
|
||||
|
||||
This program is free software; you can redistribute it and/or modify it
|
||||
under the same terms as Perl itself.
|
||||
|
||||
=cut
|
||||
68
database/perl/vendor/lib/Unicode/LineBreak/Constants.pm
vendored
Normal file
68
database/perl/vendor/lib/Unicode/LineBreak/Constants.pm
vendored
Normal file
@@ -0,0 +1,68 @@
|
||||
#-*- perl -*-
|
||||
|
||||
package Unicode::LineBreak;
|
||||
|
||||
use constant { M => 4, D => 3, I => 2, P => 1,};
|
||||
use constant { MANDATORY => M, DIRECT => D, INDIRECT => I, PROHIBITED => P,
|
||||
URGENT => 200,};
|
||||
|
||||
use constant { ALLOW_BEFORE => 2, PROHIBIT_BEFORE => 1,
|
||||
BREAK_BEFORE => 2, # deprecated.
|
||||
FLAGS => (2 | 1) };
|
||||
|
||||
use constant {
|
||||
AMBIGUOUS_CYRILLIC => [0x0401, 0x0410..0x044F, 0x0451, ],
|
||||
AMBIGUOUS_GREEK => [0x0391..0x03A9, 0x03B1..0x03C1, 0x03C3..0x03C9, ],
|
||||
AMBIGUOUS_LATIN => [0x00C6, 0x00D0, 0x00D8, 0x00DE, 0x00DF, 0x00E0,
|
||||
0x00E1, 0x00E6, 0x00E8, 0x00E9, 0x00EA, 0x00EC, 0x00ED, 0x00F0,
|
||||
0x00F2, 0x00F3, 0x00F8, 0x00F9, 0x00FA, 0x00FC, 0x00FE, 0x0101,
|
||||
0x0111, 0x0113, 0x011B, 0x0126, 0x0127, 0x012B, 0x0131, 0x0132,
|
||||
0x0133, 0x0138, 0x013F, 0x0140, 0x0141, 0x0142, 0x0144, 0x0148,
|
||||
0x0149, 0x014A, 0x014B, 0x014D, 0x0152, 0x0153, 0x0166, 0x0167,
|
||||
0x016B, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01D6, 0x01D8, 0x01DA,
|
||||
0x01DC, 0x0251, 0x0261, ],
|
||||
IDEOGRAPHIC_ITERATION_MARKS => [0x3005, 0x303B, 0x309D, 0x309E, 0x30FD,
|
||||
0x30FE, ],
|
||||
KANA_PROLONGED_SOUND_MARKS => [0x30FC, 0xFF70, ],
|
||||
KANA_SMALL_LETTERS => [0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3063,
|
||||
0x3083, 0x3085, 0x3087, 0x308E,
|
||||
0x3095, 0x3096,
|
||||
0x30A1, 0x30A3, 0x30A5, 0x30A7, 0x30A9, 0x30C3,
|
||||
0x30E3, 0x30E5, 0x30E7, 0x30EE,
|
||||
0x30F5, 0x30F6,
|
||||
0x31F0..0x31FF, 0xFF67..0xFF6F, ],
|
||||
MASU_MARK => [0x303C, ],
|
||||
QUESTIONABLE_NARROW_SIGNS => [0x00A2, 0x00A3, 0x00A5, 0x00A6, 0x00AC,
|
||||
0x00AF, ],
|
||||
};
|
||||
use constant {
|
||||
AMBIGUOUS_ALPHABETICS => [
|
||||
@{AMBIGUOUS_CYRILLIC()}, @{AMBIGUOUS_GREEK()},
|
||||
@{AMBIGUOUS_LATIN()}, ],
|
||||
KANA_NONSTARTERS => [
|
||||
@{IDEOGRAPHIC_ITERATION_MARKS()}, @{KANA_PROLONGED_SOUND_MARKS()},
|
||||
@{KANA_SMALL_LETTERS()}, @{MASU_MARK()}, ]
|
||||
};
|
||||
use constant {
|
||||
BACKWORD_GUILLEMETS => [
|
||||
0x00AB, 0x2039, ],
|
||||
FORWARD_GUILLEMETS => [
|
||||
0x00BB, 0x203A, ],
|
||||
BACKWORD_QUOTES => [
|
||||
0x2018, 0x201C, ],
|
||||
FORWARD_QUOTES => [
|
||||
0x2019, 0x201D, ],
|
||||
};
|
||||
# obsoleted names.
|
||||
use constant {
|
||||
LEFT_GUILLEMETS => BACKWORD_GUILLEMETS(),
|
||||
RIGHT_GUILLEMETS => FORWARD_GUILLEMETS(),
|
||||
LEFT_QUOTES => BACKWORD_QUOTES(),
|
||||
RIGHT_QUOTES => FORWARD_QUOTES(),
|
||||
};
|
||||
|
||||
use constant {
|
||||
IDEOGRAPHIC_SPACE => [ 0x3000, ],
|
||||
};
|
||||
|
||||
1;
|
||||
114
database/perl/vendor/lib/Unicode/LineBreak/Defaults.pm.sample
vendored
Normal file
114
database/perl/vendor/lib/Unicode/LineBreak/Defaults.pm.sample
vendored
Normal file
@@ -0,0 +1,114 @@
|
||||
#-*- perl -*-
|
||||
|
||||
package Unicode::LineBreak;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Unicode::LineBreak::Defaults - Configuration for Unicode::LineBreak
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Edit this file and place it on Unicode/LineBreak/Defaults.pm
|
||||
to activate custom settings.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Following settings are available.
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
BreakIndent
|
||||
|
||||
=item *
|
||||
|
||||
CharMax
|
||||
|
||||
=item *
|
||||
|
||||
ColMin
|
||||
|
||||
=item *
|
||||
|
||||
ColMax
|
||||
|
||||
=item *
|
||||
|
||||
ComplexBreaking
|
||||
|
||||
=item *
|
||||
|
||||
Context
|
||||
|
||||
=item *
|
||||
|
||||
EAWidth
|
||||
|
||||
=item *
|
||||
|
||||
Format
|
||||
|
||||
=item *
|
||||
|
||||
HangulAsAL
|
||||
|
||||
=item *
|
||||
|
||||
LBClass
|
||||
|
||||
=item *
|
||||
|
||||
LegacyCM
|
||||
|
||||
=item *
|
||||
|
||||
Newline
|
||||
|
||||
=item *
|
||||
|
||||
Prep
|
||||
|
||||
=item *
|
||||
|
||||
Sizing
|
||||
|
||||
=item *
|
||||
|
||||
Urgent
|
||||
|
||||
=item *
|
||||
|
||||
ViramaAsJoiner
|
||||
|
||||
=back
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Unicode::LineBreak>
|
||||
|
||||
=cut
|
||||
|
||||
#--------------------------------------------------------------------------#
|
||||
# Add your own settings below.
|
||||
#--------------------------------------------------------------------------#
|
||||
|
||||
## Default settings on current release are:
|
||||
# $Config->{BreakIndent} = 'YES';
|
||||
# $Config->{CharMax} = 998;
|
||||
# $Config->{ColMin} = 0;
|
||||
# $Config->{ColMax} = 76;
|
||||
# $Config->{ComplexBreaking} = 'YES';
|
||||
# $Config->{Context} = 'NONEASTASIAN';
|
||||
# $Config->{EAWidth} = undef;
|
||||
# $Config->{Format} = 'SIMPLE';
|
||||
# $Config->{HangulAsAL} = 'NO';
|
||||
# $Config->{LBClass} = undef;
|
||||
# $Config->{LegacyCM} = 'YES';
|
||||
# $Config->{Newline} = "\n";
|
||||
# $Config->{Prep} = undef;
|
||||
# $Config->{Sizing} = 'UAX11';
|
||||
# $Config->{Urgent} = undef;
|
||||
# $Config->{ViramaAsJoiner} = 'YES';
|
||||
|
||||
1;
|
||||
21
database/perl/vendor/lib/Unicode/UTF8.pm
vendored
Normal file
21
database/perl/vendor/lib/Unicode/UTF8.pm
vendored
Normal file
@@ -0,0 +1,21 @@
|
||||
package Unicode::UTF8;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
BEGIN {
|
||||
our $VERSION = '0.62';
|
||||
our @EXPORT_OK = qw[ decode_utf8 encode_utf8 valid_utf8 ];
|
||||
our %EXPORT_TAGS = (
|
||||
all => [ @EXPORT_OK ],
|
||||
);
|
||||
|
||||
require XSLoader;
|
||||
XSLoader::load(__PACKAGE__, $VERSION);
|
||||
|
||||
require Exporter;
|
||||
*import = \&Exporter::import;
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
207
database/perl/vendor/lib/Unicode/UTF8.pod
vendored
Normal file
207
database/perl/vendor/lib/Unicode/UTF8.pod
vendored
Normal file
@@ -0,0 +1,207 @@
|
||||
=head1 NAME
|
||||
|
||||
Unicode::UTF8 - Encoding and decoding of UTF-8 encoding form
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Unicode::UTF8 qw[decode_utf8 encode_utf8];
|
||||
|
||||
use warnings FATAL => 'utf8'; # fatalize encoding glitches
|
||||
$string = decode_utf8($octets);
|
||||
$octets = encode_utf8($string);
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module provides functions to encode and decode UTF-8 encoding form as
|
||||
specified by Unicode and ISO/IEC 10646:2011.
|
||||
|
||||
=head1 FUNCTIONS
|
||||
|
||||
=head2 decode_utf8
|
||||
|
||||
$string = decode_utf8($octets);
|
||||
$string = decode_utf8($octets, $fallback);
|
||||
|
||||
Returns an decoded representation of C<$octets> in UTF-8 encoding as a character
|
||||
string.
|
||||
|
||||
C<$fallback> is an optional C<CODE> reference which provides a error-handling
|
||||
mechanism, allowing customization of error handling. The default error-handling
|
||||
mechanism is to replace any ill-formed UTF-8 sequences or encoded code points
|
||||
which can't be interchanged with REPLACEMENT CHARACTER (U+FFFD).
|
||||
|
||||
$string = $fallback->($octets, $is_usv, $position);
|
||||
|
||||
C<$fallback> is invoked with three arguments: C<$octets>, C<$is_usv> and
|
||||
C<$position>. C<$octets> is a sequence of one or more octets containing the
|
||||
maximal subpart of the ill-formed subsequence or encoded code point which
|
||||
can't be interchanged. C<$is_usv> is a boolean indicating whether or not
|
||||
C<$octets> represent a encoded Unicode scalar value. C<$position> is a
|
||||
unsigned integer containing the zero based octet position at which the error
|
||||
occurred within the octets provided to C<decode_utf8()>. C<$fallback> must
|
||||
return a character string consisting of zero or more Unicode scalar values.
|
||||
Unicode scalar values consist of code points in the range U+0000..U+D7FF and
|
||||
U+E000..U+10FFFF.
|
||||
|
||||
=head2 encode_utf8
|
||||
|
||||
$octets = encode_utf8($string);
|
||||
$octets = encode_utf8($string, $fallback);
|
||||
|
||||
Returns an encoded representation of C<$string> in UTF-8 encoding as an octet
|
||||
string.
|
||||
|
||||
C<$fallback> is an optional C<CODE> reference which provides a error-handling
|
||||
mechanism, allowing customization of error handling. The default error-handling
|
||||
mechanism is to replace any code points which can't be interchanged or represented
|
||||
in UTF-8 encoding form with REPLACEMENT CHARACTER (U+FFFD).
|
||||
|
||||
$string = $fallback->($codepoint, $is_usv, $position);
|
||||
|
||||
C<$fallback> is invoked with three arguments: C<$codepoint>, C<$is_usv> and
|
||||
C<$position>. C<$codepoint> is a unsigned integer containing the code point
|
||||
which can't be interchanged or represented in UTF-8 encoding form. C<$is_usv>
|
||||
is a boolean indicating whether or not C<$codepoint> is a Unicode scalar value.
|
||||
C<$position> is a unsigned integer containing the zero based character position
|
||||
at which the error occurred within the string provided to C<encode_utf8()>.
|
||||
C<$fallback> must return a character string consisting of zero or more Unicode
|
||||
scalar values.Unicode scalar values consist of code points in the range
|
||||
U+0000..U+D7FF and U+E000..U+10FFFF.
|
||||
|
||||
=head2 valid_utf8
|
||||
|
||||
$boolean = valid_utf8($octets);
|
||||
|
||||
Returns a boolean indicating whether or not the given C<$octets> consist of
|
||||
well-formed UTF-8 sequences.
|
||||
|
||||
=head1 EXPORTS
|
||||
|
||||
None by default. All functions can be exported using the C<:all> tag or individually.
|
||||
|
||||
=head1 DIAGNOSTICS
|
||||
|
||||
=over 4
|
||||
|
||||
=item Can't decode a wide character string
|
||||
|
||||
(F) Wide character in octets.
|
||||
|
||||
=item Can't validate a wide character string
|
||||
|
||||
(F) Wide character in octets.
|
||||
|
||||
=item Can't decode ill-formed UTF-8 octet sequence <%s> in position %u
|
||||
|
||||
(W utf8) Encountered an ill-formed UTF-8 octet sequence. <%s> contains a
|
||||
hexadecimal representation of the maximal subpart of the ill-formed subsequence.
|
||||
|
||||
=item Can't interchange noncharacter code point U+%X in position %u
|
||||
|
||||
(W utf8, nonchar) Noncharacters are code points that are permanently reserved
|
||||
in the Unicode Standard for internal use. They are forbidden for use in open
|
||||
interchange of Unicode text data. Noncharacters consist of the values U+nFFFE
|
||||
and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.
|
||||
|
||||
=item Can't represent surrogate code point U+%X in position %u
|
||||
|
||||
(W utf8, surrogate) Surrogate code points are designated only for surrogate code
|
||||
units in the UTF-16 character encoding form. Surrogates consist of code points
|
||||
in the range U+D800 to U+DFFF.
|
||||
|
||||
=item Can't represent super code point \x{%X} in position %u
|
||||
|
||||
(W utf8, non_unicode) Code points greater than U+10FFFF. Perl's extended codespace.
|
||||
|
||||
=item Can't decode ill-formed UTF-X octet sequence <%s> in position %u
|
||||
|
||||
(F) Encountered an ill-formed octet sequence in Perl's internal representation
|
||||
of wide characters.
|
||||
|
||||
=back
|
||||
|
||||
The sub-categories: C<nonchar>, C<surrogate> and C<non_unicode> is only available
|
||||
on Perl 5.14 or greater. See L<perllexwarn> for available categories and hierarchies.
|
||||
|
||||
=head1 COMPARISON
|
||||
|
||||
Here is a summary of features for comparison with L<Encode>'s UTF-8 implementation:
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
Simple API which makes use of Perl's standard warning categories.
|
||||
|
||||
=item *
|
||||
|
||||
Recognizes all noncharacters regardless of Perl version
|
||||
|
||||
=item *
|
||||
|
||||
Implements Unicode's recommended practice for using U+FFFD.
|
||||
|
||||
=item *
|
||||
|
||||
Better diagnostics in warning messages
|
||||
|
||||
=item *
|
||||
|
||||
Detects and reports inconsistency in Perl's internal representation of
|
||||
wide characters (UTF-X)
|
||||
|
||||
=item *
|
||||
|
||||
Preserves taintedness of decoded C<$octets> or encoded C<$string>
|
||||
|
||||
=item *
|
||||
|
||||
Better performance ~ 600% - 1200% (JA: 600%, AR: 700%, SV: 900%, EN: 1200%,
|
||||
see benchmarks directory in git repository)
|
||||
|
||||
=back
|
||||
|
||||
=head1 CONFORMANCE
|
||||
|
||||
It's the author's belief that this UTF-8 implementation is conformant with
|
||||
the Unicode Standard Version 6.0. Any deviations from the Unicode Standard
|
||||
is to be considered a bug.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
=over 4
|
||||
|
||||
=item L<Encode>
|
||||
|
||||
=item L<http://www.unicode.org/>
|
||||
|
||||
=back
|
||||
|
||||
=head1 SUPPORT
|
||||
|
||||
=head2 BUGS
|
||||
|
||||
Please report any bugs by email to C<bug-unicode-utf8 at rt.cpan.org>, or
|
||||
through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-UTF8>.
|
||||
You will be automatically notified of any progress on the request by the system.
|
||||
|
||||
=head2 SOURCE CODE
|
||||
|
||||
This is open source software. The code repository is available for public
|
||||
review and contribution under the terms of the license.
|
||||
|
||||
L<http://github.com/chansen/p5-unicode-utf8>
|
||||
|
||||
git clone http://github.com/chansen/p5-unicode-utf8
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Christian Hansen C<chansen@cpan.org>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 2011-2017 by Christian Hansen.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
Reference in New Issue
Block a user