Initial Commit

2025-12-03 16:38:10 +01:00
parent c5e26bf594
commit b732d8d4b5
17680 changed files with 5977495 additions and 2 deletions
--- a/database/perl/vendor/lib/Unicode/GCString.pm
+++ b/database/perl/vendor/lib/Unicode/GCString.pm
@@ -0,0 +1,60 @@
+#-*-perl-*-
+
+package Unicode::GCString;
+require 5.008;
+
+=encoding utf-8
+
+=cut
+
+### Pragmas:
+use strict;
+use warnings;
+use vars qw($VERSION @EXPORT_OK @ISA);
+
+### Exporting:
+use Exporter;
+our @EXPORT_OK = qw();
+our %EXPORT_TAGS = ('all' => [@EXPORT_OK]);
+
+### Inheritance:
+our @ISA = qw(Exporter);
+
+### Other modules:
+use Unicode::LineBreak;
+
+### Globals
+
+# The package version
+our $VERSION = '2013.10';
+
+use overload 
+    '@{}' => \&as_arrayref,
+    '${}' => \&as_scalarref,
+    '""' => \&as_string,
+    '.' => \&concat,
+    #XXX'.=' => \&concat, #FIXME:segfault
+    'cmp' => \&cmp,
+    '<>' => \&next,
+    ;
+
+sub new {
+    my $class = shift;
+
+    my $self;
+    if (scalar @_ <= 2) {
+        $self = __PACKAGE__->_new(@_);
+    } else {
+        my $str = shift;
+        my $lb = Unicode::LineBreak->new(@_);
+        $self = __PACKAGE__->_new($str, $lb);
+    }
+    bless $self, $class;
+}
+
+sub as_arrayref {
+    my @a = shift->as_array;
+    return \@a;
+}
+
+1;
--- a/database/perl/vendor/lib/Unicode/GCString.pod
+++ b/database/perl/vendor/lib/Unicode/GCString.pod
@@ -0,0 +1,304 @@
+=encoding utf-8
+
+=head1 NAME
+
+Unicode::GCString - String as Sequence of UAX #29 Grapheme Clusters
+
+=head1 SYNOPSIS
+
+    use Unicode::GCString;
+    $gcstring = Unicode::GCString->new($string);
+    
+=head1 DESCRIPTION
+
+Unicode::GCString treats Unicode string as a sequence of
+I<extended grapheme clusters> defined by Unicode Standard Annex #29 [UAX #29].
+
+B<Grapheme cluster> is a sequence of Unicode character(s) that consists of one
+B<grapheme base> and optional B<grapheme extender> and/or
+B<“prepend” character>.  It is close in that people consider as I<character>.
+
+=head2 Public Interface
+
+=head3 Constructors
+
+=over 4
+
+=item new (STRING, [KEY => VALUE, ...])
+
+=item new (STRING, [LINEBREAK])
+
+I<Constructor>.
+Create new grapheme cluster string (Unicode::GCString object) from
+Unicode string STRING.
+
+About optional KEY => VALUE pairs see L<Unicode::LineBreak/Options>.
+On second form, L<Unicode::LineBreak> object LINEBREAK controls
+breaking features.
+
+B<Note>:
+The first form was introduced by release 2012.10.
+
+=item copy
+
+I<Copy constructor>.
+Create a copy of grapheme cluster string.
+Next position of new string is set at beginning.
+
+=back
+
+=head3 Sizes
+
+=over 4
+
+=item chars
+
+I<Instance method>.
+Returns number of Unicode characters grapheme cluster string includes,
+i.e. length as Unicode string.
+
+=item columns
+
+I<Instance method>.
+Returns total number of columns of grapheme clusters
+defined by built-in character database.
+For more details see L<Unicode::LineBreak/DESCRIPTION>.
+
+=item length
+
+I<Instance method>.
+Returns number of grapheme clusters contained in grapheme cluster string.
+
+=back
+
+=head3 Operations as String
+
+=over 4
+
+=item as_string
+
+=item C<">OBJECTC<">
+
+I<Instance method>.
+Convert grapheme cluster string to Unicode string explicitly.
+
+=item cmp (STRING)
+
+=item STRING C<cmp> STRING
+
+I<Instance method>.
+Compare strings.  There are no oddities.
+One of each STRING may be Unicode string.
+
+=item concat (STRING)
+
+=item STRING C<.> STRING
+
+I<Instance method>.
+Concatenate STRINGs.  One of each STRING may be Unicode string.
+Note that number of columns (see columns()) or grapheme clusters
+(see length()) of resulting string is not always equal to sum of both
+strings.
+Next position of new string is that set on the left value.
+
+=item join ([STRING, ...])
+
+I<Instance method>.
+Join STRINGs inserting grapheme cluster string.
+Any of STRINGs may be Unicode string.
+
+=item substr (OFFSET, [LENGTH, [REPLACEMENT]])
+
+I<Instance method>.
+Returns substring of grapheme cluster string.
+OFFSET and LENGTH are based on grapheme clusters.
+If REPLACEMENT is specified, substring is replaced by it.
+REPLACEMENT may be Unicode string.
+
+Note:
+This method cannot return the lvalue, unlike built-in substr().
+
+=back
+
+=head3 Operations as Sequence of Grapheme Clusters
+
+=over 4
+
+=item as_array
+
+=item C<@{>OBJECTC<}>
+
+=item as_arrayref
+
+I<Instance method>.
+Convert grapheme cluster string to an array of grapheme clusters.
+
+=item eos
+
+I<Instance method>.
+Test if current position is at end of grapheme cluster string.
+
+=item item ([OFFSET])
+
+I<Instance method>.
+Returns OFFSET-th grapheme cluster.
+If OFFSET was not specified, returns next grapheme cluster.
+
+=item next
+
+=item C<E<lt>>OBJECTC<E<gt>>
+
+I<Instance method>, iterative.
+Returns next grapheme cluster and increment next position.
+
+=item pos ([OFFSET])
+
+I<Instance method>.
+If optional OFFSET is specified, set next position by it.
+Returns next position of grapheme cluster string.
+
+=back
+
+=begin comment
+
+=head4 Methods planned to be deprecated
+
+=over 4
+
+=item flag ([OFFSET, [VALUE]])
+
+I<Instance method>.
+Get or set flag value of OFFEST-th grapheme cluster.
+If OFFSET was not specified, returns flag value of next grapheme cluster.
+Flag value is an non-zero integer not greater than 255 and initially is 0.
+
+Predefined flags are:
+
+=over 4
+
+=item Unicode::LineBreak::ALLOW_BEFORE
+
+Allow line breaking just before this grapheme cluster.
+
+=item Unicode::LineBreak::PROHIBIT_BEFORE
+
+Prohibit line breaking just before this grapheme cluster.
+
+=back
+
+=item lbclass ([OFFSET])
+
+I<Instance method>.
+Returns Line Breaking Class (See L<Unicode::LineBreak>) of the first
+character of OFFSET-th grapheme cluster.
+If OFFSET was not specified, returns class of next grapheme cluster.
+
+B<Note>:
+Use lbc().
+
+=item lbclass_ext ([OFFSET])
+
+I<Instance method>.
+Returns Line Breaking Class (See L<Unicode::LineBreak>) of the last
+grapheme extender of OFFSET-th grapheme cluster.  If there are no
+grapheme extenders or its class is CM, value of lbclass() is returned.
+
+B<Note>:
+Use lbcext().
+
+=back
+
+=end comment
+
+=head3 Miscelaneous
+
+=over 4
+
+=item lbc
+
+I<Instance method>.
+Returns Line Breaking Class (See L<Unicode::LineBreak>) of the first
+character of first grapheme cluster.
+
+=item lbcext
+
+I<Instance method>.
+Returns Line Breaking Class (See L<Unicode::LineBreak>) of the last
+grapheme extender of last grapheme cluster.
+If there are no grapheme extenders or its class is CM, value of last
+grapheme base will be returned.
+
+=back
+
+=head1 CAVEATS
+
+=over 4
+
+=item *
+
+The grapheme cluster should not be referred to as "grapheme"
+even though Larry does.
+
+=item *
+
+On Perl around 5.10.1, implicit conversion from Unicode::GCString object to
+Unicode string sometimes let C<"utf8_mg_pos_cache_update"> cache be confused.
+
+For example, instead of doing
+
+    $sub = substr($gcstring, $i, $j);
+
+do
+
+    $sub = substr("$gcstring", $i, $j);
+
+    $sub = substr($gcstring->as_string, $i, $j);
+
+=item *
+
+This module implements I<default> algorithm for determining grapheme cluster
+boundaries.  Tailoring mechanism has not been supported yet.
+
+=back
+
+=head1 VERSION
+
+Consult $VERSION variable.
+
+=head2 Incompatible Changes
+
+=over 4
+
+=item Release 2013.10
+
+=over 4
+
+=item *
+
+The new() method can take non-Unicode string argument.
+In this case it will be decoded by iso-8859-1 (Latin 1) character set.
+That method of former releases would die with non-Unicode inputs.
+
+=back
+
+=back
+
+=head1 SEE ALSO
+
+[UAX #29]
+Mark Davis (ed.) (2009-2013).
+I<Unicode Standard Annex #29: Unicode Text Segmentation>, Revisions 15-23.
+L<http://www.unicode.org/reports/tr29/>.
+
+=head1 AUTHOR
+
+Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2009-2013 Hatuka*nezumi - IKEDA Soji.
+
+This program is free software; you can redistribute it and/or modify it
+under the same terms as Perl itself.
+
+=cut
--- a/database/perl/vendor/lib/Unicode/LineBreak.pm
+++ b/database/perl/vendor/lib/Unicode/LineBreak.pm
@@ -0,0 +1,248 @@
+#-*- perl -*-
+
+package Unicode::LineBreak;
+require 5.008;
+
+### Pragmas:
+use strict;
+use warnings;
+use vars qw($VERSION @EXPORT_OK @ISA $Config @Config);
+
+### Exporting:
+use Exporter;
+our @EXPORT_OK = qw(UNICODE_VERSION SOMBOK_VERSION context);
+our %EXPORT_TAGS = ('all' => [@EXPORT_OK]);
+
+### Inheritance:
+our @ISA = qw(Exporter);
+
+### Other modules:
+use Carp qw(croak carp);
+use Encode qw(is_utf8);
+use MIME::Charset;
+use Unicode::GCString;
+
+### Globals
+
+### The package version
+our $VERSION = '2019.001';
+
+### Public Configuration Attributes
+our @Config = (
+    BreakIndent => 'YES',
+    CharMax => 998,
+    ColMax => 76,
+    ColMin => 0,
+    ComplexBreaking => 'YES',
+    Context => 'NONEASTASIAN',
+    EAWidth => undef,
+    Format => 'SIMPLE',
+    HangulAsAL => 'NO',
+    LBClass => undef,
+    LegacyCM => 'YES',
+    Newline => "\n",
+    Prep => undef,
+    Sizing => 'UAX11',
+    Urgent => undef,
+    ViramaAsJoiner => 'YES',
+);
+our $Config = {};
+eval { require Unicode::LineBreak::Defaults; };
+push @Config, (%$Config);
+
+### Exportable constants
+use Unicode::LineBreak::Constants;
+use constant 1.01;
+my $package = __PACKAGE__;
+my @consts = grep { s/^${package}::(\w\w+)$/$1/ } keys %constant::declared;
+push @EXPORT_OK, @consts;
+push @{$EXPORT_TAGS{'all'}}, @consts;
+
+### Load XS module
+require XSLoader;
+XSLoader::load('Unicode::LineBreak', $VERSION);
+
+### Load dynamic constants
+foreach my $p ((['EA', EAWidths()], ['LB', LBClasses()])) {
+    my $prop = shift @{$p};
+    my $idx = 0;
+    foreach my $val (@{$p}) {
+        no strict;
+        my $const = "${prop}_${val}";
+        *{$const} = eval "sub { $idx }";
+        push @EXPORT_OK, $const;
+        push @{$EXPORT_TAGS{'all'}}, $const;
+        $idx++;
+    }
+}
+
+### Privates
+my $EASTASIAN_CHARSETS = qr{
+    ^BIG5 |
+    ^CP9\d\d |
+    ^EUC- |
+    ^GB18030 | ^GB2312 | ^GBK |
+    ^HZ |
+    ^ISO-2022- |
+    ^KS_C_5601 |
+    ^SHIFT_JIS
+}ix;
+
+my $EASTASIAN_LANGUAGES = qr{
+    ^AIN |
+    ^JA\b | ^JPN |
+    ^KO\b | ^KOR |
+    ^ZH\b | ^CHI
+}ix;
+
+use overload
+    '%{}' => \&as_hashref,
+    '${}' => \&as_scalarref,
+    '""' => \&as_string,
+    ;
+
+sub new {
+    my $class = shift;
+
+    my $self = __PACKAGE__->_new();
+    $self->config(@Config);
+    $self->config(@_);
+    bless $self, $class;
+}
+
+sub config ($@) {
+    my $self = shift;
+
+    # Get config.
+    if (scalar @_ == 1) {
+        my $k = shift;
+        my $ret;
+
+        if (uc $k eq uc 'CharactersMax') {
+            return $self->_config('CharMax');
+        } elsif (uc $k eq uc 'ColumnsMax') {
+            return $self->_config('ColMax');
+        } elsif (uc $k eq uc 'ColumnsMin') {
+            return $self->_config('ColMin');
+        } elsif (uc $k eq uc 'SizingMethod') {
+            return $self->_config('Sizing');
+        } elsif (uc $k eq uc 'TailorEA') {
+            carp "$k is obsoleted.  Use EAWidth";
+            $ret = $self->_config('EAWidth');
+            if (! defined $ret) {
+                return [];
+            } else {
+                return [map { ($_->[0] => $_->[1]) } @{$ret}];
+            }
+        } elsif (uc $k eq uc 'TailorLB') {
+            carp "$k is obsoleted.  Use LBClass";
+            $ret = $self->_config('LBClass');
+            if (! defined $ret) {
+                return [];
+            } else {
+                return [map { ($_->[0] => $_->[1]) } @{$ret}];
+            }
+        } elsif (uc $k eq uc 'UrgentBreaking') {
+            return $self->_config('Urgent');
+        } elsif (uc $k eq uc 'UserBreaking') {
+            carp "$k is obsoleted.  Use Prep";
+            $ret = $self->_config('Prep');
+            if (! defined $ret) {
+                return [];
+            } else {
+                return $ret;
+            }
+        } else {
+            return $self->_config($k);
+        }
+    }
+
+    # Set config.
+    my @config = ();
+    while (0 < scalar @_) {
+        my $k = shift;
+        my $v = shift;
+
+        if (uc $k eq uc 'CharactersMax') {
+            push @config, 'CharMax' => $v;
+        } elsif (uc $k eq uc 'ColumnsMax') {
+            push @config, 'ColMax' => $v;
+        } elsif (uc $k eq uc 'ColumnsMin') {
+            push @config, 'ColMin' => $v;
+        } elsif (uc $k eq uc 'SizingMethod') {
+            push @config, 'Sizing' => $v;
+        } elsif (uc $k eq uc 'TailorLB') {
+            carp "$k is obsoleted.  Use LBClass";
+            push @config, 'LBClass' => undef;
+            if (! defined $v) {
+                ;
+            } else {
+                my @v = @{$v};
+                while (scalar(@v)) {
+                    my $k = shift @v;
+                    my $v = shift @v;
+                    push @config, 'LBClass' => [ $k => $v ];
+                }
+            }
+        } elsif (uc $k eq uc 'TailorEA') {
+            carp "$k is obsoleted.  Use EAWidth";
+            push @config, 'EAWidth' => undef;
+            if (! defined $v) {
+                ;
+            } else {
+                my @v = @{$v};
+                while (scalar(@v)) {
+                    my $k = shift @v;
+                    my $v = shift @v;
+                    push @config, 'EAWidth' => [ $k => $v ];
+                }
+            }
+        } elsif (uc $k eq uc 'UserBreaking') {
+            carp "$k is obsoleted.  Use Prep";
+            push @config, 'Prep' => undef;
+            if (! defined $v) {
+                ;
+            } elsif (ref $v eq 'ARRAY') {
+                push @config, map { ('Prep' => $_) } @{$v};
+            } else {
+                push @config, 'Prep' => $v;
+            }
+        } elsif (uc $k eq uc 'UrgentBreaking') {
+            push @config, 'Urgent' => $v;
+        } else {
+            push @config, $k => $v;
+        }
+    }
+
+    $self->_config(@config) if scalar @config;
+}
+
+sub context (@) {
+    my %opts = @_;
+
+    my $charset;
+    my $language;
+    my $context;
+    foreach my $k (keys %opts) {
+        if (uc $k eq 'CHARSET') {
+            if (ref $opts{$k}) {
+                $charset = $opts{$k}->as_string;
+            } else {
+                $charset = MIME::Charset->new($opts{$k})->as_string;
+            }
+        } elsif (uc $k eq 'LANGUAGE') {
+            $language = uc $opts{$k};
+            $language =~ s/_/-/;
+        }
+    }
+    if ($charset and $charset =~ /$EASTASIAN_CHARSETS/) {
+        $context = 'EASTASIAN';
+    } elsif ($language and $language =~ /$EASTASIAN_LANGUAGES/) {
+        $context = 'EASTASIAN';
+    } else {
+        $context = 'NONEASTASIAN';
+    }
+    $context;
+}
+
+1;
--- a/database/perl/vendor/lib/Unicode/LineBreak.pod
+++ b/database/perl/vendor/lib/Unicode/LineBreak.pod
@@ -0,0 +1,983 @@
+=encoding utf-8
+
+=head1 NAME
+
+Unicode::LineBreak - UAX #14 Unicode Line Breaking Algorithm
+
+=head1 SYNOPSIS
+
+    use Unicode::LineBreak;
+    $lb = Unicode::LineBreak->new();
+    $broken = $lb->break($string);
+
+=head1 DESCRIPTION
+
+Unicode::LineBreak performs Line Breaking Algorithm described in Unicode
+Standard Annex #14 [UAX #14]. East_Asian_Width informative property
+defined by Annex #11 [UAX #11] will be concerned to determine breaking
+positions.
+
+=head2 Terminology
+
+Following terms are used for convenience.
+
+B<Mandatory break> is obligatory line breaking behavior defined by core
+rules and performed regardless of surrounding characters.
+B<Arbitrary break> is line breaking behavior allowed by core rules
+and chosen by user to perform it.
+Arbitrary break includes B<direct break> and B<indirect break>
+defined by [UAX #14].
+
+B<Alphabetic characters> are characters usually no line breaks are allowed
+between pairs of them, except that other characters provide break
+oppotunities.
+B<Ideographic characters> are characters that usually allow line breaks
+both before and after themselves.
+[UAX #14] classifies most of alphabetic to AL and most of ideographic to ID
+(These terms are inaccurate from the point of view by grammatology).
+On several scripts, breaking positions are not obvious by each characters
+therefore heuristic based on dictionary is used.
+
+B<Number of columns> of a string is not always equal to the number of characters it contains:
+Each of characters is either B<wide>, B<narrow> or nonspacing;
+they occupy 2, 1 or 0 columns, respectively.
+Several characters may be both wide and narrow by the contexts they are used.
+Characters may have more various widths by customization.
+
+=head1 PUBLIC INTERFACE
+
+=head2 Line Breaking
+
+=over 4
+
+=item new ([KEY => VALUE, ...])
+
+I<Constructor>.
+About KEY => VALUE pairs see L</Options>.
+
+=item break (STRING)
+
+I<Instance method>.
+Break Unicode string STRING and returns it.
+In array context, returns array of lines contained in the result.
+
+=item break_partial (STRING)
+
+I<Instance method>.
+Same as break() but accepts incremental inputs.
+Give C<undef> as STRING argument to specify that input was completed.
+
+=item config (KEY)
+
+=item config (KEY => VALUE, ...)
+
+I<Instance method>.
+Get or update configuration.  About KEY => VALUE pairs see L</Options>.
+
+=item copy
+
+I<Copy constructor>.
+Create a copy of object instance.
+
+=begin comment
+
+=item reset
+
+I<Undocumented>.
+
+=end comment
+
+=back
+
+=head2 Getting Informations
+
+=over 4
+
+=item breakingRule (BEFORESTR, AFTERSTR)
+
+I<Instance method>.
+Get possible line breaking behavior between strings BEFORESTR and AFTERSTR.
+See L</Constants> for returned value.
+
+B<Note>:
+This method gives just approximate description of line breaking behavior.
+Use break() and so on to wrap actual texts.
+
+=item context ([Charset => CHARSET], [Language => LANGUAGE])
+
+I<Function>.
+Get language/region context used by character set CHARSET or
+language LANGUAGE.
+
+=back
+
+=begin comment
+
+=head3 Methods Planned to be Deprecated
+
+=over 4
+
+=item lbrule (BEFORE, AFTER)
+
+I<Instance method>.
+Get possible line breaking behavior between class BEFORE and class AFTER.
+See L</Constants> for returned value.
+
+B<Note>:
+This method gives just approximate description of line breaking behavior.
+Use break() and so on to wrap actual texts.
+
+B<Note>:
+Use breakingRule().
+
+=item strsize (LEN, PRE, SPC, STR)
+
+I<Instance method>.
+Calculate I<number of columns> of Unicode string
+PRE.SPC.STR based on character widths defined by [UAX #11].
+
+B<Note>:
+Use L<Unicode::GCString/columns>.
+
+=back
+
+=end comment
+
+=head2 Options
+
+L</new> and L</config> methods accept following pairs.
+Some of them affect number of columns ([B<E>]),
+grapheme cluster segmentation ([B<G>])
+(see also L<Unicode::GCString>) or
+line breaking behavior ([B<L>]).
+
+=over 4
+
+=item BreakIndent => C<"YES"> | C<"NO">
+
+[B<L>]
+Always allows break after SPACEs at beginning of line, a.k.a. indent.
+[UAX #14] does not take account of such usage of SPACE.
+Default is C<"YES">.
+
+B<Note>:
+This option was introduced at release 1.011.
+
+=item CharMax => NUMBER
+
+[B<L>]
+Possible maximum number of characters in one line,
+not counting trailing SPACEs and newline sequence.
+Note that number of characters generally doesn't represent length of line.
+Default is C<998>.
+C<0> means unlimited (as of release 2012.01).
+
+=item ColMin => NUMBER
+
+[B<L>]
+Minimum number of columns which line broken arbitrarily may include, not
+counting trailing spaces and newline sequences.
+Default is C<0>.
+
+=item ColMax => NUMBER
+
+[B<L>]
+Maximum number of columns line may include not counting trailing spaces and
+newline sequence.  In other words, maximum length of line.
+Default is C<76>.
+
+=back
+
+See also L</Urgent> option and L</User-Defined Breaking Behaviors>.
+
+=over 4
+
+=item ComplexBreaking => C<"YES"> | C<"NO">
+
+[B<L>]
+Performs heuristic breaking on South East Asian complex context.
+Default is, if word segmentation for South East Asian writing systems is
+enabled, C<"YES">.
+
+=item Context => CONTEXT
+
+[B<E>][B<L>]
+Specify language/region context.
+Currently available contexts are C<"EASTASIAN"> and C<"NONEASTASIAN">.
+Default context is C<"NONEASTASIAN">.
+
+In C<"EASTASIAN"> context, characters with East_Asian_Width property
+ambiguous (A) are treated as "wide" and with Line Breaking Class AI as
+ideographic (ID).
+
+In C<"NONEASTASIAN"> context, characters with East_Asian_Width property
+ambiguous (A) are treated as "narrow" and with Line Breaking Class AI as
+alphabetic (AL).
+
+=item EAWidth => C<[> ORD C<=E<gt>> PROPERTY C<]>
+
+=item EAWidth => C<undef>
+
+[B<E>]
+Tailor classification of East_Asian_Width property.
+ORD is UCS scalar value of character or array reference of them.
+PROPERTY is one of East_Asian_Width property values
+and extended values
+(See L</Constants>).
+This option may be specified multiple times.
+If C<undef> is specified, all tailoring assigned before will be canceled.
+
+By default, no tailorings are available.
+See also L</Tailoring Character Properties>.
+
+=item Format => METHOD
+
+[B<L>]
+Specify the method to format broken lines.
+
+=over 4
+
+=item C<"SIMPLE">
+
+Default method.
+Just only insert newline at arbitrary breaking positions.
+
+=item C<"NEWLINE">
+
+Insert or replace newline sequences with that specified by L</Newline> option,
+remove SPACEs leading newline sequences or end-of-text.  Then append newline
+at end of text if it does not exist.
+
+=item C<"TRIM">
+
+Insert newline at arbitrary breaking positions. Remove SPACEs leading
+newline sequences.
+
+=item C<undef>
+
+Do nothing, even inserting any newlines.
+
+=item Subroutine reference
+
+See L</Formatting Lines>.
+
+=back
+
+=item HangulAsAL => C<"YES"> | C<"NO">
+
+[B<L>]
+Treat hangul syllables and conjoining jamos as alphabetic characters (AL).
+Default is C<"NO">.
+
+=item LBClass => C<[> ORD C<=E<gt>> CLASS C<]>
+
+=item LBClass => C<undef>
+
+[B<G>][B<L>]
+Tailor classification of line breaking property.
+ORD is UCS scalar value of character or array reference of them.
+CLASS is one of line breaking classes (See L</Constants>).
+This option may be specified multiple times.
+If C<undef> is specified, all tailoring assigned before will be canceled.
+
+By default, no tailorings are available.
+See also L</Tailoring Character Properties>.
+
+=item LegacyCM => C<"YES"> | C<"NO">
+
+[B<G>][B<L>]
+Treat combining characters lead by a SPACE as an isolated combining character
+(ID).
+As of Unicode 5.0, such use of SPACE is not recommended.
+Default is C<"YES">.
+
+=item Newline => STRING
+
+[B<L>]
+Unicode string to be used for newline sequence.
+Default is C<"\n">.
+
+=item Prep => METHOD
+
+[B<L>]
+Add user-defined line breaking behavior(s).
+This option may be specified multiple times.
+Following methods are available.
+
+=over 4
+
+=item C<"NONBREAKURI">
+
+Won't break URIs.
+
+=item C<"BREAKURI">
+
+Break URIs according to a rule suitable for printed materials.
+For more details see [CMOS], sections 6.17 and 17.11.
+
+=item C<[> REGEX, SUBREF C<]>
+
+The sequences matching regular expression REGEX will be broken by
+subroutine referred by SUBREF.
+For more details see L</User-Defined Breaking Behaviors>.
+
+=item C<undef>
+
+Cancel all methods assigned before.
+
+=back
+
+=item Sizing => METHOD
+
+[B<L>]
+Specify method to calculate size of string.
+Following options are available.
+
+=over 4
+
+=item C<"UAX11">
+
+Default method.
+Sizes are computed by columns of each characters accoring to built-in
+character database.
+
+=item C<undef>
+
+Number of grapheme clusters (see L<Unicode::GCString>) contained in the string.
+
+=item Subroutine reference
+
+See L</Calculating String Size>.
+
+=back
+
+See also L</ColMax>, L</ColMin> and L</EAWidth> options.
+
+=item Urgent => METHOD
+
+[B<L>]
+Specify method to handle excessing lines.
+Following options are available.
+
+=over 4
+
+=item C<"CROAK">
+
+Print error message and die.
+
+=item C<"FORCE">
+
+Force breaking excessing fragment.
+
+=item C<undef>
+
+Default method.
+Won't break excessing fragment.
+
+=item Subroutine reference
+
+See L</User-Defined Breaking Behaviors>.
+
+=back
+
+=item ViramaAsJoiner => C<"YES"> | C<"NO">
+
+[B<G>]
+Virama sign ("halant" in Hindi, "coeng" in Khmer) and its succeeding letter
+are not broken.
+Default is C<"YES">.
+B<Note>:
+This option was introduced by release 2012.001_29.
+On previous releases, it was fixed to C<"NO">.
+"Default" grapheme cluster defined by [UAX #29] does not include this
+feature.
+
+=back
+
+=begin comment
+
+=head3 Obsoleted Options
+
+=over 4
+
+=item TailorEA => C<[> ORD C<=E<gt>> PROPERTY, ... C<]>
+
+Obsoleted equivalent to L</EAWidth>.
+
+=item TailorLB => C<[> ORD C<=E<gt>> CLASS, ... C<]>
+
+Obsoleted equivalent to L</LBClass>.
+
+=item UserBreaking => C<[>METHOD, ...C<]>
+
+Obsoleted equivalent to L</Prep>.
+
+=back
+
+=end comment
+
+=head2 Constants
+
+=over 4
+
+=item C<EA_Na>, C<EA_N>, C<EA_A>, C<EA_W>, C<EA_H>, C<EA_F>
+
+Index values to specify six East_Asian_Width property values defined by
+[UAX #11]:
+narrow (Na), neutral (N), ambiguous (A), wide (W), halfwidth (H) and
+fullwidth (F).
+
+=item C<EA_Z>
+
+Index value to specify nonspacing characters.
+
+B<Note>:
+This "nonspacing" value is extension by this module,
+not a part of [UAX #11].
+
+=begin comment
+
+C<EA_ZA> and C<EA_ZW>: Undocumented.
+
+Earlier releases had only C<EA_Z>.
+C<EA_ZA> and C<EA_ZW> were added by release 2012.10.
+
+=end comment
+
+=item C<LB_BK>, C<LB_CR>, C<LB_LF>, C<LB_NL>, C<LB_SP>, C<LB_OP>, C<LB_CL>, C<LB_CP>, C<LB_QU>, C<LB_GL>, C<LB_NS>, C<LB_EX>, C<LB_SY>, C<LB_IS>, C<LB_PR>, C<LB_PO>, C<LB_NU>, C<LB_AL>, C<LB_HL>, C<LB_ID>, C<LB_IN>, C<LB_HY>, C<LB_BA>, C<LB_BB>, C<LB_B2>, C<LB_CB>, C<LB_ZW>, C<LB_CM>, C<LB_WJ>, C<LB_H2>, C<LB_H3>, C<LB_JL>, C<LB_JV>, C<LB_JT>, C<LB_SG>, C<LB_AI>, C<LB_CJ>, C<LB_SA>, C<LB_XX>, C<LB_RI>
+
+Index values to specify 40 line breaking property values (classes)
+defined by [UAX #14].
+
+B<Note>: Property value CP was introduced by Unicode 5.2.0.
+Property values HL and CJ were introduced by Unicode 6.1.0.
+Property value RI was introduced by Unicode 6.2.0.
+
+=item C<MANDATORY>, C<DIRECT>, C<INDIRECT>, C<PROHIBITED>
+
+Four values to specify line breaking behaviors:
+Mandatory break; Both direct break and indirect break are allowed;
+Indirect break is allowed but direct break is prohibited;
+Prohibited break.
+
+=item C<Unicode::LineBreak::SouthEastAsian::supported>
+
+Flag to determin if word segmentation for South East Asian writing systems is
+enabled.
+If this feature was enabled, a non-empty string is set. 
+Otherwise, C<undef> is set.
+
+B<N.B.>: Current release supports Thai script of modern Thai language only.
+
+=item C<UNICODE_VERSION>
+
+A string to specify version of Unicode standard this module refers.
+
+=back
+
+=head1 CUSTOMIZATION
+
+=head2 Formatting Lines
+
+If you specify subroutine reference as a value of L</Format> option,
+it should accept three arguments:
+
+    $MODIFIED = &subroutine(SELF, EVENT, STR);
+
+SELF is a Unicode::LineBreak object,
+EVENT is a string to determine the context that subroutine was called in,
+and STR is a fragment of Unicode string leading or trailing breaking position.
+
+    EVENT |When Fired           |Value of STR
+    -----------------------------------------------------------------
+    "sot" |Beginning of text    |Fragment of first line
+    "sop" |After mandatory break|Fragment of next line
+    "sol" |After arbitrary break|Fragment on sequel of line
+    ""    |Just before any      |Complete line without trailing
+          |breaks               |SPACEs
+    "eol" |Arbitrary break      |SPACEs leading breaking position
+    "eop" |Mandatory break      |Newline and its leading SPACEs
+    "eot" |End of text          |SPACEs (and newline) at end of
+          |                     |text
+    -----------------------------------------------------------------
+
+Subroutine should return modified text fragment or may return
+C<undef> to express that no modification occurred.
+Note that modification in the context of C<"sot">, C<"sop"> or C<"sol"> may
+affect decision of successive breaking positions while in the others won't.
+
+B<Note>:
+String arguments are actually sequences of grapheme clusters.
+See L<Unicode::GCString>.
+
+For example, following code folds lines removing trailing spaces:
+
+    sub fmt {
+        if ($_[1] =~ /^eo/) {
+            return "\n";
+        }
+        return undef;
+    }
+    my $lb = Unicode::LineBreak->new(Format => \&fmt);
+    $output = $lb->break($text);
+
+=head2 User-Defined Breaking Behaviors
+
+When a line generated by arbitrary break is expected to be beyond measure of
+either CharMax, ColMax or ColMin, B<urgent break> may be
+performed on successive string.
+If you specify subroutine reference as a value of L</Urgent> option,
+it should accept two arguments:
+
+    @BROKEN = &subroutine(SELF, STR);
+
+SELF is a Unicode::LineBreak object and STR is a Unicode string to be broken.
+
+Subroutine should return an array of broken string STR.
+
+B<Note>:
+String argument is actually a sequence of grapheme clusters.
+See L<Unicode::GCString>.
+
+For example, following code inserts hyphen to the name of several chemical substances (such as Titin) so that it may be folded:
+
+    sub hyphenize {
+        return map {$_ =~ s/yl$/yl-/; $_} split /(\w+?yl(?=\w))/, $_[1];
+    }
+    my $lb = Unicode::LineBreak->new(Urgent => \&hyphenize);
+    $output = $lb->break("Methionylthreonylthreonylglutaminylarginyl...");
+
+If you specify [REGEX, SUBREF] array reference as any of L</Prep> option,
+subroutine should accept two arguments:
+
+    @BROKEN = &subroutine(SELF, STR);
+
+SELF is a Unicode::LineBreak object and
+STR is a Unicode string matched with REGEX.
+
+Subroutine should return an array of broken string STR.
+
+For example, following code will break HTTP URLs using [CMOS] rule.
+
+    my $url = qr{http://[\x21-\x7E]+}i;
+    sub breakurl {
+        my $self = shift;
+        my $str = shift;
+        return split m{(?<=[/]) (?=[^/]) |
+                       (?<=[^-.]) (?=[-~.,_?\#%=&]) |
+                       (?<=[=&]) (?=.)}x, $str;
+    }
+    my $lb = Unicode::LineBreak->new(Prep => [$url, \&breakurl]);
+    $output = $lb->break($string);
+
+=head3 Preserving State
+
+Unicode::LineBreak object can behave as hash reference.
+Any items may be preserved throughout its life.
+
+For example, following code will separate paragraphs with empty lines.
+
+    sub paraformat {
+        my $self = shift;
+        my $action = shift;
+        my $str = shift;
+        
+        if ($action eq 'sot' or $action eq 'sop') {
+            $self->{'line'} = '';
+        } elsif ($action eq '') {
+            $self->{'line'} = $str;
+        } elsif ($action eq 'eol') {
+            return "\n";
+        } elsif ($action eq 'eop') {
+            if (length $self->{'line'}) {
+                return "\n\n";
+            } else {
+                return "\n";
+            }
+        } elsif ($action eq 'eot') {
+            return "\n";
+        }
+        return undef;
+    }
+    my $lb = Unicode::LineBreak->new(Format => \&paraformat);
+    $output = $lb->break($string);
+
+=head2 Calculating String Size
+
+If you specify subroutine reference as a value of L</Sizing> option,
+it will be called with five arguments:
+
+    $COLS = &subroutine(SELF, LEN, PRE, SPC, STR);
+
+SELF is a Unicode::LineBreak object, LEN is size of preceding string,
+PRE is preceding Unicode string, SPC is additional SPACEs and STR is a
+Unicode string to be processed.
+
+Subroutine should return calculated number of columns of C<PRE.SPC.STR>.
+The number of columns may not be an integer: Unit of the number may be freely chosen, however, it should be same as those of L</ColMin> and L</ColMax> option.
+
+B<Note>:
+String arguments are actually sequences of grapheme clusters.
+See L<Unicode::GCString>.
+
+For example, following code processes lines with tab stops by each eight columns.
+
+    sub tabbedsizing {
+        my ($self, $cols, $pre, $spc, $str) = @_;
+    
+        my $spcstr = $spc.$str;
+        while ($spcstr->lbc == LB_SP) {
+            my $c = $spcstr->item(0);
+            if ($c eq "\t") {
+                $cols += 8 - $cols % 8;
+            } else {
+                $cols += $c->columns;
+            }
+            $spcstr = $spcstr->substr(1);
+        }
+        $cols += $spcstr->columns;
+        return $cols;
+    };
+    my $lb = Unicode::LineBreak->new(LBClass => [ord("\t") => LB_SP],
+                                     Sizing => \&tabbedsizing);
+    $output = $lb->break($string);
+
+=head2 Tailoring Character Properties
+
+Character properties may be tailored by L</LBClass> and L</EAWidth>
+options.  Some constants are defined for convenience of tailoring.
+
+=head3 Line Breaking Properties
+
+=head4 Non-starters of Kana-like Characters
+
+By default, several hiragana, katakana and characters corresponding to kana
+are treated as non-starters (NS or CJ).
+When the following pair(s) are specified for value of L</LBClass> option,
+these characters are treated as normal ideographic characters (ID).
+
+=over 4
+
+=item C<KANA_NONSTARTERS() =E<gt> LB_ID>
+
+All of characters below.
+
+=item C<IDEOGRAPHIC_ITERATION_MARKS() =E<gt> LB_ID>
+
+Ideographic iteration marks.
+U+3005 IDEOGRAPHIC ITERATION MARK, U+303B VERTICAL IDEOGRAPHIC ITERATION MARK, U+309D HIRAGANA ITERATION MARK, U+309E HIRAGANA VOICED ITERATION MARK, U+30FD KATAKANA ITERATION MARK and U+30FE KATAKANA VOICED ITERATION MARK.
+
+N.B. Some of them are neither hiragana nor katakana.
+
+=item C<KANA_SMALL_LETTERS() =E<gt> LB_ID>
+
+=item C<KANA_PROLONGED_SOUND_MARKS() =E<gt> LB_ID>
+
+Hiragana or katakana small letters:
+Hiragana small letters U+3041 A, U+3043 I, U+3045 U, U+3047 E, U+3049 O, U+3063 TU, U+3083 YA, U+3085 YU, U+3087 YO, U+308E WA, U+3095 KA, U+3096 KE. 
+Katakana small letters U+30A1 A, U+30A3 I, U+30A5 U, U+30A7 E, U+30A9 O, U+30C3 TU, U+30E3 YA, U+30E5 YU, U+30E7 YO, U+30EE WA, U+30F5 KA, U+30F6 KE.
+Katakana phonetic extensions U+31F0 KU - U+31FF RO.
+Halfwidth katakana small letters U+FF67 A - U+FF6F TU.
+
+Hiragana or katakana prolonged sound marks:
+U+30FC KATAKANA-HIRAGANA PROLONGED SOUND MARK and
+U+FF70 HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK.
+
+N.B. These letters are optionally treated either as non-starter or
+as normal ideographic.  See [JIS X 4051] 6.1.1, [JLREQ] 3.1.7 or
+[UAX14].
+
+N.B. U+3095, U+3096, U+30F5, U+30F6 are considered to be
+neither hiragana nor katakana.
+
+=item C<MASU_MARK() =E<gt> LB_ID>
+
+U+303C MASU MARK.
+
+N.B. Although this character is not kana, it is usually regarded as
+abbreviation to sequence of hiragana E<0x307E> E<0x3059> or
+katakana E<0x30DE> E<0x30B9>, MA and SU.
+
+N.B. This character is classified as non-starter (NS) by [UAX #14]
+and as the class corresponding to ID by [JIS X 4051] and [JLREQ].
+
+=back
+
+=head4 Ambiguous Quotation Marks
+
+By default, some punctuations are ambiguous quotation marks (QU).
+
+=over 4
+
+=item C<BACKWARD_QUOTES() =E<gt> LB_OP, FORWARD_QUOTES() =E<gt> LB_CL>
+
+Some languages (Dutch, English, Italian, Portugese, Spanish, Turkish and
+most East Asian) use rotated-9-style punctuations (E<0x2018> E<0x201C>) as
+opening and 9-style punctuations (E<0x2019> E<0x201D>) as closing quotation
+marks.
+
+=item C<FORWARD_QUOTES() =E<gt> LB_OP, BACKWARD_QUOTES() =E<gt> LB_CL>
+
+Some others (Czech, German and Slovak) use 9-style punctuations
+(E<0x2019> E<0x201D>) as opening and rotated-9-style punctuations
+(E<0x2018> E<0x201C>) as closing quotation marks.
+
+=item C<BACKWARD_GUILLEMETS() =E<gt> LB_OP, FORWARD_GUILLEMETS() =E<gt> LB_CL>
+
+French, Greek, Russian etc. use left-pointing guillemets (E<0x00AB> E<0x2039>)
+as opening and right-pointing guillemets (E<0x00BB> E<0x203A>) as closing
+quotation marks.
+
+=item C<FORWARD_GUILLEMETS() =E<gt> LB_OP, BACKWARD_GUILLEMETS() =E<gt> LB_CL>
+
+German and Slovak use right-pointing guillemets (E<0x00BB> E<0x203A>) as
+opening and left-pointing guillemets (E<0x00AB> E<0x2039>) as closing
+quotation marks.
+
+=back
+
+Danish, Finnish, Norwegian and Swedish use 9-style or right-pointing
+punctuations (E<0x2019> E<0x201D> E<0x00BB> E<0x203A>) as both opening and
+closing quotation marks.
+
+=head4 IDEOGRAPHIC SPACE
+
+=over 4
+
+=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_BA>
+
+U+3000 IDEOGRAPHIC SPACE won't be placed at beginning of line.
+This is default behavior.
+
+=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_ID>
+
+IDEOGRAPHIC SPACE can be placed at beginning of line.
+This was default behavior by Unicode 6.2 and earlier.
+
+=item C<IDEOGRAPHIC_SPACE() =E<gt> LB_SP>
+
+IDEOGRAPHIC SPACE won't be placed at beginning of line,
+and will protrude from end of line.
+
+=back
+
+=head3 East_Asian_Width Properties
+
+Some particular letters of Latin, Greek and Cyrillic scripts have ambiguous
+(A) East_Asian_Width property.  Thus, these characters are treated as wide
+in C<"EASTASIAN"> context.
+Specifying C<EAWidth =E<gt> [ AMBIGUOUS_>*C<() =E<gt> EA_N ]>,
+those characters are always treated as narrow.
+
+=over 4
+
+=item C<AMBIGUOUS_ALPHABETICS() =E<gt> EA_N>
+
+Treat all of characters below as East_Asian_Width neutral (N).
+
+=item C<AMBIGUOUS_CYRILLIC() =E<gt> EA_N>
+
+=item C<AMBIGUOUS_GREEK() =E<gt> EA_N>
+
+=item C<AMBIGUOUS_LATIN() =E<gt> EA_N>
+
+Treate letters having ambiguous (A) width of Cyrillic, Greek and Latin scripts
+as neutral (N).
+
+=back
+
+On the other hand, despite several characters were occasionally rendered as wide characters by number of implementations for East Asian character sets, they are given narrow (Na) East_Asian_Width property just because they have fullwidth (F) compatibility characters.
+Specifying C<EAWidth> as below, those characters are treated as ambiguous
+--- wide on C<"EASTASIAN"> context.
+
+=over 4
+
+=item C<QUESTIONABLE_NARROW_SIGNS() =E<gt> EA_A>
+
+U+00A2 CENT SIGN, U+00A3 POUND SIGN, U+00A5 YEN SIGN (or yuan sign),
+U+00A6 BROKEN BAR, U+00AC NOT SIGN, U+00AF MACRON.
+
+=back
+
+=head2 Configuration File
+
+Built-in defaults of option parameters for L</new> and L</config> method
+can be overridden by configuration files:
+F<Unicode/LineBreak/Defaults.pm>.
+For more details read F<Unicode/LineBreak/Defaults.pm.sample>.
+
+=head1 BUGS
+
+Please report bugs or buggy behaviors to developer.
+
+CPAN Request Tracker:
+L<http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-LineBreak>.
+
+=head1 VERSION
+
+Consult $VERSION variable.
+
+=head2 Incompatible Changes
+
+=over 4
+
+=item Release 2012.06
+
+=over 4
+
+=item *
+
+eawidth() method was deprecated.
+L<Unicode::GCString/columns> may be used instead.
+
+=item *
+
+lbclass() method was deprecated.
+Use L<Unicode::GCString/lbc> or L<Unicode::GCString/lbcext>.
+
+=back
+
+=back
+
+=head2 Conformance to Standards
+
+Character properties this module is based on are defined by
+Unicode Standard version 8.0.0.
+
+This module is intended to implement UAX14-C2.
+
+=head1 IMPLEMENTATION NOTES
+
+=over 4
+
+=item *
+
+Some ideographic characters may be treated either as NS or as ID by choice.
+
+=item *
+
+Hangul syllables and conjoining jamos may be treated as
+either ID or AL by choice.
+
+=item *
+
+Characters assigned to AI may be resolved to either AL or ID by choice.
+
+=item *
+
+Character(s) assigned to CB are not resolved.
+
+=item *
+
+Characters assigned to CJ are always resolved to NS.
+More flexible tailoring mechanism is provided.
+
+=item *
+
+When word segmentation for South East Asian writing systems is not supported,
+characters assigned to SA are resolved to AL,
+except that characters that have Grapheme_Cluster_Break property value
+Extend or SpacingMark be resolved to CM.
+
+=item *
+
+Characters assigned to SG or XX are resolved to AL.
+
+=item *
+
+Code points of following UCS ranges are given fixed property values even
+if they have not been assigned any characers.
+
+    Ranges             | UAX #14    | UAX #11    | Description
+    -------------------------------------------------------------
+    U+20A0..U+20CF     | PR [*1]    | N [*2]     | Currency symbols
+    U+3400..U+4DBF     | ID         | W          | CJK ideographs
+    U+4E00..U+9FFF     | ID         | W          | CJK ideographs
+    U+D800..U+DFFF     | AL (SG)    | N          | Surrogates
+    U+E000..U+F8FF     | AL (XX)    | F or N (A) | Private use
+    U+F900..U+FAFF     | ID         | W          | CJK ideographs
+    U+20000..U+2FFFD   | ID         | W          | CJK ideographs
+    U+30000..U+3FFFD   | ID         | W          | Old hanzi
+    U+F0000..U+FFFFD   | AL (XX)    | F or N (A) | Private use
+    U+100000..U+10FFFD | AL (XX)    | F or N (A) | Private use
+    Other unassigned   | AL (XX)    | N          | Unassigned,
+                       |            |            | reserved or
+                       |            |            | noncharacters
+    -------------------------------------------------------------
+    [*1] Except U+20A7 PESETA SIGN (PO),
+      U+20B6 LIVRE TOURNOIS SIGN (PO), U+20BB NORDIC MARK SIGN (PO)
+      and U+20BE LARI SIGN (PO).
+    [*2] Except U+20A9 WON SIGN (H) and U+20AC EURO SIGN
+      (F or N (A)).
+
+=item *
+
+Characters belonging to General Category Mn, Me, Cc, Cf, Zl or Zp are
+treated as nonspacing by this module.
+
+=back
+
+=head1 REFERENCES
+
+=over 4
+
+=item [CMOS]
+
+I<The Chicago Manual of Style>, 15th edition.
+University of Chicago Press, 2003.
+
+=item [JIS X 4051]
+
+JIS X 4051:2004
+I<日本語文書の組版方法> (I<Formatting Rules for Japanese Documents>).
+Japanese Standards Association, 2004.
+
+=item [JLREQ]
+
+Anan, Yasuhiro et al.
+I<Requirements for Japanese Text Layout>,
+W3C Working Group Note 3 April 2012.
+L<http://www.w3.org/TR/2012/NOTE-jlreq-20120403/>.
+
+=begin comment
+
+=item [Kubota]
+
+Kubota, Tomohiro (2001-2002).
+Width problems, "I<Problems on Interoperativity between Unicode and CJK Local Encodings>".
+L<http://web.archive.org/web/people.debian.org/~kubota/unicode-symbols-width2.html>.
+
+=end comment
+
+=item [UAX #11]
+
+A. Freytag (ed.) (2008-2009).
+I<Unicode Standard Annex #11: East Asian Width>, Revisions 17-19.
+L<http://unicode.org/reports/tr11/>.
+
+=item [UAX #14]
+
+A. Freytag and A. Heninger (eds.) (2008-2015).
+I<Unicode Standard Annex #14: Unicode Line Breaking Algorithm>, Revisions 22-35.
+L<http://unicode.org/reports/tr14/>.
+
+=item [UAX #29]
+
+Mark Davis (ed.) (2009-2013).
+I<Unicode Standard Annex #29: Unicode Text Segmentation>, Revisions 15-23.
+L<http://www.unicode.org/reports/tr29/>.
+
+=back
+
+=head1 SEE ALSO
+
+L<Text::LineFold>, L<Text::Wrap>, L<Unicode::GCString>.
+
+=head1 AUTHOR
+
+Copyright (C) 2009-2018 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
+
+This program is free software; you can redistribute it and/or modify it 
+under the same terms as Perl itself.
+
+=cut
--- a/database/perl/vendor/lib/Unicode/LineBreak/Constants.pm
+++ b/database/perl/vendor/lib/Unicode/LineBreak/Constants.pm
@@ -0,0 +1,68 @@
+#-*- perl -*-
+
+package Unicode::LineBreak;
+
+use constant { M => 4, D => 3, I => 2, P => 1,};
+use constant { MANDATORY => M, DIRECT => D, INDIRECT => I, PROHIBITED => P,
+               URGENT => 200,};
+
+use constant { ALLOW_BEFORE => 2, PROHIBIT_BEFORE => 1,
+               BREAK_BEFORE => 2, # deprecated.
+               FLAGS => (2 | 1) };
+
+use constant {
+    AMBIGUOUS_CYRILLIC => [0x0401, 0x0410..0x044F, 0x0451, ],
+    AMBIGUOUS_GREEK => [0x0391..0x03A9, 0x03B1..0x03C1, 0x03C3..0x03C9, ],
+    AMBIGUOUS_LATIN => [0x00C6, 0x00D0, 0x00D8, 0x00DE, 0x00DF, 0x00E0, 
+        0x00E1, 0x00E6, 0x00E8, 0x00E9, 0x00EA, 0x00EC, 0x00ED, 0x00F0, 
+        0x00F2, 0x00F3, 0x00F8, 0x00F9, 0x00FA, 0x00FC, 0x00FE, 0x0101, 
+        0x0111, 0x0113, 0x011B, 0x0126, 0x0127, 0x012B, 0x0131, 0x0132, 
+        0x0133, 0x0138, 0x013F, 0x0140, 0x0141, 0x0142, 0x0144, 0x0148, 
+        0x0149, 0x014A, 0x014B, 0x014D, 0x0152, 0x0153, 0x0166, 0x0167, 
+        0x016B, 0x01CE, 0x01D0, 0x01D2, 0x01D4, 0x01D6, 0x01D8, 0x01DA, 
+        0x01DC, 0x0251, 0x0261, ],
+    IDEOGRAPHIC_ITERATION_MARKS => [0x3005, 0x303B, 0x309D, 0x309E, 0x30FD, 
+        0x30FE, ],
+    KANA_PROLONGED_SOUND_MARKS => [0x30FC, 0xFF70, ],
+    KANA_SMALL_LETTERS => [0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3063, 
+        0x3083, 0x3085, 0x3087, 0x308E, 
+        0x3095, 0x3096, 
+        0x30A1, 0x30A3, 0x30A5, 0x30A7, 0x30A9, 0x30C3, 
+        0x30E3, 0x30E5, 0x30E7, 0x30EE, 
+        0x30F5, 0x30F6, 
+        0x31F0..0x31FF, 0xFF67..0xFF6F, ],
+    MASU_MARK => [0x303C, ],
+    QUESTIONABLE_NARROW_SIGNS => [0x00A2, 0x00A3, 0x00A5, 0x00A6, 0x00AC, 
+        0x00AF, ],
+};
+use constant {
+    AMBIGUOUS_ALPHABETICS => [
+        @{AMBIGUOUS_CYRILLIC()}, @{AMBIGUOUS_GREEK()},
+        @{AMBIGUOUS_LATIN()}, ],
+    KANA_NONSTARTERS => [
+        @{IDEOGRAPHIC_ITERATION_MARKS()}, @{KANA_PROLONGED_SOUND_MARKS()},
+        @{KANA_SMALL_LETTERS()}, @{MASU_MARK()}, ]
+};
+use constant {
+    BACKWORD_GUILLEMETS => [
+        0x00AB, 0x2039, ],
+    FORWARD_GUILLEMETS => [
+        0x00BB, 0x203A, ],
+    BACKWORD_QUOTES => [
+        0x2018, 0x201C, ],
+    FORWARD_QUOTES => [
+        0x2019, 0x201D, ],
+};
+# obsoleted names.
+use constant {
+    LEFT_GUILLEMETS => BACKWORD_GUILLEMETS(),
+    RIGHT_GUILLEMETS => FORWARD_GUILLEMETS(),
+    LEFT_QUOTES => BACKWORD_QUOTES(),
+    RIGHT_QUOTES => FORWARD_QUOTES(),
+};
+
+use constant {
+    IDEOGRAPHIC_SPACE => [ 0x3000, ],
+};
+
+1;
--- a/database/perl/vendor/lib/Unicode/LineBreak/Defaults.pm.sample
+++ b/database/perl/vendor/lib/Unicode/LineBreak/Defaults.pm.sample
@@ -0,0 +1,114 @@
+#-*- perl -*-
+
+package Unicode::LineBreak;
+
+=head1 NAME
+
+Unicode::LineBreak::Defaults - Configuration for Unicode::LineBreak
+
+=head1 SYNOPSIS
+
+Edit this file and place it on Unicode/LineBreak/Defaults.pm 
+to activate custom settings.
+
+=head1 DESCRIPTION
+
+Following settings are available.
+
+=over 4
+
+=item *
+
+BreakIndent
+
+=item *
+
+CharMax
+
+=item *
+
+ColMin
+
+=item *
+
+ColMax
+
+=item *
+
+ComplexBreaking
+
+=item *
+
+Context
+
+=item *
+
+EAWidth
+
+=item *
+
+Format
+
+=item *
+
+HangulAsAL
+
+=item *
+
+LBClass
+
+=item *
+
+LegacyCM
+
+=item *
+
+Newline
+
+=item *
+
+Prep
+
+=item *
+
+Sizing
+
+=item *
+
+Urgent
+
+=item *
+
+ViramaAsJoiner
+
+=back
+
+=head1 SEE ALSO
+
+L<Unicode::LineBreak>
+
+=cut
+
+#--------------------------------------------------------------------------#
+# Add your own settings below.
+#--------------------------------------------------------------------------#
+
+## Default settings on current release are:
+# $Config->{BreakIndent} = 'YES';
+# $Config->{CharMax} = 998;
+# $Config->{ColMin} = 0;
+# $Config->{ColMax} = 76;
+# $Config->{ComplexBreaking} = 'YES';
+# $Config->{Context} = 'NONEASTASIAN';
+# $Config->{EAWidth} = undef;
+# $Config->{Format} = 'SIMPLE';
+# $Config->{HangulAsAL} = 'NO';
+# $Config->{LBClass} = undef;
+# $Config->{LegacyCM} = 'YES';
+# $Config->{Newline} = "\n";
+# $Config->{Prep} = undef;
+# $Config->{Sizing} = 'UAX11';
+# $Config->{Urgent} = undef;
+# $Config->{ViramaAsJoiner} = 'YES';
+
+1;
--- a/database/perl/vendor/lib/Unicode/UTF8.pm
+++ b/database/perl/vendor/lib/Unicode/UTF8.pm
@@ -0,0 +1,21 @@
+package Unicode::UTF8;
+
+use strict;
+use warnings;
+
+BEGIN {
+    our $VERSION    = '0.62';
+    our @EXPORT_OK  = qw[ decode_utf8 encode_utf8 valid_utf8 ];
+    our %EXPORT_TAGS = (
+        all => [ @EXPORT_OK ],
+    );
+
+    require XSLoader;
+    XSLoader::load(__PACKAGE__, $VERSION);
+
+    require Exporter;
+    *import = \&Exporter::import;
+}
+
+1;
+
--- a/database/perl/vendor/lib/Unicode/UTF8.pod
+++ b/database/perl/vendor/lib/Unicode/UTF8.pod
@@ -0,0 +1,207 @@
+=head1 NAME
+
+Unicode::UTF8 - Encoding and decoding of UTF-8 encoding form
+
+=head1 SYNOPSIS
+
+    use Unicode::UTF8 qw[decode_utf8 encode_utf8];
+    
+    use warnings FATAL => 'utf8'; # fatalize encoding glitches
+    $string = decode_utf8($octets);
+    $octets = encode_utf8($string);
+
+=head1 DESCRIPTION
+
+This module provides functions to encode and decode UTF-8 encoding form as 
+specified by Unicode and ISO/IEC 10646:2011.
+
+=head1 FUNCTIONS
+
+=head2 decode_utf8
+
+    $string = decode_utf8($octets);
+    $string = decode_utf8($octets, $fallback);
+
+Returns an decoded representation of C<$octets> in UTF-8 encoding as a character
+string.
+
+C<$fallback> is an optional C<CODE> reference which provides a error-handling 
+mechanism, allowing customization of error handling. The default error-handling 
+mechanism is to replace any ill-formed UTF-8 sequences or encoded code points 
+which can't be interchanged with REPLACEMENT CHARACTER (U+FFFD).
+
+    $string = $fallback->($octets, $is_usv, $position);
+
+C<$fallback> is invoked with three arguments: C<$octets>, C<$is_usv> and 
+C<$position>. C<$octets> is a sequence of one or more octets containing the 
+maximal subpart of the ill-formed subsequence or encoded code point which 
+can't be interchanged. C<$is_usv> is a boolean indicating whether or not 
+C<$octets> represent a encoded Unicode scalar value. C<$position> is a 
+unsigned integer containing the zero based octet position at which the error 
+occurred within the octets provided to C<decode_utf8()>. C<$fallback> must 
+return a character string consisting of zero or more Unicode scalar values. 
+Unicode scalar values consist of code points in the range U+0000..U+D7FF and 
+U+E000..U+10FFFF.
+
+=head2 encode_utf8
+
+    $octets = encode_utf8($string);
+    $octets = encode_utf8($string, $fallback);
+
+Returns an encoded representation of C<$string> in UTF-8 encoding as an octet
+string.
+
+C<$fallback> is an optional C<CODE> reference which provides a error-handling 
+mechanism, allowing customization of error handling. The default error-handling 
+mechanism is to replace any code points which can't be interchanged or represented 
+in UTF-8 encoding form with REPLACEMENT CHARACTER (U+FFFD).
+
+    $string = $fallback->($codepoint, $is_usv, $position);
+
+C<$fallback> is invoked with three arguments: C<$codepoint>, C<$is_usv> and 
+C<$position>. C<$codepoint> is a unsigned integer containing the code point 
+which can't be interchanged or represented in UTF-8 encoding form. C<$is_usv> 
+is a boolean indicating whether or not C<$codepoint> is a Unicode scalar value. 
+C<$position> is a unsigned integer containing the zero based character position 
+at which the error occurred within the string provided to C<encode_utf8()>. 
+C<$fallback> must return a character string consisting of zero or more Unicode 
+scalar values.Unicode scalar values consist of code points in the range 
+U+0000..U+D7FF and U+E000..U+10FFFF.
+
+=head2 valid_utf8
+
+    $boolean = valid_utf8($octets);
+
+Returns a boolean indicating whether or not the given C<$octets> consist of 
+well-formed UTF-8 sequences.
+
+=head1 EXPORTS
+
+None by default. All functions can be exported using the C<:all> tag or individually.
+
+=head1 DIAGNOSTICS
+
+=over 4
+
+=item Can't decode a wide character string
+
+(F) Wide character in octets.
+
+=item Can't validate a wide character string
+
+(F) Wide character in octets.
+
+=item Can't decode ill-formed UTF-8 octet sequence <%s> in position %u
+
+(W utf8) Encountered an ill-formed UTF-8 octet sequence. <%s> contains a 
+hexadecimal representation of the maximal subpart of the ill-formed subsequence.
+
+=item Can't interchange noncharacter code point U+%X in position %u
+
+(W utf8, nonchar) Noncharacters are code points that are permanently reserved 
+in the Unicode Standard for internal use. They are forbidden for use in open 
+interchange of Unicode text data. Noncharacters consist of the values U+nFFFE 
+and U+nFFFF (where n is from 0 to 10^16) and the values U+FDD0..U+FDEF.
+
+=item Can't represent surrogate code point U+%X in position %u
+
+(W utf8, surrogate) Surrogate code points are designated only for surrogate code 
+units in the UTF-16 character encoding form. Surrogates consist of code points 
+in the range U+D800 to U+DFFF.
+
+=item Can't represent super code point \x{%X} in position %u
+
+(W utf8, non_unicode) Code points greater than U+10FFFF. Perl's extended codespace.
+
+=item Can't decode ill-formed UTF-X octet sequence <%s> in position %u
+
+(F) Encountered an ill-formed octet sequence in Perl's internal representation 
+of wide characters.
+
+=back
+
+The sub-categories: C<nonchar>, C<surrogate> and C<non_unicode> is only available 
+on Perl 5.14 or greater. See L<perllexwarn> for available categories and hierarchies.
+
+=head1 COMPARISON
+
+Here is a summary of features for comparison with L<Encode>'s UTF-8 implementation:
+
+=over 4
+
+=item *
+
+Simple API which makes use of Perl's standard warning categories.
+
+=item *
+
+Recognizes all noncharacters regardless of Perl version
+
+=item *
+
+Implements Unicode's recommended practice for using U+FFFD.
+
+=item *
+
+Better diagnostics in warning messages
+
+=item *
+
+Detects and reports inconsistency in Perl's internal representation of 
+wide characters (UTF-X)
+
+=item *
+
+Preserves taintedness of decoded C<$octets> or encoded C<$string>
+
+=item *
+
+Better performance ~ 600% - 1200% (JA: 600%, AR: 700%, SV: 900%, EN: 1200%, 
+see benchmarks directory in git repository)
+
+=back
+
+=head1 CONFORMANCE
+
+It's the author's belief that this UTF-8 implementation is conformant with 
+the Unicode Standard Version 6.0. Any deviations from the Unicode Standard 
+is to be considered a bug.
+
+=head1 SEE ALSO
+
+=over 4
+
+=item L<Encode>
+
+=item L<http://www.unicode.org/>
+
+=back
+
+=head1 SUPPORT
+
+=head2 BUGS
+
+Please report any bugs by email to C<bug-unicode-utf8 at rt.cpan.org>, or 
+through the web interface at L<http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-UTF8>. 
+You will be automatically notified of any progress on the request by the system.
+
+=head2 SOURCE CODE
+
+This is open source software. The code repository is available for public 
+review and contribution under the terms of the license.
+
+L<http://github.com/chansen/p5-unicode-utf8>
+
+    git clone http://github.com/chansen/p5-unicode-utf8
+
+=head1 AUTHOR
+
+Christian Hansen C<chansen@cpan.org>
+
+=head1 COPYRIGHT
+
+Copyright 2011-2017 by Christian Hansen.
+
+This is free software; you can redistribute it and/or modify it under
+the same terms as the Perl 5 programming language system itself.
+