594 lines
16 KiB
Perl
594 lines
16 KiB
Perl
#-*- perl -*-
|
|
|
|
package Text::LineFold;
|
|
require 5.008;
|
|
|
|
=encoding utf-8
|
|
|
|
=head1 NAME
|
|
|
|
Text::LineFold - Line Folding for Plain Text
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use Text::LineFold;
|
|
$lf = Text::LineFold->new();
|
|
|
|
# Fold lines
|
|
$folded = $lf->fold($string, 'PLAIN');
|
|
$indented = $lf->fold(' ' x 8, ' ' x 4, $string);
|
|
|
|
# Unfold lines
|
|
$unfolded = $lf->unfold($string, 'FIXED');
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Text::LineFold folds or unfolds lines of plain text.
|
|
As it mainly focuses on plain text e-mail messages,
|
|
RFC 3676 flowed format is also supported.
|
|
|
|
=cut
|
|
|
|
### Pragmas:
|
|
use strict;
|
|
use vars qw($VERSION @EXPORT_OK @ISA $Config);
|
|
|
|
### Exporting:
|
|
use Exporter;
|
|
|
|
### Inheritance:
|
|
our @ISA = qw(Exporter Unicode::LineBreak);
|
|
|
|
### Other modules:
|
|
use Carp qw(croak carp);
|
|
use Encode qw(is_utf8);
|
|
use MIME::Charset;
|
|
use Unicode::LineBreak qw(:all);
|
|
|
|
### Globals
|
|
|
|
### The package Version
|
|
our $VERSION = '2018.012';
|
|
|
|
### Public Configuration Attributes
|
|
our $Config = {
|
|
### %{$Unicode::LineBreak::Config},
|
|
Charset => 'UTF-8',
|
|
Language => 'XX',
|
|
OutputCharset => undef,
|
|
TabSize => 8,
|
|
};
|
|
|
|
### Privates
|
|
|
|
my %FORMAT_FUNCS = (
|
|
'FIXED' => sub {
|
|
my $self = shift;
|
|
my $action = shift;
|
|
my $str = shift;
|
|
if ($action =~ /^so[tp]/) {
|
|
$self->{_} = {};
|
|
$self->{_}->{'ColMax'} = $self->config('ColMax');
|
|
$self->config('ColMax' => 0) if $str =~ /^>/;
|
|
} elsif ($action eq "") {
|
|
$self->{_}->{line} = $str;
|
|
} elsif ($action eq "eol") {
|
|
return $self->config('Newline');
|
|
} elsif ($action =~ /^eo/) {
|
|
if (length $self->{_}->{line} and $self->config('ColMax')) {
|
|
$str = $self->config('Newline').$self->config('Newline');
|
|
} else {
|
|
$str = $self->config('Newline');
|
|
}
|
|
$self->config('ColMax' => $self->{_}->{'ColMax'});
|
|
delete $self->{_};
|
|
return $str;
|
|
}
|
|
undef;
|
|
},
|
|
'FLOWED' => sub { # RFC 3676
|
|
my $self = shift;
|
|
my $action = shift;
|
|
my $str = shift;
|
|
if ($action eq 'sol') {
|
|
if ($self->{_}->{prefix}) {
|
|
return $self->{_}->{prefix}.' '.$str;
|
|
}
|
|
} elsif ($action =~ /^so/) {
|
|
$self->{_} = {};
|
|
if ($str =~ /^(>+)/) {
|
|
$self->{_}->{prefix} = $1;
|
|
} else {
|
|
$self->{_}->{prefix} = '';
|
|
}
|
|
} elsif ($action eq "") {
|
|
if ($str =~ /^(?: |From )/
|
|
or $str =~ /^>/ and !length $self->{_}->{prefix}) {
|
|
return $self->{_}->{line} = ' ' . $str;
|
|
}
|
|
$self->{_}->{line} = $str;
|
|
} elsif ($action eq 'eol') {
|
|
$str = ' ' if length $str;
|
|
return $str.' '.$self->config('Newline');
|
|
} elsif ($action =~ /^eo/) {
|
|
if (length $self->{_}->{line} and !length $self->{_}->{prefix}) {
|
|
$str = ' '.$self->config('Newline').$self->config('Newline');
|
|
} else {
|
|
$str = $self->config('Newline');
|
|
}
|
|
delete $self->{_};
|
|
return $str;
|
|
}
|
|
undef;
|
|
},
|
|
'PLAIN' => sub {
|
|
return $_[0]->config('Newline') if $_[1] =~ /^eo/;
|
|
undef;
|
|
},
|
|
);
|
|
|
|
=head2 Public Interface
|
|
|
|
=over 4
|
|
|
|
=item new ([KEY => VALUE, ...])
|
|
|
|
I<Constructor>.
|
|
About KEY => VALUE pairs see config method.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub new {
|
|
my $class = shift;
|
|
my $self = bless __PACKAGE__->SUPER::new(), $class;
|
|
$self->config(@_);
|
|
$self;
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item $self->config (KEY)
|
|
|
|
=item $self->config ([KEY => VAL, ...])
|
|
|
|
I<Instance method>.
|
|
Get or update configuration. Following KEY => VALUE pairs may be specified.
|
|
|
|
=over 4
|
|
|
|
=item Charset => CHARSET
|
|
|
|
Character set that is used to encode string.
|
|
It may be string or L<MIME::Charset> object.
|
|
Default is C<"UTF-8">.
|
|
|
|
=item Language => LANGUAGE
|
|
|
|
Along with Charset option, this may be used to define language/region
|
|
context.
|
|
Default is C<"XX">.
|
|
See also L<Unicode::LineBreak/Context> option.
|
|
|
|
=item Newline => STRING
|
|
|
|
String to be used for newline sequence.
|
|
Default is C<"\n">.
|
|
|
|
=item OutputCharset => CHARSET
|
|
|
|
Character set that is used to encode result of fold()/unfold().
|
|
It may be string or L<MIME::Charset> object.
|
|
If a special value C<"_UNICODE_"> is specified, result will be Unicode string.
|
|
Default is the value of Charset option.
|
|
|
|
=item TabSize => NUMBER
|
|
|
|
Column width of tab stops.
|
|
When 0 is specified, tab stops are ignored.
|
|
Default is 8.
|
|
|
|
=item BreakIndent
|
|
|
|
=item CharMax
|
|
|
|
=item ColMax
|
|
|
|
=item ColMin
|
|
|
|
=item ComplexBreaking
|
|
|
|
=item EAWidth
|
|
|
|
=item HangulAsAL
|
|
|
|
=item LBClass
|
|
|
|
=item LegacyCM
|
|
|
|
=item Prep
|
|
|
|
=item Urgent
|
|
|
|
See L<Unicode::LineBreak/Options>.
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub config {
|
|
my $self = shift;
|
|
my @opts = qw{Charset Language OutputCharset TabSize};
|
|
my %opts = map { (uc $_ => $_) } @opts;
|
|
my $newline = undef;
|
|
|
|
# Get config.
|
|
if (scalar @_ == 1) {
|
|
if ($opts{uc $_[0]}) {
|
|
return $self->{$opts{uc $_[0]}};
|
|
}
|
|
return $self->SUPER::config($_[0]);
|
|
}
|
|
|
|
# Set config.
|
|
my @o = ();
|
|
while (scalar @_) {
|
|
my $k = shift;
|
|
my $v = shift;
|
|
if ($opts{uc $k}) {
|
|
$self->{$opts{uc $k}} = $v;
|
|
} elsif (uc $k eq uc 'Newline') {
|
|
$newline = $v;
|
|
} else {
|
|
push @o, $k => $v;
|
|
}
|
|
}
|
|
$self->SUPER::config(@o) if scalar @o;
|
|
|
|
# Character set and language assumed.
|
|
if (ref $self->{Charset} eq 'MIME::Charset') {
|
|
$self->{_charset} = $self->{Charset};
|
|
} else {
|
|
$self->{Charset} ||= $Config->{Charset};
|
|
$self->{_charset} = MIME::Charset->new($self->{Charset});
|
|
}
|
|
$self->{Charset} = $self->{_charset}->as_string;
|
|
my $ocharset = uc($self->{OutputCharset} || $self->{Charset});
|
|
$ocharset = MIME::Charset->new($ocharset)
|
|
unless ref $ocharset eq 'MIME::Charset' or $ocharset eq '_UNICODE_';
|
|
unless ($ocharset eq '_UNICODE_') {
|
|
$self->{_charset}->encoder($ocharset);
|
|
$self->{OutputCharset} = $ocharset->as_string;
|
|
}
|
|
$self->{Language} = uc($self->{Language} || $Config->{Language});
|
|
|
|
## Context
|
|
$self->SUPER::config(Context =>
|
|
context(Charset => $self->{Charset},
|
|
Language => $self->{Language}));
|
|
|
|
## Set sizing method.
|
|
$self->SUPER::config(Sizing => sub {
|
|
my ($self, $cols, $pre, $spc, $str) = @_;
|
|
|
|
my $tabsize = $self->{TabSize};
|
|
my $spcstr = $spc.$str;
|
|
$spcstr->pos(0);
|
|
while (!$spcstr->eos and $spcstr->item->lbc == LB_SP) {
|
|
my $c = $spcstr->next;
|
|
if ($c eq "\t") {
|
|
$cols += $tabsize - $cols % $tabsize if $tabsize;
|
|
} else {
|
|
$cols += $c->columns;
|
|
}
|
|
}
|
|
return $cols + $spcstr->substr($spcstr->pos)->columns;
|
|
});
|
|
|
|
## Classify horizontal tab as line breaking class SP.
|
|
$self->SUPER::config(LBClass => [ord("\t") => LB_SP]);
|
|
## Tab size
|
|
if (defined $self->{TabSize}) {
|
|
croak "Invalid TabSize option" unless $self->{TabSize} =~ /^\d+$/;
|
|
$self->{TabSize} += 0;
|
|
} else {
|
|
$self->{TabSize} = $Config->{TabSize};
|
|
}
|
|
|
|
## Newline
|
|
if (defined $newline) {
|
|
$newline = $self->{_charset}->decode($newline)
|
|
unless is_utf8($newline);
|
|
$self->SUPER::config(Newline => $newline);
|
|
}
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item $self->fold (STRING, [METHOD])
|
|
|
|
=item $self->fold (INITIAL_TAB, SUBSEQUENT_TAB, STRING, ...)
|
|
|
|
I<Instance method>.
|
|
fold() folds lines of string STRING and returns it.
|
|
Surplus SPACEs and horizontal tabs at end of line are removed,
|
|
newline sequences are replaced by that specified by Newline option
|
|
and newline is appended at end of text if it does not exist.
|
|
Horizontal tabs are treated as tab stops according to TabSize option.
|
|
|
|
By the first style, following options may be specified for METHOD argument.
|
|
|
|
=over 4
|
|
|
|
=item C<"FIXED">
|
|
|
|
Lines preceded by C<"E<gt>"> won't be folded.
|
|
Paragraphs are separated by empty line.
|
|
|
|
=item C<"FLOWED">
|
|
|
|
C<"Format=Flowed; DelSp=Yes"> formatting defined by RFC 3676.
|
|
|
|
=item C<"PLAIN">
|
|
|
|
Default method. All lines are folded.
|
|
|
|
=back
|
|
|
|
Second style is similar to L<Text::Wrap/wrap()>.
|
|
All lines are folded.
|
|
INITIAL_TAB is inserted at beginning of paragraphs and SUBSEQUENT_TAB
|
|
at beginning of other broken lines.
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
# Special breaking characters: VT, FF, NEL, LS, PS
|
|
my $special_break = qr/([\x{000B}\x{000C}\x{0085}\x{2028}\x{2029}])/os;
|
|
|
|
sub fold {
|
|
my $self = shift;
|
|
my $str;
|
|
|
|
if (2 < scalar @_) {
|
|
my $initial_tab = shift || '';
|
|
$initial_tab = $self->{_charset}->decode($initial_tab)
|
|
unless is_utf8($initial_tab);
|
|
my $subsequent_tab = shift || '';
|
|
$subsequent_tab = $self->{_charset}->decode($subsequent_tab)
|
|
unless is_utf8($subsequent_tab);
|
|
my @str = @_;
|
|
|
|
## Decode and concat strings.
|
|
$str = shift @str;
|
|
$str = $self->{_charset}->decode($str) unless is_utf8($str);
|
|
foreach my $s (@str) {
|
|
next unless defined $s and length $s;
|
|
|
|
$s = $self->{_charset}->decode($s) unless is_utf8($s);
|
|
unless (length $str) {
|
|
$str = $s;
|
|
} elsif ($str =~ /(\s|$special_break)$/ or
|
|
$s =~ /^(\s|$special_break)/) {
|
|
$str .= $s;
|
|
} else {
|
|
$str .= ' ' if $self->breakingRule($str, $s) == INDIRECT;
|
|
$str .= $s;
|
|
}
|
|
}
|
|
|
|
## Set format method.
|
|
$self->SUPER::config(Format => sub {
|
|
my $self = shift;
|
|
my $event = shift;
|
|
my $str = shift;
|
|
if ($event =~ /^eo/) { return $self->config('Newline'); }
|
|
if ($event =~ /^so[tp]/) { return $initial_tab.$str; }
|
|
if ($event eq 'sol') { return $subsequent_tab.$str; }
|
|
undef;
|
|
});
|
|
} else {
|
|
$str = shift;
|
|
my $method = uc(shift || '');
|
|
return '' unless defined $str and length $str;
|
|
|
|
## Decode string.
|
|
$str = $self->{_charset}->decode($str) unless is_utf8($str);
|
|
|
|
## Set format method.
|
|
$self->SUPER::config(Format => $FORMAT_FUNCS{$method} ||
|
|
$FORMAT_FUNCS{'PLAIN'});
|
|
}
|
|
|
|
## Do folding.
|
|
my $result = '';
|
|
foreach my $s (split $special_break, $str) {
|
|
if ($s =~ $special_break) {
|
|
$result .= $s;
|
|
} else {
|
|
$result .= $self->break($str);
|
|
}
|
|
}
|
|
|
|
## Encode result.
|
|
if ($self->{OutputCharset} eq '_UNICODE_') {
|
|
return $result;
|
|
} else {
|
|
return $self->{_charset}->encode($result);
|
|
}
|
|
}
|
|
|
|
=over 4
|
|
|
|
=item $self->unfold (STRING, METHOD)
|
|
|
|
Conjunct folded paragraphs of string STRING and returns it.
|
|
|
|
Following options may be specified for METHOD argument.
|
|
|
|
=over 4
|
|
|
|
=item C<"FIXED">
|
|
|
|
Default method.
|
|
Lines preceded by C<"E<gt>"> won't be conjuncted.
|
|
Treat empty line as paragraph separator.
|
|
|
|
=item C<"FLOWED">
|
|
|
|
Unfold C<"Format=Flowed; DelSp=Yes"> formatting defined by RFC 3676.
|
|
|
|
=item C<"FLOWEDSP">
|
|
|
|
Unfold C<"Format=Flowed; DelSp=No"> formatting defined by RFC 3676.
|
|
|
|
=begin comment
|
|
|
|
=item C<"OBSFLOWED">
|
|
|
|
Unfold C<"Format=Flowed> formatting defined by (obsoleted) RFC 2646
|
|
as well as possible.
|
|
|
|
=end comment
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=cut
|
|
|
|
sub unfold {
|
|
my $self = shift;
|
|
my $str = shift;
|
|
return '' unless defined $str and length $str;
|
|
|
|
## Get format method.
|
|
my $method = uc(shift || 'FIXED');
|
|
$method = 'FIXED' unless $method =~ /^(?:FIXED|FLOWED|FLOWEDSP|OBSFLOWED)$/;
|
|
my $delsp = $method eq 'FLOWED';
|
|
|
|
## Decode string and canonizalize newline.
|
|
$str = $self->{_charset}->decode($str) unless is_utf8($str);
|
|
$str =~ s/\r\n|\r/\n/g;
|
|
|
|
## Do unfolding.
|
|
my $result = '';
|
|
foreach my $s (split $special_break, $str) {
|
|
if ($s eq '') {
|
|
next;
|
|
} elsif ($s =~ $special_break) {
|
|
$result .= $s;
|
|
next;
|
|
} elsif ($method eq 'FIXED') {
|
|
pos($s) = 0;
|
|
while ($s !~ /\G\z/cg) {
|
|
if ($s =~ /\G\n/cg) {
|
|
$result .= $self->config('Newline');
|
|
} elsif ($s =~ /\G(.+)\n\n/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
} elsif ($s =~ /\G(>.*)\n/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
} elsif ($s =~ /\G(.+)\n(?=>)/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
} elsif ($s =~ /\G(.+?)( *)\n(?=(.+))/cg) {
|
|
my ($l, $s, $n) = ($1, $2, $3);
|
|
$result .= $l;
|
|
if ($n =~ /^ /) {
|
|
$result .= $self->config('Newline');
|
|
} elsif (length $s) {
|
|
$result .= $s;
|
|
} elsif (length $l) {
|
|
$result .= ' '
|
|
if $self->breakingRule($l, $n) == INDIRECT;
|
|
}
|
|
} elsif ($s =~ /\G(.+)\n/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
} elsif ($s =~ /\G(.+)/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
last;
|
|
}
|
|
}
|
|
} elsif ($method eq 'FLOWED' or $method eq 'FLOWEDSP' or
|
|
$method eq 'OBSFLOWED') {
|
|
my $prefix = undef;
|
|
pos($s) = 0;
|
|
while ($s !~ /\G\z/cg) {
|
|
if ($s =~ /\G(>+) ?(.*?)( ?)\n/cg) {
|
|
my ($p, $l, $s) = ($1, $2, $3);
|
|
unless (defined $prefix) {
|
|
$result .= $p.' '.$l;
|
|
} elsif ($p ne $prefix) {
|
|
$result .= $self->config('Newline');
|
|
$result .= $p.' '.$l;
|
|
} else {
|
|
$result .= $l;
|
|
}
|
|
unless (length $s) {
|
|
$result .= $self->config('Newline');
|
|
$prefix = undef;
|
|
} else {
|
|
$prefix = $p;
|
|
$result .= $s unless $delsp;
|
|
}
|
|
} elsif ($s =~ /\G ?(.*?)( ?)\n/cg) {
|
|
my ($l, $s) = ($1, $2);
|
|
unless (defined $prefix) {
|
|
$result .= $l;
|
|
} elsif ('' ne $prefix) {
|
|
$result .= $self->config('Newline');
|
|
$result .= $l;
|
|
} else {
|
|
$result .= $l;
|
|
}
|
|
unless (length $s) {
|
|
$result .= $self->config('Newline');
|
|
$prefix = undef;
|
|
} else {
|
|
$result .= $s unless $delsp;
|
|
$prefix = '';
|
|
}
|
|
} elsif ($s =~ /\G ?(.*)/cg) {
|
|
$result .= $1.$self->config('Newline');
|
|
last;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
## Encode result.
|
|
if ($self->{OutputCharset} eq '_UNICODE_') {
|
|
return $result;
|
|
} else {
|
|
return $self->{_charset}->encode($result);
|
|
}
|
|
}
|
|
|
|
=head1 BUGS
|
|
|
|
Please report bugs or buggy behaviors to developer.
|
|
|
|
CPAN Request Tracker:
|
|
L<http://rt.cpan.org/Public/Dist/Display.html?Name=Unicode-LineBreak>.
|
|
|
|
=head1 VERSION
|
|
|
|
Consult $VERSION variable.
|
|
|
|
=head1 SEE ALSO
|
|
|
|
L<Unicode::LineBreak>, L<Text::Wrap>.
|
|
|
|
=head1 AUTHOR
|
|
|
|
Copyright (C) 2009-2012 Hatuka*nezumi - IKEDA Soji <hatuka(at)nezumi.nu>.
|
|
|
|
This program is free software; you can redistribute it and/or modify it
|
|
under the same terms as Perl itself.
|
|
|
|
=cut
|
|
|
|
1;
|