Initial Commit
This commit is contained in:
395
database/perl/lib/Encode/Alias.pm
Normal file
395
database/perl/lib/Encode/Alias.pm
Normal file
@@ -0,0 +1,395 @@
|
||||
package Encode::Alias;
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.24 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
||||
|
||||
use Exporter 'import';
|
||||
|
||||
# Public, encouraged API is exported by default
|
||||
|
||||
our @EXPORT =
|
||||
qw (
|
||||
define_alias
|
||||
find_alias
|
||||
);
|
||||
|
||||
our @Alias; # ordered matching list
|
||||
our %Alias; # cached known aliases
|
||||
|
||||
sub find_alias {
|
||||
my $class = shift;
|
||||
my $find = shift;
|
||||
unless ( exists $Alias{$find} ) {
|
||||
$Alias{$find} = undef; # Recursion guard
|
||||
for ( my $i = 0 ; $i < @Alias ; $i += 2 ) {
|
||||
my $alias = $Alias[$i];
|
||||
my $val = $Alias[ $i + 1 ];
|
||||
my $new;
|
||||
if ( ref($alias) eq 'Regexp' && $find =~ $alias ) {
|
||||
DEBUG and warn "eval $val";
|
||||
$new = eval $val;
|
||||
DEBUG and $@ and warn "$val, $@";
|
||||
}
|
||||
elsif ( ref($alias) eq 'CODE' ) {
|
||||
DEBUG and warn "$alias", "->", "($find)";
|
||||
$new = $alias->($find);
|
||||
}
|
||||
elsif ( lc($find) eq lc($alias) ) {
|
||||
$new = $val;
|
||||
}
|
||||
if ( defined($new) ) {
|
||||
next if $new eq $find; # avoid (direct) recursion on bugs
|
||||
DEBUG and warn "$alias, $new";
|
||||
my $enc =
|
||||
( ref($new) ) ? $new : Encode::find_encoding($new);
|
||||
if ($enc) {
|
||||
$Alias{$find} = $enc;
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# case insensitive search when canonical is not in all lowercase
|
||||
# RT ticket #7835
|
||||
unless ( $Alias{$find} ) {
|
||||
my $lcfind = lc($find);
|
||||
for my $name ( keys %Encode::Encoding, keys %Encode::ExtModule )
|
||||
{
|
||||
$lcfind eq lc($name) or next;
|
||||
$Alias{$find} = Encode::find_encoding($name);
|
||||
DEBUG and warn "$find => $name";
|
||||
}
|
||||
}
|
||||
}
|
||||
if (DEBUG) {
|
||||
my $name;
|
||||
if ( my $e = $Alias{$find} ) {
|
||||
$name = $e->name;
|
||||
}
|
||||
else {
|
||||
$name = "";
|
||||
}
|
||||
warn "find_alias($class, $find)->name = $name";
|
||||
}
|
||||
return $Alias{$find};
|
||||
}
|
||||
|
||||
sub define_alias {
|
||||
while (@_) {
|
||||
my $alias = shift;
|
||||
my $name = shift;
|
||||
unshift( @Alias, $alias => $name ) # newer one has precedence
|
||||
if defined $alias;
|
||||
if ( ref($alias) ) {
|
||||
|
||||
# clear %Alias cache to allow overrides
|
||||
my @a = keys %Alias;
|
||||
for my $k (@a) {
|
||||
if ( ref($alias) eq 'Regexp' && $k =~ $alias ) {
|
||||
DEBUG and warn "delete \$Alias\{$k\}";
|
||||
delete $Alias{$k};
|
||||
}
|
||||
elsif ( ref($alias) eq 'CODE' && $alias->($k) ) {
|
||||
DEBUG and warn "delete \$Alias\{$k\}";
|
||||
delete $Alias{$k};
|
||||
}
|
||||
}
|
||||
}
|
||||
elsif (defined $alias) {
|
||||
DEBUG and warn "delete \$Alias\{$alias\}";
|
||||
delete $Alias{$alias};
|
||||
}
|
||||
elsif (DEBUG) {
|
||||
require Carp;
|
||||
Carp::croak("undef \$alias");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# HACK: Encode must be used after define_alias is declarated as Encode calls define_alias
|
||||
use Encode ();
|
||||
|
||||
# Allow latin-1 style names as well
|
||||
# 0 1 2 3 4 5 6 7 8 9 10
|
||||
our @Latin2iso = ( 0, 1, 2, 3, 4, 9, 10, 13, 14, 15, 16 );
|
||||
|
||||
# Allow winlatin1 style names as well
|
||||
our %Winlatin2cp = (
|
||||
'latin1' => 1252,
|
||||
'latin2' => 1250,
|
||||
'cyrillic' => 1251,
|
||||
'greek' => 1253,
|
||||
'turkish' => 1254,
|
||||
'hebrew' => 1255,
|
||||
'arabic' => 1256,
|
||||
'baltic' => 1257,
|
||||
'vietnamese' => 1258,
|
||||
);
|
||||
|
||||
init_aliases();
|
||||
|
||||
sub undef_aliases {
|
||||
@Alias = ();
|
||||
%Alias = ();
|
||||
}
|
||||
|
||||
sub init_aliases {
|
||||
undef_aliases();
|
||||
|
||||
# Try all-lower-case version should all else fails
|
||||
define_alias( qr/^(.*)$/ => '"\L$1"' );
|
||||
|
||||
# UTF/UCS stuff
|
||||
define_alias( qr/^(unicode-1-1-)?UTF-?7$/i => '"UTF-7"' );
|
||||
define_alias( qr/^UCS-?2-?LE$/i => '"UCS-2LE"' );
|
||||
define_alias(
|
||||
qr/^UCS-?2-?(BE)?$/i => '"UCS-2BE"',
|
||||
qr/^UCS-?4-?(BE|LE|)?$/i => 'uc("UTF-32$1")',
|
||||
qr/^iso-10646-1$/i => '"UCS-2BE"'
|
||||
);
|
||||
define_alias(
|
||||
qr/^UTF-?(16|32)-?BE$/i => '"UTF-$1BE"',
|
||||
qr/^UTF-?(16|32)-?LE$/i => '"UTF-$1LE"',
|
||||
qr/^UTF-?(16|32)$/i => '"UTF-$1"',
|
||||
);
|
||||
|
||||
# ASCII
|
||||
define_alias( qr/^(?:US-?)ascii$/i => '"ascii"' );
|
||||
define_alias( 'C' => 'ascii' );
|
||||
define_alias( qr/\b(?:ISO[-_]?)?646(?:[-_]?US)?$/i => '"ascii"' );
|
||||
|
||||
# Allow variants of iso-8859-1 etc.
|
||||
define_alias( qr/\biso[-_]?(\d+)[-_](\d+)$/i => '"iso-$1-$2"' );
|
||||
|
||||
# At least HP-UX has these.
|
||||
define_alias( qr/\biso8859(\d+)$/i => '"iso-8859-$1"' );
|
||||
|
||||
# More HP stuff.
|
||||
define_alias(
|
||||
qr/\b(?:hp-)?(arabic|greek|hebrew|kana|roman|thai|turkish)8$/i =>
|
||||
'"${1}8"' );
|
||||
|
||||
# The Official name of ASCII.
|
||||
define_alias( qr/\bANSI[-_]?X3\.4[-_]?1968$/i => '"ascii"' );
|
||||
|
||||
# This is a font issue, not an encoding issue.
|
||||
# (The currency symbol of the Latin 1 upper half
|
||||
# has been redefined as the euro symbol.)
|
||||
define_alias( qr/^(.+)\@euro$/i => '"$1"' );
|
||||
|
||||
define_alias( qr/\b(?:iso[-_]?)?latin[-_]?(\d+)$/i =>
|
||||
'defined $Encode::Alias::Latin2iso[$1] ? "iso-8859-$Encode::Alias::Latin2iso[$1]" : undef'
|
||||
);
|
||||
|
||||
define_alias(
|
||||
qr/\bwin(latin[12]|cyrillic|baltic|greek|turkish|
|
||||
hebrew|arabic|baltic|vietnamese)$/ix =>
|
||||
'"cp" . $Encode::Alias::Winlatin2cp{lc($1)}'
|
||||
);
|
||||
|
||||
# Common names for non-latin preferred MIME names
|
||||
define_alias(
|
||||
'ascii' => 'US-ascii',
|
||||
'cyrillic' => 'iso-8859-5',
|
||||
'arabic' => 'iso-8859-6',
|
||||
'greek' => 'iso-8859-7',
|
||||
'hebrew' => 'iso-8859-8',
|
||||
'thai' => 'iso-8859-11',
|
||||
);
|
||||
# RT #20781
|
||||
define_alias(qr/\btis-?620\b/i => '"iso-8859-11"');
|
||||
|
||||
# At least AIX has IBM-NNN (surprisingly...) instead of cpNNN.
|
||||
# And Microsoft has their own naming (again, surprisingly).
|
||||
# And windows-* is registered in IANA!
|
||||
define_alias(
|
||||
qr/\b(?:cp|ibm|ms|windows)[-_ ]?(\d{2,4})$/i => '"cp$1"' );
|
||||
|
||||
# Sometimes seen with a leading zero.
|
||||
# define_alias( qr/\bcp037\b/i => '"cp37"');
|
||||
|
||||
# Mac Mappings
|
||||
# predefined in *.ucm; unneeded
|
||||
# define_alias( qr/\bmacIcelandic$/i => '"macIceland"');
|
||||
define_alias( qr/^(?:x[_-])?mac[_-](.*)$/i => '"mac$1"' );
|
||||
# http://rt.cpan.org/Ticket/Display.html?id=36326
|
||||
define_alias( qr/^macintosh$/i => '"MacRoman"' );
|
||||
# https://rt.cpan.org/Ticket/Display.html?id=78125
|
||||
define_alias( qr/^macce$/i => '"MacCentralEurRoman"' );
|
||||
# Ououououou. gone. They are different!
|
||||
# define_alias( qr/\bmacRomanian$/i => '"macRumanian"');
|
||||
|
||||
# Standardize on the dashed versions.
|
||||
define_alias( qr/\bkoi8[\s\-_]*([ru])$/i => '"koi8-$1"' );
|
||||
|
||||
unless ($Encode::ON_EBCDIC) {
|
||||
|
||||
# for Encode::CN
|
||||
define_alias( qr/\beuc.*cn$/i => '"euc-cn"' );
|
||||
define_alias( qr/\bcn.*euc$/i => '"euc-cn"' );
|
||||
|
||||
# define_alias( qr/\bGB[- ]?(\d+)$/i => '"euc-cn"' )
|
||||
# CP936 doesn't have vendor-addon for GBK, so they're identical.
|
||||
define_alias( qr/^gbk$/i => '"cp936"' );
|
||||
|
||||
# This fixes gb2312 vs. euc-cn confusion, practically
|
||||
define_alias( qr/\bGB[-_ ]?2312(?!-?raw)/i => '"euc-cn"' );
|
||||
|
||||
# for Encode::JP
|
||||
define_alias( qr/\bjis$/i => '"7bit-jis"' );
|
||||
define_alias( qr/\beuc.*jp$/i => '"euc-jp"' );
|
||||
define_alias( qr/\bjp.*euc$/i => '"euc-jp"' );
|
||||
define_alias( qr/\bujis$/i => '"euc-jp"' );
|
||||
define_alias( qr/\bshift.*jis$/i => '"shiftjis"' );
|
||||
define_alias( qr/\bsjis$/i => '"shiftjis"' );
|
||||
define_alias( qr/\bwindows-31j$/i => '"cp932"' );
|
||||
|
||||
# for Encode::KR
|
||||
define_alias( qr/\beuc.*kr$/i => '"euc-kr"' );
|
||||
define_alias( qr/\bkr.*euc$/i => '"euc-kr"' );
|
||||
|
||||
# This fixes ksc5601 vs. euc-kr confusion, practically
|
||||
define_alias( qr/(?:x-)?uhc$/i => '"cp949"' );
|
||||
define_alias( qr/(?:x-)?windows-949$/i => '"cp949"' );
|
||||
define_alias( qr/\bks_c_5601-1987$/i => '"cp949"' );
|
||||
|
||||
# for Encode::TW
|
||||
define_alias( qr/\bbig-?5$/i => '"big5-eten"' );
|
||||
define_alias( qr/\bbig5-?et(?:en)?$/i => '"big5-eten"' );
|
||||
define_alias( qr/\btca[-_]?big5$/i => '"big5-eten"' );
|
||||
define_alias( qr/\bbig5-?hk(?:scs)?$/i => '"big5-hkscs"' );
|
||||
define_alias( qr/\bhk(?:scs)?[-_]?big5$/i => '"big5-hkscs"' );
|
||||
}
|
||||
|
||||
# https://github.com/dankogai/p5-encode/issues/37
|
||||
define_alias(qr/cp65000/i => '"UTF-7"');
|
||||
define_alias(qr/cp65001/i => '"utf-8-strict"');
|
||||
|
||||
# utf8 is blessed :)
|
||||
define_alias( qr/\bUTF-8$/i => '"utf-8-strict"' );
|
||||
|
||||
# At last, Map white space and _ to '-'
|
||||
define_alias( qr/^([^\s_]+)[\s_]+([^\s_]*)$/i => '"$1-$2"' );
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
# TODO: HP-UX '8' encodings arabic8 greek8 hebrew8 kana8 thai8 turkish8
|
||||
# TODO: HP-UX '15' encodings japanese15 korean15 roi15
|
||||
# TODO: Cyrillic encoding ISO-IR-111 (useful?)
|
||||
# TODO: Armenian encoding ARMSCII-8
|
||||
# TODO: Hebrew encoding ISO-8859-8-1
|
||||
# TODO: Thai encoding TCVN
|
||||
# TODO: Vietnamese encodings VPS
|
||||
# TODO: Mac Asian+African encodings: Arabic Armenian Bengali Burmese
|
||||
# ChineseSimp ChineseTrad Devanagari Ethiopic ExtArabic
|
||||
# Farsi Georgian Gujarati Gurmukhi Hebrew Japanese
|
||||
# Kannada Khmer Korean Laotian Malayalam Mongolian
|
||||
# Oriya Sinhalese Symbol Tamil Telugu Tibetan Vietnamese
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Alias - alias definitions to encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode;
|
||||
use Encode::Alias;
|
||||
define_alias( "newName" => ENCODING);
|
||||
define_alias( qr/.../ => ENCODING);
|
||||
define_alias( sub { return ENCODING if ...; } );
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Allows newName to be used as an alias for ENCODING. ENCODING may be
|
||||
either the name of an encoding or an encoding object (as described
|
||||
in L<Encode>).
|
||||
|
||||
Currently the first argument to define_alias() can be specified in the
|
||||
following ways:
|
||||
|
||||
=over 4
|
||||
|
||||
=item As a simple string.
|
||||
|
||||
=item As a qr// compiled regular expression, e.g.:
|
||||
|
||||
define_alias( qr/^iso8859-(\d+)$/i => '"iso-8859-$1"' );
|
||||
|
||||
In this case, if I<ENCODING> is not a reference, it is C<eval>-ed
|
||||
in order to allow C<$1> etc. to be substituted. The example is one
|
||||
way to alias names as used in X11 fonts to the MIME names for the
|
||||
iso-8859-* family. Note the double quotes inside the single quotes.
|
||||
|
||||
(or, you don't have to do this yourself because this example is predefined)
|
||||
|
||||
If you are using a regex here, you have to use the quotes as shown or
|
||||
it won't work. Also note that regex handling is tricky even for the
|
||||
experienced. Use this feature with caution.
|
||||
|
||||
=item As a code reference, e.g.:
|
||||
|
||||
define_alias( sub {shift =~ /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
||||
|
||||
The same effect as the example above in a different way. The coderef
|
||||
takes the alias name as an argument and returns a canonical name on
|
||||
success or undef if not. Note the second argument is ignored if provided.
|
||||
Use this with even more caution than the regex version.
|
||||
|
||||
=back
|
||||
|
||||
=head3 Changes in code reference aliasing
|
||||
|
||||
As of Encode 1.87, the older form
|
||||
|
||||
define_alias( sub { return /^iso8859-(\d+)$/i ? "iso-8859-$1" : undef } );
|
||||
|
||||
no longer works.
|
||||
|
||||
Encode up to 1.86 internally used "local $_" to implement this older
|
||||
form. But consider the code below;
|
||||
|
||||
use Encode;
|
||||
$_ = "eeeee" ;
|
||||
while (/(e)/g) {
|
||||
my $utf = decode('aliased-encoding-name', $1);
|
||||
print "position:",pos,"\n";
|
||||
}
|
||||
|
||||
Prior to Encode 1.86 this fails because of "local $_".
|
||||
|
||||
=head2 Alias overloading
|
||||
|
||||
You can override predefined aliases by simply applying define_alias().
|
||||
The new alias is always evaluated first, and when necessary,
|
||||
define_alias() flushes the internal cache to make the new definition
|
||||
available.
|
||||
|
||||
# redirect SHIFT_JIS to MS/IBM Code Page 932, which is a
|
||||
# superset of SHIFT_JIS
|
||||
|
||||
define_alias( qr/shift.*jis$/i => '"cp932"' );
|
||||
define_alias( qr/sjis$/i => '"cp932"' );
|
||||
|
||||
If you want to zap all predefined aliases, you can use
|
||||
|
||||
Encode::Alias->undef_aliases;
|
||||
|
||||
to do so. And
|
||||
|
||||
Encode::Alias->init_aliases;
|
||||
|
||||
gets the factory settings back.
|
||||
|
||||
Note that define_alias() will not be able to override the canonical name
|
||||
of encodings. Encodings are first looked up by canonical name before
|
||||
potential aliases are tried.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>, L<Encode::Supported>
|
||||
|
||||
=cut
|
||||
|
||||
120
database/perl/lib/Encode/Byte.pm
Normal file
120
database/perl/lib/Encode/Byte.pm
Normal file
@@ -0,0 +1,120 @@
|
||||
package Encode::Byte;
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Byte - Single Byte Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$greek = encode("iso-8859-7", $utf8); # loads Encode::Byte implicitly
|
||||
$utf8 = decode("iso-8859-7", $greek); # ditto
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements various single byte encodings. For most cases it uses
|
||||
\x80-\xff (upper half) to map non-ASCII characters. Encodings
|
||||
supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
# ISO 8859 series
|
||||
(iso-8859-1 is in built-in)
|
||||
iso-8859-2 latin2 [ISO]
|
||||
iso-8859-3 latin3 [ISO]
|
||||
iso-8859-4 latin4 [ISO]
|
||||
iso-8859-5 [ISO]
|
||||
iso-8859-6 [ISO]
|
||||
iso-8859-7 [ISO]
|
||||
iso-8859-8 [ISO]
|
||||
iso-8859-9 latin5 [ISO]
|
||||
iso-8859-10 latin6 [ISO]
|
||||
iso-8859-11
|
||||
(iso-8859-12 is nonexistent)
|
||||
iso-8859-13 latin7 [ISO]
|
||||
iso-8859-14 latin8 [ISO]
|
||||
iso-8859-15 latin9 [ISO]
|
||||
iso-8859-16 latin10 [ISO]
|
||||
|
||||
# Cyrillic
|
||||
koi8-f
|
||||
koi8-r cp878 [RFC1489]
|
||||
koi8-u [RFC2319]
|
||||
|
||||
# Vietnamese
|
||||
viscii
|
||||
|
||||
# all cp* are also available as ibm-*, ms-*, and windows-*
|
||||
# also see L<http://msdn.microsoft.com/en-us/library/aa752010%28VS.85%29.aspx>
|
||||
|
||||
cp424
|
||||
cp437
|
||||
cp737
|
||||
cp775
|
||||
cp850
|
||||
cp852
|
||||
cp855
|
||||
cp856
|
||||
cp857
|
||||
cp860
|
||||
cp861
|
||||
cp862
|
||||
cp863
|
||||
cp864
|
||||
cp865
|
||||
cp866
|
||||
cp869
|
||||
cp874
|
||||
cp1006
|
||||
cp1250 WinLatin2
|
||||
cp1251 WinCyrillic
|
||||
cp1252 WinLatin1
|
||||
cp1253 WinGreek
|
||||
cp1254 WinTurkish
|
||||
cp1255 WinHebrew
|
||||
cp1256 WinArabic
|
||||
cp1257 WinBaltic
|
||||
cp1258 WinVietnamese
|
||||
|
||||
# Macintosh
|
||||
# Also see L<http://developer.apple.com/technotes/tn/tn1150.html>
|
||||
MacArabic
|
||||
MacCentralEurRoman
|
||||
MacCroatian
|
||||
MacCyrillic
|
||||
MacFarsi
|
||||
MacGreek
|
||||
MacHebrew
|
||||
MacIcelandic
|
||||
MacRoman
|
||||
MacRomanian
|
||||
MacRumanian
|
||||
MacSami
|
||||
MacThai
|
||||
MacTurkish
|
||||
MacUkrainian
|
||||
|
||||
# More vendor encodings
|
||||
AdobeStandardEncoding
|
||||
nextstep
|
||||
hp-roman8
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
To find how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
66
database/perl/lib/Encode/CJKConstants.pm
Normal file
66
database/perl/lib/Encode/CJKConstants.pm
Normal file
@@ -0,0 +1,66 @@
|
||||
#
|
||||
# $Id: CJKConstants.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $
|
||||
#
|
||||
|
||||
package Encode::CJKConstants;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
our $RCSID = q$Id: CJKConstants.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Carp;
|
||||
|
||||
require Exporter;
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT = qw();
|
||||
our @EXPORT_OK = qw(%CHARCODE %ESC %RE);
|
||||
our %EXPORT_TAGS = ( 'all' => [ @EXPORT_OK, @EXPORT ] );
|
||||
|
||||
my %_0208 = (
|
||||
1978 => '\e\$\@',
|
||||
1983 => '\e\$B',
|
||||
1990 => '\e&\@\e\$B',
|
||||
);
|
||||
|
||||
our %CHARCODE = (
|
||||
UNDEF_EUC => "\xa2\xae", # <20><> in EUC
|
||||
UNDEF_SJIS => "\x81\xac", # <20><> in SJIS
|
||||
UNDEF_JIS => "\xa2\xf7", # <20><> -- used in unicode
|
||||
UNDEF_UNICODE => "\x20\x20", # <20><> -- used in unicode
|
||||
);
|
||||
|
||||
our %ESC = (
|
||||
GB_2312 => "\e\$A",
|
||||
JIS_0208 => "\e\$B",
|
||||
JIS_0212 => "\e\$(D",
|
||||
KSC_5601 => "\e\$(C",
|
||||
ASC => "\e\(B",
|
||||
KANA => "\e\(I",
|
||||
'2022_KR' => "\e\$)C",
|
||||
);
|
||||
|
||||
our %RE = (
|
||||
ASCII => '[\x00-\x7f]',
|
||||
BIN => '[\x00-\x06\x7f\xff]',
|
||||
EUC_0212 => '\x8f[\xa1-\xfe][\xa1-\xfe]',
|
||||
EUC_C => '[\xa1-\xfe][\xa1-\xfe]',
|
||||
EUC_KANA => '\x8e[\xa1-\xdf]',
|
||||
JIS_0208 => "$_0208{1978}|$_0208{1983}|$_0208{1990}",
|
||||
JIS_0212 => "\e" . '\$\(D',
|
||||
ISO_ASC => "\e" . '\([BJ]',
|
||||
JIS_KANA => "\e" . '\(I',
|
||||
'2022_KR' => "\e" . '\$\)C',
|
||||
SJIS_C => '[\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc]',
|
||||
SJIS_KANA => '[\xa1-\xdf]',
|
||||
UTF8 => '[\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]'
|
||||
);
|
||||
|
||||
1;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::CJKConstants.pm -- Internally used by Encode::??::ISO_2022_*
|
||||
|
||||
=cut
|
||||
|
||||
74
database/perl/lib/Encode/CN.pm
Normal file
74
database/perl/lib/Encode/CN.pm
Normal file
@@ -0,0 +1,74 @@
|
||||
package Encode::CN;
|
||||
BEGIN {
|
||||
if ( ord("A") == 193 ) {
|
||||
die "Encode::CN not supported on EBCDIC\n";
|
||||
}
|
||||
}
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
# Relocated from Encode.pm
|
||||
|
||||
use Encode::CN::HZ;
|
||||
|
||||
# use Encode::CN::2022_CN;
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::CN - China-based Chinese Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$euc_cn = encode("euc-cn", $utf8); # loads Encode::CN implicitly
|
||||
$utf8 = decode("euc-cn", $euc_cn); # ditto
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module implements China-based Chinese charset encodings.
|
||||
Encodings supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
euc-cn /\beuc.*cn$/i EUC (Extended Unix Character)
|
||||
/\bcn.*euc$/i
|
||||
/\bGB[-_ ]?2312(?:\D.*$|$)/i (see below)
|
||||
gb2312-raw The raw (low-bit) GB2312 character map
|
||||
gb12345-raw Traditional chinese counterpart to
|
||||
GB2312 (raw)
|
||||
iso-ir-165 GB2312 + GB6345 + GB8565 + additions
|
||||
MacChineseSimp GB2312 + Apple Additions
|
||||
cp936 Code Page 936, also known as GBK
|
||||
(Extended GuoBiao)
|
||||
hz 7-bit escaped GB2312 encoding
|
||||
--------------------------------------------------------------------
|
||||
|
||||
To find how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
Due to size concerns, C<GB 18030> (an extension to C<GBK>) is distributed
|
||||
separately on CPAN, under the name L<Encode::HanExtra>. That module
|
||||
also contains extra Taiwan-based encodings.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
When you see C<charset=gb2312> on mails and web pages, they really
|
||||
mean C<euc-cn> encodings. To fix that, C<gb2312> is aliased to C<euc-cn>.
|
||||
Use C<gb2312-raw> when you really mean it.
|
||||
|
||||
The ASCII region (0x00-0x7f) is preserved for all encodings, even though
|
||||
this conflicts with mappings by the Unicode Consortium.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
201
database/perl/lib/Encode/CN/HZ.pm
Normal file
201
database/perl/lib/Encode/CN/HZ.pm
Normal file
@@ -0,0 +1,201 @@
|
||||
package Encode::CN::HZ;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use utf8 ();
|
||||
|
||||
use vars qw($VERSION);
|
||||
$VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Encode qw(:fallbacks);
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
__PACKAGE__->Define('hz');
|
||||
|
||||
# HZ is a combination of ASCII and escaped GB, so we implement it
|
||||
# with the GB2312(raw) encoding here. Cf. RFCs 1842 & 1843.
|
||||
|
||||
# not ported for EBCDIC. Which should be used, "~" or "\x7E"?
|
||||
|
||||
sub needs_lines { 1 }
|
||||
|
||||
sub decode ($$;$) {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
|
||||
my $GB = Encode::find_encoding('gb2312-raw');
|
||||
my $ret = substr($str, 0, 0); # to propagate taintedness
|
||||
my $in_ascii = 1; # default mode is ASCII.
|
||||
|
||||
while ( length $str ) {
|
||||
if ($in_ascii) { # ASCII mode
|
||||
if ( $str =~ s/^([\x00-\x7D\x7F]+)// ) { # no '~' => ASCII
|
||||
$ret .= $1;
|
||||
|
||||
# EBCDIC should need ascii2native, but not ported.
|
||||
}
|
||||
elsif ( $str =~ s/^\x7E\x7E// ) { # escaped tilde
|
||||
$ret .= '~';
|
||||
}
|
||||
elsif ( $str =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
|
||||
1; # no-op
|
||||
}
|
||||
elsif ( $str =~ s/^\x7E\x7B// ) { # '~{'
|
||||
$in_ascii = 0; # to GB
|
||||
}
|
||||
else { # encounters an invalid escape, \x80 or greater
|
||||
last;
|
||||
}
|
||||
}
|
||||
else { # GB mode; the byte ranges are as in RFC 1843.
|
||||
no warnings 'uninitialized';
|
||||
if ( $str =~ s/^((?:[\x21-\x77][\x21-\x7E])+)// ) {
|
||||
my $prefix = $1;
|
||||
$ret .= $GB->decode( $prefix, $chk );
|
||||
}
|
||||
elsif ( $str =~ s/^\x7E\x7D// ) { # '~}'
|
||||
$in_ascii = 1;
|
||||
}
|
||||
else { # invalid
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
$_[1] = '' if $chk; # needs_lines guarantees no partial character
|
||||
return $ret;
|
||||
}
|
||||
|
||||
sub cat_decode {
|
||||
my ( $obj, undef, $src, $pos, $trm, $chk ) = @_;
|
||||
my ( $rdst, $rsrc, $rpos ) = \@_[ 1 .. 3 ];
|
||||
|
||||
my $GB = Encode::find_encoding('gb2312-raw');
|
||||
my $ret = '';
|
||||
my $in_ascii = 1; # default mode is ASCII.
|
||||
|
||||
my $ini_pos = pos($$rsrc);
|
||||
|
||||
substr( $src, 0, $pos ) = '';
|
||||
|
||||
my $ini_len = bytes::length($src);
|
||||
|
||||
# $trm is the first of the pair '~~', then 2nd tilde is to be removed.
|
||||
# XXX: Is better C<$src =~ s/^\x7E// or die if ...>?
|
||||
$src =~ s/^\x7E// if $trm eq "\x7E";
|
||||
|
||||
while ( length $src ) {
|
||||
my $now;
|
||||
if ($in_ascii) { # ASCII mode
|
||||
if ( $src =~ s/^([\x00-\x7D\x7F])// ) { # no '~' => ASCII
|
||||
$now = $1;
|
||||
}
|
||||
elsif ( $src =~ s/^\x7E\x7E// ) { # escaped tilde
|
||||
$now = '~';
|
||||
}
|
||||
elsif ( $src =~ s/^\x7E\cJ// ) { # '\cJ' == LF in ASCII
|
||||
next;
|
||||
}
|
||||
elsif ( $src =~ s/^\x7E\x7B// ) { # '~{'
|
||||
$in_ascii = 0; # to GB
|
||||
next;
|
||||
}
|
||||
else { # encounters an invalid escape, \x80 or greater
|
||||
last;
|
||||
}
|
||||
}
|
||||
else { # GB mode; the byte ranges are as in RFC 1843.
|
||||
if ( $src =~ s/^((?:[\x21-\x77][\x21-\x7F])+)// ) {
|
||||
$now = $GB->decode( $1, $chk );
|
||||
}
|
||||
elsif ( $src =~ s/^\x7E\x7D// ) { # '~}'
|
||||
$in_ascii = 1;
|
||||
next;
|
||||
}
|
||||
else { # invalid
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
next if !defined $now;
|
||||
|
||||
$ret .= $now;
|
||||
|
||||
if ( $now eq $trm ) {
|
||||
$$rdst .= $ret;
|
||||
$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
|
||||
pos($$rsrc) = $ini_pos;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
$$rdst .= $ret;
|
||||
$$rpos = $ini_pos + $pos + $ini_len - bytes::length($src);
|
||||
pos($$rsrc) = $ini_pos;
|
||||
return ''; # terminator not found
|
||||
}
|
||||
|
||||
sub encode($$;$) {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
|
||||
my $GB = Encode::find_encoding('gb2312-raw');
|
||||
my $ret = substr($str, 0, 0); # to propagate taintedness;
|
||||
my $in_ascii = 1; # default mode is ASCII.
|
||||
|
||||
no warnings 'utf8'; # $str may be malformed UTF8 at the end of a chunk.
|
||||
|
||||
while ( length $str ) {
|
||||
if ( $str =~ s/^([[:ascii:]]+)// ) {
|
||||
my $tmp = $1;
|
||||
$tmp =~ s/~/~~/g; # escapes tildes
|
||||
if ( !$in_ascii ) {
|
||||
$ret .= "\x7E\x7D"; # '~}'
|
||||
$in_ascii = 1;
|
||||
}
|
||||
$ret .= pack 'a*', $tmp; # remove UTF8 flag.
|
||||
}
|
||||
elsif ( $str =~ s/(.)// ) {
|
||||
my $s = $1;
|
||||
my $tmp = $GB->encode( $s, $chk || 0 );
|
||||
last if !defined $tmp;
|
||||
if ( length $tmp == 2 ) { # maybe a valid GB char (XXX)
|
||||
if ($in_ascii) {
|
||||
$ret .= "\x7E\x7B"; # '~{'
|
||||
$in_ascii = 0;
|
||||
}
|
||||
$ret .= $tmp;
|
||||
}
|
||||
elsif ( length $tmp ) { # maybe FALLBACK in ASCII (XXX)
|
||||
if ( !$in_ascii ) {
|
||||
$ret .= "\x7E\x7D"; # '~}'
|
||||
$in_ascii = 1;
|
||||
}
|
||||
$ret .= $tmp;
|
||||
}
|
||||
}
|
||||
else { # if $str is malformed UTF8 *and* if length $str != 0.
|
||||
last;
|
||||
}
|
||||
}
|
||||
$_[1] = $str if $chk;
|
||||
|
||||
# The state at the end of the chunk is discarded, even if in GB mode.
|
||||
# That results in the combination of GB-OUT and GB-IN, i.e. "~}~{".
|
||||
# Parhaps it is harmless, but further investigations may be required...
|
||||
|
||||
if ( !$in_ascii ) {
|
||||
$ret .= "\x7E\x7D"; # '~}'
|
||||
$in_ascii = 1;
|
||||
}
|
||||
utf8::encode($ret); # https://rt.cpan.org/Ticket/Display.html?id=35120
|
||||
return $ret;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::CN::HZ -- internally used by Encode::CN
|
||||
|
||||
=cut
|
||||
7
database/perl/lib/Encode/Changes.e2x
Normal file
7
database/perl/lib/Encode/Changes.e2x
Normal file
@@ -0,0 +1,7 @@
|
||||
#
|
||||
# $Id: Changes.e2x,v 2.0 2004/05/16 20:55:15 dankogai Exp $
|
||||
# Revision history for Perl extension Encode::$_Name_.
|
||||
#
|
||||
|
||||
0.01 $_Now_
|
||||
Autogenerated by enc2xs version $_Version_.
|
||||
170
database/perl/lib/Encode/Config.pm
Normal file
170
database/perl/lib/Encode/Config.pm
Normal file
@@ -0,0 +1,170 @@
|
||||
#
|
||||
# Demand-load module list
|
||||
#
|
||||
package Encode::Config;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.5 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our %ExtModule = (
|
||||
|
||||
# Encode::Byte
|
||||
#iso-8859-1 is in Encode.pm itself
|
||||
'iso-8859-2' => 'Encode::Byte',
|
||||
'iso-8859-3' => 'Encode::Byte',
|
||||
'iso-8859-4' => 'Encode::Byte',
|
||||
'iso-8859-5' => 'Encode::Byte',
|
||||
'iso-8859-6' => 'Encode::Byte',
|
||||
'iso-8859-7' => 'Encode::Byte',
|
||||
'iso-8859-8' => 'Encode::Byte',
|
||||
'iso-8859-9' => 'Encode::Byte',
|
||||
'iso-8859-10' => 'Encode::Byte',
|
||||
'iso-8859-11' => 'Encode::Byte',
|
||||
'iso-8859-13' => 'Encode::Byte',
|
||||
'iso-8859-14' => 'Encode::Byte',
|
||||
'iso-8859-15' => 'Encode::Byte',
|
||||
'iso-8859-16' => 'Encode::Byte',
|
||||
'koi8-f' => 'Encode::Byte',
|
||||
'koi8-r' => 'Encode::Byte',
|
||||
'koi8-u' => 'Encode::Byte',
|
||||
'viscii' => 'Encode::Byte',
|
||||
'cp424' => 'Encode::Byte',
|
||||
'cp437' => 'Encode::Byte',
|
||||
'cp737' => 'Encode::Byte',
|
||||
'cp775' => 'Encode::Byte',
|
||||
'cp850' => 'Encode::Byte',
|
||||
'cp852' => 'Encode::Byte',
|
||||
'cp855' => 'Encode::Byte',
|
||||
'cp856' => 'Encode::Byte',
|
||||
'cp857' => 'Encode::Byte',
|
||||
'cp858' => 'Encode::Byte',
|
||||
'cp860' => 'Encode::Byte',
|
||||
'cp861' => 'Encode::Byte',
|
||||
'cp862' => 'Encode::Byte',
|
||||
'cp863' => 'Encode::Byte',
|
||||
'cp864' => 'Encode::Byte',
|
||||
'cp865' => 'Encode::Byte',
|
||||
'cp866' => 'Encode::Byte',
|
||||
'cp869' => 'Encode::Byte',
|
||||
'cp874' => 'Encode::Byte',
|
||||
'cp1006' => 'Encode::Byte',
|
||||
'cp1250' => 'Encode::Byte',
|
||||
'cp1251' => 'Encode::Byte',
|
||||
'cp1252' => 'Encode::Byte',
|
||||
'cp1253' => 'Encode::Byte',
|
||||
'cp1254' => 'Encode::Byte',
|
||||
'cp1255' => 'Encode::Byte',
|
||||
'cp1256' => 'Encode::Byte',
|
||||
'cp1257' => 'Encode::Byte',
|
||||
'cp1258' => 'Encode::Byte',
|
||||
'AdobeStandardEncoding' => 'Encode::Byte',
|
||||
'MacArabic' => 'Encode::Byte',
|
||||
'MacCentralEurRoman' => 'Encode::Byte',
|
||||
'MacCroatian' => 'Encode::Byte',
|
||||
'MacCyrillic' => 'Encode::Byte',
|
||||
'MacFarsi' => 'Encode::Byte',
|
||||
'MacGreek' => 'Encode::Byte',
|
||||
'MacHebrew' => 'Encode::Byte',
|
||||
'MacIcelandic' => 'Encode::Byte',
|
||||
'MacRoman' => 'Encode::Byte',
|
||||
'MacRomanian' => 'Encode::Byte',
|
||||
'MacRumanian' => 'Encode::Byte',
|
||||
'MacSami' => 'Encode::Byte',
|
||||
'MacThai' => 'Encode::Byte',
|
||||
'MacTurkish' => 'Encode::Byte',
|
||||
'MacUkrainian' => 'Encode::Byte',
|
||||
'nextstep' => 'Encode::Byte',
|
||||
'hp-roman8' => 'Encode::Byte',
|
||||
#'gsm0338' => 'Encode::Byte',
|
||||
'gsm0338' => 'Encode::GSM0338',
|
||||
|
||||
# Encode::EBCDIC
|
||||
'cp37' => 'Encode::EBCDIC',
|
||||
'cp500' => 'Encode::EBCDIC',
|
||||
'cp875' => 'Encode::EBCDIC',
|
||||
'cp1026' => 'Encode::EBCDIC',
|
||||
'cp1047' => 'Encode::EBCDIC',
|
||||
'posix-bc' => 'Encode::EBCDIC',
|
||||
|
||||
# Encode::Symbol
|
||||
'dingbats' => 'Encode::Symbol',
|
||||
'symbol' => 'Encode::Symbol',
|
||||
'AdobeSymbol' => 'Encode::Symbol',
|
||||
'AdobeZdingbat' => 'Encode::Symbol',
|
||||
'MacDingbats' => 'Encode::Symbol',
|
||||
'MacSymbol' => 'Encode::Symbol',
|
||||
|
||||
# Encode::Unicode
|
||||
'UCS-2BE' => 'Encode::Unicode',
|
||||
'UCS-2LE' => 'Encode::Unicode',
|
||||
'UTF-16' => 'Encode::Unicode',
|
||||
'UTF-16BE' => 'Encode::Unicode',
|
||||
'UTF-16LE' => 'Encode::Unicode',
|
||||
'UTF-32' => 'Encode::Unicode',
|
||||
'UTF-32BE' => 'Encode::Unicode',
|
||||
'UTF-32LE' => 'Encode::Unicode',
|
||||
'UTF-7' => 'Encode::Unicode::UTF7',
|
||||
);
|
||||
|
||||
unless ( ord("A") == 193 ) {
|
||||
%ExtModule = (
|
||||
%ExtModule,
|
||||
'euc-cn' => 'Encode::CN',
|
||||
'gb12345-raw' => 'Encode::CN',
|
||||
'gb2312-raw' => 'Encode::CN',
|
||||
'hz' => 'Encode::CN',
|
||||
'iso-ir-165' => 'Encode::CN',
|
||||
'cp936' => 'Encode::CN',
|
||||
'MacChineseSimp' => 'Encode::CN',
|
||||
|
||||
'7bit-jis' => 'Encode::JP',
|
||||
'euc-jp' => 'Encode::JP',
|
||||
'iso-2022-jp' => 'Encode::JP',
|
||||
'iso-2022-jp-1' => 'Encode::JP',
|
||||
'jis0201-raw' => 'Encode::JP',
|
||||
'jis0208-raw' => 'Encode::JP',
|
||||
'jis0212-raw' => 'Encode::JP',
|
||||
'cp932' => 'Encode::JP',
|
||||
'MacJapanese' => 'Encode::JP',
|
||||
'shiftjis' => 'Encode::JP',
|
||||
|
||||
'euc-kr' => 'Encode::KR',
|
||||
'iso-2022-kr' => 'Encode::KR',
|
||||
'johab' => 'Encode::KR',
|
||||
'ksc5601-raw' => 'Encode::KR',
|
||||
'cp949' => 'Encode::KR',
|
||||
'MacKorean' => 'Encode::KR',
|
||||
|
||||
'big5-eten' => 'Encode::TW',
|
||||
'big5-hkscs' => 'Encode::TW',
|
||||
'cp950' => 'Encode::TW',
|
||||
'MacChineseTrad' => 'Encode::TW',
|
||||
|
||||
#'big5plus' => 'Encode::HanExtra',
|
||||
#'euc-tw' => 'Encode::HanExtra',
|
||||
#'gb18030' => 'Encode::HanExtra',
|
||||
|
||||
'MIME-Header' => 'Encode::MIME::Header',
|
||||
'MIME-B' => 'Encode::MIME::Header',
|
||||
'MIME-Q' => 'Encode::MIME::Header',
|
||||
|
||||
'MIME-Header-ISO_2022_JP' => 'Encode::MIME::Header::ISO_2022_JP',
|
||||
);
|
||||
}
|
||||
|
||||
#
|
||||
# Why not export ? to keep ConfigLocal Happy!
|
||||
#
|
||||
while ( my ( $enc, $mod ) = each %ExtModule ) {
|
||||
$Encode::ExtModule{$enc} = $mod;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Config -- internally used by Encode
|
||||
|
||||
=cut
|
||||
13
database/perl/lib/Encode/ConfigLocal_PM.e2x
Normal file
13
database/perl/lib/Encode/ConfigLocal_PM.e2x
Normal file
@@ -0,0 +1,13 @@
|
||||
#
|
||||
# Local demand-load module list
|
||||
#
|
||||
# You should not edit this file by hand! use "enc2xs -C"
|
||||
#
|
||||
package Encode::ConfigLocal;
|
||||
our $VERSION = $_LocalVer_;
|
||||
|
||||
use strict;
|
||||
|
||||
$_ModLines_
|
||||
|
||||
1;
|
||||
45
database/perl/lib/Encode/EBCDIC.pm
Normal file
45
database/perl/lib/Encode/EBCDIC.pm
Normal file
@@ -0,0 +1,45 @@
|
||||
package Encode::EBCDIC;
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::EBCDIC - EBCDIC Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$posix_bc = encode("posix-bc", $utf8); # loads Encode::EBCDIC implicitly
|
||||
$utf8 = decode("", $posix_bc); # ditto
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements various EBCDIC-Based encodings. Encodings
|
||||
supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
cp37
|
||||
cp500
|
||||
cp875
|
||||
cp1026
|
||||
cp1047
|
||||
posix-bc
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
To find how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>, L<perlebcdic>
|
||||
|
||||
=cut
|
||||
253
database/perl/lib/Encode/Encoder.pm
Normal file
253
database/perl/lib/Encode/Encoder.pm
Normal file
@@ -0,0 +1,253 @@
|
||||
#
|
||||
# $Id: Encoder.pm,v 2.3 2013/09/14 07:51:59 dankogai Exp $
|
||||
#
|
||||
package Encode::Encoder;
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
require Exporter;
|
||||
our @ISA = qw(Exporter);
|
||||
our @EXPORT_OK = qw ( encoder );
|
||||
|
||||
our $AUTOLOAD;
|
||||
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
||||
use Encode qw(encode decode find_encoding from_to);
|
||||
use Carp;
|
||||
|
||||
sub new {
|
||||
my ( $class, $data, $encname ) = @_;
|
||||
unless ($encname) {
|
||||
$encname = Encode::is_utf8($data) ? 'utf8' : '';
|
||||
}
|
||||
else {
|
||||
my $obj = find_encoding($encname)
|
||||
or croak __PACKAGE__, ": unknown encoding: $encname";
|
||||
$encname = $obj->name;
|
||||
}
|
||||
my $self = {
|
||||
data => $data,
|
||||
encoding => $encname,
|
||||
};
|
||||
bless $self => $class;
|
||||
}
|
||||
|
||||
sub encoder { __PACKAGE__->new(@_) }
|
||||
|
||||
sub data {
|
||||
my ( $self, $data ) = @_;
|
||||
if ( defined $data ) {
|
||||
$self->{data} = $data;
|
||||
return $data;
|
||||
}
|
||||
else {
|
||||
return $self->{data};
|
||||
}
|
||||
}
|
||||
|
||||
sub encoding {
|
||||
my ( $self, $encname ) = @_;
|
||||
if ($encname) {
|
||||
my $obj = find_encoding($encname)
|
||||
or confess __PACKAGE__, ": unknown encoding: $encname";
|
||||
$self->{encoding} = $obj->name;
|
||||
return $self;
|
||||
}
|
||||
else {
|
||||
return $self->{encoding};
|
||||
}
|
||||
}
|
||||
|
||||
sub bytes {
|
||||
my ( $self, $encname ) = @_;
|
||||
$encname ||= $self->{encoding};
|
||||
my $obj = find_encoding($encname)
|
||||
or confess __PACKAGE__, ": unknown encoding: $encname";
|
||||
$self->{data} = $obj->decode( $self->{data}, 1 );
|
||||
$self->{encoding} = '';
|
||||
return $self;
|
||||
}
|
||||
|
||||
sub DESTROY { # defined so it won't autoload.
|
||||
DEBUG and warn shift;
|
||||
}
|
||||
|
||||
sub AUTOLOAD {
|
||||
my $self = shift;
|
||||
my $type = ref($self)
|
||||
or confess "$self is not an object";
|
||||
my $myname = $AUTOLOAD;
|
||||
$myname =~ s/.*://; # strip fully-qualified portion
|
||||
my $obj = find_encoding($myname)
|
||||
or confess __PACKAGE__, ": unknown encoding: $myname";
|
||||
DEBUG and warn $self->{encoding}, " => ", $obj->name;
|
||||
if ( $self->{encoding} ) {
|
||||
from_to( $self->{data}, $self->{encoding}, $obj->name, 1 );
|
||||
}
|
||||
else {
|
||||
$self->{data} = $obj->encode( $self->{data}, 1 );
|
||||
}
|
||||
$self->{encoding} = $obj->name;
|
||||
return $self;
|
||||
}
|
||||
|
||||
use overload
|
||||
q("") => sub { $_[0]->{data} },
|
||||
q(0+) => sub { use bytes(); bytes::length( $_[0]->{data} ) },
|
||||
fallback => 1,
|
||||
;
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Encoder -- Object Oriented Encoder
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode::Encoder;
|
||||
# Encode::encode("ISO-8859-1", $data);
|
||||
Encode::Encoder->new($data)->iso_8859_1; # OOP way
|
||||
# shortcut
|
||||
use Encode::Encoder qw(encoder);
|
||||
encoder($data)->iso_8859_1;
|
||||
# you can stack them!
|
||||
encoder($data)->iso_8859_1->base64; # provided base64() is defined
|
||||
# you can use it as a decoder as well
|
||||
encoder($base64)->bytes('base64')->latin1;
|
||||
# stringified
|
||||
print encoder($data)->utf8->latin1; # prints the string in latin1
|
||||
# numified
|
||||
encoder("\x{abcd}\x{ef}g")->utf8 == 6; # true. bytes::length($data)
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
B<Encode::Encoder> allows you to use Encode in an object-oriented
|
||||
style. This is not only more intuitive than a functional approach,
|
||||
but also handier when you want to stack encodings. Suppose you want
|
||||
your UTF-8 string converted to Latin1 then Base64: you can simply say
|
||||
|
||||
my $base64 = encoder($utf8)->latin1->base64;
|
||||
|
||||
instead of
|
||||
|
||||
my $latin1 = encode("latin1", $utf8);
|
||||
my $base64 = encode_base64($utf8);
|
||||
|
||||
or the lazier and more convoluted
|
||||
|
||||
my $base64 = encode_base64(encode("latin1", $utf8));
|
||||
|
||||
=head1 Description
|
||||
|
||||
Here is how to use this module.
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
There are at least two instance variables stored in a hash reference,
|
||||
{data} and {encoding}.
|
||||
|
||||
=item *
|
||||
|
||||
When there is no method, it takes the method name as the name of the
|
||||
encoding and encodes the instance I<data> with I<encoding>. If successful,
|
||||
the instance I<encoding> is set accordingly.
|
||||
|
||||
=item *
|
||||
|
||||
You can retrieve the result via -E<gt>data but usually you don't have to
|
||||
because the stringify operator ("") is overridden to do exactly that.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Predefined Methods
|
||||
|
||||
This module predefines the methods below:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $e = Encode::Encoder-E<gt>new([$data, $encoding]);
|
||||
|
||||
returns an encoder object. Its data is initialized with $data if
|
||||
present, and its encoding is set to $encoding if present.
|
||||
|
||||
When $encoding is omitted, it defaults to utf8 if $data is already in
|
||||
utf8 or "" (empty string) otherwise.
|
||||
|
||||
=item encoder()
|
||||
|
||||
is an alias of Encode::Encoder-E<gt>new(). This one is exported on demand.
|
||||
|
||||
=item $e-E<gt>data([$data])
|
||||
|
||||
When $data is present, sets the instance data to $data and returns the
|
||||
object itself. Otherwise, the current instance data is returned.
|
||||
|
||||
=item $e-E<gt>encoding([$encoding])
|
||||
|
||||
When $encoding is present, sets the instance encoding to $encoding and
|
||||
returns the object itself. Otherwise, the current instance encoding is
|
||||
returned.
|
||||
|
||||
=item $e-E<gt>bytes([$encoding])
|
||||
|
||||
decodes instance data from $encoding, or the instance encoding if
|
||||
omitted. If the conversion is successful, the instance encoding
|
||||
will be set to "".
|
||||
|
||||
The name I<bytes> was deliberately picked to avoid namespace tainting
|
||||
-- this module may be used as a base class so method names that appear
|
||||
in Encode::Encoding are avoided.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Example: base64 transcoder
|
||||
|
||||
This module is designed to work with L<Encode::Encoding>.
|
||||
To make the Base64 transcoder example above really work, you could
|
||||
write a module like this:
|
||||
|
||||
package Encode::Base64;
|
||||
use parent 'Encode::Encoding';
|
||||
__PACKAGE__->Define('base64');
|
||||
use MIME::Base64;
|
||||
sub encode{
|
||||
my ($obj, $data) = @_;
|
||||
return encode_base64($data);
|
||||
}
|
||||
sub decode{
|
||||
my ($obj, $data) = @_;
|
||||
return decode_base64($data);
|
||||
}
|
||||
1;
|
||||
__END__
|
||||
|
||||
And your caller module would be something like this:
|
||||
|
||||
use Encode::Encoder;
|
||||
use Encode::Base64;
|
||||
|
||||
# now you can really do the following
|
||||
|
||||
encoder($data)->iso_8859_1->base64;
|
||||
encoder($base64)->bytes('base64')->latin1;
|
||||
|
||||
=head2 Operator Overloading
|
||||
|
||||
This module overloads two operators, stringify ("") and numify (0+).
|
||||
|
||||
Stringify dumps the data inside the object.
|
||||
|
||||
Numify returns the number of bytes in the instance data.
|
||||
|
||||
They come in handy when you want to print or find the size of data.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>,
|
||||
L<Encode::Encoding>
|
||||
|
||||
=cut
|
||||
356
database/perl/lib/Encode/Encoding.pm
Normal file
356
database/perl/lib/Encode/Encoding.pm
Normal file
@@ -0,0 +1,356 @@
|
||||
package Encode::Encoding;
|
||||
|
||||
# Base class for classes which implement encodings
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
our @CARP_NOT = qw(Encode Encode::Encoder);
|
||||
|
||||
use Carp ();
|
||||
use Encode ();
|
||||
use Encode::MIME::Name;
|
||||
|
||||
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
||||
|
||||
sub Define {
|
||||
my $obj = shift;
|
||||
my $canonical = shift;
|
||||
$obj = bless { Name => $canonical }, $obj unless ref $obj;
|
||||
|
||||
# warn "$canonical => $obj\n";
|
||||
Encode::define_encoding( $obj, $canonical, @_ );
|
||||
}
|
||||
|
||||
sub name { return shift->{'Name'} }
|
||||
|
||||
sub mime_name {
|
||||
return Encode::MIME::Name::get_mime_name(shift->name);
|
||||
}
|
||||
|
||||
sub renew {
|
||||
my $self = shift;
|
||||
my $clone = bless {%$self} => ref($self);
|
||||
$clone->{renewed}++; # so the caller can see it
|
||||
DEBUG and warn $clone->{renewed};
|
||||
return $clone;
|
||||
}
|
||||
|
||||
sub renewed { return $_[0]->{renewed} || 0 }
|
||||
|
||||
*new_sequence = \&renew;
|
||||
|
||||
sub needs_lines { 0 }
|
||||
|
||||
sub perlio_ok {
|
||||
return eval { require PerlIO::encoding } ? 1 : 0;
|
||||
}
|
||||
|
||||
# (Temporary|legacy) methods
|
||||
|
||||
sub toUnicode { shift->decode(@_) }
|
||||
sub fromUnicode { shift->encode(@_) }
|
||||
|
||||
#
|
||||
# Needs to be overloaded or just croak
|
||||
#
|
||||
|
||||
sub encode {
|
||||
my $obj = shift;
|
||||
my $class = ref($obj) ? ref($obj) : $obj;
|
||||
Carp::croak( $class . "->encode() not defined!" );
|
||||
}
|
||||
|
||||
sub decode {
|
||||
my $obj = shift;
|
||||
my $class = ref($obj) ? ref($obj) : $obj;
|
||||
Carp::croak( $class . "->encode() not defined!" );
|
||||
}
|
||||
|
||||
sub DESTROY { }
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Encoding - Encode Implementation Base Class
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
package Encode::MyEncoding;
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
__PACKAGE__->Define(qw(myCanonical myAlias));
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
As mentioned in L<Encode>, encodings are (in the current
|
||||
implementation at least) defined as objects. The mapping of encoding
|
||||
name to object is via the C<%Encode::Encoding> hash. Though you can
|
||||
directly manipulate this hash, it is strongly encouraged to use this
|
||||
base class module and add encode() and decode() methods.
|
||||
|
||||
=head2 Methods you should implement
|
||||
|
||||
You are strongly encouraged to implement methods below, at least
|
||||
either encode() or decode().
|
||||
|
||||
=over 4
|
||||
|
||||
=item -E<gt>encode($string [,$check])
|
||||
|
||||
MUST return the octet sequence representing I<$string>.
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
If I<$check> is true, it SHOULD modify I<$string> in place to remove
|
||||
the converted part (i.e. the whole string unless there is an error).
|
||||
If perlio_ok() is true, SHOULD becomes MUST.
|
||||
|
||||
=item *
|
||||
|
||||
If an error occurs, it SHOULD return the octet sequence for the
|
||||
fragment of string that has been converted and modify $string in-place
|
||||
to remove the converted part leaving it starting with the problem
|
||||
fragment. If perlio_ok() is true, SHOULD becomes MUST.
|
||||
|
||||
=item *
|
||||
|
||||
If I<$check> is false then C<encode> MUST make a "best effort" to
|
||||
convert the string - for example, by using a replacement character.
|
||||
|
||||
=back
|
||||
|
||||
=item -E<gt>decode($octets [,$check])
|
||||
|
||||
MUST return the string that I<$octets> represents.
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
If I<$check> is true, it SHOULD modify I<$octets> in place to remove
|
||||
the converted part (i.e. the whole sequence unless there is an
|
||||
error). If perlio_ok() is true, SHOULD becomes MUST.
|
||||
|
||||
=item *
|
||||
|
||||
If an error occurs, it SHOULD return the fragment of string that has
|
||||
been converted and modify $octets in-place to remove the converted
|
||||
part leaving it starting with the problem fragment. If perlio_ok() is
|
||||
true, SHOULD becomes MUST.
|
||||
|
||||
=item *
|
||||
|
||||
If I<$check> is false then C<decode> should make a "best effort" to
|
||||
convert the string - for example by using Unicode's "\x{FFFD}" as a
|
||||
replacement character.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
If you want your encoding to work with L<encoding> pragma, you should
|
||||
also implement the method below.
|
||||
|
||||
=over 4
|
||||
|
||||
=item -E<gt>cat_decode($destination, $octets, $offset, $terminator [,$check])
|
||||
|
||||
MUST decode I<$octets> with I<$offset> and concatenate it to I<$destination>.
|
||||
Decoding will terminate when $terminator (a string) appears in output.
|
||||
I<$offset> will be modified to the last $octets position at end of decode.
|
||||
Returns true if $terminator appears output, else returns false.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Other methods defined in Encode::Encodings
|
||||
|
||||
You do not have to override methods shown below unless you have to.
|
||||
|
||||
=over 4
|
||||
|
||||
=item -E<gt>name
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub name { return shift->{'Name'} }
|
||||
|
||||
MUST return the string representing the canonical name of the encoding.
|
||||
|
||||
=item -E<gt>mime_name
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub mime_name{
|
||||
return Encode::MIME::Name::get_mime_name(shift->name);
|
||||
}
|
||||
|
||||
MUST return the string representing the IANA charset name of the encoding.
|
||||
|
||||
=item -E<gt>renew
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub renew {
|
||||
my $self = shift;
|
||||
my $clone = bless { %$self } => ref($self);
|
||||
$clone->{renewed}++;
|
||||
return $clone;
|
||||
}
|
||||
|
||||
This method reconstructs the encoding object if necessary. If you need
|
||||
to store the state during encoding, this is where you clone your object.
|
||||
|
||||
PerlIO ALWAYS calls this method to make sure it has its own private
|
||||
encoding object.
|
||||
|
||||
=item -E<gt>renewed
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub renewed { $_[0]->{renewed} || 0 }
|
||||
|
||||
Tells whether the object is renewed (and how many times). Some
|
||||
modules emit C<Use of uninitialized value in null operation> warning
|
||||
unless the value is numeric so return 0 for false.
|
||||
|
||||
=item -E<gt>perlio_ok()
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub perlio_ok {
|
||||
return eval { require PerlIO::encoding } ? 1 : 0;
|
||||
}
|
||||
|
||||
If your encoding does not support PerlIO for some reasons, just;
|
||||
|
||||
sub perlio_ok { 0 }
|
||||
|
||||
=item -E<gt>needs_lines()
|
||||
|
||||
Predefined As:
|
||||
|
||||
sub needs_lines { 0 };
|
||||
|
||||
If your encoding can work with PerlIO but needs line buffering, you
|
||||
MUST define this method so it returns true. 7bit ISO-2022 encodings
|
||||
are one example that needs this. When this method is missing, false
|
||||
is assumed.
|
||||
|
||||
=back
|
||||
|
||||
=head2 Example: Encode::ROT13
|
||||
|
||||
package Encode::ROT13;
|
||||
use strict;
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
__PACKAGE__->Define('rot13');
|
||||
|
||||
sub encode($$;$){
|
||||
my ($obj, $str, $chk) = @_;
|
||||
$str =~ tr/A-Za-z/N-ZA-Mn-za-m/;
|
||||
$_[1] = '' if $chk; # this is what in-place edit means
|
||||
return $str;
|
||||
}
|
||||
|
||||
# Jr pna or ynml yvxr guvf;
|
||||
*decode = \&encode;
|
||||
|
||||
1;
|
||||
|
||||
=head1 Why the heck Encode API is different?
|
||||
|
||||
It should be noted that the I<$check> behaviour is different from the
|
||||
outer public API. The logic is that the "unchecked" case is useful
|
||||
when the encoding is part of a stream which may be reporting errors
|
||||
(e.g. STDERR). In such cases, it is desirable to get everything
|
||||
through somehow without causing additional errors which obscure the
|
||||
original one. Also, the encoding is best placed to know what the
|
||||
correct replacement character is, so if that is the desired behaviour
|
||||
then letting low level code do it is the most efficient.
|
||||
|
||||
By contrast, if I<$check> is true, the scheme above allows the
|
||||
encoding to do as much as it can and tell the layer above how much
|
||||
that was. What is lacking at present is a mechanism to report what
|
||||
went wrong. The most likely interface will be an additional method
|
||||
call to the object, or perhaps (to avoid forcing per-stream objects
|
||||
on otherwise stateless encodings) an additional parameter.
|
||||
|
||||
It is also highly desirable that encoding classes inherit from
|
||||
C<Encode::Encoding> as a base class. This allows that class to define
|
||||
additional behaviour for all encoding objects.
|
||||
|
||||
package Encode::MyEncoding;
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
__PACKAGE__->Define(qw(myCanonical myAlias));
|
||||
|
||||
to create an object with C<< bless {Name => ...}, $class >>, and call
|
||||
define_encoding. They inherit their C<name> method from
|
||||
C<Encode::Encoding>.
|
||||
|
||||
=head2 Compiled Encodings
|
||||
|
||||
For the sake of speed and efficiency, most of the encodings are now
|
||||
supported via a I<compiled form>: XS modules generated from UCM
|
||||
files. Encode provides the enc2xs tool to achieve that. Please see
|
||||
L<enc2xs> for more details.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<perlmod>, L<enc2xs>
|
||||
|
||||
=begin future
|
||||
|
||||
=over 4
|
||||
|
||||
=item Scheme 1
|
||||
|
||||
The fixup routine gets passed the remaining fragment of string being
|
||||
processed. It modifies it in place to remove bytes/characters it can
|
||||
understand and returns a string used to represent them. For example:
|
||||
|
||||
sub fixup {
|
||||
my $ch = substr($_[0],0,1,'');
|
||||
return sprintf("\x{%02X}",ord($ch);
|
||||
}
|
||||
|
||||
This scheme is close to how the underlying C code for Encode works,
|
||||
but gives the fixup routine very little context.
|
||||
|
||||
=item Scheme 2
|
||||
|
||||
The fixup routine gets passed the original string, an index into
|
||||
it of the problem area, and the output string so far. It appends
|
||||
what it wants to the output string and returns a new index into the
|
||||
original string. For example:
|
||||
|
||||
sub fixup {
|
||||
# my ($s,$i,$d) = @_;
|
||||
my $ch = substr($_[0],$_[1],1);
|
||||
$_[2] .= sprintf("\x{%02X}",ord($ch);
|
||||
return $_[1]+1;
|
||||
}
|
||||
|
||||
This scheme gives maximal control to the fixup routine but is more
|
||||
complicated to code, and may require that the internals of Encode be tweaked to
|
||||
keep the original string intact.
|
||||
|
||||
=item Other Schemes
|
||||
|
||||
Hybrids of the above.
|
||||
|
||||
Multiple return values rather than in-place modifications.
|
||||
|
||||
Index into the string could be C<pos($str)> allowing C<s/\G...//>.
|
||||
|
||||
=back
|
||||
|
||||
=end future
|
||||
|
||||
=cut
|
||||
289
database/perl/lib/Encode/GSM0338.pm
Normal file
289
database/perl/lib/Encode/GSM0338.pm
Normal file
@@ -0,0 +1,289 @@
|
||||
#
|
||||
# $Id: GSM0338.pm,v 2.9 2020/12/02 01:28:17 dankogai Exp dankogai $
|
||||
#
|
||||
package Encode::GSM0338;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
use Carp;
|
||||
|
||||
use vars qw($VERSION);
|
||||
$VERSION = do { my @r = ( q$Revision: 2.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Encode qw(:fallbacks);
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
__PACKAGE__->Define('gsm0338');
|
||||
|
||||
use utf8;
|
||||
|
||||
# Mapping table according to 3GPP TS 23.038 version 16.0.0 Release 16 and ETSI TS 123 038 V16.0.0 (2020-07)
|
||||
# https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf (page 20 and 22)
|
||||
our %UNI2GSM = (
|
||||
"\x{000A}" => "\x0A", # LINE FEED
|
||||
"\x{000C}" => "\x1B\x0A", # FORM FEED
|
||||
"\x{000D}" => "\x0D", # CARRIAGE RETURN
|
||||
"\x{0020}" => "\x20", # SPACE
|
||||
"\x{0021}" => "\x21", # EXCLAMATION MARK
|
||||
"\x{0022}" => "\x22", # QUOTATION MARK
|
||||
"\x{0023}" => "\x23", # NUMBER SIGN
|
||||
"\x{0024}" => "\x02", # DOLLAR SIGN
|
||||
"\x{0025}" => "\x25", # PERCENT SIGN
|
||||
"\x{0026}" => "\x26", # AMPERSAND
|
||||
"\x{0027}" => "\x27", # APOSTROPHE
|
||||
"\x{0028}" => "\x28", # LEFT PARENTHESIS
|
||||
"\x{0029}" => "\x29", # RIGHT PARENTHESIS
|
||||
"\x{002A}" => "\x2A", # ASTERISK
|
||||
"\x{002B}" => "\x2B", # PLUS SIGN
|
||||
"\x{002C}" => "\x2C", # COMMA
|
||||
"\x{002D}" => "\x2D", # HYPHEN-MINUS
|
||||
"\x{002E}" => "\x2E", # FULL STOP
|
||||
"\x{002F}" => "\x2F", # SOLIDUS
|
||||
"\x{0030}" => "\x30", # DIGIT ZERO
|
||||
"\x{0031}" => "\x31", # DIGIT ONE
|
||||
"\x{0032}" => "\x32", # DIGIT TWO
|
||||
"\x{0033}" => "\x33", # DIGIT THREE
|
||||
"\x{0034}" => "\x34", # DIGIT FOUR
|
||||
"\x{0035}" => "\x35", # DIGIT FIVE
|
||||
"\x{0036}" => "\x36", # DIGIT SIX
|
||||
"\x{0037}" => "\x37", # DIGIT SEVEN
|
||||
"\x{0038}" => "\x38", # DIGIT EIGHT
|
||||
"\x{0039}" => "\x39", # DIGIT NINE
|
||||
"\x{003A}" => "\x3A", # COLON
|
||||
"\x{003B}" => "\x3B", # SEMICOLON
|
||||
"\x{003C}" => "\x3C", # LESS-THAN SIGN
|
||||
"\x{003D}" => "\x3D", # EQUALS SIGN
|
||||
"\x{003E}" => "\x3E", # GREATER-THAN SIGN
|
||||
"\x{003F}" => "\x3F", # QUESTION MARK
|
||||
"\x{0040}" => "\x00", # COMMERCIAL AT
|
||||
"\x{0041}" => "\x41", # LATIN CAPITAL LETTER A
|
||||
"\x{0042}" => "\x42", # LATIN CAPITAL LETTER B
|
||||
"\x{0043}" => "\x43", # LATIN CAPITAL LETTER C
|
||||
"\x{0044}" => "\x44", # LATIN CAPITAL LETTER D
|
||||
"\x{0045}" => "\x45", # LATIN CAPITAL LETTER E
|
||||
"\x{0046}" => "\x46", # LATIN CAPITAL LETTER F
|
||||
"\x{0047}" => "\x47", # LATIN CAPITAL LETTER G
|
||||
"\x{0048}" => "\x48", # LATIN CAPITAL LETTER H
|
||||
"\x{0049}" => "\x49", # LATIN CAPITAL LETTER I
|
||||
"\x{004A}" => "\x4A", # LATIN CAPITAL LETTER J
|
||||
"\x{004B}" => "\x4B", # LATIN CAPITAL LETTER K
|
||||
"\x{004C}" => "\x4C", # LATIN CAPITAL LETTER L
|
||||
"\x{004D}" => "\x4D", # LATIN CAPITAL LETTER M
|
||||
"\x{004E}" => "\x4E", # LATIN CAPITAL LETTER N
|
||||
"\x{004F}" => "\x4F", # LATIN CAPITAL LETTER O
|
||||
"\x{0050}" => "\x50", # LATIN CAPITAL LETTER P
|
||||
"\x{0051}" => "\x51", # LATIN CAPITAL LETTER Q
|
||||
"\x{0052}" => "\x52", # LATIN CAPITAL LETTER R
|
||||
"\x{0053}" => "\x53", # LATIN CAPITAL LETTER S
|
||||
"\x{0054}" => "\x54", # LATIN CAPITAL LETTER T
|
||||
"\x{0055}" => "\x55", # LATIN CAPITAL LETTER U
|
||||
"\x{0056}" => "\x56", # LATIN CAPITAL LETTER V
|
||||
"\x{0057}" => "\x57", # LATIN CAPITAL LETTER W
|
||||
"\x{0058}" => "\x58", # LATIN CAPITAL LETTER X
|
||||
"\x{0059}" => "\x59", # LATIN CAPITAL LETTER Y
|
||||
"\x{005A}" => "\x5A", # LATIN CAPITAL LETTER Z
|
||||
"\x{005B}" => "\x1B\x3C", # LEFT SQUARE BRACKET
|
||||
"\x{005C}" => "\x1B\x2F", # REVERSE SOLIDUS
|
||||
"\x{005D}" => "\x1B\x3E", # RIGHT SQUARE BRACKET
|
||||
"\x{005E}" => "\x1B\x14", # CIRCUMFLEX ACCENT
|
||||
"\x{005F}" => "\x11", # LOW LINE
|
||||
"\x{0061}" => "\x61", # LATIN SMALL LETTER A
|
||||
"\x{0062}" => "\x62", # LATIN SMALL LETTER B
|
||||
"\x{0063}" => "\x63", # LATIN SMALL LETTER C
|
||||
"\x{0064}" => "\x64", # LATIN SMALL LETTER D
|
||||
"\x{0065}" => "\x65", # LATIN SMALL LETTER E
|
||||
"\x{0066}" => "\x66", # LATIN SMALL LETTER F
|
||||
"\x{0067}" => "\x67", # LATIN SMALL LETTER G
|
||||
"\x{0068}" => "\x68", # LATIN SMALL LETTER H
|
||||
"\x{0069}" => "\x69", # LATIN SMALL LETTER I
|
||||
"\x{006A}" => "\x6A", # LATIN SMALL LETTER J
|
||||
"\x{006B}" => "\x6B", # LATIN SMALL LETTER K
|
||||
"\x{006C}" => "\x6C", # LATIN SMALL LETTER L
|
||||
"\x{006D}" => "\x6D", # LATIN SMALL LETTER M
|
||||
"\x{006E}" => "\x6E", # LATIN SMALL LETTER N
|
||||
"\x{006F}" => "\x6F", # LATIN SMALL LETTER O
|
||||
"\x{0070}" => "\x70", # LATIN SMALL LETTER P
|
||||
"\x{0071}" => "\x71", # LATIN SMALL LETTER Q
|
||||
"\x{0072}" => "\x72", # LATIN SMALL LETTER R
|
||||
"\x{0073}" => "\x73", # LATIN SMALL LETTER S
|
||||
"\x{0074}" => "\x74", # LATIN SMALL LETTER T
|
||||
"\x{0075}" => "\x75", # LATIN SMALL LETTER U
|
||||
"\x{0076}" => "\x76", # LATIN SMALL LETTER V
|
||||
"\x{0077}" => "\x77", # LATIN SMALL LETTER W
|
||||
"\x{0078}" => "\x78", # LATIN SMALL LETTER X
|
||||
"\x{0079}" => "\x79", # LATIN SMALL LETTER Y
|
||||
"\x{007A}" => "\x7A", # LATIN SMALL LETTER Z
|
||||
"\x{007B}" => "\x1B\x28", # LEFT CURLY BRACKET
|
||||
"\x{007C}" => "\x1B\x40", # VERTICAL LINE
|
||||
"\x{007D}" => "\x1B\x29", # RIGHT CURLY BRACKET
|
||||
"\x{007E}" => "\x1B\x3D", # TILDE
|
||||
"\x{00A1}" => "\x40", # INVERTED EXCLAMATION MARK
|
||||
"\x{00A3}" => "\x01", # POUND SIGN
|
||||
"\x{00A4}" => "\x24", # CURRENCY SIGN
|
||||
"\x{00A5}" => "\x03", # YEN SIGN
|
||||
"\x{00A7}" => "\x5F", # SECTION SIGN
|
||||
"\x{00BF}" => "\x60", # INVERTED QUESTION MARK
|
||||
"\x{00C4}" => "\x5B", # LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||
"\x{00C5}" => "\x0E", # LATIN CAPITAL LETTER A WITH RING ABOVE
|
||||
"\x{00C6}" => "\x1C", # LATIN CAPITAL LETTER AE
|
||||
"\x{00C7}" => "\x09", # LATIN CAPITAL LETTER C WITH CEDILLA
|
||||
"\x{00C9}" => "\x1F", # LATIN CAPITAL LETTER E WITH ACUTE
|
||||
"\x{00D1}" => "\x5D", # LATIN CAPITAL LETTER N WITH TILDE
|
||||
"\x{00D6}" => "\x5C", # LATIN CAPITAL LETTER O WITH DIAERESIS
|
||||
"\x{00D8}" => "\x0B", # LATIN CAPITAL LETTER O WITH STROKE
|
||||
"\x{00DC}" => "\x5E", # LATIN CAPITAL LETTER U WITH DIAERESIS
|
||||
"\x{00DF}" => "\x1E", # LATIN SMALL LETTER SHARP S
|
||||
"\x{00E0}" => "\x7F", # LATIN SMALL LETTER A WITH GRAVE
|
||||
"\x{00E4}" => "\x7B", # LATIN SMALL LETTER A WITH DIAERESIS
|
||||
"\x{00E5}" => "\x0F", # LATIN SMALL LETTER A WITH RING ABOVE
|
||||
"\x{00E6}" => "\x1D", # LATIN SMALL LETTER AE
|
||||
"\x{00E8}" => "\x04", # LATIN SMALL LETTER E WITH GRAVE
|
||||
"\x{00E9}" => "\x05", # LATIN SMALL LETTER E WITH ACUTE
|
||||
"\x{00EC}" => "\x07", # LATIN SMALL LETTER I WITH GRAVE
|
||||
"\x{00F1}" => "\x7D", # LATIN SMALL LETTER N WITH TILDE
|
||||
"\x{00F2}" => "\x08", # LATIN SMALL LETTER O WITH GRAVE
|
||||
"\x{00F6}" => "\x7C", # LATIN SMALL LETTER O WITH DIAERESIS
|
||||
"\x{00F8}" => "\x0C", # LATIN SMALL LETTER O WITH STROKE
|
||||
"\x{00F9}" => "\x06", # LATIN SMALL LETTER U WITH GRAVE
|
||||
"\x{00FC}" => "\x7E", # LATIN SMALL LETTER U WITH DIAERESIS
|
||||
"\x{0393}" => "\x13", # GREEK CAPITAL LETTER GAMMA
|
||||
"\x{0394}" => "\x10", # GREEK CAPITAL LETTER DELTA
|
||||
"\x{0398}" => "\x19", # GREEK CAPITAL LETTER THETA
|
||||
"\x{039B}" => "\x14", # GREEK CAPITAL LETTER LAMDA
|
||||
"\x{039E}" => "\x1A", # GREEK CAPITAL LETTER XI
|
||||
"\x{03A0}" => "\x16", # GREEK CAPITAL LETTER PI
|
||||
"\x{03A3}" => "\x18", # GREEK CAPITAL LETTER SIGMA
|
||||
"\x{03A6}" => "\x12", # GREEK CAPITAL LETTER PHI
|
||||
"\x{03A8}" => "\x17", # GREEK CAPITAL LETTER PSI
|
||||
"\x{03A9}" => "\x15", # GREEK CAPITAL LETTER OMEGA
|
||||
"\x{20AC}" => "\x1B\x65", # EURO SIGN
|
||||
);
|
||||
our %GSM2UNI = reverse %UNI2GSM;
|
||||
our $ESC = "\x1b";
|
||||
|
||||
sub decode ($$;$) {
|
||||
my ( $obj, $bytes, $chk ) = @_;
|
||||
return undef unless defined $bytes;
|
||||
my $str = substr($bytes, 0, 0); # to propagate taintedness;
|
||||
while ( length $bytes ) {
|
||||
my $seq = '';
|
||||
my $c;
|
||||
do {
|
||||
$c = substr( $bytes, 0, 1, '' );
|
||||
$seq .= $c;
|
||||
} while ( length $bytes and $c eq $ESC );
|
||||
my $u =
|
||||
exists $GSM2UNI{$seq}
|
||||
? $GSM2UNI{$seq}
|
||||
: ($chk && ref $chk eq 'CODE')
|
||||
? $chk->( unpack 'C*', $seq )
|
||||
: "\x{FFFD}";
|
||||
if ( not exists $GSM2UNI{$seq} and $chk and not ref $chk ) {
|
||||
if ( substr($seq, 0, 1) eq $ESC and ($chk & Encode::STOP_AT_PARTIAL) ) {
|
||||
$bytes .= $seq;
|
||||
last;
|
||||
}
|
||||
croak join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::DIE_ON_ERR;
|
||||
carp join( '', map { sprintf "\\x%02X", $_ } unpack 'C*', $seq ) . ' does not map to Unicode' if $chk & Encode::WARN_ON_ERR;
|
||||
if ($chk & Encode::RETURN_ON_ERR) {
|
||||
$bytes .= $seq;
|
||||
last;
|
||||
}
|
||||
}
|
||||
$str .= $u;
|
||||
}
|
||||
$_[1] = $bytes if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
|
||||
return $str;
|
||||
}
|
||||
|
||||
sub encode($$;$) {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
my $bytes = substr($str, 0, 0); # to propagate taintedness
|
||||
while ( length $str ) {
|
||||
my $u = substr( $str, 0, 1, '' );
|
||||
my $c;
|
||||
my $seq =
|
||||
exists $UNI2GSM{$u}
|
||||
? $UNI2GSM{$u}
|
||||
: ($chk && ref $chk eq 'CODE')
|
||||
? $chk->( ord($u) )
|
||||
: $UNI2GSM{'?'};
|
||||
if ( not exists $UNI2GSM{$u} and $chk and not ref $chk ) {
|
||||
croak sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::DIE_ON_ERR;
|
||||
carp sprintf( "\\x{%04x} does not map to %s", ord($u), $obj->name ) if $chk & Encode::WARN_ON_ERR;
|
||||
if ($chk & Encode::RETURN_ON_ERR) {
|
||||
$str .= $u;
|
||||
last;
|
||||
}
|
||||
}
|
||||
$bytes .= $seq;
|
||||
}
|
||||
$_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
|
||||
return $bytes;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::GSM0338 -- ETSI GSM 03.38 Encoding
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$gsm0338 = encode("gsm0338", $unicode); # loads Encode::GSM0338 implicitly
|
||||
$unicode = decode("gsm0338", $gsm0338); # ditto
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
GSM0338 is for GSM handsets. Though it shares alphanumerals with ASCII,
|
||||
control character ranges and other parts are mapped very differently,
|
||||
mainly to store Greek characters. There are also escape sequences
|
||||
(starting with 0x1B) to cover e.g. the Euro sign.
|
||||
|
||||
This was once handled by L<Encode::Bytes> but because of all those
|
||||
unusual specifications, Encode 2.20 has relocated the support to
|
||||
this module.
|
||||
|
||||
This module implements only I<GSM 7 bit Default Alphabet> and
|
||||
I<GSM 7 bit default alphabet extension table> according to standard
|
||||
3GPP TS 23.038 version 16. Therefore I<National Language Single Shift>
|
||||
and I<National Language Locking Shift> are not implemented nor supported.
|
||||
|
||||
=head2 Septets
|
||||
|
||||
This modules operates with octets (like any other Encode module) and not
|
||||
with packed septets (unlike other GSM standards). Therefore for processing
|
||||
binary SMS or parts of GSM TPDU payload (3GPP TS 23.040) it is needed to do
|
||||
conversion between octets and packed septets. For this purpose perl's C<pack>
|
||||
and C<unpack> functions may be useful:
|
||||
|
||||
$bytes = substr(pack('(b*)*', unpack '(A7)*', unpack 'b*', $septets), 0, $num_of_septets);
|
||||
$unicode = decode('GSM0338', $bytes);
|
||||
|
||||
$bytes = encode('GSM0338', $unicode);
|
||||
$septets = pack 'b*', join '', map { substr $_, 0, 7 } unpack '(A8)*', unpack 'b*', $bytes;
|
||||
$num_of_septets = length $bytes;
|
||||
|
||||
Please note that for correct decoding of packed septets it is required to
|
||||
know number of septets packed in binary buffer as binary buffer is always
|
||||
padded with zero bits and 7 zero bits represents character C<@>. Number
|
||||
of septets is also stored in TPDU payload when dealing with 3GPP TS 23.040.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Encode::GSM0338 2.7 and older versions (part of Encode 3.06) incorrectly
|
||||
handled zero bytes (character C<@>). This was fixed in Encode::GSM0338
|
||||
version 2.8 (part of Encode 3.07).
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<3GPP TS 23.038|https://www.3gpp.org/dynareport/23038.htm>
|
||||
|
||||
L<ETSI TS 123 038 V16.0.0 (2020-07)|https://www.etsi.org/deliver/etsi_ts/123000_123099/123038/16.00.00_60/ts_123038v160000p.pdf>
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
356
database/perl/lib/Encode/Guess.pm
Normal file
356
database/perl/lib/Encode/Guess.pm
Normal file
@@ -0,0 +1,356 @@
|
||||
package Encode::Guess;
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode qw(:fallbacks find_encoding);
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
my $Canon = 'Guess';
|
||||
use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
|
||||
our %DEF_SUSPECTS = map { $_ => find_encoding($_) } qw(ascii utf8);
|
||||
my $obj = bless {
|
||||
Name => $Canon,
|
||||
Suspects => {%DEF_SUSPECTS},
|
||||
} => __PACKAGE__;
|
||||
Encode::define_encoding($obj, $Canon);
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
sub needs_lines { 1 }
|
||||
sub perlio_ok { 0 }
|
||||
|
||||
our @EXPORT = qw(guess_encoding);
|
||||
our $NoUTFAutoGuess = 0;
|
||||
our $UTF8_BOM = pack( "C3", 0xef, 0xbb, 0xbf );
|
||||
|
||||
sub import { # Exporter not used so we do it on our own
|
||||
my $callpkg = caller;
|
||||
for my $item (@EXPORT) {
|
||||
no strict 'refs';
|
||||
*{"$callpkg\::$item"} = \&{"$item"};
|
||||
}
|
||||
set_suspects(@_);
|
||||
}
|
||||
|
||||
sub set_suspects {
|
||||
my $class = shift;
|
||||
my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
|
||||
$self->{Suspects} = {%DEF_SUSPECTS};
|
||||
$self->add_suspects(@_);
|
||||
}
|
||||
|
||||
sub add_suspects {
|
||||
my $class = shift;
|
||||
my $self = ref($class) ? $class : $Encode::Encoding{$Canon};
|
||||
for my $c (@_) {
|
||||
my $e = find_encoding($c) or die "Unknown encoding: $c";
|
||||
$self->{Suspects}{ $e->name } = $e;
|
||||
DEBUG and warn "Added: ", $e->name;
|
||||
}
|
||||
}
|
||||
|
||||
sub decode($$;$) {
|
||||
my ( $obj, $octet, $chk ) = @_;
|
||||
my $guessed = guess( $obj, $octet );
|
||||
unless ( ref($guessed) ) {
|
||||
require Carp;
|
||||
Carp::croak($guessed);
|
||||
}
|
||||
my $utf8 = $guessed->decode( $octet, $chk || 0 );
|
||||
$_[1] = $octet if $chk;
|
||||
return $utf8;
|
||||
}
|
||||
|
||||
sub guess_encoding {
|
||||
guess( $Encode::Encoding{$Canon}, @_ );
|
||||
}
|
||||
|
||||
sub guess {
|
||||
my $class = shift;
|
||||
my $obj = ref($class) ? $class : $Encode::Encoding{$Canon};
|
||||
my $octet = shift;
|
||||
|
||||
# sanity check
|
||||
return "Empty string, empty guess" unless defined $octet and length $octet;
|
||||
|
||||
# cheat 0: utf8 flag;
|
||||
if ( Encode::is_utf8($octet) ) {
|
||||
return find_encoding('utf8') unless $NoUTFAutoGuess;
|
||||
Encode::_utf8_off($octet);
|
||||
}
|
||||
|
||||
# cheat 1: BOM
|
||||
use Encode::Unicode;
|
||||
unless ($NoUTFAutoGuess) {
|
||||
my $BOM = pack( 'C3', unpack( "C3", $octet ) );
|
||||
return find_encoding('utf8')
|
||||
if ( defined $BOM and $BOM eq $UTF8_BOM );
|
||||
$BOM = unpack( 'N', $octet );
|
||||
return find_encoding('UTF-32')
|
||||
if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe0000 ) );
|
||||
$BOM = unpack( 'n', $octet );
|
||||
return find_encoding('UTF-16')
|
||||
if ( defined $BOM and ( $BOM == 0xFeFF or $BOM == 0xFFFe ) );
|
||||
if ( $octet =~ /\x00/o )
|
||||
{ # if \x00 found, we assume UTF-(16|32)(BE|LE)
|
||||
my $utf;
|
||||
my ( $be, $le ) = ( 0, 0 );
|
||||
if ( $octet =~ /\x00\x00/o ) { # UTF-32(BE|LE) assumed
|
||||
$utf = "UTF-32";
|
||||
for my $char ( unpack( 'N*', $octet ) ) {
|
||||
$char & 0x0000ffff and $be++;
|
||||
$char & 0xffff0000 and $le++;
|
||||
}
|
||||
}
|
||||
else { # UTF-16(BE|LE) assumed
|
||||
$utf = "UTF-16";
|
||||
for my $char ( unpack( 'n*', $octet ) ) {
|
||||
$char & 0x00ff and $be++;
|
||||
$char & 0xff00 and $le++;
|
||||
}
|
||||
}
|
||||
DEBUG and warn "$utf, be == $be, le == $le";
|
||||
$be == $le
|
||||
and return
|
||||
"Encodings ambiguous between $utf BE and LE ($be, $le)";
|
||||
$utf .= ( $be > $le ) ? 'BE' : 'LE';
|
||||
return find_encoding($utf);
|
||||
}
|
||||
}
|
||||
my %try = %{ $obj->{Suspects} };
|
||||
for my $c (@_) {
|
||||
my $e = find_encoding($c) or die "Unknown encoding: $c";
|
||||
$try{ $e->name } = $e;
|
||||
DEBUG and warn "Added: ", $e->name;
|
||||
}
|
||||
my $nline = 1;
|
||||
for my $line ( split /\r\n?|\n/, $octet ) {
|
||||
|
||||
# cheat 2 -- \e in the string
|
||||
if ( $line =~ /\e/o ) {
|
||||
my @keys = keys %try;
|
||||
delete @try{qw/utf8 ascii/};
|
||||
for my $k (@keys) {
|
||||
ref( $try{$k} ) eq 'Encode::XS' and delete $try{$k};
|
||||
}
|
||||
}
|
||||
my %ok = %try;
|
||||
|
||||
# warn join(",", keys %try);
|
||||
for my $k ( keys %try ) {
|
||||
my $scratch = $line;
|
||||
$try{$k}->decode( $scratch, FB_QUIET );
|
||||
if ( $scratch eq '' ) {
|
||||
DEBUG and warn sprintf( "%4d:%-24s ok\n", $nline, $k );
|
||||
}
|
||||
else {
|
||||
use bytes ();
|
||||
DEBUG
|
||||
and warn sprintf( "%4d:%-24s not ok; %d bytes left\n",
|
||||
$nline, $k, bytes::length($scratch) );
|
||||
delete $ok{$k};
|
||||
}
|
||||
}
|
||||
%ok or return "No appropriate encodings found!";
|
||||
if ( scalar( keys(%ok) ) == 1 ) {
|
||||
my ($retval) = values(%ok);
|
||||
return $retval;
|
||||
}
|
||||
%try = %ok;
|
||||
$nline++;
|
||||
}
|
||||
$try{ascii}
|
||||
or return "Encodings too ambiguous: " . join( " or ", keys %try );
|
||||
return $try{ascii};
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Guess -- Guesses encoding from data
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
# if you are sure $data won't contain anything bogus
|
||||
|
||||
use Encode;
|
||||
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
|
||||
my $utf8 = decode("Guess", $data);
|
||||
my $data = encode("Guess", $utf8); # this doesn't work!
|
||||
|
||||
# more elaborate way
|
||||
use Encode::Guess;
|
||||
my $enc = guess_encoding($data, qw/euc-jp shiftjis 7bit-jis/);
|
||||
ref($enc) or die "Can't guess: $enc"; # trap error this way
|
||||
$utf8 = $enc->decode($data);
|
||||
# or
|
||||
$utf8 = decode($enc->name, $data)
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
Encode::Guess enables you to guess in what encoding a given data is
|
||||
encoded, or at least tries to.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
By default, it checks only ascii, utf8 and UTF-16/32 with BOM.
|
||||
|
||||
use Encode::Guess; # ascii/utf8/BOMed UTF
|
||||
|
||||
To use it more practically, you have to give the names of encodings to
|
||||
check (I<suspects> as follows). The name of suspects can either be
|
||||
canonical names or aliases.
|
||||
|
||||
CAVEAT: Unlike UTF-(16|32), BOM in utf8 is NOT AUTOMATICALLY STRIPPED.
|
||||
|
||||
# tries all major Japanese Encodings as well
|
||||
use Encode::Guess qw/euc-jp shiftjis 7bit-jis/;
|
||||
|
||||
If the C<$Encode::Guess::NoUTFAutoGuess> variable is set to a true
|
||||
value, no heuristics will be applied to UTF8/16/32, and the result
|
||||
will be limited to the suspects and C<ascii>.
|
||||
|
||||
=over 4
|
||||
|
||||
=item Encode::Guess->set_suspects
|
||||
|
||||
You can also change the internal suspects list via C<set_suspects>
|
||||
method.
|
||||
|
||||
use Encode::Guess;
|
||||
Encode::Guess->set_suspects(qw/euc-jp shiftjis 7bit-jis/);
|
||||
|
||||
=item Encode::Guess->add_suspects
|
||||
|
||||
Or you can use C<add_suspects> method. The difference is that
|
||||
C<set_suspects> flushes the current suspects list while
|
||||
C<add_suspects> adds.
|
||||
|
||||
use Encode::Guess;
|
||||
Encode::Guess->add_suspects(qw/euc-jp shiftjis 7bit-jis/);
|
||||
# now the suspects are euc-jp,shiftjis,7bit-jis, AND
|
||||
# euc-kr,euc-cn, and big5-eten
|
||||
Encode::Guess->add_suspects(qw/euc-kr euc-cn big5-eten/);
|
||||
|
||||
=item Encode::decode("Guess" ...)
|
||||
|
||||
When you are content with suspects list, you can now
|
||||
|
||||
my $utf8 = Encode::decode("Guess", $data);
|
||||
|
||||
=item Encode::Guess->guess($data)
|
||||
|
||||
But it will croak if:
|
||||
|
||||
=over
|
||||
|
||||
=item *
|
||||
|
||||
Two or more suspects remain
|
||||
|
||||
=item *
|
||||
|
||||
No suspects left
|
||||
|
||||
=back
|
||||
|
||||
So you should instead try this;
|
||||
|
||||
my $decoder = Encode::Guess->guess($data);
|
||||
|
||||
On success, $decoder is an object that is documented in
|
||||
L<Encode::Encoding>. So you can now do this;
|
||||
|
||||
my $utf8 = $decoder->decode($data);
|
||||
|
||||
On failure, $decoder now contains an error message so the whole thing
|
||||
would be as follows;
|
||||
|
||||
my $decoder = Encode::Guess->guess($data);
|
||||
die $decoder unless ref($decoder);
|
||||
my $utf8 = $decoder->decode($data);
|
||||
|
||||
=item guess_encoding($data, [, I<list of suspects>])
|
||||
|
||||
You can also try C<guess_encoding> function which is exported by
|
||||
default. It takes $data to check and it also takes the list of
|
||||
suspects by option. The optional suspect list is I<not reflected> to
|
||||
the internal suspects list.
|
||||
|
||||
my $decoder = guess_encoding($data, qw/euc-jp euc-kr euc-cn/);
|
||||
die $decoder unless ref($decoder);
|
||||
my $utf8 = $decoder->decode($data);
|
||||
# check only ascii, utf8 and UTF-(16|32) with BOM
|
||||
my $decoder = guess_encoding($data);
|
||||
|
||||
=back
|
||||
|
||||
=head1 CAVEATS
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
Because of the algorithm used, ISO-8859 series and other single-byte
|
||||
encodings do not work well unless either one of ISO-8859 is the only
|
||||
one suspect (besides ascii and utf8).
|
||||
|
||||
use Encode::Guess;
|
||||
# perhaps ok
|
||||
my $decoder = guess_encoding($data, 'latin1');
|
||||
# definitely NOT ok
|
||||
my $decoder = guess_encoding($data, qw/latin1 greek/);
|
||||
|
||||
The reason is that Encode::Guess guesses encoding by trial and error.
|
||||
It first splits $data into lines and tries to decode the line for each
|
||||
suspect. It keeps it going until all but one encoding is eliminated
|
||||
out of suspects list. ISO-8859 series is just too successful for most
|
||||
cases (because it fills almost all code points in \x00-\xff).
|
||||
|
||||
=item *
|
||||
|
||||
Do not mix national standard encodings and the corresponding vendor
|
||||
encodings.
|
||||
|
||||
# a very bad idea
|
||||
my $decoder
|
||||
= guess_encoding($data, qw/shiftjis MacJapanese cp932/);
|
||||
|
||||
The reason is that vendor encoding is usually a superset of national
|
||||
standard so it becomes too ambiguous for most cases.
|
||||
|
||||
=item *
|
||||
|
||||
On the other hand, mixing various national standard encodings
|
||||
automagically works unless $data is too short to allow for guessing.
|
||||
|
||||
# This is ok if $data is long enough
|
||||
my $decoder =
|
||||
guess_encoding($data, qw/euc-cn
|
||||
euc-jp shiftjis 7bit-jis
|
||||
euc-kr
|
||||
big5-eten/);
|
||||
|
||||
=item *
|
||||
|
||||
DO NOT PUT TOO MANY SUSPECTS! Don't you try something like this!
|
||||
|
||||
my $decoder = guess_encoding($data,
|
||||
Encode->encodings(":all"));
|
||||
|
||||
=back
|
||||
|
||||
It is, after all, just a guess. You should alway be explicit when it
|
||||
comes to encodings. But there are some, especially Japanese,
|
||||
environment that guess-coding is a must. Use this module with care.
|
||||
|
||||
=head1 TO DO
|
||||
|
||||
Encode::Guess does not work on EBCDIC platforms.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>, L<Encode::Encoding>
|
||||
|
||||
=cut
|
||||
|
||||
95
database/perl/lib/Encode/JP.pm
Normal file
95
database/perl/lib/Encode/JP.pm
Normal file
@@ -0,0 +1,95 @@
|
||||
package Encode::JP;
|
||||
BEGIN {
|
||||
if ( ord("A") == 193 ) {
|
||||
die "Encode::JP not supported on EBCDIC\n";
|
||||
}
|
||||
}
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
use Encode::JP::JIS7;
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::JP - Japanese Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$euc_jp = encode("euc-jp", $utf8); # loads Encode::JP implicitly
|
||||
$utf8 = decode("euc-jp", $euc_jp); # ditto
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements Japanese charset encodings. Encodings
|
||||
supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
euc-jp /\beuc.*jp$/i EUC (Extended Unix Character)
|
||||
/\bjp.*euc/i
|
||||
/\bujis$/i
|
||||
shiftjis /\bshift.*jis$/i Shift JIS (aka MS Kanji)
|
||||
/\bsjis$/i
|
||||
7bit-jis /\bjis$/i 7bit JIS
|
||||
iso-2022-jp ISO-2022-JP [RFC1468]
|
||||
= 7bit JIS with all Halfwidth Kana
|
||||
converted to Fullwidth
|
||||
iso-2022-jp-1 ISO-2022-JP-1 [RFC2237]
|
||||
= ISO-2022-JP with JIS X 0212-1990
|
||||
support. See below
|
||||
MacJapanese Shift JIS + Apple vendor mappings
|
||||
cp932 /\bwindows-31j$/i Code Page 932
|
||||
= Shift JIS + MS/IBM vendor mappings
|
||||
jis0201-raw JIS0201, raw format
|
||||
jis0208-raw JIS0201, raw format
|
||||
jis0212-raw JIS0201, raw format
|
||||
--------------------------------------------------------------------
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
To find out how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 Note on ISO-2022-JP(-1)?
|
||||
|
||||
ISO-2022-JP-1 (RFC2237) is a superset of ISO-2022-JP (RFC1468) which
|
||||
adds support for JIS X 0212-1990. That means you can use the same
|
||||
code to decode to utf8 but not vice versa.
|
||||
|
||||
$utf8 = decode('iso-2022-jp-1', $stream);
|
||||
|
||||
and
|
||||
|
||||
$utf8 = decode('iso-2022-jp', $stream);
|
||||
|
||||
yield the same result but
|
||||
|
||||
$with_0212 = encode('iso-2022-jp-1', $utf8);
|
||||
|
||||
is now different from
|
||||
|
||||
$without_0212 = encode('iso-2022-jp', $utf8 );
|
||||
|
||||
In the latter case, characters that map to 0212 are first converted
|
||||
to U+3013 (0xA2AE in EUC-JP; a white square also known as 'Tofu' or
|
||||
'geta mark') then fed to the decoding engine. U+FFFD is not used,
|
||||
in order to preserve text layout as much as possible.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
The ASCII region (0x00-0x7f) is preserved for all encodings, even
|
||||
though this conflicts with mappings by the Unicode Consortium.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
176
database/perl/lib/Encode/JP/H2Z.pm
Normal file
176
database/perl/lib/Encode/JP/H2Z.pm
Normal file
@@ -0,0 +1,176 @@
|
||||
#
|
||||
# $Id: H2Z.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $
|
||||
#
|
||||
|
||||
package Encode::JP::H2Z;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our $RCSID = q$Id: H2Z.pm,v 2.2 2006/06/03 20:28:48 dankogai Exp $;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Encode::CJKConstants qw(:all);
|
||||
|
||||
use vars qw(%_D2Z $_PAT_D2Z
|
||||
%_Z2D $_PAT_Z2D
|
||||
%_H2Z $_PAT_H2Z
|
||||
%_Z2H $_PAT_Z2H);
|
||||
|
||||
%_H2Z = (
|
||||
"\x8e\xa1" => "\xa1\xa3", #<23><>
|
||||
"\x8e\xa2" => "\xa1\xd6", #<23><>
|
||||
"\x8e\xa3" => "\xa1\xd7", #<23><>
|
||||
"\x8e\xa4" => "\xa1\xa2", #<23><>
|
||||
"\x8e\xa5" => "\xa1\xa6", #<23><>
|
||||
"\x8e\xa6" => "\xa5\xf2", #<23><>
|
||||
"\x8e\xa7" => "\xa5\xa1", #<23><>
|
||||
"\x8e\xa8" => "\xa5\xa3", #<23><>
|
||||
"\x8e\xa9" => "\xa5\xa5", #<23><>
|
||||
"\x8e\xaa" => "\xa5\xa7", #<23><>
|
||||
"\x8e\xab" => "\xa5\xa9", #<23><>
|
||||
"\x8e\xac" => "\xa5\xe3", #<23><>
|
||||
"\x8e\xad" => "\xa5\xe5", #<23><>
|
||||
"\x8e\xae" => "\xa5\xe7", #<23><>
|
||||
"\x8e\xaf" => "\xa5\xc3", #<23><>
|
||||
"\x8e\xb0" => "\xa1\xbc", #<23><>
|
||||
"\x8e\xb1" => "\xa5\xa2", #<23><>
|
||||
"\x8e\xb2" => "\xa5\xa4", #<23><>
|
||||
"\x8e\xb3" => "\xa5\xa6", #<23><>
|
||||
"\x8e\xb4" => "\xa5\xa8", #<23><>
|
||||
"\x8e\xb5" => "\xa5\xaa", #<23><>
|
||||
"\x8e\xb6" => "\xa5\xab", #<23><>
|
||||
"\x8e\xb7" => "\xa5\xad", #<23><>
|
||||
"\x8e\xb8" => "\xa5\xaf", #<23><>
|
||||
"\x8e\xb9" => "\xa5\xb1", #<23><>
|
||||
"\x8e\xba" => "\xa5\xb3", #<23><>
|
||||
"\x8e\xbb" => "\xa5\xb5", #<23><>
|
||||
"\x8e\xbc" => "\xa5\xb7", #<23><>
|
||||
"\x8e\xbd" => "\xa5\xb9", #<23><>
|
||||
"\x8e\xbe" => "\xa5\xbb", #<23><>
|
||||
"\x8e\xbf" => "\xa5\xbd", #<23><>
|
||||
"\x8e\xc0" => "\xa5\xbf", #<23><>
|
||||
"\x8e\xc1" => "\xa5\xc1", #<23><>
|
||||
"\x8e\xc2" => "\xa5\xc4", #<23><>
|
||||
"\x8e\xc3" => "\xa5\xc6", #<23><>
|
||||
"\x8e\xc4" => "\xa5\xc8", #<23><>
|
||||
"\x8e\xc5" => "\xa5\xca", #<23><>
|
||||
"\x8e\xc6" => "\xa5\xcb", #<23><>
|
||||
"\x8e\xc7" => "\xa5\xcc", #<23><>
|
||||
"\x8e\xc8" => "\xa5\xcd", #<23><>
|
||||
"\x8e\xc9" => "\xa5\xce", #<23><>
|
||||
"\x8e\xca" => "\xa5\xcf", #<23><>
|
||||
"\x8e\xcb" => "\xa5\xd2", #<23><>
|
||||
"\x8e\xcc" => "\xa5\xd5", #<23><>
|
||||
"\x8e\xcd" => "\xa5\xd8", #<23><>
|
||||
"\x8e\xce" => "\xa5\xdb", #<23><>
|
||||
"\x8e\xcf" => "\xa5\xde", #<23><>
|
||||
"\x8e\xd0" => "\xa5\xdf", #<23><>
|
||||
"\x8e\xd1" => "\xa5\xe0", #<23><>
|
||||
"\x8e\xd2" => "\xa5\xe1", #<23><>
|
||||
"\x8e\xd3" => "\xa5\xe2", #<23><>
|
||||
"\x8e\xd4" => "\xa5\xe4", #<23><>
|
||||
"\x8e\xd5" => "\xa5\xe6", #<23><>
|
||||
"\x8e\xd6" => "\xa5\xe8", #<23><>
|
||||
"\x8e\xd7" => "\xa5\xe9", #<23><>
|
||||
"\x8e\xd8" => "\xa5\xea", #<23><>
|
||||
"\x8e\xd9" => "\xa5\xeb", #<23><>
|
||||
"\x8e\xda" => "\xa5\xec", #<23><>
|
||||
"\x8e\xdb" => "\xa5\xed", #<23><>
|
||||
"\x8e\xdc" => "\xa5\xef", #<23><>
|
||||
"\x8e\xdd" => "\xa5\xf3", #<23><>
|
||||
"\x8e\xde" => "\xa1\xab", #<23><>
|
||||
"\x8e\xdf" => "\xa1\xac", #<23><>
|
||||
);
|
||||
|
||||
%_D2Z = (
|
||||
"\x8e\xb6\x8e\xde" => "\xa5\xac", #<23><>
|
||||
"\x8e\xb7\x8e\xde" => "\xa5\xae", #<23><>
|
||||
"\x8e\xb8\x8e\xde" => "\xa5\xb0", #<23><>
|
||||
"\x8e\xb9\x8e\xde" => "\xa5\xb2", #<23><>
|
||||
"\x8e\xba\x8e\xde" => "\xa5\xb4", #<23><>
|
||||
"\x8e\xbb\x8e\xde" => "\xa5\xb6", #<23><>
|
||||
"\x8e\xbc\x8e\xde" => "\xa5\xb8", #<23><>
|
||||
"\x8e\xbd\x8e\xde" => "\xa5\xba", #<23><>
|
||||
"\x8e\xbe\x8e\xde" => "\xa5\xbc", #<23><>
|
||||
"\x8e\xbf\x8e\xde" => "\xa5\xbe", #<23><>
|
||||
"\x8e\xc0\x8e\xde" => "\xa5\xc0", #<23><>
|
||||
"\x8e\xc1\x8e\xde" => "\xa5\xc2", #<23><>
|
||||
"\x8e\xc2\x8e\xde" => "\xa5\xc5", #<23><>
|
||||
"\x8e\xc3\x8e\xde" => "\xa5\xc7", #<23><>
|
||||
"\x8e\xc4\x8e\xde" => "\xa5\xc9", #<23><>
|
||||
"\x8e\xca\x8e\xde" => "\xa5\xd0", #<23><>
|
||||
"\x8e\xcb\x8e\xde" => "\xa5\xd3", #<23><>
|
||||
"\x8e\xcc\x8e\xde" => "\xa5\xd6", #<23><>
|
||||
"\x8e\xcd\x8e\xde" => "\xa5\xd9", #<23><>
|
||||
"\x8e\xce\x8e\xde" => "\xa5\xdc", #<23><>
|
||||
"\x8e\xca\x8e\xdf" => "\xa5\xd1", #<23><>
|
||||
"\x8e\xcb\x8e\xdf" => "\xa5\xd4", #<23><>
|
||||
"\x8e\xcc\x8e\xdf" => "\xa5\xd7", #<23><>
|
||||
"\x8e\xcd\x8e\xdf" => "\xa5\xda", #<23><>
|
||||
"\x8e\xce\x8e\xdf" => "\xa5\xdd", #<23><>
|
||||
"\x8e\xb3\x8e\xde" => "\xa5\xf4", #<23><>
|
||||
);
|
||||
|
||||
# init only once;
|
||||
|
||||
#$_PAT_D2Z = join("|", keys %_D2Z);
|
||||
#$_PAT_H2Z = join("|", keys %_H2Z);
|
||||
|
||||
%_Z2H = reverse %_H2Z;
|
||||
%_Z2D = reverse %_D2Z;
|
||||
|
||||
#$_PAT_Z2H = join("|", keys %_Z2H);
|
||||
#$_PAT_Z2D = join("|", keys %_Z2D);
|
||||
|
||||
sub h2z {
|
||||
no warnings qw(uninitialized);
|
||||
my $r_str = shift;
|
||||
my ($keep_dakuten) = @_;
|
||||
my $n = 0;
|
||||
unless ($keep_dakuten) {
|
||||
$n = (
|
||||
$$r_str =~ s(
|
||||
($RE{EUC_KANA}
|
||||
(?:\x8e[\xde\xdf])?)
|
||||
){
|
||||
my $str = $1;
|
||||
$_D2Z{$str} || $_H2Z{$str} ||
|
||||
# in case dakuten and handakuten are side-by-side!
|
||||
$_H2Z{substr($str,0,2)} . $_H2Z{substr($str,2,2)};
|
||||
}eogx
|
||||
);
|
||||
}
|
||||
else {
|
||||
$n = (
|
||||
$$r_str =~ s(
|
||||
($RE{EUC_KANA})
|
||||
){
|
||||
$_H2Z{$1};
|
||||
}eogx
|
||||
);
|
||||
}
|
||||
$n;
|
||||
}
|
||||
|
||||
sub z2h {
|
||||
my $r_str = shift;
|
||||
my $n = (
|
||||
$$r_str =~ s(
|
||||
($RE{EUC_C}|$RE{EUC_0212}|$RE{EUC_KANA})
|
||||
){
|
||||
$_Z2D{$1} || $_Z2H{$1} || $1;
|
||||
}eogx
|
||||
);
|
||||
$n;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::JP::H2Z -- internally used by Encode::JP::2022_JP*
|
||||
|
||||
=cut
|
||||
168
database/perl/lib/Encode/JP/JIS7.pm
Normal file
168
database/perl/lib/Encode/JP/JIS7.pm
Normal file
@@ -0,0 +1,168 @@
|
||||
package Encode::JP::JIS7;
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.8 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Encode qw(:fallbacks);
|
||||
|
||||
for my $name ( '7bit-jis', 'iso-2022-jp', 'iso-2022-jp-1' ) {
|
||||
my $h2z = ( $name eq '7bit-jis' ) ? 0 : 1;
|
||||
my $jis0212 = ( $name eq 'iso-2022-jp' ) ? 0 : 1;
|
||||
|
||||
my $obj = bless {
|
||||
Name => $name,
|
||||
h2z => $h2z,
|
||||
jis0212 => $jis0212,
|
||||
} => __PACKAGE__;
|
||||
Encode::define_encoding($obj, $name);
|
||||
}
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
# we override this to 1 so PerlIO works
|
||||
sub needs_lines { 1 }
|
||||
|
||||
use Encode::CJKConstants qw(:all);
|
||||
|
||||
#
|
||||
# decode is identical for all 2022 variants
|
||||
#
|
||||
|
||||
sub decode($$;$) {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
my $residue = '';
|
||||
if ($chk) {
|
||||
$str =~ s/([^\x00-\x7f].*)$//so and $residue = $1;
|
||||
}
|
||||
$residue .= jis_euc( \$str );
|
||||
$_[1] = $residue if $chk;
|
||||
return Encode::decode( 'euc-jp', $str, FB_PERLQQ );
|
||||
}
|
||||
|
||||
#
|
||||
# encode is different
|
||||
#
|
||||
|
||||
sub encode($$;$) {
|
||||
require Encode::JP::H2Z;
|
||||
my ( $obj, $utf8, $chk ) = @_;
|
||||
return undef unless defined $utf8;
|
||||
|
||||
# empty the input string in the stack so perlio is ok
|
||||
$_[1] = '' if $chk;
|
||||
my ( $h2z, $jis0212 ) = @$obj{qw(h2z jis0212)};
|
||||
my $octet = Encode::encode( 'euc-jp', $utf8, $chk || 0 );
|
||||
$h2z and &Encode::JP::H2Z::h2z( \$octet );
|
||||
euc_jis( \$octet, $jis0212 );
|
||||
return $octet;
|
||||
}
|
||||
|
||||
#
|
||||
# cat_decode
|
||||
#
|
||||
my $re_scan_jis_g = qr{
|
||||
\G ( ($RE{JIS_0212}) | $RE{JIS_0208} |
|
||||
($RE{ISO_ASC}) | ($RE{JIS_KANA}) | )
|
||||
([^\e]*)
|
||||
}x;
|
||||
|
||||
sub cat_decode { # ($obj, $dst, $src, $pos, $trm, $chk)
|
||||
my ( $obj, undef, undef, $pos, $trm ) = @_; # currently ignores $chk
|
||||
my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
|
||||
local ${^ENCODING};
|
||||
use bytes;
|
||||
my $opos = pos($$rsrc);
|
||||
pos($$rsrc) = $pos;
|
||||
while ( $$rsrc =~ /$re_scan_jis_g/gc ) {
|
||||
my ( $esc, $esc_0212, $esc_asc, $esc_kana, $chunk ) =
|
||||
( $1, $2, $3, $4, $5 );
|
||||
|
||||
unless ($chunk) { $esc or last; next; }
|
||||
|
||||
if ( $esc && !$esc_asc ) {
|
||||
$chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
|
||||
if ($esc_kana) {
|
||||
$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
|
||||
}
|
||||
elsif ($esc_0212) {
|
||||
$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
|
||||
}
|
||||
$chunk = Encode::decode( 'euc-jp', $chunk, 0 );
|
||||
}
|
||||
elsif ( ( my $npos = index( $chunk, $trm ) ) >= 0 ) {
|
||||
$$rdst .= substr( $chunk, 0, $npos + length($trm) );
|
||||
$$rpos += length($esc) + $npos + length($trm);
|
||||
pos($$rsrc) = $opos;
|
||||
return 1;
|
||||
}
|
||||
$$rdst .= $chunk;
|
||||
$$rpos = pos($$rsrc);
|
||||
}
|
||||
$$rpos = pos($$rsrc);
|
||||
pos($$rsrc) = $opos;
|
||||
return '';
|
||||
}
|
||||
|
||||
# JIS<->EUC
|
||||
my $re_scan_jis = qr{
|
||||
(?:($RE{JIS_0212})|$RE{JIS_0208}|($RE{ISO_ASC})|($RE{JIS_KANA}))([^\e]*)
|
||||
}x;
|
||||
|
||||
sub jis_euc {
|
||||
local ${^ENCODING};
|
||||
my $r_str = shift;
|
||||
$$r_str =~ s($re_scan_jis)
|
||||
{
|
||||
my ($esc_0212, $esc_asc, $esc_kana, $chunk) =
|
||||
($1, $2, $3, $4);
|
||||
if (!$esc_asc) {
|
||||
$chunk =~ tr/\x21-\x7e/\xa1-\xfe/;
|
||||
if ($esc_kana) {
|
||||
$chunk =~ s/([\xa1-\xdf])/\x8e$1/og;
|
||||
}
|
||||
elsif ($esc_0212) {
|
||||
$chunk =~ s/([\xa1-\xfe][\xa1-\xfe])/\x8f$1/og;
|
||||
}
|
||||
}
|
||||
$chunk;
|
||||
}geox;
|
||||
my ($residue) = ( $$r_str =~ s/(\e.*)$//so );
|
||||
return $residue;
|
||||
}
|
||||
|
||||
sub euc_jis {
|
||||
no warnings qw(uninitialized);
|
||||
local ${^ENCODING};
|
||||
my $r_str = shift;
|
||||
my $jis0212 = shift;
|
||||
$$r_str =~ s{
|
||||
((?:$RE{EUC_C})+|(?:$RE{EUC_KANA})+|(?:$RE{EUC_0212})+)
|
||||
}{
|
||||
my $chunk = $1;
|
||||
my $esc =
|
||||
( $chunk =~ tr/\x8E//d ) ? $ESC{KANA} :
|
||||
( $chunk =~ tr/\x8F//d ) ? $ESC{JIS_0212} :
|
||||
$ESC{JIS_0208};
|
||||
if ($esc eq $ESC{JIS_0212} && !$jis0212){
|
||||
# fallback to '?'
|
||||
$chunk =~ tr/\xA1-\xFE/\x3F/;
|
||||
}else{
|
||||
$chunk =~ tr/\xA1-\xFE/\x21-\x7E/;
|
||||
}
|
||||
$esc . $chunk . $ESC{ASC};
|
||||
}geox;
|
||||
$$r_str =~ s/\Q$ESC{ASC}\E
|
||||
(\Q$ESC{KANA}\E|\Q$ESC{JIS_0212}\E|\Q$ESC{JIS_0208}\E)/$1/gox;
|
||||
$$r_str;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::JP::JIS7 -- internally used by Encode::JP
|
||||
|
||||
=cut
|
||||
69
database/perl/lib/Encode/KR.pm
Normal file
69
database/perl/lib/Encode/KR.pm
Normal file
@@ -0,0 +1,69 @@
|
||||
package Encode::KR;
|
||||
BEGIN {
|
||||
if ( ord("A") == 193 ) {
|
||||
die "Encode::KR not supported on EBCDIC\n";
|
||||
}
|
||||
}
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
use Encode::KR::2022_KR;
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::KR - Korean Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$euc_kr = encode("euc-kr", $utf8); # loads Encode::KR implicitly
|
||||
$utf8 = decode("euc-kr", $euc_kr); # ditto
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module implements Korean charset encodings. Encodings supported
|
||||
are as follows.
|
||||
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
euc-kr /\beuc.*kr$/i EUC (Extended Unix Character)
|
||||
/\bkr.*euc$/i
|
||||
ksc5601-raw Korean standard code set (as is)
|
||||
cp949 /(?:x-)?uhc$/i
|
||||
/(?:x-)?windows-949$/i
|
||||
/\bks_c_5601-1987$/i
|
||||
Code Page 949 (EUC-KR + 8,822
|
||||
(additional Hangul syllables)
|
||||
MacKorean EUC-KR + Apple Vendor Mappings
|
||||
johab JOHAB A supplementary encoding defined in
|
||||
Annex 3 of KS X 1001:1998
|
||||
iso-2022-kr iso-2022-kr [RFC1557]
|
||||
--------------------------------------------------------------------
|
||||
|
||||
To find how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
When you see C<charset=ks_c_5601-1987> on mails and web pages, they really
|
||||
mean "cp949" encodings. To fix that, the following aliases are set;
|
||||
|
||||
qr/(?:x-)?uhc$/i => '"cp949"'
|
||||
qr/(?:x-)?windows-949$/i => '"cp949"'
|
||||
qr/ks_c_5601-1987$/i => '"cp949"'
|
||||
|
||||
The ASCII region (0x00-0x7f) is preserved for all encodings, even
|
||||
though this conflicts with mappings by the Unicode Consortium.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
83
database/perl/lib/Encode/KR/2022_KR.pm
Normal file
83
database/perl/lib/Encode/KR/2022_KR.pm
Normal file
@@ -0,0 +1,83 @@
|
||||
package Encode::KR::2022_KR;
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.4 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Encode qw(:fallbacks);
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
__PACKAGE__->Define('iso-2022-kr');
|
||||
|
||||
sub needs_lines { 1 }
|
||||
|
||||
sub perlio_ok {
|
||||
return 0; # for the time being
|
||||
}
|
||||
|
||||
sub decode {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
my $res = $str;
|
||||
my $residue = iso_euc( \$res );
|
||||
|
||||
# This is for PerlIO
|
||||
$_[1] = $residue if $chk;
|
||||
return Encode::decode( 'euc-kr', $res, FB_PERLQQ );
|
||||
}
|
||||
|
||||
sub encode {
|
||||
my ( $obj, $utf8, $chk ) = @_;
|
||||
return undef unless defined $utf8;
|
||||
|
||||
# empty the input string in the stack so perlio is ok
|
||||
$_[1] = '' if $chk;
|
||||
my $octet = Encode::encode( 'euc-kr', $utf8, FB_PERLQQ );
|
||||
euc_iso( \$octet );
|
||||
return $octet;
|
||||
}
|
||||
|
||||
use Encode::CJKConstants qw(:all);
|
||||
|
||||
# ISO<->EUC
|
||||
|
||||
sub iso_euc {
|
||||
my $r_str = shift;
|
||||
$$r_str =~ s/$RE{'2022_KR'}//gox; # remove the designator
|
||||
$$r_str =~ s{ # replace characters in GL
|
||||
\x0e # between SO(\x0e) and SI(\x0f)
|
||||
([^\x0f]*) # with characters in GR
|
||||
\x0f
|
||||
}
|
||||
{
|
||||
my $out= $1;
|
||||
$out =~ tr/\x21-\x7e/\xa1-\xfe/;
|
||||
$out;
|
||||
}geox;
|
||||
my ($residue) = ( $$r_str =~ s/(\e.*)$//so );
|
||||
return $residue;
|
||||
}
|
||||
|
||||
sub euc_iso {
|
||||
no warnings qw(uninitialized);
|
||||
my $r_str = shift;
|
||||
substr( $$r_str, 0, 0 ) =
|
||||
$ESC{'2022_KR'}; # put the designator at the beg.
|
||||
$$r_str =~
|
||||
s{ # move KS X 1001 characters in GR to GL
|
||||
($RE{EUC_C}+) # and enclose them with SO and SI
|
||||
}{
|
||||
my $str = $1;
|
||||
$str =~ tr/\xA1-\xFE/\x21-\x7E/;
|
||||
"\x0e" . $str . "\x0f";
|
||||
}geox;
|
||||
$$r_str;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::KR::2022_KR -- internally used by Encode::KR
|
||||
|
||||
=cut
|
||||
427
database/perl/lib/Encode/MIME/Header.pm
Normal file
427
database/perl/lib/Encode/MIME/Header.pm
Normal file
@@ -0,0 +1,427 @@
|
||||
package Encode::MIME::Header;
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.28 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use Carp ();
|
||||
use Encode ();
|
||||
use MIME::Base64 ();
|
||||
|
||||
my %seed = (
|
||||
decode_b => 1, # decodes 'B' encoding ?
|
||||
decode_q => 1, # decodes 'Q' encoding ?
|
||||
encode => 'B', # encode with 'B' or 'Q' ?
|
||||
charset => 'UTF-8', # encode charset
|
||||
bpl => 75, # bytes per line
|
||||
);
|
||||
|
||||
my @objs;
|
||||
|
||||
push @objs, bless {
|
||||
%seed,
|
||||
Name => 'MIME-Header',
|
||||
} => __PACKAGE__;
|
||||
|
||||
push @objs, bless {
|
||||
%seed,
|
||||
decode_q => 0,
|
||||
Name => 'MIME-B',
|
||||
} => __PACKAGE__;
|
||||
|
||||
push @objs, bless {
|
||||
%seed,
|
||||
decode_b => 0,
|
||||
encode => 'Q',
|
||||
Name => 'MIME-Q',
|
||||
} => __PACKAGE__;
|
||||
|
||||
Encode::define_encoding($_, $_->{Name}) foreach @objs;
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
sub needs_lines { 1 }
|
||||
sub perlio_ok { 0 }
|
||||
|
||||
# RFC 2047 and RFC 2231 grammar
|
||||
my $re_charset = qr/[!"#\$%&'+\-0-9A-Z\\\^_`a-z\{\|\}~]+/;
|
||||
my $re_language = qr/[A-Za-z]{1,8}(?:-[0-9A-Za-z]{1,8})*/;
|
||||
my $re_encoding = qr/[QqBb]/;
|
||||
my $re_encoded_text = qr/[^\?]*/;
|
||||
my $re_encoded_word = qr/=\?$re_charset(?:\*$re_language)?\?$re_encoding\?$re_encoded_text\?=/;
|
||||
my $re_capture_encoded_word = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding\?$re_encoded_text)\?=/;
|
||||
my $re_capture_encoded_word_split = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding)\?($re_encoded_text)\?=/;
|
||||
|
||||
# in strict mode check also for valid base64 characters and also for valid quoted printable codes
|
||||
my $re_encoding_strict_b = qr/[Bb]/;
|
||||
my $re_encoding_strict_q = qr/[Qq]/;
|
||||
my $re_encoded_text_strict_b = qr/[0-9A-Za-z\+\/]*={0,2}/;
|
||||
my $re_encoded_text_strict_q = qr/(?:[\x21-\x3C\x3E\x40-\x7E]|=[0-9A-Fa-f]{2})*/; # NOTE: first part are printable US-ASCII except ?, =, SPACE and TAB
|
||||
my $re_encoded_word_strict = qr/=\?$re_charset(?:\*$re_language)?\?(?:$re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
|
||||
my $re_capture_encoded_word_strict = qr/=\?($re_charset)((?:\*$re_language)?)\?($re_encoding_strict_b\?$re_encoded_text_strict_b|$re_encoding_strict_q\?$re_encoded_text_strict_q)\?=/;
|
||||
|
||||
my $re_newline = qr/(?:\r\n|[\r\n])/;
|
||||
|
||||
# in strict mode encoded words must be always separated by spaces or tabs (or folded newline)
|
||||
# except in comments when separator between words and comment round brackets can be omitted
|
||||
my $re_word_begin_strict = qr/(?:(?:[ \t]|\A)\(?|(?:[^\\]|\A)\)\()/;
|
||||
my $re_word_sep_strict = qr/(?:$re_newline?[ \t])+/;
|
||||
my $re_word_end_strict = qr/(?:\)\(|\)?(?:$re_newline?[ \t]|\z))/;
|
||||
|
||||
my $re_match = qr/()((?:$re_encoded_word\s*)*$re_encoded_word)()/;
|
||||
my $re_match_strict = qr/($re_word_begin_strict)((?:$re_encoded_word_strict$re_word_sep_strict)*$re_encoded_word_strict)(?=$re_word_end_strict)/;
|
||||
|
||||
my $re_capture = qr/$re_capture_encoded_word(?:\s*)?/;
|
||||
my $re_capture_strict = qr/$re_capture_encoded_word_strict$re_word_sep_strict?/;
|
||||
|
||||
our $STRICT_DECODE = 0;
|
||||
|
||||
sub decode($$;$) {
|
||||
my ($obj, $str, $chk) = @_;
|
||||
return undef unless defined $str;
|
||||
|
||||
my $re_match_decode = $STRICT_DECODE ? $re_match_strict : $re_match;
|
||||
my $re_capture_decode = $STRICT_DECODE ? $re_capture_strict : $re_capture;
|
||||
|
||||
my $stop = 0;
|
||||
my $output = substr($str, 0, 0); # to propagate taintedness
|
||||
|
||||
# decode each line separately, match whole continuous folded line at one call
|
||||
1 while not $stop and $str =~ s{^((?:[^\r\n]*(?:$re_newline[ \t])?)*)($re_newline)?}{
|
||||
|
||||
my $line = $1;
|
||||
my $sep = defined $2 ? $2 : '';
|
||||
|
||||
$stop = 1 unless length($line) or length($sep);
|
||||
|
||||
# NOTE: this code partially could break $chk support
|
||||
# in non strict mode concat consecutive encoded mime words with same charset, language and encoding
|
||||
# fixes breaking inside multi-byte characters
|
||||
1 while not $STRICT_DECODE and $line =~ s/$re_capture_encoded_word_split\s*=\?\1\2\?\3\?($re_encoded_text)\?=/=\?$1$2\?$3\?$4$5\?=/so;
|
||||
|
||||
# process sequence of encoded MIME words at once
|
||||
1 while not $stop and $line =~ s{^(.*?)$re_match_decode}{
|
||||
|
||||
my $begin = $1 . $2;
|
||||
my $words = $3;
|
||||
|
||||
$begin =~ tr/\r\n//d;
|
||||
$output .= $begin;
|
||||
|
||||
# decode one MIME word
|
||||
1 while not $stop and $words =~ s{^(.*?)($re_capture_decode)}{
|
||||
|
||||
$output .= $1;
|
||||
my $orig = $2;
|
||||
my $charset = $3;
|
||||
my ($mime_enc, $text) = split /\?/, $5;
|
||||
|
||||
$text =~ tr/\r\n//d;
|
||||
|
||||
my $enc = Encode::find_mime_encoding($charset);
|
||||
|
||||
# in non strict mode allow also perl encoding aliases
|
||||
if ( not defined $enc and not $STRICT_DECODE ) {
|
||||
# make sure that decoded string will be always strict UTF-8
|
||||
$charset = 'UTF-8' if lc($charset) eq 'utf8';
|
||||
$enc = Encode::find_encoding($charset);
|
||||
}
|
||||
|
||||
if ( not defined $enc ) {
|
||||
Carp::croak qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
|
||||
Carp::carp qq(Unknown charset "$charset") if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
|
||||
$stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
|
||||
$output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
|
||||
$stop ? $orig : '';
|
||||
} else {
|
||||
if ( uc($mime_enc) eq 'B' and $obj->{decode_b} ) {
|
||||
my $decoded = _decode_b($enc, $text, $chk);
|
||||
$stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
|
||||
$output .= (defined $decoded ? $decoded : $text) unless $stop;
|
||||
$stop ? $orig : '';
|
||||
} elsif ( uc($mime_enc) eq 'Q' and $obj->{decode_q} ) {
|
||||
my $decoded = _decode_q($enc, $text, $chk);
|
||||
$stop = 1 if not defined $decoded and not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
|
||||
$output .= (defined $decoded ? $decoded : $text) unless $stop;
|
||||
$stop ? $orig : '';
|
||||
} else {
|
||||
Carp::croak qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::DIE_ON_ERR;
|
||||
Carp::carp qq(MIME "$mime_enc" unsupported) if not ref $chk and $chk and $chk & Encode::WARN_ON_ERR;
|
||||
$stop = 1 if not ref $chk and $chk and $chk & Encode::RETURN_ON_ERR;
|
||||
$output .= ($output =~ /(?:\A|[ \t])$/ ? '' : ' ') . $orig unless $stop; # $orig mime word is separated by whitespace
|
||||
$stop ? $orig : '';
|
||||
}
|
||||
}
|
||||
|
||||
}se;
|
||||
|
||||
if ( not $stop ) {
|
||||
$output .= $words;
|
||||
$words = '';
|
||||
}
|
||||
|
||||
$words;
|
||||
|
||||
}se;
|
||||
|
||||
if ( not $stop ) {
|
||||
$line =~ tr/\r\n//d;
|
||||
$output .= $line . $sep;
|
||||
$line = '';
|
||||
$sep = '';
|
||||
}
|
||||
|
||||
$line . $sep;
|
||||
|
||||
}se;
|
||||
|
||||
$_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
|
||||
return $output;
|
||||
}
|
||||
|
||||
sub _decode_b {
|
||||
my ($enc, $text, $chk) = @_;
|
||||
# MIME::Base64::decode ignores everything after a '=' padding character
|
||||
# in non strict mode split string after each sequence of padding characters and decode each substring
|
||||
my $octets = $STRICT_DECODE ?
|
||||
MIME::Base64::decode($text) :
|
||||
join('', map { MIME::Base64::decode($_) } split /(?<==)(?=[^=])/, $text);
|
||||
return _decode_octets($enc, $octets, $chk);
|
||||
}
|
||||
|
||||
sub _decode_q {
|
||||
my ($enc, $text, $chk) = @_;
|
||||
$text =~ s/_/ /go;
|
||||
$text =~ s/=([0-9A-Fa-f]{2})/pack('C', hex($1))/ego;
|
||||
return _decode_octets($enc, $text, $chk);
|
||||
}
|
||||
|
||||
sub _decode_octets {
|
||||
my ($enc, $octets, $chk) = @_;
|
||||
$chk = 0 unless defined $chk;
|
||||
$chk &= ~Encode::LEAVE_SRC if not ref $chk and $chk;
|
||||
my $output = $enc->decode($octets, $chk);
|
||||
return undef if not ref $chk and $chk and $octets ne '';
|
||||
return $output;
|
||||
}
|
||||
|
||||
sub encode($$;$) {
|
||||
my ($obj, $str, $chk) = @_;
|
||||
return undef unless defined $str;
|
||||
my $output = $obj->_fold_line($obj->_encode_string($str, $chk));
|
||||
$_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
|
||||
return $output . substr($str, 0, 0); # to propagate taintedness
|
||||
}
|
||||
|
||||
sub _fold_line {
|
||||
my ($obj, $line) = @_;
|
||||
my $bpl = $obj->{bpl};
|
||||
my $output = '';
|
||||
|
||||
while ( length($line) ) {
|
||||
if ( $line =~ s/^(.{0,$bpl})(\s|\z)// ) {
|
||||
$output .= $1;
|
||||
$output .= "\r\n" . $2 if length($line);
|
||||
} elsif ( $line =~ s/(\s)(.*)$// ) {
|
||||
$output .= $line;
|
||||
$line = $2;
|
||||
$output .= "\r\n" . $1 if length($line);
|
||||
} else {
|
||||
$output .= $line;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
return $output;
|
||||
}
|
||||
|
||||
sub _encode_string {
|
||||
my ($obj, $str, $chk) = @_;
|
||||
my $wordlen = $obj->{bpl} > 76 ? 76 : $obj->{bpl};
|
||||
my $enc = Encode::find_mime_encoding($obj->{charset});
|
||||
my $enc_chk = $chk;
|
||||
$enc_chk = 0 unless defined $enc_chk;
|
||||
$enc_chk |= Encode::LEAVE_SRC if not ref $enc_chk and $enc_chk;
|
||||
my @result = ();
|
||||
my $octets = '';
|
||||
while ( length( my $chr = substr($str, 0, 1, '') ) ) {
|
||||
my $seq = $enc->encode($chr, $enc_chk);
|
||||
if ( not length($seq) ) {
|
||||
substr($str, 0, 0, $chr);
|
||||
last;
|
||||
}
|
||||
if ( $obj->_encoded_word_len($octets . $seq) > $wordlen ) {
|
||||
push @result, $obj->_encode_word($octets);
|
||||
$octets = '';
|
||||
}
|
||||
$octets .= $seq;
|
||||
}
|
||||
length($octets) and push @result, $obj->_encode_word($octets);
|
||||
$_[1] = $str if not ref $chk and $chk and !($chk & Encode::LEAVE_SRC);
|
||||
return join(' ', @result);
|
||||
}
|
||||
|
||||
sub _encode_word {
|
||||
my ($obj, $octets) = @_;
|
||||
my $charset = $obj->{charset};
|
||||
my $encode = $obj->{encode};
|
||||
my $text = $encode eq 'B' ? _encode_b($octets) : _encode_q($octets);
|
||||
return "=?$charset?$encode?$text?=";
|
||||
}
|
||||
|
||||
sub _encoded_word_len {
|
||||
my ($obj, $octets) = @_;
|
||||
my $charset = $obj->{charset};
|
||||
my $encode = $obj->{encode};
|
||||
my $text_len = $encode eq 'B' ? _encoded_b_len($octets) : _encoded_q_len($octets);
|
||||
return length("=?$charset?$encode??=") + $text_len;
|
||||
}
|
||||
|
||||
sub _encode_b {
|
||||
my ($octets) = @_;
|
||||
return MIME::Base64::encode($octets, '');
|
||||
}
|
||||
|
||||
sub _encoded_b_len {
|
||||
my ($octets) = @_;
|
||||
return ( length($octets) + 2 ) / 3 * 4;
|
||||
}
|
||||
|
||||
my $re_invalid_q_char = qr/[^0-9A-Za-z !*+\-\/]/;
|
||||
|
||||
sub _encode_q {
|
||||
my ($octets) = @_;
|
||||
$octets =~ s{($re_invalid_q_char)}{
|
||||
join('', map { sprintf('=%02X', $_) } unpack('C*', $1))
|
||||
}egox;
|
||||
$octets =~ s/ /_/go;
|
||||
return $octets;
|
||||
}
|
||||
|
||||
sub _encoded_q_len {
|
||||
my ($octets) = @_;
|
||||
my $invalid_count = () = $octets =~ /$re_invalid_q_char/sgo;
|
||||
return ( $invalid_count * 3 ) + ( length($octets) - $invalid_count );
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::MIME::Header -- MIME encoding for an unstructured email header
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw(encode decode);
|
||||
|
||||
my $mime_str = encode("MIME-Header", "Sample:Text \N{U+263A}");
|
||||
# $mime_str is "=?UTF-8?B?U2FtcGxlOlRleHQg4pi6?="
|
||||
|
||||
my $mime_q_str = encode("MIME-Q", "Sample:Text \N{U+263A}");
|
||||
# $mime_q_str is "=?UTF-8?Q?Sample=3AText_=E2=98=BA?="
|
||||
|
||||
my $str = decode("MIME-Header",
|
||||
"=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\r\n " .
|
||||
"=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?="
|
||||
);
|
||||
# $str is "If you can read this you understand the example."
|
||||
|
||||
use Encode qw(decode :fallbacks);
|
||||
use Encode::MIME::Header;
|
||||
local $Encode::MIME::Header::STRICT_DECODE = 1;
|
||||
my $strict_string = decode("MIME-Header", $mime_string, FB_CROAK);
|
||||
# use strict decoding and croak on errors
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements L<RFC 2047|https://tools.ietf.org/html/rfc2047> MIME
|
||||
encoding for an unstructured field body of the email header. It can also be
|
||||
used for L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token. However,
|
||||
it cannot be used directly for the whole header with the field name or for the
|
||||
structured header fields like From, To, Cc, Message-Id, etc... There are 3
|
||||
encoding names supported by this module: C<MIME-Header>, C<MIME-B> and
|
||||
C<MIME-Q>.
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Decode method takes an unstructured field body of the email header (or
|
||||
L<RFC 822|https://tools.ietf.org/html/rfc822> 'text' token) as its input and
|
||||
decodes each MIME encoded-word from input string to a sequence of bytes
|
||||
according to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
|
||||
L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Subsequently, each sequence
|
||||
of bytes with the corresponding MIME charset is decoded with
|
||||
L<the Encode module|Encode> and finally, one output string is returned. Text
|
||||
parts of the input string which do not contain MIME encoded-word stay
|
||||
unmodified in the output string. Folded newlines between two consecutive MIME
|
||||
encoded-words are discarded, others are preserved in the output string.
|
||||
C<MIME-B> can decode Base64 variant, C<MIME-Q> can decode Quoted-Printable
|
||||
variant and C<MIME-Header> can decode both of them. If L<Encode module|Encode>
|
||||
does not support particular MIME charset or chosen variant then an action based
|
||||
on L<CHECK flags|Encode/Handling Malformed Data> is performed (by default, the
|
||||
MIME encoded-word is not decoded).
|
||||
|
||||
Encode method takes a scalar string as its input and uses
|
||||
L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for encoding it to UTF-8
|
||||
bytes. Then a sequence of UTF-8 bytes is encoded into MIME encoded-words
|
||||
(C<MIME-Header> and C<MIME-B> use a Base64 variant while C<MIME-Q> uses a
|
||||
Quoted-Printable variant) where each MIME encoded-word is limited to 75
|
||||
characters. MIME encoded-words are separated by C<CRLF SPACE> and joined to
|
||||
one output string. Output string is suitable for unstructured field body of
|
||||
the email header.
|
||||
|
||||
Both encode and decode methods propagate
|
||||
L<CHECK flags|Encode/Handling Malformed Data> when encoding and decoding the
|
||||
MIME charset.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Versions prior to 2.22 (part of Encode 2.83) have a malfunctioning decoder
|
||||
and encoder. The MIME encoder infamously inserted additional spaces or
|
||||
discarded white spaces between consecutive MIME encoded-words, which led to
|
||||
invalid MIME headers produced by this module. The MIME decoder had a tendency
|
||||
to discard white spaces, incorrectly interpret data or attempt to decode Base64
|
||||
MIME encoded-words as Quoted-Printable. These problems were fixed in version
|
||||
2.22. It is highly recommended not to use any version prior 2.22!
|
||||
|
||||
Versions prior to 2.24 (part of Encode 2.87) ignored
|
||||
L<CHECK flags|Encode/Handling Malformed Data>. The MIME encoder used
|
||||
L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder for input Unicode
|
||||
strings which could lead to invalid UTF-8 sequences. MIME decoder used also
|
||||
L<not strict utf8|Encode/UTF-8 vs. utf8 vs. UTF8> decoder and additionally
|
||||
called the decode method with a C<Encode::FB_PERLQQ> flag (thus user-specified
|
||||
L<CHECK flags|Encode/Handling Malformed Data> were ignored). Moreover, it
|
||||
automatically croaked when a MIME encoded-word contained unknown encoding.
|
||||
Since version 2.24, this module uses
|
||||
L<strict UTF-8|Encode/UTF-8 vs. utf8 vs. UTF8> encoder and decoder. And
|
||||
L<CHECK flags|Encode/Handling Malformed Data> are correctly propagated.
|
||||
|
||||
Since version 2.22 (part of Encode 2.83), the MIME encoder should be fully
|
||||
compliant to L<RFC 2047|https://tools.ietf.org/html/rfc2047> and
|
||||
L<RFC 2231|https://tools.ietf.org/html/rfc2231>. Due to the aforementioned
|
||||
bugs in previous versions of the MIME encoder, there is a I<less strict>
|
||||
compatible mode for the MIME decoder which is used by default. It should be
|
||||
able to decode MIME encoded-words encoded by pre 2.22 versions of this module.
|
||||
However, note that this is not correct according to
|
||||
L<RFC 2047|https://tools.ietf.org/html/rfc2047>.
|
||||
|
||||
In default I<not strict> mode the MIME decoder attempts to decode every substring
|
||||
which looks like a MIME encoded-word. Therefore, the MIME encoded-words do not
|
||||
need to be separated by white space. To enforce a correct I<strict> mode, set
|
||||
variable C<$Encode::MIME::Header::STRICT_DECODE> to 1 e.g. by localizing:
|
||||
|
||||
use Encode::MIME::Header;
|
||||
local $Encode::MIME::Header::STRICT_DECODE = 1;
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Pali E<lt>pali@cpan.orgE<gt>
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>,
|
||||
L<RFC 822|https://tools.ietf.org/html/rfc822>,
|
||||
L<RFC 2047|https://tools.ietf.org/html/rfc2047>,
|
||||
L<RFC 2231|https://tools.ietf.org/html/rfc2231>
|
||||
|
||||
=cut
|
||||
133
database/perl/lib/Encode/MIME/Header/ISO_2022_JP.pm
Normal file
133
database/perl/lib/Encode/MIME/Header/ISO_2022_JP.pm
Normal file
@@ -0,0 +1,133 @@
|
||||
package Encode::MIME::Header::ISO_2022_JP;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
use parent qw(Encode::MIME::Header);
|
||||
|
||||
my $obj =
|
||||
bless { decode_b => '1', decode_q => '1', encode => 'B', bpl => 76, Name => 'MIME-Header-ISO_2022_JP' } =>
|
||||
__PACKAGE__;
|
||||
Encode::define_encoding($obj, 'MIME-Header-ISO_2022_JP');
|
||||
|
||||
use constant HEAD => '=?ISO-2022-JP?B?';
|
||||
use constant TAIL => '?=';
|
||||
|
||||
use Encode::CJKConstants qw(%RE);
|
||||
|
||||
our $VERSION = do { my @r = ( q$Revision: 1.9 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
# I owe the below codes totally to
|
||||
# Jcode by Dan Kogai & http://www.din.or.jp/~ohzaki/perl.htm#JP_Base64
|
||||
|
||||
sub encode {
|
||||
my $self = shift;
|
||||
my $str = shift;
|
||||
return undef unless defined $str;
|
||||
|
||||
utf8::encode($str) if ( Encode::is_utf8($str) );
|
||||
Encode::from_to( $str, 'utf8', 'euc-jp' );
|
||||
|
||||
my ($trailing_crlf) = ( $str =~ /(\n|\r|\x0d\x0a)$/o );
|
||||
|
||||
$str = _mime_unstructured_header( $str, $self->{bpl} );
|
||||
|
||||
not $trailing_crlf and $str =~ s/(\n|\r|\x0d\x0a)$//o;
|
||||
|
||||
return $str;
|
||||
}
|
||||
|
||||
sub _mime_unstructured_header {
|
||||
my ( $oldheader, $bpl ) = @_;
|
||||
my $crlf = $oldheader =~ /\n$/;
|
||||
my ( $header, @words, @wordstmp, $i ) = ('');
|
||||
|
||||
$oldheader =~ s/\s+$//;
|
||||
|
||||
@wordstmp = split /\s+/, $oldheader;
|
||||
|
||||
for ( $i = 0 ; $i < $#wordstmp ; $i++ ) {
|
||||
if ( $wordstmp[$i] !~ /^[\x21-\x7E]+$/
|
||||
and $wordstmp[ $i + 1 ] !~ /^[\x21-\x7E]+$/ )
|
||||
{
|
||||
$wordstmp[ $i + 1 ] = "$wordstmp[$i] $wordstmp[$i + 1]";
|
||||
}
|
||||
else {
|
||||
push( @words, $wordstmp[$i] );
|
||||
}
|
||||
}
|
||||
|
||||
push( @words, $wordstmp[-1] );
|
||||
|
||||
for my $word (@words) {
|
||||
if ( $word =~ /^[\x21-\x7E]+$/ ) {
|
||||
$header =~ /(?:.*\n)*(.*)/;
|
||||
if ( length($1) + length($word) > $bpl ) {
|
||||
$header .= "\n $word";
|
||||
}
|
||||
else {
|
||||
$header .= $word;
|
||||
}
|
||||
}
|
||||
else {
|
||||
$header = _add_encoded_word( $word, $header, $bpl );
|
||||
}
|
||||
|
||||
$header =~ /(?:.*\n)*(.*)/;
|
||||
|
||||
if ( length($1) == $bpl ) {
|
||||
$header .= "\n ";
|
||||
}
|
||||
else {
|
||||
$header .= ' ';
|
||||
}
|
||||
}
|
||||
|
||||
$header =~ s/\n? $//mg;
|
||||
|
||||
$crlf ? "$header\n" : $header;
|
||||
}
|
||||
|
||||
sub _add_encoded_word {
|
||||
my ( $str, $line, $bpl ) = @_;
|
||||
my $result = '';
|
||||
|
||||
while ( length($str) ) {
|
||||
my $target = $str;
|
||||
$str = '';
|
||||
|
||||
if (
|
||||
length($line) + 22 +
|
||||
( $target =~ /^(?:$RE{EUC_0212}|$RE{EUC_C})/o ) * 8 > $bpl )
|
||||
{
|
||||
$line =~ s/[ \t\n\r]*$/\n/;
|
||||
$result .= $line;
|
||||
$line = ' ';
|
||||
}
|
||||
|
||||
while (1) {
|
||||
my $iso_2022_jp = $target;
|
||||
Encode::from_to( $iso_2022_jp, 'euc-jp', 'iso-2022-jp' );
|
||||
|
||||
my $encoded =
|
||||
HEAD . MIME::Base64::encode_base64( $iso_2022_jp, '' ) . TAIL;
|
||||
|
||||
if ( length($encoded) + length($line) > $bpl ) {
|
||||
$target =~
|
||||
s/($RE{EUC_0212}|$RE{EUC_KANA}|$RE{EUC_C}|$RE{ASCII})$//o;
|
||||
$str = $1 . $str;
|
||||
}
|
||||
else {
|
||||
$line .= $encoded;
|
||||
last;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$result . $line;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
103
database/perl/lib/Encode/MIME/Name.pm
Normal file
103
database/perl/lib/Encode/MIME/Name.pm
Normal file
@@ -0,0 +1,103 @@
|
||||
package Encode::MIME::Name;
|
||||
use strict;
|
||||
use warnings;
|
||||
our $VERSION = do { my @r = ( q$Revision: 1.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
# NOTE: This table must be 1:1 mapping
|
||||
our %MIME_NAME_OF = (
|
||||
'AdobeStandardEncoding' => 'Adobe-Standard-Encoding',
|
||||
'AdobeSymbol' => 'Adobe-Symbol-Encoding',
|
||||
'ascii' => 'US-ASCII',
|
||||
'big5-hkscs' => 'Big5-HKSCS',
|
||||
'cp1026' => 'IBM1026',
|
||||
'cp1047' => 'IBM1047',
|
||||
'cp1250' => 'windows-1250',
|
||||
'cp1251' => 'windows-1251',
|
||||
'cp1252' => 'windows-1252',
|
||||
'cp1253' => 'windows-1253',
|
||||
'cp1254' => 'windows-1254',
|
||||
'cp1255' => 'windows-1255',
|
||||
'cp1256' => 'windows-1256',
|
||||
'cp1257' => 'windows-1257',
|
||||
'cp1258' => 'windows-1258',
|
||||
'cp37' => 'IBM037',
|
||||
'cp424' => 'IBM424',
|
||||
'cp437' => 'IBM437',
|
||||
'cp500' => 'IBM500',
|
||||
'cp775' => 'IBM775',
|
||||
'cp850' => 'IBM850',
|
||||
'cp852' => 'IBM852',
|
||||
'cp855' => 'IBM855',
|
||||
'cp857' => 'IBM857',
|
||||
'cp860' => 'IBM860',
|
||||
'cp861' => 'IBM861',
|
||||
'cp862' => 'IBM862',
|
||||
'cp863' => 'IBM863',
|
||||
'cp864' => 'IBM864',
|
||||
'cp865' => 'IBM865',
|
||||
'cp866' => 'IBM866',
|
||||
'cp869' => 'IBM869',
|
||||
'cp936' => 'GBK',
|
||||
'euc-cn' => 'EUC-CN',
|
||||
'euc-jp' => 'EUC-JP',
|
||||
'euc-kr' => 'EUC-KR',
|
||||
#'gb2312-raw' => 'GB2312', # no, you're wrong, I18N::Charset
|
||||
'hp-roman8' => 'hp-roman8',
|
||||
'hz' => 'HZ-GB-2312',
|
||||
'iso-2022-jp' => 'ISO-2022-JP',
|
||||
'iso-2022-jp-1' => 'ISO-2022-JP-1',
|
||||
'iso-2022-kr' => 'ISO-2022-KR',
|
||||
'iso-8859-1' => 'ISO-8859-1',
|
||||
'iso-8859-10' => 'ISO-8859-10',
|
||||
'iso-8859-13' => 'ISO-8859-13',
|
||||
'iso-8859-14' => 'ISO-8859-14',
|
||||
'iso-8859-15' => 'ISO-8859-15',
|
||||
'iso-8859-16' => 'ISO-8859-16',
|
||||
'iso-8859-2' => 'ISO-8859-2',
|
||||
'iso-8859-3' => 'ISO-8859-3',
|
||||
'iso-8859-4' => 'ISO-8859-4',
|
||||
'iso-8859-5' => 'ISO-8859-5',
|
||||
'iso-8859-6' => 'ISO-8859-6',
|
||||
'iso-8859-7' => 'ISO-8859-7',
|
||||
'iso-8859-8' => 'ISO-8859-8',
|
||||
'iso-8859-9' => 'ISO-8859-9',
|
||||
#'jis0201-raw' => 'JIS_X0201',
|
||||
#'jis0208-raw' => 'JIS_C6226-1983',
|
||||
#'jis0212-raw' => 'JIS_X0212-1990',
|
||||
'koi8-r' => 'KOI8-R',
|
||||
'koi8-u' => 'KOI8-U',
|
||||
#'ksc5601-raw' => 'KS_C_5601-1987',
|
||||
'shiftjis' => 'Shift_JIS',
|
||||
'UTF-16' => 'UTF-16',
|
||||
'UTF-16BE' => 'UTF-16BE',
|
||||
'UTF-16LE' => 'UTF-16LE',
|
||||
'UTF-32' => 'UTF-32',
|
||||
'UTF-32BE' => 'UTF-32BE',
|
||||
'UTF-32LE' => 'UTF-32LE',
|
||||
'UTF-7' => 'UTF-7',
|
||||
'utf-8-strict' => 'UTF-8',
|
||||
'viscii' => 'VISCII',
|
||||
);
|
||||
|
||||
# NOTE: %MIME_NAME_OF is still 1:1 mapping
|
||||
our %ENCODE_NAME_OF = map { uc $MIME_NAME_OF{$_} => $_ } keys %MIME_NAME_OF;
|
||||
|
||||
# Add additional 1:N mapping
|
||||
$MIME_NAME_OF{'utf8'} = 'UTF-8';
|
||||
|
||||
sub get_mime_name($) { $MIME_NAME_OF{$_[0]} };
|
||||
|
||||
sub get_encode_name($) { $ENCODE_NAME_OF{uc $_[0]} };
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::MIME::NAME -- internally used by Encode
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<I18N::Charset>
|
||||
|
||||
=cut
|
||||
190
database/perl/lib/Encode/Makefile_PL.e2x
Normal file
190
database/perl/lib/Encode/Makefile_PL.e2x
Normal file
@@ -0,0 +1,190 @@
|
||||
#
|
||||
# This file is auto-generated by:
|
||||
# enc2xs version $_Version_
|
||||
# $_Now_
|
||||
#
|
||||
use 5.7.2;
|
||||
use strict;
|
||||
use ExtUtils::MakeMaker;
|
||||
use Config;
|
||||
|
||||
# Please edit the following to the taste!
|
||||
my $name = '$_Name_';
|
||||
my %tables = (
|
||||
$_Name__t => [ $_TableFiles_ ],
|
||||
);
|
||||
|
||||
#### DO NOT EDIT BEYOND THIS POINT!
|
||||
require File::Spec;
|
||||
my ($enc2xs, $encode_h) = ();
|
||||
my @path_ext = ('');
|
||||
@path_ext = split(';', $ENV{PATHEXT}) if $^O eq 'MSWin32';
|
||||
PATHLOOP:
|
||||
for my $d (@Config{qw/bin sitebin vendorbin/},
|
||||
(split /$Config{path_sep}/o, $ENV{PATH})){
|
||||
for my $f (qw/enc2xs enc2xs5.7.3/){
|
||||
my $path = File::Spec->catfile($d, $f);
|
||||
for my $ext (@path_ext) {
|
||||
my $bin = "$path$ext";
|
||||
-r "$bin" and $enc2xs = $bin and last PATHLOOP;
|
||||
}
|
||||
}
|
||||
}
|
||||
$enc2xs or die "enc2xs not found!";
|
||||
print "enc2xs is $enc2xs\n";
|
||||
my %encode_h = ();
|
||||
for my $d (@INC){
|
||||
my $dir = File::Spec->catfile($d, "Encode");
|
||||
my $file = File::Spec->catfile($dir, "encode.h");
|
||||
-f $file and $encode_h{$dir} = -M $file;
|
||||
}
|
||||
%encode_h or die "encode.h not found!";
|
||||
# find the latest one
|
||||
($encode_h) = sort {$encode_h{$b} <=> $encode_h{$a}} keys %encode_h;
|
||||
print "encode.h is at $encode_h\n";
|
||||
|
||||
WriteMakefile(
|
||||
INC => "-I$encode_h",
|
||||
#### END_OF_HEADER -- DO NOT EDIT THIS LINE BY HAND! ####
|
||||
NAME => 'Encode::'.$name,
|
||||
VERSION_FROM => "$name.pm",
|
||||
OBJECT => '$(O_FILES)',
|
||||
'dist' => {
|
||||
COMPRESS => 'gzip -9f',
|
||||
SUFFIX => 'gz',
|
||||
DIST_DEFAULT => 'all tardist',
|
||||
},
|
||||
MAN3PODS => {},
|
||||
PREREQ_PM => {
|
||||
'Encode' => "1.41",
|
||||
},
|
||||
# OS 390 winges about line numbers > 64K ???
|
||||
XSOPT => '-nolinenumbers',
|
||||
);
|
||||
|
||||
package MY;
|
||||
|
||||
sub post_initialize
|
||||
{
|
||||
my ($self) = @_;
|
||||
my %o;
|
||||
my $x = $self->{'OBJ_EXT'};
|
||||
# Add the table O_FILES
|
||||
foreach my $e (keys %tables)
|
||||
{
|
||||
$o{$e.$x} = 1;
|
||||
}
|
||||
$o{"$name$x"} = 1;
|
||||
$self->{'O_FILES'} = [sort keys %o];
|
||||
my @files = ("$name.xs");
|
||||
$self->{'C'} = ["$name.c"];
|
||||
# The next two lines to make MacPerl Happy -- dankogai via pudge
|
||||
$self->{SOURCE} .= " $name.c"
|
||||
if $^O eq 'MacOS' && $self->{SOURCE} !~ /\b$name\.c\b/;
|
||||
# $self->{'H'} = [$self->catfile($self->updir,'encode.h')];
|
||||
my %xs;
|
||||
foreach my $table (sort keys %tables) {
|
||||
push (@{$self->{'C'}},"$table.c");
|
||||
# Do NOT add $table.h etc. to H_FILES unless we own up as to how they
|
||||
# get built.
|
||||
foreach my $ext (qw($(OBJ_EXT) .c .h .exh .fnm)) {
|
||||
push (@files,$table.$ext);
|
||||
}
|
||||
}
|
||||
$self->{'XS'} = { "$name.xs" => "$name.c" };
|
||||
$self->{'clean'}{'FILES'} .= join(' ',@files);
|
||||
open(XS,">$name.xs") || die "Cannot open $name.xs:$!";
|
||||
print XS <<'END';
|
||||
#include <EXTERN.h>
|
||||
#include <perl.h>
|
||||
#include <XSUB.h>
|
||||
#include "encode.h"
|
||||
END
|
||||
foreach my $table (sort keys %tables) {
|
||||
print XS qq[#include "${table}.h"\n];
|
||||
}
|
||||
print XS <<"END";
|
||||
|
||||
static void
|
||||
Encode_XSEncoding(pTHX_ encode_t *enc)
|
||||
{
|
||||
dSP;
|
||||
HV *stash = gv_stashpv("Encode::XS", TRUE);
|
||||
SV *iv = newSViv(PTR2IV(enc));
|
||||
SV *sv = sv_bless(newRV_noinc(iv),stash);
|
||||
int i = 0;
|
||||
/* with the SvLEN() == 0 hack, PVX won't be freed. We cast away name's
|
||||
constness, in the hope that perl won't mess with it. */
|
||||
assert(SvTYPE(iv) >= SVt_PV); assert(SvLEN(iv) == 0);
|
||||
SvFLAGS(iv) |= SVp_POK;
|
||||
SvPVX(iv) = (char*) enc->name[0];
|
||||
PUSHMARK(sp);
|
||||
XPUSHs(sv);
|
||||
while (enc->name[i])
|
||||
{
|
||||
const char *name = enc->name[i++];
|
||||
XPUSHs(sv_2mortal(newSVpvn(name,strlen(name))));
|
||||
}
|
||||
PUTBACK;
|
||||
call_pv("Encode::define_encoding",G_DISCARD);
|
||||
SvREFCNT_dec(sv);
|
||||
}
|
||||
|
||||
MODULE = Encode::$name PACKAGE = Encode::$name
|
||||
PROTOTYPES: DISABLE
|
||||
BOOT:
|
||||
{
|
||||
END
|
||||
foreach my $table (sort keys %tables) {
|
||||
print XS qq[#include "${table}.exh"\n];
|
||||
}
|
||||
print XS "}\n";
|
||||
close(XS);
|
||||
return "# Built $name.xs\n\n";
|
||||
}
|
||||
|
||||
sub postamble
|
||||
{
|
||||
my $self = shift;
|
||||
my $dir = "."; # $self->catdir('Encode');
|
||||
my $str = "# $name\$(OBJ_EXT) depends on .h and .exh files not .c files - but all written by enc2xs\n";
|
||||
$str .= "$name.c : $name.xs ";
|
||||
foreach my $table (sort keys %tables)
|
||||
{
|
||||
$str .= " $table.c";
|
||||
}
|
||||
$str .= "\n\n";
|
||||
$str .= "$name\$(OBJ_EXT) : $name.c\n\n";
|
||||
|
||||
foreach my $table (sort keys %tables)
|
||||
{
|
||||
my $numlines = 1;
|
||||
my $lengthsofar = length($str);
|
||||
my $continuator = '';
|
||||
$str .= "$table.c : Makefile.PL";
|
||||
foreach my $file (@{$tables{$table}})
|
||||
{
|
||||
$str .= $continuator.' '.$self->catfile($dir,$file);
|
||||
if ( length($str)-$lengthsofar > 128*$numlines )
|
||||
{
|
||||
$continuator .= " \\\n\t";
|
||||
$numlines++;
|
||||
} else {
|
||||
$continuator = '';
|
||||
}
|
||||
}
|
||||
my $plib = $self->{PERL_CORE} ? '"-I$(PERL_LIB)"' : '';
|
||||
my $ucopts = '-"Q"';
|
||||
$str .=
|
||||
qq{\n\t\$(PERL) $plib $enc2xs $ucopts -o \$\@ -f $table.fnm\n\n};
|
||||
open (FILELIST, ">$table.fnm")
|
||||
|| die "Could not open $table.fnm: $!";
|
||||
foreach my $file (@{$tables{$table}})
|
||||
{
|
||||
print FILELIST $self->catfile($dir,$file) . "\n";
|
||||
}
|
||||
close(FILELIST);
|
||||
}
|
||||
return $str;
|
||||
}
|
||||
|
||||
167
database/perl/lib/Encode/PerlIO.pod
Normal file
167
database/perl/lib/Encode/PerlIO.pod
Normal file
@@ -0,0 +1,167 @@
|
||||
=head1 NAME
|
||||
|
||||
Encode::PerlIO -- a detailed document on Encode and PerlIO
|
||||
|
||||
=head1 Overview
|
||||
|
||||
It is very common to want to do encoding transformations when
|
||||
reading or writing files, network connections, pipes etc.
|
||||
If Perl is configured to use the new 'perlio' IO system then
|
||||
C<Encode> provides a "layer" (see L<PerlIO>) which can transform
|
||||
data as it is read or written.
|
||||
|
||||
Here is how the blind poet would modernise the encoding:
|
||||
|
||||
use Encode;
|
||||
open(my $iliad,'<:encoding(iso-8859-7)','iliad.greek');
|
||||
open(my $utf8,'>:utf8','iliad.utf8');
|
||||
my @epic = <$iliad>;
|
||||
print $utf8 @epic;
|
||||
close($utf8);
|
||||
close($illiad);
|
||||
|
||||
In addition, the new IO system can also be configured to read/write
|
||||
UTF-8 encoded characters (as noted above, this is efficient):
|
||||
|
||||
open(my $fh,'>:utf8','anything');
|
||||
print $fh "Any \x{0021} string \N{SMILEY FACE}\n";
|
||||
|
||||
Either of the above forms of "layer" specifications can be made the default
|
||||
for a lexical scope with the C<use open ...> pragma. See L<open>.
|
||||
|
||||
Once a handle is open, its layers can be altered using C<binmode>.
|
||||
|
||||
Without any such configuration, or if Perl itself is built using the
|
||||
system's own IO, then write operations assume that the file handle
|
||||
accepts only I<bytes> and will C<die> if a character larger than 255 is
|
||||
written to the handle. When reading, each octet from the handle becomes
|
||||
a byte-in-a-character. Note that this default is the same behaviour
|
||||
as bytes-only languages (including Perl before v5.6) would have,
|
||||
and is sufficient to handle native 8-bit encodings e.g. iso-8859-1,
|
||||
EBCDIC etc. and any legacy mechanisms for handling other encodings
|
||||
and binary data.
|
||||
|
||||
In other cases, it is the program's responsibility to transform
|
||||
characters into bytes using the API above before doing writes, and to
|
||||
transform the bytes read from a handle into characters before doing
|
||||
"character operations" (e.g. C<lc>, C</\W+/>, ...).
|
||||
|
||||
You can also use PerlIO to convert larger amounts of data you don't
|
||||
want to bring into memory. For example, to convert between ISO-8859-1
|
||||
(Latin 1) and UTF-8 (or UTF-EBCDIC in EBCDIC machines):
|
||||
|
||||
open(F, "<:encoding(iso-8859-1)", "data.txt") or die $!;
|
||||
open(G, ">:utf8", "data.utf") or die $!;
|
||||
while (<F>) { print G }
|
||||
|
||||
# Could also do "print G <F>" but that would pull
|
||||
# the whole file into memory just to write it out again.
|
||||
|
||||
More examples:
|
||||
|
||||
open(my $f, "<:encoding(cp1252)")
|
||||
open(my $g, ">:encoding(iso-8859-2)")
|
||||
open(my $h, ">:encoding(latin9)") # iso-8859-15
|
||||
|
||||
See also L<encoding> for how to change the default encoding of the
|
||||
data in your script.
|
||||
|
||||
=head1 How does it work?
|
||||
|
||||
Here is a crude diagram of how filehandle, PerlIO, and Encode
|
||||
interact.
|
||||
|
||||
filehandle <-> PerlIO PerlIO <-> scalar (read/printed)
|
||||
\ /
|
||||
Encode
|
||||
|
||||
When PerlIO receives data from either direction, it fills a buffer
|
||||
(currently with 1024 bytes) and passes the buffer to Encode.
|
||||
Encode tries to convert the valid part and passes it back to PerlIO,
|
||||
leaving invalid parts (usually a partial character) in the buffer.
|
||||
PerlIO then appends more data to the buffer, calls Encode again,
|
||||
and so on until the data stream ends.
|
||||
|
||||
To do so, PerlIO always calls (de|en)code methods with CHECK set to 1.
|
||||
This ensures that the method stops at the right place when it
|
||||
encounters partial character. The following is what happens when
|
||||
PerlIO and Encode tries to encode (from utf8) more than 1024 bytes
|
||||
and the buffer boundary happens to be in the middle of a character.
|
||||
|
||||
A B C .... ~ \x{3000} ....
|
||||
41 42 43 .... 7E e3 80 80 ....
|
||||
<- buffer --------------->
|
||||
<< encoded >>>>>>>>>>
|
||||
<- next buffer ------
|
||||
|
||||
Encode converts from the beginning to \x7E, leaving \xe3 in the buffer
|
||||
because it is invalid (partial character).
|
||||
|
||||
Unfortunately, this scheme does not work well with escape-based
|
||||
encodings such as ISO-2022-JP.
|
||||
|
||||
=head1 Line Buffering
|
||||
|
||||
Now let's see what happens when you try to decode from ISO-2022-JP and
|
||||
the buffer ends in the middle of a character.
|
||||
|
||||
JIS208-ESC \x{5f3e}
|
||||
A B C .... ~ \e $ B |DAN | ....
|
||||
41 42 43 .... 7E 1b 24 41 43 46 ....
|
||||
<- buffer --------------------------->
|
||||
<< encoded >>>>>>>>>>>>>>>>>>>>>>>
|
||||
|
||||
As you see, the next buffer begins with \x43. But \x43 is 'C' in
|
||||
ASCII, which is wrong in this case because we are now in JISX 0208
|
||||
area so it has to convert \x43\x46, not \x43. Unlike utf8 and EUC,
|
||||
in escape-based encodings you can't tell if a given octet is a whole
|
||||
character or just part of it.
|
||||
|
||||
Fortunately PerlIO also supports line buffer if you tell PerlIO to use
|
||||
one instead of fixed buffer. Since ISO-2022-JP is guaranteed to revert to ASCII at the end of the line, partial
|
||||
character will never happen when line buffer is used.
|
||||
|
||||
To tell PerlIO to use line buffer, implement -E<gt>needs_lines method
|
||||
for your encoding object. See L<Encode::Encoding> for details.
|
||||
|
||||
Thanks to these efforts most encodings that come with Encode support
|
||||
PerlIO but that still leaves following encodings.
|
||||
|
||||
iso-2022-kr
|
||||
MIME-B
|
||||
MIME-Header
|
||||
MIME-Q
|
||||
|
||||
Fortunately iso-2022-kr is hardly used (according to Jungshik) and
|
||||
MIME-* are very unlikely to be fed to PerlIO because they are for mail
|
||||
headers. See L<Encode::MIME::Header> for details.
|
||||
|
||||
=head2 How can I tell whether my encoding fully supports PerlIO ?
|
||||
|
||||
As of this writing, any encoding whose class belongs to Encode::XS and
|
||||
Encode::Unicode works. The Encode module has a C<perlio_ok> method
|
||||
which you can use before applying PerlIO encoding to the filehandle.
|
||||
Here is an example:
|
||||
|
||||
my $use_perlio = perlio_ok($enc);
|
||||
my $layer = $use_perlio ? "<:raw" : "<:encoding($enc)";
|
||||
open my $fh, $layer, $file or die "$file : $!";
|
||||
while(<$fh>){
|
||||
$_ = decode($enc, $_) unless $use_perlio;
|
||||
# ....
|
||||
}
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode::Encoding>,
|
||||
L<Encode::Supported>,
|
||||
L<Encode::PerlIO>,
|
||||
L<encoding>,
|
||||
L<perlebcdic>,
|
||||
L<perlfunc/open>,
|
||||
L<perlunicode>,
|
||||
L<utf8>,
|
||||
the Perl Unicode Mailing List E<lt>perl-unicode@perl.orgE<gt>
|
||||
|
||||
=cut
|
||||
|
||||
31
database/perl/lib/Encode/README.e2x
Normal file
31
database/perl/lib/Encode/README.e2x
Normal file
@@ -0,0 +1,31 @@
|
||||
Encode::$_Name_ version 0.1
|
||||
========
|
||||
|
||||
NAME
|
||||
Encode::$_Name_ - <describe encoding>
|
||||
|
||||
SYNOPSIS
|
||||
use Encode::$_Name_;
|
||||
#<put more words here>
|
||||
ABSTRACT
|
||||
<fill this in>
|
||||
INSTALLATION
|
||||
|
||||
To install this module type the following:
|
||||
|
||||
perl Makefile.PL
|
||||
make
|
||||
make test
|
||||
make install
|
||||
|
||||
DEPENDENCIES
|
||||
|
||||
This module requires perl version 5.7.3 or later.
|
||||
|
||||
COPYRIGHT AND LICENCE
|
||||
|
||||
Copyright (C) 2002 Your Name <your@address.domain>
|
||||
|
||||
This library is free software; you can redistribute it and/or modify
|
||||
it under the same terms as Perl itself.
|
||||
|
||||
901
database/perl/lib/Encode/Supported.pod
Normal file
901
database/perl/lib/Encode/Supported.pod
Normal file
@@ -0,0 +1,901 @@
|
||||
=head1 NAME
|
||||
|
||||
Encode::Supported -- Encodings supported by Encode
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
=head2 Encoding Names
|
||||
|
||||
Encoding names are case insensitive. White space in names
|
||||
is ignored. In addition, an encoding may have aliases.
|
||||
Each encoding has one "canonical" name. The "canonical"
|
||||
name is chosen from the names of the encoding by picking
|
||||
the first in the following sequence (with a few exceptions).
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
The name used by the Perl community. That includes 'utf8' and 'ascii'.
|
||||
Unlike aliases, canonical names directly reach the method so such
|
||||
frequently used words like 'utf8' don't need to do alias lookups.
|
||||
|
||||
=item *
|
||||
|
||||
The MIME name as defined in IETF RFCs. This includes all "iso-"s.
|
||||
|
||||
=item *
|
||||
|
||||
The name in the IANA registry.
|
||||
|
||||
=item *
|
||||
|
||||
The name used by the organization that defined it.
|
||||
|
||||
=back
|
||||
|
||||
In case I<de jure> canonical names differ from that of the Encode
|
||||
module, they are always aliased if it ever be implemented. So you can
|
||||
safely tell if a given encoding is implemented or not just by passing
|
||||
the canonical name.
|
||||
|
||||
Because of all the alias issues, and because in the general case
|
||||
encodings have state, "Encode" uses an encoding object internally
|
||||
once an operation is in progress.
|
||||
|
||||
=head1 Supported Encodings
|
||||
|
||||
As of Perl 5.8.0, at least the following encodings are recognized.
|
||||
Note that unless otherwise specified, they are all case insensitive
|
||||
(via alias) and all occurrence of spaces are replaced with '-'.
|
||||
In other words, "ISO 8859 1" and "iso-8859-1" are identical.
|
||||
|
||||
Encodings are categorized and implemented in several different modules
|
||||
but you don't have to C<use Encode::XX> to make them available for
|
||||
most cases. Encode.pm will automatically load those modules on demand.
|
||||
|
||||
=head2 Built-in Encodings
|
||||
|
||||
The following encodings are always available.
|
||||
|
||||
Canonical Aliases Comments & References
|
||||
----------------------------------------------------------------
|
||||
ascii US-ascii ISO-646-US [ECMA]
|
||||
ascii-ctrl Special Encoding
|
||||
iso-8859-1 latin1 [ISO]
|
||||
null Special Encoding
|
||||
utf8 UTF-8 [RFC2279]
|
||||
----------------------------------------------------------------
|
||||
|
||||
I<null> and I<ascii-ctrl> are special. "null" fails for all character
|
||||
so when you set fallback mode to PERLQQ, HTMLCREF or XMLCREF, ALL
|
||||
CHARACTERS will fall back to character references. Ditto for
|
||||
"ascii-ctrl" except for control characters. For fallback modes, see
|
||||
L<Encode>.
|
||||
|
||||
=head2 Encode::Unicode -- other Unicode encodings
|
||||
|
||||
Unicode coding schemes other than native utf8 are supported by
|
||||
Encode::Unicode, which will be autoloaded on demand.
|
||||
|
||||
----------------------------------------------------------------
|
||||
UCS-2BE UCS-2, iso-10646-1 [IANA, UC]
|
||||
UCS-2LE [UC]
|
||||
UTF-16 [UC]
|
||||
UTF-16BE [UC]
|
||||
UTF-16LE [UC]
|
||||
UTF-32 [UC]
|
||||
UTF-32BE UCS-4 [UC]
|
||||
UTF-32LE [UC]
|
||||
UTF-7 [RFC2152]
|
||||
----------------------------------------------------------------
|
||||
|
||||
To find how (UCS-2|UTF-(16|32))(LE|BE)? differ from one another,
|
||||
see L<Encode::Unicode>.
|
||||
|
||||
UTF-7 is a special encoding which "re-encodes" UTF-16BE into a 7-bit
|
||||
encoding. It is implemented separately by Encode::Unicode::UTF7.
|
||||
|
||||
=head2 Encode::Byte -- Extended ASCII
|
||||
|
||||
Encode::Byte implements most single-byte encodings except for
|
||||
Symbols and EBCDIC. The following encodings are based on single-byte
|
||||
encodings implemented as extended ASCII. Most of them map
|
||||
\x80-\xff (upper half) to non-ASCII characters.
|
||||
|
||||
=over 2
|
||||
|
||||
=item ISO-8859 and corresponding vendor mappings
|
||||
|
||||
Since there are so many, they are presented in table format with
|
||||
languages and corresponding encoding names by vendors. Note that
|
||||
the table is sorted in order of ISO-8859 and the corresponding vendor
|
||||
mappings are slightly different from that of ISO. See
|
||||
L<http://czyborra.com/charsets/iso8859.html> for details.
|
||||
|
||||
Lang/Regions ISO/Other Std. DOS Windows Macintosh Others
|
||||
----------------------------------------------------------------
|
||||
N. America (ASCII) cp437 AdobeStandardEncoding
|
||||
cp863 (DOSCanadaF)
|
||||
W. Europe iso-8859-1 cp850 cp1252 MacRoman nextstep
|
||||
hp-roman8
|
||||
cp860 (DOSPortuguese)
|
||||
Cntrl. Europe iso-8859-2 cp852 cp1250 MacCentralEurRoman
|
||||
MacCroatian
|
||||
MacRomanian
|
||||
MacRumanian
|
||||
Latin3[1] iso-8859-3
|
||||
Latin4[2] iso-8859-4
|
||||
Cyrillics iso-8859-5 cp855 cp1251 MacCyrillic
|
||||
(See also next section) cp866 MacUkrainian
|
||||
Arabic iso-8859-6 cp864 cp1256 MacArabic
|
||||
cp1006 MacFarsi
|
||||
Greek iso-8859-7 cp737 cp1253 MacGreek
|
||||
cp869 (DOSGreek2)
|
||||
Hebrew iso-8859-8 cp862 cp1255 MacHebrew
|
||||
Turkish iso-8859-9 cp857 cp1254 MacTurkish
|
||||
Nordics iso-8859-10 cp865
|
||||
cp861 MacIcelandic
|
||||
MacSami
|
||||
Thai iso-8859-11[3] cp874 MacThai
|
||||
(iso-8859-12 is nonexistent. Reserved for Indics?)
|
||||
Baltics iso-8859-13 cp775 cp1257
|
||||
Celtics iso-8859-14
|
||||
Latin9 [4] iso-8859-15
|
||||
Latin10 iso-8859-16
|
||||
Vietnamese viscii cp1258 MacVietnamese
|
||||
----------------------------------------------------------------
|
||||
|
||||
[1] Esperanto, Maltese, and Turkish. Turkish is now on 8859-9.
|
||||
[2] Baltics. Now on 8859-10, except for Latvian.
|
||||
[3] TIS 620 + Non-Breaking Space (0xA0 / U+00A0)
|
||||
[4] Nicknamed Latin0; the Euro sign as well as French and Finnish
|
||||
letters that are missing from 8859-1 were added.
|
||||
|
||||
All cp* are also available as ibm-*, ms-*, and windows-* . See also
|
||||
L<http://czyborra.com/charsets/codepages.html>.
|
||||
|
||||
Macintosh encodings don't seem to be registered in such entities as
|
||||
IANA. "Canonical" names in Encode are based upon Apple's Tech Note
|
||||
1150. See L<http://developer.apple.com/technotes/tn/tn1150.html>
|
||||
for details.
|
||||
|
||||
=item KOI8 - De Facto Standard for the Cyrillic world
|
||||
|
||||
Though ISO-8859 does have ISO-8859-5, the KOI8 series is far more
|
||||
popular in the Net. L<Encode> comes with the following KOI charsets.
|
||||
For gory details, see L<http://czyborra.com/charsets/cyrillic.html>
|
||||
|
||||
----------------------------------------------------------------
|
||||
koi8-f
|
||||
koi8-r cp878 [RFC1489]
|
||||
koi8-u [RFC2319]
|
||||
----------------------------------------------------------------
|
||||
|
||||
=back
|
||||
|
||||
=head2 gsm0338 - Hentai Latin 1
|
||||
|
||||
GSM0338 is for GSM handsets. Though it shares alphanumerals with
|
||||
ASCII, control character ranges and other parts are mapped very
|
||||
differently, mainly to store Greek characters. There are also escape
|
||||
sequences (starting with 0x1B) to cover e.g. the Euro sign.
|
||||
|
||||
This was once handled by L<Encode::Bytes> but because of all those
|
||||
unusual specifications, Encode 2.20 has relocated the support to
|
||||
L<Encode::GSM0338>. See L<Encode::GSM0338> for details.
|
||||
|
||||
=over 2
|
||||
|
||||
=item gsm0338 support before 2.19
|
||||
|
||||
Some special cases like a trailing 0x00 byte or a lone 0x1B byte are not
|
||||
well-defined and decode() will return an empty string for them.
|
||||
One possible workaround is
|
||||
|
||||
$gsm =~ s/\x00\z/\x00\x00/;
|
||||
$uni = decode("gsm0338", $gsm);
|
||||
$uni .= "\xA0" if $gsm =~ /\x1B\z/;
|
||||
|
||||
Note that the Encode implementation of GSM0338 does not implement the
|
||||
reuse of Latin capital letters as Greek capital letters (for example,
|
||||
the 0x5A is U+005A (LATIN CAPITAL LETTER Z), not U+0396 (GREEK CAPITAL
|
||||
LETTER ZETA).
|
||||
|
||||
The GSM0338 is also covered in Encode::Byte even though it is not
|
||||
an "extended ASCII" encoding.
|
||||
|
||||
=back
|
||||
|
||||
=head2 CJK: Chinese, Japanese, Korean (Multibyte)
|
||||
|
||||
Note that Vietnamese is listed above. Also read "Encoding vs Charset"
|
||||
below. Also note that these are implemented in distinct modules by
|
||||
countries, due to the size concerns (simplified Chinese is mapped
|
||||
to 'CN', continental China, while traditional Chinese is mapped to
|
||||
'TW', Taiwan). Please refer to their respective documentation pages.
|
||||
|
||||
=over 2
|
||||
|
||||
=item Encode::CN -- Continental China
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
euc-cn [1] MacChineseSimp
|
||||
(gbk) cp936 [2]
|
||||
gb12345-raw { GB12345 without CES }
|
||||
gb2312-raw { GB2312 without CES }
|
||||
hz
|
||||
iso-ir-165
|
||||
----------------------------------------------------------------
|
||||
|
||||
[1] GB2312 is aliased to this. See L<Microsoft-related naming mess>
|
||||
[2] gbk is aliased to this. See L<Microsoft-related naming mess>
|
||||
|
||||
=item Encode::JP -- Japan
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
euc-jp
|
||||
shiftjis cp932 macJapanese
|
||||
7bit-jis
|
||||
iso-2022-jp [RFC1468]
|
||||
iso-2022-jp-1 [RFC2237]
|
||||
jis0201-raw { JIS X 0201 (roman + halfwidth kana) without CES }
|
||||
jis0208-raw { JIS X 0208 (Kanji + fullwidth kana) without CES }
|
||||
jis0212-raw { JIS X 0212 (Extended Kanji) without CES }
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::KR -- Korea
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
euc-kr MacKorean [RFC1557]
|
||||
cp949 [1]
|
||||
iso-2022-kr [RFC1557]
|
||||
johab [KS X 1001:1998, Annex 3]
|
||||
ksc5601-raw { KSC5601 without CES }
|
||||
----------------------------------------------------------------
|
||||
|
||||
[1] ks_c_5601-1987, (x-)?windows-949, and uhc are aliased to this.
|
||||
See below.
|
||||
|
||||
=item Encode::TW -- Taiwan
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
big5-eten cp950 MacChineseTrad {big5 aliased to big5-eten}
|
||||
big5-hkscs
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::HanExtra -- More Chinese via CPAN
|
||||
|
||||
Due to the size concerns, additional Chinese encodings below are
|
||||
distributed separately on CPAN, under the name Encode::HanExtra.
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
big5ext CMEX's Big5e Extension
|
||||
big5plus CMEX's Big5+ Extension
|
||||
cccii Chinese Character Code for Information Interchange
|
||||
euc-tw EUC (Extended Unix Character)
|
||||
gb18030 GBK with Traditional Characters
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::JIS2K -- JIS X 0213 encodings via CPAN
|
||||
|
||||
Due to size concerns, additional Japanese encodings below are
|
||||
distributed separately on CPAN, under the name Encode::JIS2K.
|
||||
|
||||
Standard DOS/Win Macintosh Comment/Reference
|
||||
----------------------------------------------------------------
|
||||
euc-jisx0213
|
||||
shiftjisx0123
|
||||
iso-2022-jp-3
|
||||
jis0213-1-raw
|
||||
jis0213-2-raw
|
||||
----------------------------------------------------------------
|
||||
|
||||
=back
|
||||
|
||||
=head2 Miscellaneous encodings
|
||||
|
||||
=over 2
|
||||
|
||||
=item Encode::EBCDIC
|
||||
|
||||
See L<perlebcdic> for details.
|
||||
|
||||
----------------------------------------------------------------
|
||||
cp37
|
||||
cp500
|
||||
cp875
|
||||
cp1026
|
||||
cp1047
|
||||
posix-bc
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::Symbols
|
||||
|
||||
For symbols and dingbats.
|
||||
|
||||
----------------------------------------------------------------
|
||||
symbol
|
||||
dingbats
|
||||
MacDingbats
|
||||
AdobeZdingbat
|
||||
AdobeSymbol
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::MIME::Header
|
||||
|
||||
Strictly speaking, MIME header encoding documented in RFC 2047 is more
|
||||
of encapsulation than encoding. However, their support in modern
|
||||
world is imperative so they are supported.
|
||||
|
||||
----------------------------------------------------------------
|
||||
MIME-Header [RFC2047]
|
||||
MIME-B [RFC2047]
|
||||
MIME-Q [RFC2047]
|
||||
----------------------------------------------------------------
|
||||
|
||||
=item Encode::Guess
|
||||
|
||||
This one is not a name of encoding but a utility that lets you pick up
|
||||
the most appropriate encoding for a data out of given I<suspects>. See
|
||||
L<Encode::Guess> for details.
|
||||
|
||||
=back
|
||||
|
||||
=head1 Unsupported encodings
|
||||
|
||||
The following encodings are not supported as yet; some because they
|
||||
are rarely used, some because of technical difficulties. They may
|
||||
be supported by external modules via CPAN in the future, however.
|
||||
|
||||
=over 2
|
||||
|
||||
=item ISO-2022-JP-2 [RFC1554]
|
||||
|
||||
Not very popular yet. Needs Unicode Database or equivalent to
|
||||
implement encode() (because it includes JIS X 0208/0212, KSC5601, and
|
||||
GB2312 simultaneously, whose code points in Unicode overlap. So you
|
||||
need to lookup the database to determine to what character set a given
|
||||
Unicode character should belong).
|
||||
|
||||
=item ISO-2022-CN [RFC1922]
|
||||
|
||||
Not very popular. Needs CNS 11643-1 and -2 which are not available in
|
||||
this module. CNS 11643 is supported (via euc-tw) in Encode::HanExtra.
|
||||
Audrey Tang may add support for this encoding in her module in future.
|
||||
|
||||
=item Various HP-UX encodings
|
||||
|
||||
The following are unsupported due to the lack of mapping data.
|
||||
|
||||
'8' - arabic8, greek8, hebrew8, kana8, thai8, and turkish8
|
||||
'15' - japanese15, korean15, and roi15
|
||||
|
||||
=item Cyrillic encoding ISO-IR-111
|
||||
|
||||
Anton Tagunov doubts its usefulness.
|
||||
|
||||
=item ISO-8859-8-1 [Hebrew]
|
||||
|
||||
None of the Encode team knows Hebrew enough (ISO-8859-8, cp1255 and
|
||||
MacHebrew are supported because and just because there were mappings
|
||||
available at L<http://www.unicode.org/>). Contributions welcome.
|
||||
|
||||
=item ISIRI 3342, Iran System, ISIRI 2900 [Farsi]
|
||||
|
||||
Ditto.
|
||||
|
||||
=item Thai encoding TCVN
|
||||
|
||||
Ditto.
|
||||
|
||||
=item Vietnamese encodings VPS
|
||||
|
||||
Though Jungshik Shin has reported that Mozilla supports this encoding,
|
||||
it was too late before 5.8.0 for us to add it. In the future, it
|
||||
may be available via a separate module. See
|
||||
L<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf>
|
||||
and
|
||||
L<http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut>
|
||||
if you are interested in helping us.
|
||||
|
||||
=item Various Mac encodings
|
||||
|
||||
The following are unsupported due to the lack of mapping data.
|
||||
|
||||
MacArmenian, MacBengali, MacBurmese, MacEthiopic
|
||||
MacExtArabic, MacGeorgian, MacKannada, MacKhmer
|
||||
MacLaotian, MacMalayalam, MacMongolian, MacOriya
|
||||
MacSinhalese, MacTamil, MacTelugu, MacTibetan
|
||||
MacVietnamese
|
||||
|
||||
The rest which are already available are based upon the vendor mappings
|
||||
at L<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/> .
|
||||
|
||||
=item (Mac) Indic encodings
|
||||
|
||||
The maps for the following are available at L<http://www.unicode.org/>
|
||||
but remain unsupported because those encodings need an algorithmical
|
||||
approach, currently unsupported by F<enc2xs>:
|
||||
|
||||
MacDevanagari
|
||||
MacGurmukhi
|
||||
MacGujarati
|
||||
|
||||
For details, please see C<Unicode mapping issues and notes:> at
|
||||
L<http://www.unicode.org/Public/MAPPINGS/VENDORS/APPLE/DEVANAGA.TXT> .
|
||||
|
||||
I believe this issue is prevalent not only for Mac Indics but also in
|
||||
other Indic encodings, but the above were the only Indic encodings
|
||||
maps that I could find at L<http://www.unicode.org/> .
|
||||
|
||||
=back
|
||||
|
||||
=head1 Encoding vs. Charset -- terminology
|
||||
|
||||
We are used to using the term (character) I<encoding> and I<character
|
||||
set> interchangeably. But just as confusing the terms byte and
|
||||
character is dangerous and the terms should be differentiated when
|
||||
needed, we need to differentiate I<encoding> and I<character set>.
|
||||
|
||||
To understand that, here is a description of how we make computers
|
||||
grok our characters.
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
First we start with which characters to include. We call this
|
||||
collection of characters I<character repertoire>.
|
||||
|
||||
=item *
|
||||
|
||||
Then we have to give each character a unique ID so your computer can
|
||||
tell the difference between 'a' and 'A'. This itemized character
|
||||
repertoire is now a I<character set>.
|
||||
|
||||
=item *
|
||||
|
||||
If your computer can grow the character set without further
|
||||
processing, you can go ahead and use it. This is called a I<coded
|
||||
character set> (CCS) or I<raw character encoding>. ASCII is used this
|
||||
way for most cases.
|
||||
|
||||
=item *
|
||||
|
||||
But in many cases, especially multi-byte CJK encodings, you have to
|
||||
tweak a little more. Your network connection may not accept any data
|
||||
with the Most Significant Bit set, and your computer may not be able to
|
||||
tell if a given byte is a whole character or just half of it. So you
|
||||
have to I<encode> the character set to use it.
|
||||
|
||||
A I<character encoding scheme> (CES) determines how to encode a given
|
||||
character set, or a set of multiple character sets. 7bit ISO-2022 is
|
||||
an example of a CES. You switch between character sets via I<escape
|
||||
sequences>.
|
||||
|
||||
=back
|
||||
|
||||
Technically, or mathematically, speaking, a character set encoded in
|
||||
such a CES that maps character by character may form a CCS. EUC is such
|
||||
an example. The CES of EUC is as follows:
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
Map ASCII unchanged.
|
||||
|
||||
=item *
|
||||
|
||||
Map such a character set that consists of 94 or 96 powered by N
|
||||
members by adding 0x80 to each byte.
|
||||
|
||||
=item *
|
||||
|
||||
You can also use 0x8e and 0x8f to indicate that the following sequence of
|
||||
characters belongs to yet another character set. To each following byte
|
||||
is added the value 0x80.
|
||||
|
||||
=back
|
||||
|
||||
By carefully looking at the encoded byte sequence, you can find that the
|
||||
byte sequence conforms a unique number. In that sense, EUC is a CCS
|
||||
generated by a CES above from up to four CCS (complicated?). UTF-8
|
||||
falls into this category. See L<perlUnicode/"UTF-8"> to find out how
|
||||
UTF-8 maps Unicode to a byte sequence.
|
||||
|
||||
You may also have found out by now why 7bit ISO-2022 cannot comprise
|
||||
a CCS. If you look at a byte sequence \x21\x21, you can't tell if
|
||||
it is two !'s or IDEOGRAPHIC SPACE. EUC maps the latter to \xA1\xA1
|
||||
so you have no trouble differentiating between "!!". and S<" ">.
|
||||
|
||||
=head1 Encoding Classification (by Anton Tagunov and Dan Kogai)
|
||||
|
||||
This section tries to classify the supported encodings by their
|
||||
applicability for information exchange over the Internet and to
|
||||
choose the most suitable aliases to name them in the context of
|
||||
such communication.
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
To (en|de)code encodings marked by C<(**)>, you need
|
||||
C<Encode::HanExtra>, available from CPAN.
|
||||
|
||||
=back
|
||||
|
||||
Encoding names
|
||||
|
||||
US-ASCII UTF-8 ISO-8859-* KOI8-R
|
||||
Shift_JIS EUC-JP ISO-2022-JP ISO-2022-JP-1
|
||||
EUC-KR Big5 GB2312
|
||||
|
||||
are registered with IANA as preferred MIME names and may
|
||||
be used over the Internet.
|
||||
|
||||
C<Shift_JIS> has been officialized by JIS X 0208:1997.
|
||||
L<Microsoft-related naming mess> gives details.
|
||||
|
||||
C<GB2312> is the IANA name for C<EUC-CN>.
|
||||
See L<Microsoft-related naming mess> for details.
|
||||
|
||||
C<GB_2312-80> I<raw> encoding is available as C<gb2312-raw>
|
||||
with Encode. See L<Encode::CN> for details.
|
||||
|
||||
EUC-CN
|
||||
KOI8-U [RFC2319]
|
||||
|
||||
have not been registered with IANA (as of March 2002) but
|
||||
seem to be supported by major web browsers.
|
||||
The IANA name for C<EUC-CN> is C<GB2312>.
|
||||
|
||||
KS_C_5601-1987
|
||||
|
||||
is heavily misused.
|
||||
See L<Microsoft-related naming mess> for details.
|
||||
|
||||
C<KS_C_5601-1987> I<raw> encoding is available as C<kcs5601-raw>
|
||||
with Encode. See L<Encode::KR> for details.
|
||||
|
||||
UTF-16 UTF-16BE UTF-16LE
|
||||
|
||||
are IANA-registered C<charset>s. See [RFC 2781] for details.
|
||||
Jungshik Shin reports that UTF-16 with a BOM is well accepted
|
||||
by MS IE 5/6 and NS 4/6. Beware however that
|
||||
|
||||
=over 2
|
||||
|
||||
=item *
|
||||
|
||||
C<UTF-16> support in any software you're going to be
|
||||
using/interoperating with has probably been less tested
|
||||
then C<UTF-8> support
|
||||
|
||||
=item *
|
||||
|
||||
C<UTF-8> coded data seamlessly passes traditional
|
||||
command piping (C<cat>, C<more>, etc.) while C<UTF-16> coded
|
||||
data is likely to cause confusion (with its zero bytes,
|
||||
for example)
|
||||
|
||||
=item *
|
||||
|
||||
it is beyond the power of words to describe the way HTML browsers
|
||||
encode non-C<ASCII> form data. To get a general impression, visit
|
||||
L<http://www.alanflavell.org.uk/charset/form-i18n.html>.
|
||||
While encoding of form data has stabilized for C<UTF-8> encoded pages
|
||||
(at least IE 5/6, NS 6, and Opera 6 behave consistently), be sure to
|
||||
expect fun (and cross-browser discrepancies) with C<UTF-16> encoded
|
||||
pages!
|
||||
|
||||
=back
|
||||
|
||||
The rule of thumb is to use C<UTF-8> unless you know what
|
||||
you're doing and unless you really benefit from using C<UTF-16>.
|
||||
|
||||
ISO-IR-165 [RFC1345]
|
||||
VISCII
|
||||
GB 12345
|
||||
GB 18030 (**) (see links below)
|
||||
EUC-TW (**)
|
||||
|
||||
are totally valid encodings but not registered at IANA.
|
||||
The names under which they are listed here are probably the
|
||||
most widely-known names for these encodings and are recommended
|
||||
names.
|
||||
|
||||
BIG5PLUS (**)
|
||||
|
||||
is a proprietary name.
|
||||
|
||||
=head2 Microsoft-related naming mess
|
||||
|
||||
Microsoft products misuse the following names:
|
||||
|
||||
=over 2
|
||||
|
||||
=item KS_C_5601-1987
|
||||
|
||||
Microsoft extension to C<EUC-KR>.
|
||||
|
||||
Proper names: C<CP949>, C<UHC>, C<x-windows-949> (as used by Mozilla).
|
||||
|
||||
See L<http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0033.html>
|
||||
for details.
|
||||
|
||||
Encode aliases C<KS_C_5601-1987> to C<cp949> to reflect this common
|
||||
misusage. I<Raw> C<KS_C_5601-1987> encoding is available as
|
||||
C<kcs5601-raw>.
|
||||
|
||||
See L<Encode::KR> for details.
|
||||
|
||||
=item GB2312
|
||||
|
||||
Microsoft extension to C<EUC-CN>.
|
||||
|
||||
Proper names: C<CP936>, C<GBK>.
|
||||
|
||||
C<GB2312> has been registered in the C<EUC-CN> meaning at
|
||||
IANA. This has partially repaired the situation: Microsoft's
|
||||
C<GB2312> has become a superset of the official C<GB2312>.
|
||||
|
||||
Encode aliases C<GB2312> to C<euc-cn> in full agreement with
|
||||
IANA registration. C<cp936> is supported separately.
|
||||
I<Raw> C<GB_2312-80> encoding is available as C<gb2312-raw>.
|
||||
|
||||
See L<Encode::CN> for details.
|
||||
|
||||
=item Big5
|
||||
|
||||
Microsoft extension to C<Big5>.
|
||||
|
||||
Proper name: C<CP950>.
|
||||
|
||||
Encode separately supports C<Big5> and C<cp950>.
|
||||
|
||||
=item Shift_JIS
|
||||
|
||||
Microsoft's understanding of C<Shift_JIS>.
|
||||
|
||||
JIS has not endorsed the full Microsoft standard however.
|
||||
The official C<Shift_JIS> includes only JIS X 0201 and JIS X 0208
|
||||
character sets, while Microsoft has always used C<Shift_JIS>
|
||||
to encode a wider character repertoire. See C<IANA> registration for
|
||||
C<Windows-31J>.
|
||||
|
||||
As a historical predecessor, Microsoft's variant
|
||||
probably has more rights for the name, though it may be objected
|
||||
that Microsoft shouldn't have used JIS as part of the name
|
||||
in the first place.
|
||||
|
||||
Unambiguous name: C<CP932>. C<IANA> name (also used by Mozilla, and
|
||||
provided as an alias by Encode): C<Windows-31J>.
|
||||
|
||||
Encode separately supports C<Shift_JIS> and C<cp932>.
|
||||
|
||||
=back
|
||||
|
||||
=head1 Glossary
|
||||
|
||||
=over 2
|
||||
|
||||
=item character repertoire
|
||||
|
||||
A collection of unique characters. A I<character> set in the strictest
|
||||
sense. At this stage, characters are not numbered.
|
||||
|
||||
=item coded character set (CCS)
|
||||
|
||||
A character set that is mapped in a way computers can use directly.
|
||||
Many character encodings, including EUC, fall in this category.
|
||||
|
||||
=item character encoding scheme (CES)
|
||||
|
||||
An algorithm to map a character set to a byte sequence. You don't
|
||||
have to be able to tell which character set a given byte sequence
|
||||
belongs. 7-bit ISO-2022 is a CES but it cannot be a CCS. EUC is an
|
||||
example of being both a CCS and CES.
|
||||
|
||||
=item charset (in MIME context)
|
||||
|
||||
has long been used in the meaning of C<encoding>, CES.
|
||||
|
||||
While the word combination C<character set> has lost this meaning
|
||||
in MIME context since [RFC 2130], the C<charset> abbreviation has
|
||||
retained it. This is how [RFC 2277] and [RFC 2278] bless C<charset>:
|
||||
|
||||
This document uses the term "charset" to mean a set of rules for
|
||||
mapping from a sequence of octets to a sequence of characters, such
|
||||
as the combination of a coded character set and a character encoding
|
||||
scheme; this is also what is used as an identifier in MIME "charset="
|
||||
parameters, and registered in the IANA charset registry ... (Note
|
||||
that this is NOT a term used by other standards bodies, such as ISO).
|
||||
[RFC 2277]
|
||||
|
||||
=item EUC
|
||||
|
||||
Extended Unix Character. See ISO-2022.
|
||||
|
||||
=item ISO-2022
|
||||
|
||||
A CES that was carefully designed to coexist with ASCII. There are a 7
|
||||
bit version and an 8 bit version.
|
||||
|
||||
The 7 bit version switches character set via escape sequence so it
|
||||
cannot form a CCS. Since this is more difficult to handle in programs
|
||||
than the 8 bit version, the 7 bit version is not very popular except for
|
||||
iso-2022-jp, the I<de facto> standard CES for e-mails.
|
||||
|
||||
The 8 bit version can form a CCS. EUC and ISO-8859 are two examples
|
||||
thereof. Pre-5.6 perl could use them as string literals.
|
||||
|
||||
=item UCS
|
||||
|
||||
Short for I<Universal Character Set>. When you say just UCS, it means
|
||||
I<Unicode>.
|
||||
|
||||
=item UCS-2
|
||||
|
||||
ISO/IEC 10646 encoding form: Universal Character Set coded in two
|
||||
octets.
|
||||
|
||||
=item Unicode
|
||||
|
||||
A character set that aims to include all character repertoires of the
|
||||
world. Many character sets in various national as well as industrial
|
||||
standards have become, in a way, just subsets of Unicode.
|
||||
|
||||
=item UTF
|
||||
|
||||
Short for I<Unicode Transformation Format>. Determines how to map a
|
||||
Unicode character into a byte sequence.
|
||||
|
||||
=item UTF-16
|
||||
|
||||
A UTF in 16-bit encoding. Can either be in big endian or little
|
||||
endian. The big endian version is called UTF-16BE (equal to UCS-2 +
|
||||
surrogate support) and the little endian version is called UTF-16LE.
|
||||
|
||||
=back
|
||||
|
||||
=head1 See Also
|
||||
|
||||
L<Encode>,
|
||||
L<Encode::Byte>,
|
||||
L<Encode::CN>, L<Encode::JP>, L<Encode::KR>, L<Encode::TW>,
|
||||
L<Encode::EBCDIC>, L<Encode::Symbol>
|
||||
L<Encode::MIME::Header>, L<Encode::Guess>
|
||||
|
||||
=head1 References
|
||||
|
||||
=over 2
|
||||
|
||||
=item ECMA
|
||||
|
||||
European Computer Manufacturers Association
|
||||
L<http://www.ecma.ch>
|
||||
|
||||
=over 2
|
||||
|
||||
=item ECMA-035 (eq C<ISO-2022>)
|
||||
|
||||
L<http://www.ecma.ch/ecma1/STAND/ECMA-035.HTM>
|
||||
|
||||
The specification of ISO-2022 is available from the link above.
|
||||
|
||||
=back
|
||||
|
||||
=item IANA
|
||||
|
||||
Internet Assigned Numbers Authority
|
||||
L<http://www.iana.org/>
|
||||
|
||||
=over 2
|
||||
|
||||
=item Assigned Charset Names by IANA
|
||||
|
||||
L<http://www.iana.org/assignments/character-sets>
|
||||
|
||||
Most of the C<canonical names> in Encode derive from this list
|
||||
so you can directly apply the string you have extracted from MIME
|
||||
header of mails and web pages.
|
||||
|
||||
=back
|
||||
|
||||
=item ISO
|
||||
|
||||
International Organization for Standardization
|
||||
L<http://www.iso.ch/>
|
||||
|
||||
=item RFC
|
||||
|
||||
Request For Comments -- need I say more?
|
||||
L<http://www.rfc-editor.org/>, L<http://www.ietf.org/rfc.html>,
|
||||
L<http://www.faqs.org/rfcs/>
|
||||
|
||||
=item UC
|
||||
|
||||
Unicode Consortium
|
||||
L<http://www.unicode.org/>
|
||||
|
||||
=over 2
|
||||
|
||||
=item Unicode Glossary
|
||||
|
||||
L<http://www.unicode.org/glossary/>
|
||||
|
||||
The glossary of this document is based upon this site.
|
||||
|
||||
=back
|
||||
|
||||
=back
|
||||
|
||||
=head2 Other Notable Sites
|
||||
|
||||
=over 2
|
||||
|
||||
=item czyborra.com
|
||||
|
||||
L<http://czyborra.com/>
|
||||
|
||||
Contains a lot of useful information, especially gory details of ISO
|
||||
vs. vendor mappings.
|
||||
|
||||
=item CJK.inf
|
||||
|
||||
L<http://examples.oreilly.com/cjkvinfo/doc/cjk.inf>
|
||||
|
||||
Somewhat obsolete (last update in 1996), but still useful. Also try
|
||||
|
||||
L<ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf>
|
||||
|
||||
You will find brief info on C<EUC-CN>, C<GBK> and mostly on C<GB 18030>.
|
||||
|
||||
=item Jungshik Shin's Hangul FAQ
|
||||
|
||||
L<http://jshin.net/faq>
|
||||
|
||||
And especially its subject 8.
|
||||
|
||||
L<http://jshin.net/faq/qa8.html>
|
||||
|
||||
A comprehensive overview of the Korean (C<KS *>) standards.
|
||||
|
||||
=item debian.org: "Introduction to i18n"
|
||||
|
||||
A brief description for most of the mentioned CJK encodings is
|
||||
contained in
|
||||
L<http://www.debian.org/doc/manuals/intro-i18n/ch-codes.en.html>
|
||||
|
||||
=back
|
||||
|
||||
=head2 Offline sources
|
||||
|
||||
=over 2
|
||||
|
||||
=item C<CJKV Information Processing> by Ken Lunde
|
||||
|
||||
CJKV Information Processing
|
||||
1999 O'Reilly & Associates, ISBN : 1-56592-224-7
|
||||
|
||||
The modern successor of C<CJK.inf>.
|
||||
|
||||
Features a comprehensive coverage of CJKV character sets and
|
||||
encodings along with many other issues faced by anyone trying
|
||||
to better support CJKV languages/scripts in all the areas of
|
||||
information processing.
|
||||
|
||||
To purchase this book, visit
|
||||
L<http://oreilly.com/catalog/9780596514471/>
|
||||
or your favourite bookstore.
|
||||
|
||||
=back
|
||||
|
||||
=cut
|
||||
44
database/perl/lib/Encode/Symbol.pm
Normal file
44
database/perl/lib/Encode/Symbol.pm
Normal file
@@ -0,0 +1,44 @@
|
||||
package Encode::Symbol;
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.2 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Symbol - Symbol Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$symbol = encode("symbol", $utf8); # loads Encode::Symbol implicitly
|
||||
$utf8 = decode("", $symbol); # ditto
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements symbol and dingbats encodings. Encodings
|
||||
supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
symbol
|
||||
dingbats
|
||||
AdobeZDingbat
|
||||
AdobeSymbol
|
||||
MacDingbats
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
To find out how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
75
database/perl/lib/Encode/TW.pm
Normal file
75
database/perl/lib/Encode/TW.pm
Normal file
@@ -0,0 +1,75 @@
|
||||
package Encode::TW;
|
||||
BEGIN {
|
||||
if ( ord("A") == 193 ) {
|
||||
die "Encode::TW not supported on EBCDIC\n";
|
||||
}
|
||||
}
|
||||
use strict;
|
||||
use warnings;
|
||||
use Encode;
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.3 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::TW - Taiwan-based Chinese Encodings
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$big5 = encode("big5", $utf8); # loads Encode::TW implicitly
|
||||
$utf8 = decode("big5", $big5); # ditto
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module implements tradition Chinese charset encodings as used
|
||||
in Taiwan and Hong Kong.
|
||||
Encodings supported are as follows.
|
||||
|
||||
Canonical Alias Description
|
||||
--------------------------------------------------------------------
|
||||
big5-eten /\bbig-?5$/i Big5 encoding (with ETen extensions)
|
||||
/\bbig5-?et(en)?$/i
|
||||
/\btca-?big5$/i
|
||||
big5-hkscs /\bbig5-?hk(scs)?$/i
|
||||
/\bhk(scs)?-?big5$/i
|
||||
Big5 + Cantonese characters in Hong Kong
|
||||
MacChineseTrad Big5 + Apple Vendor Mappings
|
||||
cp950 Code Page 950
|
||||
= Big5 + Microsoft vendor mappings
|
||||
--------------------------------------------------------------------
|
||||
|
||||
To find out how to use this module in detail, see L<Encode>.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
Due to size concerns, C<EUC-TW> (Extended Unix Character), C<CCCII>
|
||||
(Chinese Character Code for Information Interchange), C<BIG5PLUS>
|
||||
(CMEX's Big5+) and C<BIG5EXT> (CMEX's Big5e) are distributed separately
|
||||
on CPAN, under the name L<Encode::HanExtra>. That module also contains
|
||||
extra China-based encodings.
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Since the original C<big5> encoding (1984) is not supported anywhere
|
||||
(glibc and DOS-based systems uses C<big5> to mean C<big5-eten>; Microsoft
|
||||
uses C<big5> to mean C<cp950>), a conscious decision was made to alias
|
||||
C<big5> to C<big5-eten>, which is the de facto superset of the original
|
||||
big5.
|
||||
|
||||
The C<CNS11643> encoding files are not complete. For common C<CNS11643>
|
||||
manipulation, please use C<EUC-TW> in L<Encode::HanExtra>, which contains
|
||||
planes 1-7.
|
||||
|
||||
The ASCII region (0x00-0x7f) is preserved for all encodings, even
|
||||
though this conflicts with mappings by the Unicode Consortium.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
272
database/perl/lib/Encode/Unicode.pm
Normal file
272
database/perl/lib/Encode/Unicode.pm
Normal file
@@ -0,0 +1,272 @@
|
||||
package Encode::Unicode;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.18 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
|
||||
use XSLoader;
|
||||
XSLoader::load( __PACKAGE__, $VERSION );
|
||||
|
||||
#
|
||||
# Object Generator 8 transcoders all at once!
|
||||
#
|
||||
|
||||
use Encode ();
|
||||
|
||||
our %BOM_Unknown = map { $_ => 1 } qw(UTF-16 UTF-32);
|
||||
|
||||
for my $name (
|
||||
qw(UTF-16 UTF-16BE UTF-16LE
|
||||
UTF-32 UTF-32BE UTF-32LE
|
||||
UCS-2BE UCS-2LE)
|
||||
)
|
||||
{
|
||||
my ( $size, $endian, $ucs2, $mask );
|
||||
$name =~ /^(\w+)-(\d+)(\w*)$/o;
|
||||
if ( $ucs2 = ( $1 eq 'UCS' ) ) {
|
||||
$size = 2;
|
||||
}
|
||||
else {
|
||||
$size = $2 / 8;
|
||||
}
|
||||
$endian = ( $3 eq 'BE' ) ? 'n' : ( $3 eq 'LE' ) ? 'v' : '';
|
||||
$size == 4 and $endian = uc($endian);
|
||||
|
||||
my $obj = bless {
|
||||
Name => $name,
|
||||
size => $size,
|
||||
endian => $endian,
|
||||
ucs2 => $ucs2,
|
||||
} => __PACKAGE__;
|
||||
Encode::define_encoding($obj, $name);
|
||||
}
|
||||
|
||||
use parent qw(Encode::Encoding);
|
||||
|
||||
sub renew {
|
||||
my $self = shift;
|
||||
$BOM_Unknown{ $self->name } or return $self;
|
||||
my $clone = bless {%$self} => ref($self);
|
||||
$clone->{renewed}++; # so the caller knows it is renewed.
|
||||
return $clone;
|
||||
}
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Unicode -- Various Unicode Transformation Formats
|
||||
|
||||
=cut
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$ucs2 = encode("UCS-2BE", $utf8);
|
||||
$utf8 = decode("UCS-2BE", $ucs2);
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements all Character Encoding Schemes of Unicode that
|
||||
are officially documented by Unicode Consortium (except, of course,
|
||||
for UTF-8, which is a native format in perl).
|
||||
|
||||
=over 4
|
||||
|
||||
=item L<http://www.unicode.org/glossary/> says:
|
||||
|
||||
I<Character Encoding Scheme> A character encoding form plus byte
|
||||
serialization. There are Seven character encoding schemes in Unicode:
|
||||
UTF-8, UTF-16, UTF-16BE, UTF-16LE, UTF-32 (UCS-4), UTF-32BE (UCS-4BE) and
|
||||
UTF-32LE (UCS-4LE), and UTF-7.
|
||||
|
||||
Since UTF-7 is a 7-bit (re)encoded version of UTF-16BE, It is not part of
|
||||
Unicode's Character Encoding Scheme. It is separately implemented in
|
||||
Encode::Unicode::UTF7. For details see L<Encode::Unicode::UTF7>.
|
||||
|
||||
=item Quick Reference
|
||||
|
||||
Decodes from ord(N) Encodes chr(N) to...
|
||||
octet/char BOM S.P d800-dfff ord > 0xffff \x{1abcd} ==
|
||||
---------------+-----------------+------------------------------
|
||||
UCS-2BE 2 N N is bogus Not Available
|
||||
UCS-2LE 2 N N bogus Not Available
|
||||
UTF-16 2/4 Y Y is S.P S.P BE/LE
|
||||
UTF-16BE 2/4 N Y S.P S.P 0xd82a,0xdfcd
|
||||
UTF-16LE 2/4 N Y S.P S.P 0x2ad8,0xcddf
|
||||
UTF-32 4 Y - is bogus As is BE/LE
|
||||
UTF-32BE 4 N - bogus As is 0x0001abcd
|
||||
UTF-32LE 4 N - bogus As is 0xcdab0100
|
||||
UTF-8 1-4 - - bogus >= 4 octets \xf0\x9a\af\8d
|
||||
---------------+-----------------+------------------------------
|
||||
|
||||
=back
|
||||
|
||||
=head1 Size, Endianness, and BOM
|
||||
|
||||
You can categorize these CES by 3 criteria: size of each character,
|
||||
endianness, and Byte Order Mark.
|
||||
|
||||
=head2 by size
|
||||
|
||||
UCS-2 is a fixed-length encoding with each character taking 16 bits.
|
||||
It B<does not> support I<surrogate pairs>. When a surrogate pair
|
||||
is encountered during decode(), its place is filled with \x{FFFD}
|
||||
if I<CHECK> is 0, or the routine croaks if I<CHECK> is 1. When a
|
||||
character whose ord value is larger than 0xFFFF is encountered,
|
||||
its place is filled with \x{FFFD} if I<CHECK> is 0, or the routine
|
||||
croaks if I<CHECK> is 1.
|
||||
|
||||
UTF-16 is almost the same as UCS-2 but it supports I<surrogate pairs>.
|
||||
When it encounters a high surrogate (0xD800-0xDBFF), it fetches the
|
||||
following low surrogate (0xDC00-0xDFFF) and C<desurrogate>s them to
|
||||
form a character. Bogus surrogates result in death. When \x{10000}
|
||||
or above is encountered during encode(), it C<ensurrogate>s them and
|
||||
pushes the surrogate pair to the output stream.
|
||||
|
||||
UTF-32 (UCS-4) is a fixed-length encoding with each character taking 32 bits.
|
||||
Since it is 32-bit, there is no need for I<surrogate pairs>.
|
||||
|
||||
=head2 by endianness
|
||||
|
||||
The first (and now failed) goal of Unicode was to map all character
|
||||
repertoires into a fixed-length integer so that programmers are happy.
|
||||
Since each character is either a I<short> or I<long> in C, you have to
|
||||
pay attention to the endianness of each platform when you pass data
|
||||
to one another.
|
||||
|
||||
Anything marked as BE is Big Endian (or network byte order) and LE is
|
||||
Little Endian (aka VAX byte order). For anything not marked either
|
||||
BE or LE, a character called Byte Order Mark (BOM) indicating the
|
||||
endianness is prepended to the string.
|
||||
|
||||
CAVEAT: Though BOM in utf8 (\xEF\xBB\xBF) is valid, it is meaningless
|
||||
and as of this writing Encode suite just leave it as is (\x{FeFF}).
|
||||
|
||||
=over 4
|
||||
|
||||
=item BOM as integer when fetched in network byte order
|
||||
|
||||
16 32 bits/char
|
||||
-------------------------
|
||||
BE 0xFeFF 0x0000FeFF
|
||||
LE 0xFFFe 0xFFFe0000
|
||||
-------------------------
|
||||
|
||||
=back
|
||||
|
||||
This modules handles the BOM as follows.
|
||||
|
||||
=over 4
|
||||
|
||||
=item *
|
||||
|
||||
When BE or LE is explicitly stated as the name of encoding, BOM is
|
||||
simply treated as a normal character (ZERO WIDTH NO-BREAK SPACE).
|
||||
|
||||
=item *
|
||||
|
||||
When BE or LE is omitted during decode(), it checks if BOM is at the
|
||||
beginning of the string; if one is found, the endianness is set to
|
||||
what the BOM says.
|
||||
|
||||
=item *
|
||||
|
||||
Default Byte Order
|
||||
|
||||
When no BOM is found, Encode 2.76 and blow croaked. Since Encode
|
||||
2.77, it falls back to BE accordingly to RFC2781 and the Unicode
|
||||
Standard version 8.0
|
||||
|
||||
=item *
|
||||
|
||||
When BE or LE is omitted during encode(), it returns a BE-encoded
|
||||
string with BOM prepended. So when you want to encode a whole text
|
||||
file, make sure you encode() the whole text at once, not line by line
|
||||
or each line, not file, will have a BOM prepended.
|
||||
|
||||
=item *
|
||||
|
||||
C<UCS-2> is an exception. Unlike others, this is an alias of UCS-2BE.
|
||||
UCS-2 is already registered by IANA and others that way.
|
||||
|
||||
=back
|
||||
|
||||
=head1 Surrogate Pairs
|
||||
|
||||
To say the least, surrogate pairs were the biggest mistake of the
|
||||
Unicode Consortium. But according to the late Douglas Adams in I<The
|
||||
Hitchhiker's Guide to the Galaxy> Trilogy, C<In the beginning the
|
||||
Universe was created. This has made a lot of people very angry and
|
||||
been widely regarded as a bad move>. Their mistake was not of this
|
||||
magnitude so let's forgive them.
|
||||
|
||||
(I don't dare make any comparison with Unicode Consortium and the
|
||||
Vogons here ;) Or, comparing Encode to Babel Fish is completely
|
||||
appropriate -- if you can only stick this into your ear :)
|
||||
|
||||
Surrogate pairs were born when the Unicode Consortium finally
|
||||
admitted that 16 bits were not big enough to hold all the world's
|
||||
character repertoires. But they already made UCS-2 16-bit. What
|
||||
do we do?
|
||||
|
||||
Back then, the range 0xD800-0xDFFF was not allocated. Let's split
|
||||
that range in half and use the first half to represent the C<upper
|
||||
half of a character> and the second half to represent the C<lower
|
||||
half of a character>. That way, you can represent 1024 * 1024 =
|
||||
1048576 more characters. Now we can store character ranges up to
|
||||
\x{10ffff} even with 16-bit encodings. This pair of half-character is
|
||||
now called a I<surrogate pair> and UTF-16 is the name of the encoding
|
||||
that embraces them.
|
||||
|
||||
Here is a formula to ensurrogate a Unicode character \x{10000} and
|
||||
above;
|
||||
|
||||
$hi = ($uni - 0x10000) / 0x400 + 0xD800;
|
||||
$lo = ($uni - 0x10000) % 0x400 + 0xDC00;
|
||||
|
||||
And to desurrogate;
|
||||
|
||||
$uni = 0x10000 + ($hi - 0xD800) * 0x400 + ($lo - 0xDC00);
|
||||
|
||||
Note this move has made \x{D800}-\x{DFFF} into a forbidden zone but
|
||||
perl does not prohibit the use of characters within this range. To perl,
|
||||
every one of \x{0000_0000} up to \x{ffff_ffff} (*) is I<a character>.
|
||||
|
||||
(*) or \x{ffff_ffff_ffff_ffff} if your perl is compiled with 64-bit
|
||||
integer support!
|
||||
|
||||
=head1 Error Checking
|
||||
|
||||
Unlike most encodings which accept various ways to handle errors,
|
||||
Unicode encodings simply croaks.
|
||||
|
||||
% perl -MEncode -e'$_ = "\xfe\xff\xd8\xd9\xda\xdb\0\n"' \
|
||||
-e'Encode::from_to($_, "utf16","shift_jis", 0); print'
|
||||
UTF-16:Malformed LO surrogate d8d9 at /path/to/Encode.pm line 184.
|
||||
% perl -MEncode -e'$a = "BOM missing"' \
|
||||
-e' Encode::from_to($a, "utf16", "shift_jis", 0); print'
|
||||
UTF-16:Unrecognised BOM 424f at /path/to/Encode.pm line 184.
|
||||
|
||||
Unlike other encodings where mappings are not one-to-one against
|
||||
Unicode, UTFs are supposed to map 100% against one another. So Encode
|
||||
is more strict on UTFs.
|
||||
|
||||
Consider that "division by zero" of Encode :)
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>, L<Encode::Unicode::UTF7>, L<http://www.unicode.org/glossary/>,
|
||||
L<http://www.unicode.org/unicode/faq/utf_bom.html>,
|
||||
|
||||
RFC 2781 L<http://www.ietf.org/rfc/rfc2781.txt>,
|
||||
|
||||
The whole Unicode standard L<http://www.unicode.org/unicode/uni2book/u2.html>
|
||||
|
||||
Ch. 15, pp. 403 of C<Programming Perl (3rd Edition)>
|
||||
by Larry Wall, Tom Christiansen, Jon Orwant;
|
||||
O'Reilly & Associates; ISBN 0-596-00027-8
|
||||
|
||||
=cut
|
||||
133
database/perl/lib/Encode/Unicode/UTF7.pm
Normal file
133
database/perl/lib/Encode/Unicode/UTF7.pm
Normal file
@@ -0,0 +1,133 @@
|
||||
#
|
||||
# $Id: UTF7.pm,v 2.10 2017/06/10 17:23:50 dankogai Exp $
|
||||
#
|
||||
package Encode::Unicode::UTF7;
|
||||
use strict;
|
||||
use warnings;
|
||||
use parent qw(Encode::Encoding);
|
||||
__PACKAGE__->Define('UTF-7');
|
||||
our $VERSION = do { my @r = ( q$Revision: 2.10 $ =~ /\d+/g ); sprintf "%d." . "%02d" x $#r, @r };
|
||||
use MIME::Base64;
|
||||
use Encode qw(find_encoding);
|
||||
|
||||
#
|
||||
# Algorithms taken from Unicode::String by Gisle Aas
|
||||
#
|
||||
|
||||
our $OPTIONAL_DIRECT_CHARS = 1;
|
||||
my $specials = quotemeta "\'(),-./:?";
|
||||
$OPTIONAL_DIRECT_CHARS
|
||||
and $specials .= quotemeta "!\"#$%&*;<=>@[]^_`{|}";
|
||||
|
||||
# \s will not work because it matches U+3000 DEOGRAPHIC SPACE
|
||||
# We use qr/[\n\r\t\ ] instead
|
||||
my $re_asis = qr/(?:[\n\r\t\ A-Za-z0-9$specials])/;
|
||||
my $re_encoded = qr/(?:[^\n\r\t\ A-Za-z0-9$specials])/;
|
||||
my $e_utf16 = find_encoding("UTF-16BE");
|
||||
|
||||
sub needs_lines { 1 }
|
||||
|
||||
sub encode($$;$) {
|
||||
my ( $obj, $str, $chk ) = @_;
|
||||
return undef unless defined $str;
|
||||
my $len = length($str);
|
||||
pos($str) = 0;
|
||||
my $bytes = substr($str, 0, 0); # to propagate taintedness
|
||||
while ( pos($str) < $len ) {
|
||||
if ( $str =~ /\G($re_asis+)/ogc ) {
|
||||
my $octets = $1;
|
||||
utf8::downgrade($octets);
|
||||
$bytes .= $octets;
|
||||
}
|
||||
elsif ( $str =~ /\G($re_encoded+)/ogsc ) {
|
||||
if ( $1 eq "+" ) {
|
||||
$bytes .= "+-";
|
||||
}
|
||||
else {
|
||||
my $s = $1;
|
||||
my $base64 = encode_base64( $e_utf16->encode($s), '' );
|
||||
$base64 =~ s/=+$//;
|
||||
$bytes .= "+$base64-";
|
||||
}
|
||||
}
|
||||
else {
|
||||
die "This should not happen! (pos=" . pos($str) . ")";
|
||||
}
|
||||
}
|
||||
$_[1] = '' if $chk;
|
||||
return $bytes;
|
||||
}
|
||||
|
||||
sub decode($$;$) {
|
||||
use re 'taint';
|
||||
my ( $obj, $bytes, $chk ) = @_;
|
||||
return undef unless defined $bytes;
|
||||
my $len = length($bytes);
|
||||
my $str = substr($bytes, 0, 0); # to propagate taintedness;
|
||||
pos($bytes) = 0;
|
||||
no warnings 'uninitialized';
|
||||
while ( pos($bytes) < $len ) {
|
||||
if ( $bytes =~ /\G([^+]+)/ogc ) {
|
||||
$str .= $1;
|
||||
}
|
||||
elsif ( $bytes =~ /\G\+-/ogc ) {
|
||||
$str .= "+";
|
||||
}
|
||||
elsif ( $bytes =~ /\G\+([A-Za-z0-9+\/]+)-?/ogsc ) {
|
||||
my $base64 = $1;
|
||||
my $pad = length($base64) % 4;
|
||||
$base64 .= "=" x ( 4 - $pad ) if $pad;
|
||||
$str .= $e_utf16->decode( decode_base64($base64) );
|
||||
}
|
||||
elsif ( $bytes =~ /\G\+/ogc ) {
|
||||
$^W and warn "Bad UTF7 data escape";
|
||||
$str .= "+";
|
||||
}
|
||||
else {
|
||||
die "This should not happen " . pos($bytes);
|
||||
}
|
||||
}
|
||||
$_[1] = '' if $chk;
|
||||
return $str;
|
||||
}
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::Unicode::UTF7 -- UTF-7 encoding
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Encode qw/encode decode/;
|
||||
$utf7 = encode("UTF-7", $utf8);
|
||||
$utf8 = decode("UTF-7", $ucs2);
|
||||
|
||||
=head1 ABSTRACT
|
||||
|
||||
This module implements UTF-7 encoding documented in RFC 2152. UTF-7,
|
||||
as its name suggests, is a 7-bit re-encoded version of UTF-16BE. It
|
||||
is designed to be MTA-safe and expected to be a standard way to
|
||||
exchange Unicoded mails via mails. But with the advent of UTF-8 and
|
||||
8-bit compliant MTAs, UTF-7 is hardly ever used.
|
||||
|
||||
UTF-7 was not supported by Encode until version 1.95 because of that.
|
||||
But Unicode::String, a module by Gisle Aas which adds Unicode supports
|
||||
to non-utf8-savvy perl did support UTF-7, the UTF-7 support was added
|
||||
so Encode can supersede Unicode::String 100%.
|
||||
|
||||
=head1 In Practice
|
||||
|
||||
When you want to encode Unicode for mails and web pages, however, do
|
||||
not use UTF-7 unless you are sure your recipients and readers can
|
||||
handle it. Very few MUAs and WWW Browsers support these days (only
|
||||
Mozilla seems to support one). For general cases, use UTF-8 for
|
||||
message body and MIME-Header for header instead.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>, L<Encode::Unicode>, L<Unicode::String>
|
||||
|
||||
RFC 2781 L<http://www.ietf.org/rfc/rfc2152.txt>
|
||||
|
||||
=cut
|
||||
23
database/perl/lib/Encode/_PM.e2x
Normal file
23
database/perl/lib/Encode/_PM.e2x
Normal file
@@ -0,0 +1,23 @@
|
||||
package Encode::$_Name_;
|
||||
our $VERSION = "0.01";
|
||||
|
||||
use Encode;
|
||||
use XSLoader;
|
||||
XSLoader::load(__PACKAGE__,$VERSION);
|
||||
|
||||
1;
|
||||
__END__
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Encode::$_Name_ - New Encoding
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
You got to fill this in!
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<Encode>
|
||||
|
||||
=cut
|
||||
9
database/perl/lib/Encode/_T.e2x
Normal file
9
database/perl/lib/Encode/_T.e2x
Normal file
@@ -0,0 +1,9 @@
|
||||
use strict;
|
||||
# Adjust the number here!
|
||||
use Test::More tests => 2;
|
||||
|
||||
BEGIN {
|
||||
use_ok('Encode');
|
||||
use_ok('Encode::$_Name_');
|
||||
}
|
||||
# Add more test here!
|
||||
1358
database/perl/lib/Encode/encode.h
Normal file
1358
database/perl/lib/Encode/encode.h
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user