Initial Commit
This commit is contained in:
269
database/perl/vendor/lib/Text/Soundex.pm
vendored
Normal file
269
database/perl/vendor/lib/Text/Soundex.pm
vendored
Normal file
@@ -0,0 +1,269 @@
|
||||
# -*- perl -*-
|
||||
|
||||
# (c) Copyright 1998-2007 by Mark Mielke
|
||||
#
|
||||
# Freedom to use these sources for whatever you want, as long as credit
|
||||
# is given where credit is due, is hereby granted. You may make modifications
|
||||
# where you see fit but leave this copyright somewhere visible. As well, try
|
||||
# to initial any changes you make so that if I like the changes I can
|
||||
# incorporate them into later versions.
|
||||
#
|
||||
# - Mark Mielke <mark@mielke.cc>
|
||||
#
|
||||
|
||||
package Text::Soundex;
|
||||
require 5.006;
|
||||
|
||||
use Exporter ();
|
||||
use XSLoader ();
|
||||
|
||||
use strict;
|
||||
|
||||
use if $] > 5.016, 'deprecate';
|
||||
|
||||
our $VERSION = '3.05';
|
||||
our @EXPORT_OK = qw(soundex soundex_unicode soundex_nara soundex_nara_unicode
|
||||
$soundex_nocode);
|
||||
our @EXPORT = qw(soundex soundex_nara $soundex_nocode);
|
||||
our @ISA = qw(Exporter);
|
||||
|
||||
our $nocode;
|
||||
|
||||
# Previous releases of Text::Soundex made $nocode available as $soundex_nocode.
|
||||
# For now, this part of the interface is exported and maintained.
|
||||
# In the feature, $soundex_nocode will be deprecated.
|
||||
*Text::Soundex::soundex_nocode = \$nocode;
|
||||
|
||||
sub soundex_noxs
|
||||
{
|
||||
# Original Soundex algorithm
|
||||
|
||||
my @results = map {
|
||||
my $code = uc($_);
|
||||
$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
|
||||
|
||||
if (length($code)) {
|
||||
my $firstchar = substr($code, 0, 1);
|
||||
$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
|
||||
[0000000000000000111111112222222222222222333344555566]s;
|
||||
($code = substr($code, 1)) =~ tr/0//d;
|
||||
substr($firstchar . $code . '000', 0, 4);
|
||||
} else {
|
||||
$nocode;
|
||||
}
|
||||
} @_;
|
||||
|
||||
wantarray ? @results : $results[0];
|
||||
}
|
||||
|
||||
sub soundex_nara
|
||||
{
|
||||
# US census (NARA) algorithm.
|
||||
|
||||
my @results = map {
|
||||
my $code = uc($_);
|
||||
$code =~ tr/AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr//cd;
|
||||
|
||||
if (length($code)) {
|
||||
my $firstchar = substr($code, 0, 1);
|
||||
$code =~ tr[AaEeHhIiOoUuWwYyBbFfPpVvCcGgJjKkQqSsXxZzDdTtLlMmNnRr]
|
||||
[0000990000009900111111112222222222222222333344555566]s;
|
||||
$code =~ s/(.)9\1/$1/gs;
|
||||
($code = substr($code, 1)) =~ tr/09//d;
|
||||
substr($firstchar . $code . '000', 0, 4);
|
||||
} else {
|
||||
$nocode
|
||||
}
|
||||
} @_;
|
||||
|
||||
wantarray ? @results : $results[0];
|
||||
}
|
||||
|
||||
sub soundex_unicode
|
||||
{
|
||||
require Text::Unidecode unless defined &Text::Unidecode::unidecode;
|
||||
soundex(Text::Unidecode::unidecode(@_));
|
||||
}
|
||||
|
||||
sub soundex_nara_unicode
|
||||
{
|
||||
require Text::Unidecode unless defined &Text::Unidecode::unidecode;
|
||||
soundex_nara(Text::Unidecode::unidecode(@_));
|
||||
}
|
||||
|
||||
eval { XSLoader::load(__PACKAGE__, $VERSION) };
|
||||
|
||||
if (defined(&soundex_xs)) {
|
||||
*soundex = \&soundex_xs;
|
||||
} else {
|
||||
*soundex = \&soundex_noxs;
|
||||
*soundex_xs = sub {
|
||||
require Carp;
|
||||
Carp::croak("XS implementation of Text::Soundex::soundex_xs() ".
|
||||
"could not be loaded");
|
||||
};
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
# Implementation of the soundex algorithm.
|
||||
#
|
||||
# Some of this documention was written by Mike Stok.
|
||||
#
|
||||
# Examples:
|
||||
#
|
||||
# Euler, Ellery -> E460
|
||||
# Gauss, Ghosh -> G200
|
||||
# Hilbert, Heilbronn -> H416
|
||||
# Knuth, Kant -> K530
|
||||
# Lloyd, Ladd -> L300
|
||||
# Lukasiewicz, Lissajous -> L222
|
||||
#
|
||||
|
||||
=head1 NAME
|
||||
|
||||
Text::Soundex - Implementation of the soundex algorithm.
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use Text::Soundex;
|
||||
|
||||
# Original algorithm.
|
||||
$code = soundex($name); # Get the soundex code for a name.
|
||||
@codes = soundex(@names); # Get the list of codes for a list of names.
|
||||
|
||||
# American Soundex variant (NARA) - Used for US census data.
|
||||
$code = soundex_nara($name); # Get the soundex code for a name.
|
||||
@codes = soundex_nara(@names); # Get the list of codes for a list of names.
|
||||
|
||||
# Redefine the value that soundex() will return if the input string
|
||||
# contains no identifiable sounds within it.
|
||||
$Text::Soundex::nocode = 'Z000';
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
Soundex is a phonetic algorithm for indexing names by sound, as
|
||||
pronounced in English. The goal is for names with the same
|
||||
pronunciation to be encoded to the same representation so that they
|
||||
can be matched despite minor differences in spelling. Soundex is the
|
||||
most widely known of all phonetic algorithms and is often used
|
||||
(incorrectly) as a synonym for "phonetic algorithm". Improvements to
|
||||
Soundex are the basis for many modern phonetic algorithms. (Wikipedia,
|
||||
2007)
|
||||
|
||||
This module implements the original soundex algorithm developed by
|
||||
Robert Russell and Margaret Odell, patented in 1918 and 1922, as well
|
||||
as a variation called "American Soundex" used for US census data, and
|
||||
current maintained by the National Archives and Records Administration
|
||||
(NARA).
|
||||
|
||||
The soundex algorithm may be recognized from Donald Knuth's
|
||||
B<The Art of Computer Programming>. The algorithm described by
|
||||
Knuth is the NARA algorithm.
|
||||
|
||||
The value returned for strings which have no soundex encoding is
|
||||
defined using C<$Text::Soundex::nocode>. The default value is C<undef>,
|
||||
however values such as C<'Z000'> are commonly used alternatives.
|
||||
|
||||
For backward compatibility with older versions of this module the
|
||||
C<$Text::Soundex::nocode> is exported into the caller's namespace as
|
||||
C<$soundex_nocode>.
|
||||
|
||||
In scalar context, C<soundex()> returns the soundex code of its first
|
||||
argument. In list context, a list is returned in which each element is the
|
||||
soundex code for the corresponding argument passed to C<soundex()>. For
|
||||
example, the following code assigns @codes the value C<('M200', 'S320')>:
|
||||
|
||||
@codes = soundex qw(Mike Stok);
|
||||
|
||||
To use C<Text::Soundex> to generate codes that can be used to search one
|
||||
of the publically available US Censuses, a variant of the soundex
|
||||
algorithm must be used:
|
||||
|
||||
use Text::Soundex;
|
||||
$code = soundex_nara($name);
|
||||
|
||||
An example of where these algorithm differ follows:
|
||||
|
||||
use Text::Soundex;
|
||||
print soundex("Ashcraft"), "\n"; # prints: A226
|
||||
print soundex_nara("Ashcraft"), "\n"; # prints: A261
|
||||
|
||||
=head1 EXAMPLES
|
||||
|
||||
Donald Knuth's examples of names and the soundex codes they map to
|
||||
are listed below:
|
||||
|
||||
Euler, Ellery -> E460
|
||||
Gauss, Ghosh -> G200
|
||||
Hilbert, Heilbronn -> H416
|
||||
Knuth, Kant -> K530
|
||||
Lloyd, Ladd -> L300
|
||||
Lukasiewicz, Lissajous -> L222
|
||||
|
||||
so:
|
||||
|
||||
$code = soundex 'Knuth'; # $code contains 'K530'
|
||||
@list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
|
||||
|
||||
=head1 LIMITATIONS
|
||||
|
||||
As the soundex algorithm was originally used a B<long> time ago in the US
|
||||
it considers only the English alphabet and pronunciation. In particular,
|
||||
non-ASCII characters will be ignored. The recommended method of dealing
|
||||
with characters that have accents, or other unicode characters, is to use
|
||||
the Text::Unidecode module available from CPAN. Either use the module
|
||||
explicitly:
|
||||
|
||||
use Text::Soundex;
|
||||
use Text::Unidecode;
|
||||
|
||||
print soundex(unidecode("Fran\xE7ais")), "\n"; # Prints "F652\n"
|
||||
|
||||
Or use the convenient wrapper routine:
|
||||
|
||||
use Text::Soundex 'soundex_unicode';
|
||||
|
||||
print soundex_unicode("Fran\xE7ais"), "\n"; # Prints "F652\n"
|
||||
|
||||
Since the soundex algorithm maps a large space (strings of arbitrary
|
||||
length) onto a small space (single letter plus 3 digits) no inference
|
||||
can be made about the similarity of two strings which end up with the
|
||||
same soundex code. For example, both C<Hilbert> and C<Heilbronn> end
|
||||
up with a soundex code of C<H416>.
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 1998-2003 by Mark Mielke.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=head1 MAINTAINER
|
||||
|
||||
This module is currently maintain by Mark Mielke (C<mark@mielke.cc>).
|
||||
|
||||
=head1 HISTORY
|
||||
|
||||
Version 3 is a significant update to provide support for versions of
|
||||
Perl later than Perl 5.004. Specifically, the XS version of the
|
||||
soundex() subroutine understands strings that are encoded using UTF-8
|
||||
(unicode strings).
|
||||
|
||||
Version 2 of this module was a re-write by Mark Mielke (C<mark@mielke.cc>)
|
||||
to improve the speed of the subroutines. The XS version of the soundex()
|
||||
subroutine was introduced in 2.00.
|
||||
|
||||
Version 1 of this module was written by Mike Stok (C<mike@stok.co.uk>)
|
||||
and was included into the Perl core library set.
|
||||
|
||||
Dave Carlsen (C<dcarlsen@csranet.com>) made the request for the NARA
|
||||
algorithm to be included. The NARA soundex page can be viewed at:
|
||||
C<http://www.archives.gov/research/census/soundex.html>
|
||||
|
||||
Ian Phillips (C<ian@pipex.net>) and Rich Pinder (C<rpinder@hsc.usc.edu>)
|
||||
supplied ideas and spotted mistakes for v1.x.
|
||||
|
||||
=cut
|
||||
Reference in New Issue
Block a user