summaryrefslogtreecommitdiff
path: root/ext/Text
diff options
context:
space:
mode:
authorSteve Peters <steve@fisharerojo.org>2006-10-03 14:52:45 +0000
committerSteve Peters <steve@fisharerojo.org>2006-10-03 14:52:45 +0000
commit11f885b578514fcbf59f44ca49ae6a8299238c7d (patch)
tree51373d85574be174a02aa1f749d8e664d2a6270f /ext/Text
parent88d2d28a53c95ab805c353b18117eeb69c6573c2 (diff)
downloadperl-11f885b578514fcbf59f44ca49ae6a8299238c7d.tar.gz
Move Text::Soundex from lib/ to ext/ and upgrade it to
Text-Soundex-3.02. p4raw-id: //depot/perl@28927
Diffstat (limited to 'ext/Text')
-rw-r--r--ext/Text/Soundex/Changes39
-rw-r--r--ext/Text/Soundex/Makefile.PL11
-rw-r--r--ext/Text/Soundex/README161
-rw-r--r--ext/Text/Soundex/Soundex.pm150
-rw-r--r--ext/Text/Soundex/Soundex.xs157
-rwxr-xr-xext/Text/Soundex/t/Soundex.t143
6 files changed, 661 insertions, 0 deletions
diff --git a/ext/Text/Soundex/Changes b/ext/Text/Soundex/Changes
new file mode 100644
index 0000000000..41c78b15fe
--- /dev/null
+++ b/ext/Text/Soundex/Changes
@@ -0,0 +1,39 @@
+Revision history for Perl extension Text::Soundex.
+
+3.02 Sun Feb 02 02:54:00 EST 2003 <mark@mielke.cc>
+
+The U8 type was over-used in 3.00 and 3.01. Now, "U8 *" is used only as a
+pointer into the UTF-8 string. Also, unicode now works properly on
+Perl 5.6.x as the utf8_to_uv() function is used instead of utf8n_to_uvchr()
+when compiled under a version of Perl earlier than 5.8.0.
+
+3.01 Sun Jan 26 16:30:00 EST 2003 <mark@mielke.cc>
+
+A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters
+was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper
+routines were included and the documentation refers the user to the
+excellent Text::Unidecode module to perform soundex encodings using
+unicode strings. The Perl versions of the routines have been further
+optimized, and correct a border case involving non-alphabetic characters
+at the beginning of the string.
+
+3.00 Sun Jan 26 04:08:00 EST 2003 <mark@mielke.cc>
+
+Updated documentation, simplified the Perl interface, and updated
+the XS code to be faster, and to properly work with UTF-8 strings.
+UNICODE characters outside the ASCII range (0x00 - 0x7F) are
+considered to be non-alphabetic for the purposes of the soundex
+algorithms.
+
+2.10 Sun Feb 15 15:29:38 EST 1998 <mark@mielke.cc>
+
+I've put in a version of my XS code and fully integrated it with the
+existing 100% perl mechanism. The change should be virtually transparent
+to the user. XS code is approx 7.5 times faster.
+ - Mark Mielke
+
+2.00 Thu Jan 1 16:22:11 1998 <mike@stok.co.uk>
+
+Incorporated Mark Mielke's rewritten version of the main soundex routine
+and made the test.pl file simpler.
+
diff --git a/ext/Text/Soundex/Makefile.PL b/ext/Text/Soundex/Makefile.PL
new file mode 100644
index 0000000000..ea757f175f
--- /dev/null
+++ b/ext/Text/Soundex/Makefile.PL
@@ -0,0 +1,11 @@
+use ExtUtils::MakeMaker;
+WriteMakefile(
+ NAME => "Text::Soundex",
+ VERSION_FROM => 'Soundex.pm',
+ 'dist' => {
+ COMPRESS => 'gzip -9f',
+ SUFFIX => 'gz',
+ DIST_DEFAULT => 'all tardist',
+ },
+ MAN3PODS => {},
+);
diff --git a/ext/Text/Soundex/README b/ext/Text/Soundex/README
new file mode 100644
index 0000000000..7fcf22e945
--- /dev/null
+++ b/ext/Text/Soundex/README
@@ -0,0 +1,161 @@
+Text::Soundex Version 3.02
+
+NOTE: Users of Text::Soundex Version 2.x should consult the 'History'
+ section at the end of this document before installing this module.
+ The interface has been simplified, and existing code that takes
+ advantages of Version 2.x features may need to be altered to function
+ properly.
+
+This is a perl 5 module implementing the Soundex algorithm described by
+Knuth. The algorithm is used quite often for locating a person by name
+where the actual spelling of the name is not known.
+
+This version directly supercedes the version of Text::Soundex that can be
+found in the core from Perl 5.8.0 and down. (This version is a drop-in
+replacement)
+
+The algorithm used by soundex() is NOT fully compatible with the
+algorithm used to index names for US Censuses. Use the soundex_nara()
+subroutine to return codes for this purpose.
+
+Basic Usage:
+
+ Soundex is used to do a one way transformation of a name, converting
+ a character string given as input into a set of codes representing
+ the identifiable sounds those characters might make in the output.
+
+ For example:
+
+ use Text::Soundex;
+
+ print soundex("Mark"), "\n"; # prints: M620
+ print soundex("Marc"), "\n"; # prints: M620
+
+ print soundex("Hansen"), "\n"; # prints: H525
+ print soundex("Hanson"), "\n"; # prints: H525
+ print soundex("Henson"), "\n"; # prints: H525
+
+ In many situations, code such as the following:
+
+ if ($name1 eq $name2) {
+ ...
+ }
+
+ Can be substituted with:
+
+ if (soundex($name1) eq soundex($name2)) {
+ ...
+ }
+
+Installation:
+
+ Once the archive has been unpacked then the following steps are needed
+ to build, test and install the module (to be done in the directory which
+ contains the Makefile.PL)
+
+ perl Makefile.PL
+ make
+ make test
+
+ If the make test succeeds then the next step may need to be run as root
+ (on a Unix-like system) or with special privileges on other systems.
+
+ make install
+
+ If you do not want to use the XS code (for whatever reason) do the following
+ instead of the above:
+
+ perl Makefile.PL --no-xs
+ make
+ make test
+ make install
+
+ If any of the tests report 'not ok' and you are running perl 5.6.0 or later
+ then please contact Mark Mielke <mark@mielke.cc>
+
+History:
+
+ Version 3.02:
+ 3.01 and 3.00 used the 'U8' type incorrectly causing some strict
+ compilers to complain or refuse to compile the XS code. Also, unicode
+ support did not work properly for Perl 5.6.x. Both of these problems
+ are now fixed.
+
+ Version 3.01:
+ A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters
+ was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper
+ routines were included and the documentation refers the user to the
+ excellent Text::Unidecode module to perform soundex encodings using
+ unicode strings. The Perl versions of the routines have been further
+ optimized, and correct a border case involving non-alphabetic characters
+ at the beginning of the string.
+
+ Version 3.00:
+ Support for UTF-8 strings (unicode strings) is now in place. Note
+ that this allows UTF-8 strings to be passed to the XS version of
+ the soundex() routine. The Soundex algorithm treats characters
+ outside the ascii range (0x00 - 0x7F) as if they were not
+ alphabetical.
+
+ The interface has been simplified. In order to explicitly use the
+ non-XS implementation of soundex():
+
+ use Text::Soundex ();
+ $code = Text::Soundex::soundex_noxs($name);
+
+ In order to use the NARA soundex algorithm:
+
+ use Text::Soundex 'soundex_nara';
+ $code = soundex_nara($name);
+
+ Use of the ':NARA-Ruleset' import directive is now obsolete. To
+ emulate the old behaviour:
+
+ use Text::Soundex ();
+ *soundex = \&Text::Soundex::soundex_nara;
+ $code = soundex($name);
+
+ Version 2.20:
+ This version includes support for the algorithm used to index
+ the U.S. Federal Censuses. There is a slight descrepancy in the
+ definition for a soundex code which is not commonly known or
+ recognized involved similar sounding letters being seperated
+ by the characters H or W. This is defined as the NARA ruleset,
+ as this descrepency was discovered by them. (Calling it "the
+ US Census ruleset" was too unwieldy...)
+
+ NARA can be found at:
+ http://www.nara.gov/genealogy/
+
+ The algorithm requested by NARA can be found at:
+ http://home.utah-inter.net/kinsearch/Soundex.html
+
+ Ways to use it in your code:
+
+ Transparently change existing code like this:
+ =============================================
+ use Text::Soundex qw(:NARA-Ruleset);
+
+ ... soundex(...) ...
+
+ --
+
+ Make the change visibly distinct like this:
+ ===========================================
+ use Text::Soundex qw(soundex_nara);
+
+ ... soundex_nara(...) ...
+
+ Version 2.00:
+ This version is a full re-write of the 1.0 engine by Mark Mielke.
+ The goal was for speed... and this was achieved. There is an optional
+ XS module which can be used completely transparently by the user
+ which offers a further speed increase of a factor of more than 7.5X.
+
+ Version 1.00:
+ This version can be found in the perl core distribution from at
+ least Perl 5.8.0 and down. It was written by Mike Stok. It can be
+ identified by the fact that it does not contain a $VERSION
+ in the beginning of the module, and as well it uses an RCS
+ tag with a version of 1.x. This version, before some perl5'ish
+ packaging was introduced, was actually written for perl4.
diff --git a/ext/Text/Soundex/Soundex.pm b/ext/Text/Soundex/Soundex.pm
new file mode 100644
index 0000000000..64a9e6507d
--- /dev/null
+++ b/ext/Text/Soundex/Soundex.pm
@@ -0,0 +1,150 @@
+package Text::Soundex;
+require 5.000;
+require Exporter;
+
+@ISA = qw(Exporter);
+@EXPORT = qw(&soundex $soundex_nocode);
+
+$VERSION = '1.01';
+
+# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $
+#
+# Implementation of soundex algorithm as described by Knuth in volume
+# 3 of The Art of Computer Programming, with ideas stolen from Ian
+# Phillipps <ian@pipex.net>.
+#
+# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994.
+#
+# Knuth's test cases are:
+#
+# Euler, Ellery -> E460
+# Gauss, Ghosh -> G200
+# Hilbert, Heilbronn -> H416
+# Knuth, Kant -> K530
+# Lloyd, Ladd -> L300
+# Lukasiewicz, Lissajous -> L222
+#
+# $Log: soundex.pl,v $
+# Revision 1.2 1994/03/24 00:30:27 mike
+# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu>
+# in the way I handles leasing characters which were different but had
+# the same soundex code. This showed up comparing it with Oracle's
+# soundex output.
+#
+# Revision 1.1 1994/03/02 13:01:30 mike
+# Initial revision
+#
+#
+##############################################################################
+
+# $soundex_nocode is used to indicate a string doesn't have a soundex
+# code, I like undef other people may want to set it to 'Z000'.
+
+$soundex_nocode = undef;
+
+sub soundex
+{
+ local (@s, $f, $fc, $_) = @_;
+
+ push @s, '' unless @s; # handle no args as a single empty string
+
+ foreach (@s)
+ {
+ $_ = uc $_;
+ tr/A-Z//cd;
+
+ if ($_ eq '')
+ {
+ $_ = $soundex_nocode;
+ }
+ else
+ {
+ ($f) = /^(.)/;
+ tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/;
+ ($fc) = /^(.)/;
+ s/^$fc+//;
+ tr///cs;
+ tr/0//d;
+ $_ = $f . $_ . '000';
+ s/^(.{4}).*/$1/;
+ }
+ }
+
+ wantarray ? @s : shift @s;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth
+
+=head1 SYNOPSIS
+
+ use Text::Soundex;
+
+ $code = soundex $string; # get soundex code for a string
+ @codes = soundex @list; # get list of codes for list of strings
+
+ # set value to be returned for strings without soundex code
+
+ $soundex_nocode = 'Z000';
+
+=head1 DESCRIPTION
+
+This module implements the soundex algorithm as described by Donald Knuth
+in Volume 3 of B<The Art of Computer Programming>. The algorithm is
+intended to hash words (in particular surnames) into a small space using a
+simple model which approximates the sound of the word when spoken by an English
+speaker. Each word is reduced to a four character string, the first
+character being an upper case letter and the remaining three being digits.
+
+If there is no soundex code representation for a string then the value of
+C<$soundex_nocode> is returned. This is initially set to C<undef>, but
+many people seem to prefer an I<unlikely> value like C<Z000>
+(how unlikely this is depends on the data set being dealt with.) Any value
+can be assigned to C<$soundex_nocode>.
+
+In scalar context C<soundex> returns the soundex code of its first
+argument, and in list context a list is returned in which each element is the
+soundex code for the corresponding argument passed to C<soundex> e.g.
+
+ @codes = soundex qw(Mike Stok);
+
+leaves C<@codes> containing C<('M200', 'S320')>.
+
+=head1 EXAMPLES
+
+Knuth's examples of various names and the soundex codes they map to
+are listed below:
+
+ Euler, Ellery -> E460
+ Gauss, Ghosh -> G200
+ Hilbert, Heilbronn -> H416
+ Knuth, Kant -> K530
+ Lloyd, Ladd -> L300
+ Lukasiewicz, Lissajous -> L222
+
+so:
+
+ $code = soundex 'Knuth'; # $code contains 'K530'
+ @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200'
+
+=head1 LIMITATIONS
+
+As the soundex algorithm was originally used a B<long> time ago in the US
+it considers only the English alphabet and pronunciation.
+
+As it is mapping a large space (arbitrary length strings) onto a small
+space (single letter plus 3 digits) no inference can be made about the
+similarity of two strings which end up with the same soundex code. For
+example, both C<Hilbert> and C<Heilbronn> end up with a soundex code
+of C<H416>.
+
+=head1 AUTHOR
+
+This code was implemented by Mike Stok (C<stok@cybercom.net>) from the
+description given by Knuth. Ian Phillipps (C<ian@pipex.net>) and Rich Pinder
+(C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes.
diff --git a/ext/Text/Soundex/Soundex.xs b/ext/Text/Soundex/Soundex.xs
new file mode 100644
index 0000000000..9f5d809441
--- /dev/null
+++ b/ext/Text/Soundex/Soundex.xs
@@ -0,0 +1,157 @@
+/* -*- c -*- */
+
+/* (c) Copyright 1998-2003 by Mark Mielke
+ *
+ * Freedom to use these sources for whatever you want, as long as credit
+ * is given where credit is due, is hereby granted. You may make modifications
+ * where you see fit but leave this copyright somewhere visible. As well try
+ * to initial any changes you make so that if i like the changes i can
+ * incorporate them into any later versions of mine.
+ *
+ * - Mark Mielke <mark@mielke.cc>
+ */
+
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+
+#define SOUNDEX_ACCURACY (4) /* The maximum code length... (should be>=2) */
+
+#if !(PERL_REVISION >= 5 && PERL_VERSION >= 8)
+# define utf8n_to_uvchr utf8_to_uv
+#endif
+
+static char *soundex_table =
+ /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/
+ "01230120022455012623010202";
+
+static SV *sv_soundex (source)
+ SV *source;
+{
+ char *source_p;
+ char *source_end;
+
+ {
+ STRLEN source_len;
+ source_p = SvPV(source, source_len);
+ source_end = &source_p[source_len];
+ }
+
+ while (source_p != source_end)
+ {
+ if ((*source_p & ~((UV) 0x7F)) == 0 && isalpha(*source_p))
+ {
+ SV *code = newSV(SOUNDEX_ACCURACY);
+ char *code_p = SvPVX(code);
+ char *code_end = &code_p[SOUNDEX_ACCURACY];
+ char code_last;
+
+ SvCUR_set(code, SOUNDEX_ACCURACY);
+ SvPOK_only(code);
+
+ code_last = soundex_table[(*code_p++ = toupper(*source_p++)) - 'A'];
+
+ while (source_p != source_end && code_p != code_end)
+ {
+ char c = *source_p++;
+
+ if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
+ {
+ *code_p = soundex_table[toupper(c) - 'A'];
+ if (*code_p != code_last && (code_last = *code_p) != '0')
+ code_p++;
+ }
+ }
+
+ while (code_p != code_end)
+ *code_p++ = '0';
+
+ *code_end = '\0';
+
+ return code;
+ }
+
+ source_p++;
+ }
+
+ return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE));
+}
+
+static SV *sv_soundex_utf8 (source)
+ SV *source;
+{
+ U8 *source_p;
+ U8 *source_end;
+
+ {
+ STRLEN source_len;
+ source_p = (U8 *) SvPV(source, source_len);
+ source_end = &source_p[source_len];
+ }
+
+ while (source_p < source_end)
+ {
+ STRLEN offset;
+ UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
+ source_p = (offset >= 1) ? &source_p[offset] : source_end;
+
+ if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
+ {
+ SV *code = newSV(SOUNDEX_ACCURACY);
+ char *code_p = SvPVX(code);
+ char *code_end = &code_p[SOUNDEX_ACCURACY];
+ char code_last;
+
+ SvCUR_set(code, SOUNDEX_ACCURACY);
+ SvPOK_only(code);
+
+ code_last = soundex_table[(*code_p++ = toupper(c)) - 'A'];
+
+ while (source_p != source_end && code_p != code_end)
+ {
+ c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
+ source_p = (offset >= 1) ? &source_p[offset] : source_end;
+
+ if ((c & ~((UV) 0x7F)) == 0 && isalpha(c))
+ {
+ *code_p = soundex_table[toupper(c) - 'A'];
+ if (*code_p != code_last && (code_last = *code_p) != '0')
+ code_p++;
+ }
+ }
+
+ while (code_p != code_end)
+ *code_p++ = '0';
+
+ *code_end = '\0';
+
+ return code;
+ }
+
+ source_p++;
+ }
+
+ return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE));
+}
+
+MODULE = Text::Soundex PACKAGE = Text::Soundex
+
+PROTOTYPES: DISABLE
+
+void
+soundex_xs (...)
+PPCODE:
+{
+ int i;
+ for (i = 0; i < items; i++)
+ {
+ SV *sv = ST(i);
+
+ if (DO_UTF8(sv))
+ sv = sv_soundex_utf8(sv);
+ else
+ sv = sv_soundex(sv);
+
+ PUSHs(sv_2mortal(sv));
+ }
+}
diff --git a/ext/Text/Soundex/t/Soundex.t b/ext/Text/Soundex/t/Soundex.t
new file mode 100755
index 0000000000..d35f264c7a
--- /dev/null
+++ b/ext/Text/Soundex/t/Soundex.t
@@ -0,0 +1,143 @@
+#!./perl
+#
+# $Id: soundex.t,v 1.2 1994/03/24 00:30:27 mike Exp $
+#
+# test module for soundex.pl
+#
+# $Log: soundex.t,v $
+# Revision 1.2 1994/03/24 00:30:27 mike
+# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu>
+# in the way I handles leasing characters which were different but had
+# the same soundex code. This showed up comparing it with Oracle's
+# soundex output.
+#
+# Revision 1.1 1994/03/02 13:03:02 mike
+# Initial revision
+#
+#
+
+BEGIN {
+ chdir 't' if -d 't';
+ @INC = '../lib';
+}
+
+use Text::Soundex;
+
+$test = 0;
+print "1..13\n";
+
+while (<DATA>)
+{
+ chop;
+ next if /^\s*;?#/;
+ next if /^\s*$/;
+
+ ++$test;
+ $bad = 0;
+
+ if (/^eval\s+/)
+ {
+ ($try = $_) =~ s/^eval\s+//;
+
+ eval ($try);
+ if ($@)
+ {
+ $bad++;
+ print "not ok $test\n";
+ print "# eval '$try' returned $@";
+ }
+ }
+ elsif (/^\(/)
+ {
+ ($in, $out) = split (':');
+
+ $try = "\@expect = $out; \@got = &soundex $in;";
+ eval ($try);
+
+ if (@expect != @got)
+ {
+ $bad++;
+ print "not ok $test\n";
+ print "# expected ", scalar @expect, " results, got ", scalar @got, "\n";
+ print "# expected (", join (', ', @expect),
+ ") got (", join (', ', @got), ")\n";
+ }
+ else
+ {
+ while (@got)
+ {
+ $expect = shift @expect;
+ $got = shift @got;
+
+ if ($expect ne $got)
+ {
+ $bad++;
+ print "not ok $test\n";
+ print "# expected $expect, got $got\n";
+ }
+ }
+ }
+ }
+ else
+ {
+ ($in, $out) = split (':');
+
+ $try = "\$expect = $out; \$got = &soundex ($in);";
+ eval ($try);
+
+ if ($expect ne $got)
+ {
+ $bad++;
+ print "not ok $test\n";
+ print "# expected $expect, got $got\n";
+ }
+ }
+
+ print "ok $test\n" unless $bad;
+}
+
+__END__
+#
+# 1..6
+#
+# Knuth's test cases, scalar in, scalar out
+#
+'Euler':'E460'
+'Gauss':'G200'
+'Hilbert':'H416'
+'Knuth':'K530'
+'Lloyd':'L300'
+'Lukasiewicz':'L222'
+#
+# 7..8
+#
+# check default bad code
+#
+'2 + 2 = 4':undef
+undef:undef
+#
+# 9
+#
+# check array in, array out
+#
+('Ellery', 'Ghosh', 'Heilbronn', 'Kant', 'Ladd', 'Lissajous'):('E460', 'G200', 'H416', 'K530', 'L300', 'L222')
+#
+# 10
+#
+# check array with explicit undef
+#
+('Mike', undef, 'Stok'):('M200', undef, 'S320')
+#
+# 11..12
+#
+# check setting $Text::Soundex::noCode
+#
+eval $soundex_nocode = 'Z000';
+('Mike', undef, 'Stok'):('M200', 'Z000', 'S320')
+#
+# 13
+#
+# a subtle difference between me & oracle, spotted by Rich Pinder
+# <rpinder@hsc.usc.edu>
+#
+CZARKOWSKA:C622