diff options
author | Steve Peters <steve@fisharerojo.org> | 2006-10-03 14:52:45 +0000 |
---|---|---|
committer | Steve Peters <steve@fisharerojo.org> | 2006-10-03 14:52:45 +0000 |
commit | 11f885b578514fcbf59f44ca49ae6a8299238c7d (patch) | |
tree | 51373d85574be174a02aa1f749d8e664d2a6270f /ext/Text | |
parent | 88d2d28a53c95ab805c353b18117eeb69c6573c2 (diff) | |
download | perl-11f885b578514fcbf59f44ca49ae6a8299238c7d.tar.gz |
Move Text::Soundex from lib/ to ext/ and upgrade it to
Text-Soundex-3.02.
p4raw-id: //depot/perl@28927
Diffstat (limited to 'ext/Text')
-rw-r--r-- | ext/Text/Soundex/Changes | 39 | ||||
-rw-r--r-- | ext/Text/Soundex/Makefile.PL | 11 | ||||
-rw-r--r-- | ext/Text/Soundex/README | 161 | ||||
-rw-r--r-- | ext/Text/Soundex/Soundex.pm | 150 | ||||
-rw-r--r-- | ext/Text/Soundex/Soundex.xs | 157 | ||||
-rwxr-xr-x | ext/Text/Soundex/t/Soundex.t | 143 |
6 files changed, 661 insertions, 0 deletions
diff --git a/ext/Text/Soundex/Changes b/ext/Text/Soundex/Changes new file mode 100644 index 0000000000..41c78b15fe --- /dev/null +++ b/ext/Text/Soundex/Changes @@ -0,0 +1,39 @@ +Revision history for Perl extension Text::Soundex. + +3.02 Sun Feb 02 02:54:00 EST 2003 <mark@mielke.cc> + +The U8 type was over-used in 3.00 and 3.01. Now, "U8 *" is used only as a +pointer into the UTF-8 string. Also, unicode now works properly on +Perl 5.6.x as the utf8_to_uv() function is used instead of utf8n_to_uvchr() +when compiled under a version of Perl earlier than 5.8.0. + +3.01 Sun Jan 26 16:30:00 EST 2003 <mark@mielke.cc> + +A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters +was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper +routines were included and the documentation refers the user to the +excellent Text::Unidecode module to perform soundex encodings using +unicode strings. The Perl versions of the routines have been further +optimized, and correct a border case involving non-alphabetic characters +at the beginning of the string. + +3.00 Sun Jan 26 04:08:00 EST 2003 <mark@mielke.cc> + +Updated documentation, simplified the Perl interface, and updated +the XS code to be faster, and to properly work with UTF-8 strings. +UNICODE characters outside the ASCII range (0x00 - 0x7F) are +considered to be non-alphabetic for the purposes of the soundex +algorithms. + +2.10 Sun Feb 15 15:29:38 EST 1998 <mark@mielke.cc> + +I've put in a version of my XS code and fully integrated it with the +existing 100% perl mechanism. The change should be virtually transparent +to the user. XS code is approx 7.5 times faster. + - Mark Mielke + +2.00 Thu Jan 1 16:22:11 1998 <mike@stok.co.uk> + +Incorporated Mark Mielke's rewritten version of the main soundex routine +and made the test.pl file simpler. + diff --git a/ext/Text/Soundex/Makefile.PL b/ext/Text/Soundex/Makefile.PL new file mode 100644 index 0000000000..ea757f175f --- /dev/null +++ b/ext/Text/Soundex/Makefile.PL @@ -0,0 +1,11 @@ +use ExtUtils::MakeMaker; +WriteMakefile( + NAME => "Text::Soundex", + VERSION_FROM => 'Soundex.pm', + 'dist' => { + COMPRESS => 'gzip -9f', + SUFFIX => 'gz', + DIST_DEFAULT => 'all tardist', + }, + MAN3PODS => {}, +); diff --git a/ext/Text/Soundex/README b/ext/Text/Soundex/README new file mode 100644 index 0000000000..7fcf22e945 --- /dev/null +++ b/ext/Text/Soundex/README @@ -0,0 +1,161 @@ +Text::Soundex Version 3.02 + +NOTE: Users of Text::Soundex Version 2.x should consult the 'History' + section at the end of this document before installing this module. + The interface has been simplified, and existing code that takes + advantages of Version 2.x features may need to be altered to function + properly. + +This is a perl 5 module implementing the Soundex algorithm described by +Knuth. The algorithm is used quite often for locating a person by name +where the actual spelling of the name is not known. + +This version directly supercedes the version of Text::Soundex that can be +found in the core from Perl 5.8.0 and down. (This version is a drop-in +replacement) + +The algorithm used by soundex() is NOT fully compatible with the +algorithm used to index names for US Censuses. Use the soundex_nara() +subroutine to return codes for this purpose. + +Basic Usage: + + Soundex is used to do a one way transformation of a name, converting + a character string given as input into a set of codes representing + the identifiable sounds those characters might make in the output. + + For example: + + use Text::Soundex; + + print soundex("Mark"), "\n"; # prints: M620 + print soundex("Marc"), "\n"; # prints: M620 + + print soundex("Hansen"), "\n"; # prints: H525 + print soundex("Hanson"), "\n"; # prints: H525 + print soundex("Henson"), "\n"; # prints: H525 + + In many situations, code such as the following: + + if ($name1 eq $name2) { + ... + } + + Can be substituted with: + + if (soundex($name1) eq soundex($name2)) { + ... + } + +Installation: + + Once the archive has been unpacked then the following steps are needed + to build, test and install the module (to be done in the directory which + contains the Makefile.PL) + + perl Makefile.PL + make + make test + + If the make test succeeds then the next step may need to be run as root + (on a Unix-like system) or with special privileges on other systems. + + make install + + If you do not want to use the XS code (for whatever reason) do the following + instead of the above: + + perl Makefile.PL --no-xs + make + make test + make install + + If any of the tests report 'not ok' and you are running perl 5.6.0 or later + then please contact Mark Mielke <mark@mielke.cc> + +History: + + Version 3.02: + 3.01 and 3.00 used the 'U8' type incorrectly causing some strict + compilers to complain or refuse to compile the XS code. Also, unicode + support did not work properly for Perl 5.6.x. Both of these problems + are now fixed. + + Version 3.01: + A bug with non-UTF 8 strings that contain non-ASCII alphabetic characters + was fixed. The soundex_unicode() and soundex_nara_unicode() wrapper + routines were included and the documentation refers the user to the + excellent Text::Unidecode module to perform soundex encodings using + unicode strings. The Perl versions of the routines have been further + optimized, and correct a border case involving non-alphabetic characters + at the beginning of the string. + + Version 3.00: + Support for UTF-8 strings (unicode strings) is now in place. Note + that this allows UTF-8 strings to be passed to the XS version of + the soundex() routine. The Soundex algorithm treats characters + outside the ascii range (0x00 - 0x7F) as if they were not + alphabetical. + + The interface has been simplified. In order to explicitly use the + non-XS implementation of soundex(): + + use Text::Soundex (); + $code = Text::Soundex::soundex_noxs($name); + + In order to use the NARA soundex algorithm: + + use Text::Soundex 'soundex_nara'; + $code = soundex_nara($name); + + Use of the ':NARA-Ruleset' import directive is now obsolete. To + emulate the old behaviour: + + use Text::Soundex (); + *soundex = \&Text::Soundex::soundex_nara; + $code = soundex($name); + + Version 2.20: + This version includes support for the algorithm used to index + the U.S. Federal Censuses. There is a slight descrepancy in the + definition for a soundex code which is not commonly known or + recognized involved similar sounding letters being seperated + by the characters H or W. This is defined as the NARA ruleset, + as this descrepency was discovered by them. (Calling it "the + US Census ruleset" was too unwieldy...) + + NARA can be found at: + http://www.nara.gov/genealogy/ + + The algorithm requested by NARA can be found at: + http://home.utah-inter.net/kinsearch/Soundex.html + + Ways to use it in your code: + + Transparently change existing code like this: + ============================================= + use Text::Soundex qw(:NARA-Ruleset); + + ... soundex(...) ... + + -- + + Make the change visibly distinct like this: + =========================================== + use Text::Soundex qw(soundex_nara); + + ... soundex_nara(...) ... + + Version 2.00: + This version is a full re-write of the 1.0 engine by Mark Mielke. + The goal was for speed... and this was achieved. There is an optional + XS module which can be used completely transparently by the user + which offers a further speed increase of a factor of more than 7.5X. + + Version 1.00: + This version can be found in the perl core distribution from at + least Perl 5.8.0 and down. It was written by Mike Stok. It can be + identified by the fact that it does not contain a $VERSION + in the beginning of the module, and as well it uses an RCS + tag with a version of 1.x. This version, before some perl5'ish + packaging was introduced, was actually written for perl4. diff --git a/ext/Text/Soundex/Soundex.pm b/ext/Text/Soundex/Soundex.pm new file mode 100644 index 0000000000..64a9e6507d --- /dev/null +++ b/ext/Text/Soundex/Soundex.pm @@ -0,0 +1,150 @@ +package Text::Soundex; +require 5.000; +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(&soundex $soundex_nocode); + +$VERSION = '1.01'; + +# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ +# +# Implementation of soundex algorithm as described by Knuth in volume +# 3 of The Art of Computer Programming, with ideas stolen from Ian +# Phillipps <ian@pipex.net>. +# +# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. +# +# Knuth's test cases are: +# +# Euler, Ellery -> E460 +# Gauss, Ghosh -> G200 +# Hilbert, Heilbronn -> H416 +# Knuth, Kant -> K530 +# Lloyd, Ladd -> L300 +# Lukasiewicz, Lissajous -> L222 +# +# $Log: soundex.pl,v $ +# Revision 1.2 1994/03/24 00:30:27 mike +# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> +# in the way I handles leasing characters which were different but had +# the same soundex code. This showed up comparing it with Oracle's +# soundex output. +# +# Revision 1.1 1994/03/02 13:01:30 mike +# Initial revision +# +# +############################################################################## + +# $soundex_nocode is used to indicate a string doesn't have a soundex +# code, I like undef other people may want to set it to 'Z000'. + +$soundex_nocode = undef; + +sub soundex +{ + local (@s, $f, $fc, $_) = @_; + + push @s, '' unless @s; # handle no args as a single empty string + + foreach (@s) + { + $_ = uc $_; + tr/A-Z//cd; + + if ($_ eq '') + { + $_ = $soundex_nocode; + } + else + { + ($f) = /^(.)/; + tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; + ($fc) = /^(.)/; + s/^$fc+//; + tr///cs; + tr/0//d; + $_ = $f . $_ . '000'; + s/^(.{4}).*/$1/; + } + } + + wantarray ? @s : shift @s; +} + +1; + +__END__ + +=head1 NAME + +Text::Soundex - Implementation of the Soundex Algorithm as Described by Knuth + +=head1 SYNOPSIS + + use Text::Soundex; + + $code = soundex $string; # get soundex code for a string + @codes = soundex @list; # get list of codes for list of strings + + # set value to be returned for strings without soundex code + + $soundex_nocode = 'Z000'; + +=head1 DESCRIPTION + +This module implements the soundex algorithm as described by Donald Knuth +in Volume 3 of B<The Art of Computer Programming>. The algorithm is +intended to hash words (in particular surnames) into a small space using a +simple model which approximates the sound of the word when spoken by an English +speaker. Each word is reduced to a four character string, the first +character being an upper case letter and the remaining three being digits. + +If there is no soundex code representation for a string then the value of +C<$soundex_nocode> is returned. This is initially set to C<undef>, but +many people seem to prefer an I<unlikely> value like C<Z000> +(how unlikely this is depends on the data set being dealt with.) Any value +can be assigned to C<$soundex_nocode>. + +In scalar context C<soundex> returns the soundex code of its first +argument, and in list context a list is returned in which each element is the +soundex code for the corresponding argument passed to C<soundex> e.g. + + @codes = soundex qw(Mike Stok); + +leaves C<@codes> containing C<('M200', 'S320')>. + +=head1 EXAMPLES + +Knuth's examples of various names and the soundex codes they map to +are listed below: + + Euler, Ellery -> E460 + Gauss, Ghosh -> G200 + Hilbert, Heilbronn -> H416 + Knuth, Kant -> K530 + Lloyd, Ladd -> L300 + Lukasiewicz, Lissajous -> L222 + +so: + + $code = soundex 'Knuth'; # $code contains 'K530' + @list = soundex qw(Lloyd Gauss); # @list contains 'L300', 'G200' + +=head1 LIMITATIONS + +As the soundex algorithm was originally used a B<long> time ago in the US +it considers only the English alphabet and pronunciation. + +As it is mapping a large space (arbitrary length strings) onto a small +space (single letter plus 3 digits) no inference can be made about the +similarity of two strings which end up with the same soundex code. For +example, both C<Hilbert> and C<Heilbronn> end up with a soundex code +of C<H416>. + +=head1 AUTHOR + +This code was implemented by Mike Stok (C<stok@cybercom.net>) from the +description given by Knuth. Ian Phillipps (C<ian@pipex.net>) and Rich Pinder +(C<rpinder@hsc.usc.edu>) supplied ideas and spotted mistakes. diff --git a/ext/Text/Soundex/Soundex.xs b/ext/Text/Soundex/Soundex.xs new file mode 100644 index 0000000000..9f5d809441 --- /dev/null +++ b/ext/Text/Soundex/Soundex.xs @@ -0,0 +1,157 @@ +/* -*- c -*- */ + +/* (c) Copyright 1998-2003 by Mark Mielke + * + * Freedom to use these sources for whatever you want, as long as credit + * is given where credit is due, is hereby granted. You may make modifications + * where you see fit but leave this copyright somewhere visible. As well try + * to initial any changes you make so that if i like the changes i can + * incorporate them into any later versions of mine. + * + * - Mark Mielke <mark@mielke.cc> + */ + +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" + +#define SOUNDEX_ACCURACY (4) /* The maximum code length... (should be>=2) */ + +#if !(PERL_REVISION >= 5 && PERL_VERSION >= 8) +# define utf8n_to_uvchr utf8_to_uv +#endif + +static char *soundex_table = + /*ABCDEFGHIJKLMNOPQRSTUVWXYZ*/ + "01230120022455012623010202"; + +static SV *sv_soundex (source) + SV *source; +{ + char *source_p; + char *source_end; + + { + STRLEN source_len; + source_p = SvPV(source, source_len); + source_end = &source_p[source_len]; + } + + while (source_p != source_end) + { + if ((*source_p & ~((UV) 0x7F)) == 0 && isalpha(*source_p)) + { + SV *code = newSV(SOUNDEX_ACCURACY); + char *code_p = SvPVX(code); + char *code_end = &code_p[SOUNDEX_ACCURACY]; + char code_last; + + SvCUR_set(code, SOUNDEX_ACCURACY); + SvPOK_only(code); + + code_last = soundex_table[(*code_p++ = toupper(*source_p++)) - 'A']; + + while (source_p != source_end && code_p != code_end) + { + char c = *source_p++; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + *code_p = soundex_table[toupper(c) - 'A']; + if (*code_p != code_last && (code_last = *code_p) != '0') + code_p++; + } + } + + while (code_p != code_end) + *code_p++ = '0'; + + *code_end = '\0'; + + return code; + } + + source_p++; + } + + return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); +} + +static SV *sv_soundex_utf8 (source) + SV *source; +{ + U8 *source_p; + U8 *source_end; + + { + STRLEN source_len; + source_p = (U8 *) SvPV(source, source_len); + source_end = &source_p[source_len]; + } + + while (source_p < source_end) + { + STRLEN offset; + UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + source_p = (offset >= 1) ? &source_p[offset] : source_end; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + SV *code = newSV(SOUNDEX_ACCURACY); + char *code_p = SvPVX(code); + char *code_end = &code_p[SOUNDEX_ACCURACY]; + char code_last; + + SvCUR_set(code, SOUNDEX_ACCURACY); + SvPOK_only(code); + + code_last = soundex_table[(*code_p++ = toupper(c)) - 'A']; + + while (source_p != source_end && code_p != code_end) + { + c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); + source_p = (offset >= 1) ? &source_p[offset] : source_end; + + if ((c & ~((UV) 0x7F)) == 0 && isalpha(c)) + { + *code_p = soundex_table[toupper(c) - 'A']; + if (*code_p != code_last && (code_last = *code_p) != '0') + code_p++; + } + } + + while (code_p != code_end) + *code_p++ = '0'; + + *code_end = '\0'; + + return code; + } + + source_p++; + } + + return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); +} + +MODULE = Text::Soundex PACKAGE = Text::Soundex + +PROTOTYPES: DISABLE + +void +soundex_xs (...) +PPCODE: +{ + int i; + for (i = 0; i < items; i++) + { + SV *sv = ST(i); + + if (DO_UTF8(sv)) + sv = sv_soundex_utf8(sv); + else + sv = sv_soundex(sv); + + PUSHs(sv_2mortal(sv)); + } +} diff --git a/ext/Text/Soundex/t/Soundex.t b/ext/Text/Soundex/t/Soundex.t new file mode 100755 index 0000000000..d35f264c7a --- /dev/null +++ b/ext/Text/Soundex/t/Soundex.t @@ -0,0 +1,143 @@ +#!./perl +# +# $Id: soundex.t,v 1.2 1994/03/24 00:30:27 mike Exp $ +# +# test module for soundex.pl +# +# $Log: soundex.t,v $ +# Revision 1.2 1994/03/24 00:30:27 mike +# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> +# in the way I handles leasing characters which were different but had +# the same soundex code. This showed up comparing it with Oracle's +# soundex output. +# +# Revision 1.1 1994/03/02 13:03:02 mike +# Initial revision +# +# + +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; +} + +use Text::Soundex; + +$test = 0; +print "1..13\n"; + +while (<DATA>) +{ + chop; + next if /^\s*;?#/; + next if /^\s*$/; + + ++$test; + $bad = 0; + + if (/^eval\s+/) + { + ($try = $_) =~ s/^eval\s+//; + + eval ($try); + if ($@) + { + $bad++; + print "not ok $test\n"; + print "# eval '$try' returned $@"; + } + } + elsif (/^\(/) + { + ($in, $out) = split (':'); + + $try = "\@expect = $out; \@got = &soundex $in;"; + eval ($try); + + if (@expect != @got) + { + $bad++; + print "not ok $test\n"; + print "# expected ", scalar @expect, " results, got ", scalar @got, "\n"; + print "# expected (", join (', ', @expect), + ") got (", join (', ', @got), ")\n"; + } + else + { + while (@got) + { + $expect = shift @expect; + $got = shift @got; + + if ($expect ne $got) + { + $bad++; + print "not ok $test\n"; + print "# expected $expect, got $got\n"; + } + } + } + } + else + { + ($in, $out) = split (':'); + + $try = "\$expect = $out; \$got = &soundex ($in);"; + eval ($try); + + if ($expect ne $got) + { + $bad++; + print "not ok $test\n"; + print "# expected $expect, got $got\n"; + } + } + + print "ok $test\n" unless $bad; +} + +__END__ +# +# 1..6 +# +# Knuth's test cases, scalar in, scalar out +# +'Euler':'E460' +'Gauss':'G200' +'Hilbert':'H416' +'Knuth':'K530' +'Lloyd':'L300' +'Lukasiewicz':'L222' +# +# 7..8 +# +# check default bad code +# +'2 + 2 = 4':undef +undef:undef +# +# 9 +# +# check array in, array out +# +('Ellery', 'Ghosh', 'Heilbronn', 'Kant', 'Ladd', 'Lissajous'):('E460', 'G200', 'H416', 'K530', 'L300', 'L222') +# +# 10 +# +# check array with explicit undef +# +('Mike', undef, 'Stok'):('M200', undef, 'S320') +# +# 11..12 +# +# check setting $Text::Soundex::noCode +# +eval $soundex_nocode = 'Z000'; +('Mike', undef, 'Stok'):('M200', 'Z000', 'S320') +# +# 13 +# +# a subtle difference between me & oracle, spotted by Rich Pinder +# <rpinder@hsc.usc.edu> +# +CZARKOWSKA:C622 |