diff options
Diffstat (limited to 'lib/Text')
-rw-r--r-- | lib/Text/Abbrev.pm | 37 | ||||
-rw-r--r-- | lib/Text/ParseWords.pm | 170 | ||||
-rw-r--r-- | lib/Text/Soundex.pm | 82 | ||||
-rw-r--r-- | lib/Text/Tabs.pm | 47 |
4 files changed, 336 insertions, 0 deletions
diff --git a/lib/Text/Abbrev.pm b/lib/Text/Abbrev.pm new file mode 100644 index 0000000000..77370d37c3 --- /dev/null +++ b/lib/Text/Abbrev.pm @@ -0,0 +1,37 @@ +package Text::Abbrev; +require 5.000; +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(abbrev); + +# Usage: +# &abbrev(*foo,LIST); +# ... +# $long = $foo{$short}; + +sub abbrev { + local(*domain) = shift; + @cmp = @_; + %domain = (); + foreach $name (@_) { + @extra = split(//,$name); + $abbrev = shift(@extra); + $len = 1; + foreach $cmp (@cmp) { + next if $cmp eq $name; + while (substr($cmp,0,$len) eq $abbrev) { + $abbrev .= shift(@extra); + ++$len; + } + } + $domain{$abbrev} = $name; + while (@extra) { + $abbrev .= shift(@extra); + $domain{$abbrev} = $name; + } + } +} + +1; + diff --git a/lib/Text/ParseWords.pm b/lib/Text/ParseWords.pm new file mode 100644 index 0000000000..89278501d1 --- /dev/null +++ b/lib/Text/ParseWords.pm @@ -0,0 +1,170 @@ +package Text::ParseWords; + +require 5.000; +require Exporter; +require AutoLoader; +use Carp; + +@ISA = qw(Exporter AutoLoader); +@EXPORT = qw(shellwords quotewords); +@EXPORT_OK = qw(old_shellwords); + +# This code needs updating to use new Perl 5 features (regexp etc). + +# ParseWords.pm +# +# Usage: +# use ParseWords; +# @words = "ewords($delim, $keep, @lines); +# @words = &shellwords(@lines); +# @words = &old_shellwords(@lines); + +# Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 +# Permission to use and distribute under the same terms as Perl. +# No warranty expressed or implied. + +# Basically an update and generalization of the old shellwords.pl. +# Much code shamelessly stolen from the old version (author unknown). +# +# "ewords() accepts a delimiter (which can be a regular expression) +# and a list of lines and then breaks those lines up into a list of +# words ignoring delimiters that appear inside quotes. +# +# The $keep argument is a boolean flag. If true, the quotes are kept +# with each word, otherwise quotes are stripped in the splitting process. +# $keep also defines whether unprotected backslashes are retained. +# + +1; +__END__ + + +sub shellwords { + + # A &shellwords() replacement is included to demonstrate the new package. + # This version differs from the original in that it will _NOT_ default + # to using $_ if no arguments are given. I personally find the old behavior + # to be a mis-feature. + + local(@lines) = @_; + $lines[$#lines] =~ s/\s+$//; + "ewords('\s+', 0, @lines); +} + + + +sub quotewords { + +# "ewords() works by simply jamming all of @lines into a single +# string in $_ and then pulling off words a bit at a time until $_ +# is exhausted. +# +# The inner "for" loop builds up each word (or $field) one $snippet +# at a time. A $snippet is a quoted string, a backslashed character, +# or an unquoted string. We fall out of the "for" loop when we reach +# the end of $_ or when we hit a delimiter. Falling out of the "for" +# loop, we push the $field we've been building up onto the list of +# @words we'll be returning, and then loop back and pull another word +# off of $_. +# +# The first two cases inside the "for" loop deal with quoted strings. +# The first case matches a double quoted string, removes it from $_, +# and assigns the double quoted string to $snippet in the body of the +# conditional. The second case handles single quoted strings. In +# the third case we've found a quote at the current beginning of $_, +# but it didn't match the quoted string regexps in the first two cases, +# so it must be an unbalanced quote and we croak with an error (which can +# be caught by eval()). +# +# The next case handles backslashed characters, and the next case is the +# exit case on reaching the end of the string or finding a delimiter. +# +# Otherwise, we've found an unquoted thing and we pull of characters one +# at a time until we reach something that could start another $snippet-- +# a quote of some sort, a backslash, or the delimiter. This one character +# at a time behavior was necessary if the delimiter was going to be a +# regexp (love to hear it if you can figure out a better way). + + local($delim, $keep, @lines) = @_; + local(@words,$snippet,$field,$_); + + $_ = join('', @lines); + while ($_) { + $field = ''; + for (;;) { + $snippet = ''; + if (s/^"(([^"\\]|\\[\\"])*)"//) { + $snippet = $1; + $snippet = "\"$snippet\"" if ($keep); + } + elsif (s/^'(([^'\\]|\\[\\'])*)'//) { + $snippet = $1; + $snippet = "'$snippet'" if ($keep); + } + elsif (/^["']/) { + croak "Unmatched quote"; + } + elsif (s/^\\(.)//) { + $snippet = $1; + $snippet = "\\$snippet" if ($keep); + } + elsif (!$_ || s/^$delim//) { + last; + } + else { + while ($_ && !(/^$delim/ || /^['"\\]/)) { + $snippet .= substr($_, 0, 1); + substr($_, 0, 1) = ''; + } + } + $field .= $snippet; + } + push(@words, $field); + } + @words; +} + + +sub old_shellwords { + + # Usage: + # use ParseWords; + # @words = old_shellwords($line); + # or + # @words = old_shellwords(@lines); + + local($_) = join('', @_); + my(@words,$snippet,$field); + + s/^\s+//; + while ($_ ne '') { + $field = ''; + for (;;) { + if (s/^"(([^"\\]|\\.)*)"//) { + ($snippet = $1) =~ s#\\(.)#$1#g; + } + elsif (/^"/) { + croak "Unmatched double quote: $_"; + } + elsif (s/^'(([^'\\]|\\.)*)'//) { + ($snippet = $1) =~ s#\\(.)#$1#g; + } + elsif (/^'/) { + croak "Unmatched single quote: $_"; + } + elsif (s/^\\(.)//) { + $snippet = $1; + } + elsif (s/^([^\s\\'"]+)//) { + $snippet = $1; + } + else { + s/^\s+//; + last; + } + $field .= $snippet; + } + push(@words, $field); + } + @words; +} diff --git a/lib/Text/Soundex.pm b/lib/Text/Soundex.pm new file mode 100644 index 0000000000..655152347c --- /dev/null +++ b/lib/Text/Soundex.pm @@ -0,0 +1,82 @@ +package Text::Soundex; +require 5.000; +require Exporter; + +@ISA = qw(Exporter); +@EXPORT = qw(&soundex $soundex_nocode); + +# $Id: soundex.pl,v 1.2 1994/03/24 00:30:27 mike Exp $ +# +# Implementation of soundex algorithm as described by Knuth in volume +# 3 of The Art of Computer Programming, with ideas stolen from Ian +# Phillips <ian@pipex.net>. +# +# Mike Stok <Mike.Stok@meiko.concord.ma.us>, 2 March 1994. +# +# Knuth's test cases are: +# +# Euler, Ellery -> E460 +# Gauss, Ghosh -> G200 +# Hilbert, Heilbronn -> H416 +# Knuth, Kant -> K530 +# Lloyd, Ladd -> L300 +# Lukasiewicz, Lissajous -> L222 +# +# $Log: soundex.pl,v $ +# Revision 1.2 1994/03/24 00:30:27 mike +# Subtle bug (any excuse :-) spotted by Rich Pinder <rpinder@hsc.usc.edu> +# in the way I handles leasing characters which were different but had +# the same soundex code. This showed up comparing it with Oracle's +# soundex output. +# +# Revision 1.1 1994/03/02 13:01:30 mike +# Initial revision +# +# +############################################################################## + +# $soundex_nocode is used to indicate a string doesn't have a soundex +# code, I like undef other people may want to set it to 'Z000'. + +$soundex_nocode = undef; + +# soundex +# +# usage: +# +# @codes = &soundex (@wordList); +# $code = &soundex ($word); +# +# This strenuously avoids 0 + +sub soundex +{ + local (@s, $f, $fc, $_) = @_; + + foreach (@s) + { + tr/a-z/A-Z/; + tr/A-Z//cd; + + if ($_ eq '') + { + $_ = $soundex_nocode; + } + else + { + ($f) = /^(.)/; + tr/AEHIOUWYBFPVCGJKQSXZDTLMNR/00000000111122222222334556/; + ($fc) = /^(.)/; + s/^$fc+//; + tr///cs; + tr/0//d; + $_ = $f . $_ . '000'; + s/^(.{4}).*/$1/; + } + } + + wantarray ? @s : shift @s; +} + +1; + diff --git a/lib/Text/Tabs.pm b/lib/Text/Tabs.pm new file mode 100644 index 0000000000..8ca833f8e8 --- /dev/null +++ b/lib/Text/Tabs.pm @@ -0,0 +1,47 @@ +# +# expand and unexpand tabs as per the unix expand and +# unexpand programs. +# +# expand and unexpand operate on arrays of lines. Do not +# feed strings that contain newlines to them. +# +# David Muir Sharnoff <muir@idiom.com> +# + +package Tabs; + +require Exporter; + +@ISA = (Exporter); +@EXPORT = qw(expand unexpand $tabstop); + +$tabstop = 8; + +sub expand +{ + my @l = @_; + for $_ (@l) { + 1 while s/^([^\t]*)(\t+)/ + $1 . (" " x + ($tabstop * length($2) + - (length($1) % $tabstop))) + /e; + } + return @l; +} + +sub unexpand +{ + my @l = &expand(@_); + my @e; + for $x (@l) { + @e = split(/(.{$tabstop})/,$x); + for $_ (@e) { + s/ +$/\t/; + } + $x = join('',@e); + } + return @l; +} + +1; |