diff options
author | Nicholas Clark <nick@ccl4.org> | 2009-10-02 16:39:27 +0100 |
---|---|---|
committer | Nicholas Clark <nick@ccl4.org> | 2009-10-02 16:39:27 +0100 |
commit | 204606f4ac32e12078eeffffcd758292ce910d1b (patch) | |
tree | 30674ad2259f7bc6020d91e080f25960cf988115 /cpan/Text-ParseWords | |
parent | e916ef552ee31bfadefeb6b7752fce6b84326b26 (diff) | |
download | perl-204606f4ac32e12078eeffffcd758292ce910d1b.tar.gz |
Move Text::ParseWords from ext/ to cpan/
Diffstat (limited to 'cpan/Text-ParseWords')
-rw-r--r-- | cpan/Text-ParseWords/lib/Text/ParseWords.pm | 294 | ||||
-rw-r--r-- | cpan/Text-ParseWords/t/ParseWords.t | 122 | ||||
-rw-r--r-- | cpan/Text-ParseWords/t/taint.t | 13 |
3 files changed, 429 insertions, 0 deletions
diff --git a/cpan/Text-ParseWords/lib/Text/ParseWords.pm b/cpan/Text-ParseWords/lib/Text/ParseWords.pm new file mode 100644 index 0000000000..f1b5937904 --- /dev/null +++ b/cpan/Text-ParseWords/lib/Text/ParseWords.pm @@ -0,0 +1,294 @@ +package Text::ParseWords; + +use strict; +require 5.006; +our $VERSION = "3.27"; + + +use Exporter; +our @ISA = qw(Exporter); +our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line); +our @EXPORT_OK = qw(old_shellwords); +our $PERL_SINGLE_QUOTE; + + +sub shellwords { + my (@lines) = @_; + my @allwords; + + foreach my $line (@lines) { + $line =~ s/^\s+//; + my @words = parse_line('\s+', 0, $line); + pop @words if (@words and !defined $words[-1]); + return() unless (@words || !length($line)); + push(@allwords, @words); + } + return(@allwords); +} + + + +sub quotewords { + my($delim, $keep, @lines) = @_; + my($line, @words, @allwords); + + foreach $line (@lines) { + @words = parse_line($delim, $keep, $line); + return() unless (@words || !length($line)); + push(@allwords, @words); + } + return(@allwords); +} + + + +sub nested_quotewords { + my($delim, $keep, @lines) = @_; + my($i, @allwords); + + for ($i = 0; $i < @lines; $i++) { + @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); + return() unless (@{$allwords[$i]} || !length($lines[$i])); + } + return(@allwords); +} + + + +sub parse_line { + my($delimiter, $keep, $line) = @_; + my($word, @pieces); + + no warnings 'uninitialized'; # we will be testing undef strings + + while (length($line)) { + # This pattern is optimised to be stack conservative on older perls. + # Do not refactor without being careful and testing it on very long strings. + # See Perl bug #42980 for an example of a stack busting input. + $line =~ s/^ + (?: + # double quoted string + (") # $quote + ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted + | # --OR-- + # singe quoted string + (') # $quote + ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted + | # --OR-- + # unquoted string + ( # $unquoted + (?:\\.|[^\\"'])*? + ) + # followed by + ( # $delim + \Z(?!\n) # EOL + | # --OR-- + (?-x:$delimiter) # delimiter + | # --OR-- + (?!^)(?=["']) # a quote + ) + )//xs or return; # extended layout + my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6); + + + return() unless( defined($quote) || length($unquoted) || length($delim)); + + if ($keep) { + $quoted = "$quote$quoted$quote"; + } + else { + $unquoted =~ s/\\(.)/$1/sg; + if (defined $quote) { + $quoted =~ s/\\(.)/$1/sg if ($quote eq '"'); + $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'"); + } + } + $word .= substr($line, 0, 0); # leave results tainted + $word .= defined $quote ? $quoted : $unquoted; + + if (length($delim)) { + push(@pieces, $word); + push(@pieces, $delim) if ($keep eq 'delimiters'); + undef $word; + } + if (!length($line)) { + push(@pieces, $word); + } + } + return(@pieces); +} + + + +sub old_shellwords { + + # Usage: + # use ParseWords; + # @words = old_shellwords($line); + # or + # @words = old_shellwords(@lines); + # or + # @words = old_shellwords(); # defaults to $_ (and clobbers it) + + no warnings 'uninitialized'; # we will be testing undef strings + local *_ = \join('', @_) if @_; + my (@words, $snippet); + + s/\A\s+//; + while ($_ ne '') { + my $field = substr($_, 0, 0); # leave results tainted + for (;;) { + if (s/\A"(([^"\\]|\\.)*)"//s) { + ($snippet = $1) =~ s#\\(.)#$1#sg; + } + elsif (/\A"/) { + require Carp; + Carp::carp("Unmatched double quote: $_"); + return(); + } + elsif (s/\A'(([^'\\]|\\.)*)'//s) { + ($snippet = $1) =~ s#\\(.)#$1#sg; + } + elsif (/\A'/) { + require Carp; + Carp::carp("Unmatched single quote: $_"); + return(); + } + elsif (s/\A\\(.?)//s) { + $snippet = $1; + } + elsif (s/\A([^\s\\'"]+)//) { + $snippet = $1; + } + else { + s/\A\s+//; + last; + } + $field .= $snippet; + } + push(@words, $field); + } + return @words; +} + +1; + +__END__ + +=head1 NAME + +Text::ParseWords - parse text into an array of tokens or array of arrays + +=head1 SYNOPSIS + + use Text::ParseWords; + @lists = nested_quotewords($delim, $keep, @lines); + @words = quotewords($delim, $keep, @lines); + @words = shellwords(@lines); + @words = parse_line($delim, $keep, $line); + @words = old_shellwords(@lines); # DEPRECATED! + +=head1 DESCRIPTION + +The &nested_quotewords() and "ewords() functions accept a delimiter +(which can be a regular expression) +and a list of lines and then breaks those lines up into a list of +words ignoring delimiters that appear inside quotes. "ewords() +returns all of the tokens in a single long list, while &nested_quotewords() +returns a list of token lists corresponding to the elements of @lines. +&parse_line() does tokenizing on a single string. The &*quotewords() +functions simply call &parse_line(), so if you're only splitting +one line you can call &parse_line() directly and save a function +call. + +The $keep argument is a boolean flag. If true, then the tokens are +split on the specified delimiter, but all other characters (quotes, +backslashes, etc.) are kept in the tokens. If $keep is false then the +&*quotewords() functions remove all quotes and backslashes that are +not themselves backslash-escaped or inside of single quotes (i.e., +"ewords() tries to interpret these characters just like the Bourne +shell). NB: these semantics are significantly different from the +original version of this module shipped with Perl 5.000 through 5.004. +As an additional feature, $keep may be the keyword "delimiters" which +causes the functions to preserve the delimiters in each string as +tokens in the token lists, in addition to preserving quote and +backslash characters. + +&shellwords() is written as a special case of "ewords(), and it +does token parsing with whitespace as a delimiter-- similar to most +Unix shells. + +=head1 EXAMPLES + +The sample program: + + use Text::ParseWords; + @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); + $i = 0; + foreach (@words) { + print "$i: <$_>\n"; + $i++; + } + +produces: + + 0: <this> + 1: <is> + 2: <a test> + 3: <of quotewords> + 4: <"for> + 5: <you> + +demonstrating: + +=over 4 + +=item 0 + +a simple word + +=item 1 + +multiple spaces are skipped because of our $delim + +=item 2 + +use of quotes to include a space in a word + +=item 3 + +use of a backslash to include a space in a word + +=item 4 + +use of a backslash to remove the special meaning of a double-quote + +=item 5 + +another simple word (note the lack of effect of the +backslashed double-quote) + +=back + +Replacing C<quotewords('\s+', 0, q{this is...})> +with C<shellwords(q{this is...})> +is a simpler way to accomplish the same thing. + +=head1 AUTHORS + +Maintainer: Alexandr Ciornii <alexchornyATgmail.com>. + +Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original +author unknown). Much of the code for &parse_line() (including the +primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. + +Examples section another documentation provided by John Heidemann +<johnh@ISI.EDU> + +Bug reports, patches, and nagging provided by lots of folks-- thanks +everybody! Special thanks to Michael Schwern <schwern@envirolink.org> +for assuring me that a &nested_quotewords() would be useful, and to +Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about +error-checking (sort of-- you had to be there). + +=cut diff --git a/cpan/Text-ParseWords/t/ParseWords.t b/cpan/Text-ParseWords/t/ParseWords.t new file mode 100644 index 0000000000..905ea00864 --- /dev/null +++ b/cpan/Text-ParseWords/t/ParseWords.t @@ -0,0 +1,122 @@ +#!./perl
+
+use warnings;
+use Text::ParseWords;
+use Test::More tests => 27;
+
+@words = shellwords(qq(foo "bar quiz" zoo));
+is($words[0], 'foo');
+is($words[1], 'bar quiz');
+is($words[2], 'zoo');
+
+{
+ # Gonna get some undefined things back
+ no warnings 'uninitialized' ;
+
+ # Test quotewords() with other parameters and null last field
+ @words = quotewords(':+', 1, 'foo:::"bar:foo":zoo zoo:');
+ is(join(";", @words), qq(foo;"bar:foo";zoo zoo;));
+}
+
+# Test $keep eq 'delimiters' and last field zero
+@words = quotewords('\s+', 'delimiters', '4 3 2 1 0');
+is(join(";", @words), qq(4; ;3; ;2; ;1; ;0));
+
+# Big ol' nasty test (thanks, Joerk!)
+$string = 'aaaa"bbbbb" cc\\ cc \\\\\\"dddd" eee\\\\\\"ffff" "gg"';
+
+# First with $keep == 1
+$result = join('|', parse_line('\s+', 1, $string));
+is($result, 'aaaa"bbbbb"|cc\\ cc|\\\\\\"dddd" eee\\\\\\"ffff"|"gg"');
+
+# Now, $keep == 0
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\\"dddd eee\\"ffff|gg');
+
+# Now test single quote behavior
+$string = 'aaaa"bbbbb" cc\\ cc \\\\\\"dddd\' eee\\\\\\"ffff\' gg';
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\\"dddd eee\\\\\\"ffff|gg');
+
+# Make sure @nested_quotewords does the right thing
+@lists = nested_quotewords('\s+', 0, 'a b c', '1 2 3', 'x y z');
+is (@lists, 3);
+is (@{$lists[0]}, 3);
+is (@{$lists[1]}, 3);
+is (@{$lists[2]}, 3);
+
+# Now test error return
+$string = 'foo bar baz"bach blech boop';
+
+@words = shellwords($string);
+is(@words, 0);
+
+@words = parse_line('s+', 0, $string);
+is(@words, 0);
+
+@words = quotewords('s+', 0, $string);
+is(@words, 0);
+
+{
+ # Gonna get some more undefined things back
+ no warnings 'uninitialized' ;
+
+ @words = nested_quotewords('s+', 0, $string);
+ is(@words, 0);
+
+ # Now test empty fields
+ $result = join('|', parse_line(':', 0, 'foo::0:"":::'));
+ is($result, 'foo||0||||');
+
+ # Test for 0 in quotes without $keep
+ $result = join('|', parse_line(':', 0, ':"0":'));
+ is($result, '|0|');
+
+ # Test for \001 in quoted string
+ $result = join('|', parse_line(':', 0, ':"' . "\001" . '":'));
+ is($result, "|\1|");
+
+}
+
+# Now test perlish single quote behavior
+$Text::ParseWords::PERL_SINGLE_QUOTE = 1;
+$string = 'aaaa"bbbbb" cc\ cc \\\\\"dddd\' eee\\\\\"\\\'ffff\' gg';
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\"dddd eee\\\\"\'ffff|gg');
+
+# test whitespace in the delimiters
+@words = quotewords(' ', 1, '4 3 2 1 0');
+is(join(";", @words), qq(4;3;2;1;0));
+
+# [perl #30442] Text::ParseWords does not handle backslashed newline inside quoted text
+$string = qq{"field1" "field2\\\nstill field2" "field3"};
+
+$result = join('|', parse_line("\t", 1, $string));
+is($result, qq{"field1"|"field2\\\nstill field2"|"field3"});
+
+$result = join('|', parse_line("\t", 0, $string));
+is($result, "field1|field2\nstill field2|field3");
+
+SKIP: { # unicode
+ skip "No unicode",1 if $]<5.008;
+ $string = qq{"field1"\x{1234}"field2\\\x{1234}still field2"\x{1234}"field3"};
+ $result = join('|', parse_line("\x{1234}", 0, $string));
+ is($result, "field1|field2\x{1234}still field2|field3",'Unicode');
+}
+
+# missing quote after matching regex used to hang after change #22997
+"1234" =~ /(1)(2)(3)(4)/;
+$string = qq{"missing quote};
+$result = join('|', shellwords($string));
+is($result, "");
+
+# make sure shellwords strips out leading whitespace and trailng undefs
+# from parse_line, so it's behavior is more like /bin/sh
+$result = join('|', shellwords(" aa \\ \\ bb ", " \\ ", "cc dd ee\\ "));
+is($result, "aa| | bb| |cc|dd|ee ");
+
+$SIG{ALRM} = sub {die "Timeout!"};
+alarm(3);
+@words = Text::ParseWords::old_shellwords("foo\\");
+is(@words, 1);
+alarm(0);
diff --git a/cpan/Text-ParseWords/t/taint.t b/cpan/Text-ParseWords/t/taint.t new file mode 100644 index 0000000000..959ba79700 --- /dev/null +++ b/cpan/Text-ParseWords/t/taint.t @@ -0,0 +1,13 @@ +#!./perl -Tw +# [perl #33173] shellwords.pl and tainting + +use Text::ParseWords qw(shellwords old_shellwords); +use Scalar::Util qw(tainted); + +print "1..2\n"; + +print "not " if grep { not tainted($_) } shellwords("$0$^X"); +print "ok 1\n"; + +print "not " if grep { not tainted($_) } old_shellwords("$0$^X"); +print "ok 2\n"; |