summaryrefslogtreecommitdiff
path: root/cpan/Text-ParseWords
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2009-10-02 16:39:27 +0100
committerNicholas Clark <nick@ccl4.org>2009-10-02 16:39:27 +0100
commit204606f4ac32e12078eeffffcd758292ce910d1b (patch)
tree30674ad2259f7bc6020d91e080f25960cf988115 /cpan/Text-ParseWords
parente916ef552ee31bfadefeb6b7752fce6b84326b26 (diff)
downloadperl-204606f4ac32e12078eeffffcd758292ce910d1b.tar.gz
Move Text::ParseWords from ext/ to cpan/
Diffstat (limited to 'cpan/Text-ParseWords')
-rw-r--r--cpan/Text-ParseWords/lib/Text/ParseWords.pm294
-rw-r--r--cpan/Text-ParseWords/t/ParseWords.t122
-rw-r--r--cpan/Text-ParseWords/t/taint.t13
3 files changed, 429 insertions, 0 deletions
diff --git a/cpan/Text-ParseWords/lib/Text/ParseWords.pm b/cpan/Text-ParseWords/lib/Text/ParseWords.pm
new file mode 100644
index 0000000000..f1b5937904
--- /dev/null
+++ b/cpan/Text-ParseWords/lib/Text/ParseWords.pm
@@ -0,0 +1,294 @@
+package Text::ParseWords;
+
+use strict;
+require 5.006;
+our $VERSION = "3.27";
+
+
+use Exporter;
+our @ISA = qw(Exporter);
+our @EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
+our @EXPORT_OK = qw(old_shellwords);
+our $PERL_SINGLE_QUOTE;
+
+
+sub shellwords {
+ my (@lines) = @_;
+ my @allwords;
+
+ foreach my $line (@lines) {
+ $line =~ s/^\s+//;
+ my @words = parse_line('\s+', 0, $line);
+ pop @words if (@words and !defined $words[-1]);
+ return() unless (@words || !length($line));
+ push(@allwords, @words);
+ }
+ return(@allwords);
+}
+
+
+
+sub quotewords {
+ my($delim, $keep, @lines) = @_;
+ my($line, @words, @allwords);
+
+ foreach $line (@lines) {
+ @words = parse_line($delim, $keep, $line);
+ return() unless (@words || !length($line));
+ push(@allwords, @words);
+ }
+ return(@allwords);
+}
+
+
+
+sub nested_quotewords {
+ my($delim, $keep, @lines) = @_;
+ my($i, @allwords);
+
+ for ($i = 0; $i < @lines; $i++) {
+ @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
+ return() unless (@{$allwords[$i]} || !length($lines[$i]));
+ }
+ return(@allwords);
+}
+
+
+
+sub parse_line {
+ my($delimiter, $keep, $line) = @_;
+ my($word, @pieces);
+
+ no warnings 'uninitialized'; # we will be testing undef strings
+
+ while (length($line)) {
+ # This pattern is optimised to be stack conservative on older perls.
+ # Do not refactor without being careful and testing it on very long strings.
+ # See Perl bug #42980 for an example of a stack busting input.
+ $line =~ s/^
+ (?:
+ # double quoted string
+ (") # $quote
+ ((?>[^\\"]*(?:\\.[^\\"]*)*))" # $quoted
+ | # --OR--
+ # singe quoted string
+ (') # $quote
+ ((?>[^\\']*(?:\\.[^\\']*)*))' # $quoted
+ | # --OR--
+ # unquoted string
+ ( # $unquoted
+ (?:\\.|[^\\"'])*?
+ )
+ # followed by
+ ( # $delim
+ \Z(?!\n) # EOL
+ | # --OR--
+ (?-x:$delimiter) # delimiter
+ | # --OR--
+ (?!^)(?=["']) # a quote
+ )
+ )//xs or return; # extended layout
+ my ($quote, $quoted, $unquoted, $delim) = (($1 ? ($1,$2) : ($3,$4)), $5, $6);
+
+
+ return() unless( defined($quote) || length($unquoted) || length($delim));
+
+ if ($keep) {
+ $quoted = "$quote$quoted$quote";
+ }
+ else {
+ $unquoted =~ s/\\(.)/$1/sg;
+ if (defined $quote) {
+ $quoted =~ s/\\(.)/$1/sg if ($quote eq '"');
+ $quoted =~ s/\\([\\'])/$1/g if ( $PERL_SINGLE_QUOTE && $quote eq "'");
+ }
+ }
+ $word .= substr($line, 0, 0); # leave results tainted
+ $word .= defined $quote ? $quoted : $unquoted;
+
+ if (length($delim)) {
+ push(@pieces, $word);
+ push(@pieces, $delim) if ($keep eq 'delimiters');
+ undef $word;
+ }
+ if (!length($line)) {
+ push(@pieces, $word);
+ }
+ }
+ return(@pieces);
+}
+
+
+
+sub old_shellwords {
+
+ # Usage:
+ # use ParseWords;
+ # @words = old_shellwords($line);
+ # or
+ # @words = old_shellwords(@lines);
+ # or
+ # @words = old_shellwords(); # defaults to $_ (and clobbers it)
+
+ no warnings 'uninitialized'; # we will be testing undef strings
+ local *_ = \join('', @_) if @_;
+ my (@words, $snippet);
+
+ s/\A\s+//;
+ while ($_ ne '') {
+ my $field = substr($_, 0, 0); # leave results tainted
+ for (;;) {
+ if (s/\A"(([^"\\]|\\.)*)"//s) {
+ ($snippet = $1) =~ s#\\(.)#$1#sg;
+ }
+ elsif (/\A"/) {
+ require Carp;
+ Carp::carp("Unmatched double quote: $_");
+ return();
+ }
+ elsif (s/\A'(([^'\\]|\\.)*)'//s) {
+ ($snippet = $1) =~ s#\\(.)#$1#sg;
+ }
+ elsif (/\A'/) {
+ require Carp;
+ Carp::carp("Unmatched single quote: $_");
+ return();
+ }
+ elsif (s/\A\\(.?)//s) {
+ $snippet = $1;
+ }
+ elsif (s/\A([^\s\\'"]+)//) {
+ $snippet = $1;
+ }
+ else {
+ s/\A\s+//;
+ last;
+ }
+ $field .= $snippet;
+ }
+ push(@words, $field);
+ }
+ return @words;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Text::ParseWords - parse text into an array of tokens or array of arrays
+
+=head1 SYNOPSIS
+
+ use Text::ParseWords;
+ @lists = nested_quotewords($delim, $keep, @lines);
+ @words = quotewords($delim, $keep, @lines);
+ @words = shellwords(@lines);
+ @words = parse_line($delim, $keep, $line);
+ @words = old_shellwords(@lines); # DEPRECATED!
+
+=head1 DESCRIPTION
+
+The &nested_quotewords() and &quotewords() functions accept a delimiter
+(which can be a regular expression)
+and a list of lines and then breaks those lines up into a list of
+words ignoring delimiters that appear inside quotes. &quotewords()
+returns all of the tokens in a single long list, while &nested_quotewords()
+returns a list of token lists corresponding to the elements of @lines.
+&parse_line() does tokenizing on a single string. The &*quotewords()
+functions simply call &parse_line(), so if you're only splitting
+one line you can call &parse_line() directly and save a function
+call.
+
+The $keep argument is a boolean flag. If true, then the tokens are
+split on the specified delimiter, but all other characters (quotes,
+backslashes, etc.) are kept in the tokens. If $keep is false then the
+&*quotewords() functions remove all quotes and backslashes that are
+not themselves backslash-escaped or inside of single quotes (i.e.,
+&quotewords() tries to interpret these characters just like the Bourne
+shell). NB: these semantics are significantly different from the
+original version of this module shipped with Perl 5.000 through 5.004.
+As an additional feature, $keep may be the keyword "delimiters" which
+causes the functions to preserve the delimiters in each string as
+tokens in the token lists, in addition to preserving quote and
+backslash characters.
+
+&shellwords() is written as a special case of &quotewords(), and it
+does token parsing with whitespace as a delimiter-- similar to most
+Unix shells.
+
+=head1 EXAMPLES
+
+The sample program:
+
+ use Text::ParseWords;
+ @words = quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
+ $i = 0;
+ foreach (@words) {
+ print "$i: <$_>\n";
+ $i++;
+ }
+
+produces:
+
+ 0: <this>
+ 1: <is>
+ 2: <a test>
+ 3: <of quotewords>
+ 4: <"for>
+ 5: <you>
+
+demonstrating:
+
+=over 4
+
+=item 0
+
+a simple word
+
+=item 1
+
+multiple spaces are skipped because of our $delim
+
+=item 2
+
+use of quotes to include a space in a word
+
+=item 3
+
+use of a backslash to include a space in a word
+
+=item 4
+
+use of a backslash to remove the special meaning of a double-quote
+
+=item 5
+
+another simple word (note the lack of effect of the
+backslashed double-quote)
+
+=back
+
+Replacing C<quotewords('\s+', 0, q{this is...})>
+with C<shellwords(q{this is...})>
+is a simpler way to accomplish the same thing.
+
+=head1 AUTHORS
+
+Maintainer: Alexandr Ciornii <alexchornyATgmail.com>.
+
+Previous maintainer: Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
+author unknown). Much of the code for &parse_line() (including the
+primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
+
+Examples section another documentation provided by John Heidemann
+<johnh@ISI.EDU>
+
+Bug reports, patches, and nagging provided by lots of folks-- thanks
+everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
+for assuring me that a &nested_quotewords() would be useful, and to
+Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
+error-checking (sort of-- you had to be there).
+
+=cut
diff --git a/cpan/Text-ParseWords/t/ParseWords.t b/cpan/Text-ParseWords/t/ParseWords.t
new file mode 100644
index 0000000000..905ea00864
--- /dev/null
+++ b/cpan/Text-ParseWords/t/ParseWords.t
@@ -0,0 +1,122 @@
+#!./perl
+
+use warnings;
+use Text::ParseWords;
+use Test::More tests => 27;
+
+@words = shellwords(qq(foo "bar quiz" zoo));
+is($words[0], 'foo');
+is($words[1], 'bar quiz');
+is($words[2], 'zoo');
+
+{
+ # Gonna get some undefined things back
+ no warnings 'uninitialized' ;
+
+ # Test quotewords() with other parameters and null last field
+ @words = quotewords(':+', 1, 'foo:::"bar:foo":zoo zoo:');
+ is(join(";", @words), qq(foo;"bar:foo";zoo zoo;));
+}
+
+# Test $keep eq 'delimiters' and last field zero
+@words = quotewords('\s+', 'delimiters', '4 3 2 1 0');
+is(join(";", @words), qq(4; ;3; ;2; ;1; ;0));
+
+# Big ol' nasty test (thanks, Joerk!)
+$string = 'aaaa"bbbbb" cc\\ cc \\\\\\"dddd" eee\\\\\\"ffff" "gg"';
+
+# First with $keep == 1
+$result = join('|', parse_line('\s+', 1, $string));
+is($result, 'aaaa"bbbbb"|cc\\ cc|\\\\\\"dddd" eee\\\\\\"ffff"|"gg"');
+
+# Now, $keep == 0
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\\"dddd eee\\"ffff|gg');
+
+# Now test single quote behavior
+$string = 'aaaa"bbbbb" cc\\ cc \\\\\\"dddd\' eee\\\\\\"ffff\' gg';
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\\"dddd eee\\\\\\"ffff|gg');
+
+# Make sure @nested_quotewords does the right thing
+@lists = nested_quotewords('\s+', 0, 'a b c', '1 2 3', 'x y z');
+is (@lists, 3);
+is (@{$lists[0]}, 3);
+is (@{$lists[1]}, 3);
+is (@{$lists[2]}, 3);
+
+# Now test error return
+$string = 'foo bar baz"bach blech boop';
+
+@words = shellwords($string);
+is(@words, 0);
+
+@words = parse_line('s+', 0, $string);
+is(@words, 0);
+
+@words = quotewords('s+', 0, $string);
+is(@words, 0);
+
+{
+ # Gonna get some more undefined things back
+ no warnings 'uninitialized' ;
+
+ @words = nested_quotewords('s+', 0, $string);
+ is(@words, 0);
+
+ # Now test empty fields
+ $result = join('|', parse_line(':', 0, 'foo::0:"":::'));
+ is($result, 'foo||0||||');
+
+ # Test for 0 in quotes without $keep
+ $result = join('|', parse_line(':', 0, ':"0":'));
+ is($result, '|0|');
+
+ # Test for \001 in quoted string
+ $result = join('|', parse_line(':', 0, ':"' . "\001" . '":'));
+ is($result, "|\1|");
+
+}
+
+# Now test perlish single quote behavior
+$Text::ParseWords::PERL_SINGLE_QUOTE = 1;
+$string = 'aaaa"bbbbb" cc\ cc \\\\\"dddd\' eee\\\\\"\\\'ffff\' gg';
+$result = join('|', parse_line('\s+', 0, $string));
+is($result, 'aaaabbbbb|cc cc|\"dddd eee\\\\"\'ffff|gg');
+
+# test whitespace in the delimiters
+@words = quotewords(' ', 1, '4 3 2 1 0');
+is(join(";", @words), qq(4;3;2;1;0));
+
+# [perl #30442] Text::ParseWords does not handle backslashed newline inside quoted text
+$string = qq{"field1" "field2\\\nstill field2" "field3"};
+
+$result = join('|', parse_line("\t", 1, $string));
+is($result, qq{"field1"|"field2\\\nstill field2"|"field3"});
+
+$result = join('|', parse_line("\t", 0, $string));
+is($result, "field1|field2\nstill field2|field3");
+
+SKIP: { # unicode
+ skip "No unicode",1 if $]<5.008;
+ $string = qq{"field1"\x{1234}"field2\\\x{1234}still field2"\x{1234}"field3"};
+ $result = join('|', parse_line("\x{1234}", 0, $string));
+ is($result, "field1|field2\x{1234}still field2|field3",'Unicode');
+}
+
+# missing quote after matching regex used to hang after change #22997
+"1234" =~ /(1)(2)(3)(4)/;
+$string = qq{"missing quote};
+$result = join('|', shellwords($string));
+is($result, "");
+
+# make sure shellwords strips out leading whitespace and trailng undefs
+# from parse_line, so it's behavior is more like /bin/sh
+$result = join('|', shellwords(" aa \\ \\ bb ", " \\ ", "cc dd ee\\ "));
+is($result, "aa| | bb| |cc|dd|ee ");
+
+$SIG{ALRM} = sub {die "Timeout!"};
+alarm(3);
+@words = Text::ParseWords::old_shellwords("foo\\");
+is(@words, 1);
+alarm(0);
diff --git a/cpan/Text-ParseWords/t/taint.t b/cpan/Text-ParseWords/t/taint.t
new file mode 100644
index 0000000000..959ba79700
--- /dev/null
+++ b/cpan/Text-ParseWords/t/taint.t
@@ -0,0 +1,13 @@
+#!./perl -Tw
+# [perl #33173] shellwords.pl and tainting
+
+use Text::ParseWords qw(shellwords old_shellwords);
+use Scalar::Util qw(tainted);
+
+print "1..2\n";
+
+print "not " if grep { not tainted($_) } shellwords("$0$^X");
+print "ok 1\n";
+
+print "not " if grep { not tainted($_) } old_shellwords("$0$^X");
+print "ok 2\n";