summaryrefslogtreecommitdiff
path: root/lib/Text
diff options
context:
space:
mode:
authorGurusamy Sarathy <gsar@cpan.org>1998-05-14 09:31:34 +0000
committerGurusamy Sarathy <gsar@cpan.org>1998-05-14 09:31:34 +0000
commit9b599b2a63d2324ddacddd9710c41b795a95070d (patch)
tree4180f11ca1ddccb984799ab74df847e9f64f1213 /lib/Text
parent491527d0220de34ec13035d557e288c9952d1007 (diff)
downloadperl-9b599b2a63d2324ddacddd9710c41b795a95070d.tar.gz
[win32] merge change#887 from maintbranch
p4raw-link: @887 on //depot/maint-5.004/perl: 6cdf74fe31f049dc2164dbb9e6242179d4b8ee1f p4raw-id: //depot/win32/perl@937
Diffstat (limited to 'lib/Text')
-rw-r--r--lib/Text/ParseWords.pm293
-rw-r--r--lib/Text/Wrap.pm154
2 files changed, 247 insertions, 200 deletions
diff --git a/lib/Text/ParseWords.pm b/lib/Text/ParseWords.pm
index 62da1d273f..d3a89f03b8 100644
--- a/lib/Text/ParseWords.pm
+++ b/lib/Text/ParseWords.pm
@@ -1,140 +1,93 @@
package Text::ParseWords;
-require 5.000;
-use Carp;
+use vars qw($VERSION @ISA @EXPORT);
+$VERSION = "3.0";
-require AutoLoader;
-*AUTOLOAD = \&AutoLoader::AUTOLOAD;
+require 5.000;
-require Exporter;
+use Exporter;
@ISA = qw(Exporter);
-@EXPORT = qw(shellwords quotewords);
+@EXPORT = qw(shellwords quotewords nested_quotewords parse_line);
@EXPORT_OK = qw(old_shellwords);
-=head1 NAME
-
-Text::ParseWords - parse text into an array of tokens
-
-=head1 SYNOPSIS
-
- use Text::ParseWords;
- @words = &quotewords($delim, $keep, @lines);
- @words = &shellwords(@lines);
- @words = &old_shellwords(@lines);
-
-=head1 DESCRIPTION
-&quotewords() accepts a delimiter (which can be a regular expression)
-and a list of lines and then breaks those lines up into a list of
-words ignoring delimiters that appear inside quotes.
-
-The $keep argument is a boolean flag. If true, the quotes are kept
-with each word, otherwise quotes are stripped in the splitting process.
-$keep also defines whether unprotected backslashes are retained.
-
-A &shellwords() replacement is included to demonstrate the new package.
-This version differs from the original in that it will _NOT_ default
-to using $_ if no arguments are given. I personally find the old behavior
-to be a mis-feature.
-
-&quotewords() works by simply jamming all of @lines into a single
-string in $_ and then pulling off words a bit at a time until $_
-is exhausted.
+sub shellwords {
+ local(@lines) = @_;
+ $lines[$#lines] =~ s/\s+$//;
+ return(quotewords('\s+', 0, @lines));
+}
-=head1 AUTHORS
-Hal Pomeranz (pomeranz@netcom.com), 23 March 1994
-Basically an update and generalization of the old shellwords.pl.
-Much code shamelessly stolen from the old version (author unknown).
+sub quotewords {
+ my($delim, $keep, @lines) = @_;
+ my($line, @words, @allwords);
+
+
+ foreach $line (@lines) {
+ @words = parse_line($delim, $keep, $line);
+ return() unless (@words || !length($line));
+ push(@allwords, @words);
+ }
+ return(@allwords);
+}
-=cut
-1;
-__END__
-sub shellwords {
- local(@lines) = @_;
- $lines[$#lines] =~ s/\s+$//;
- &quotewords('\s+', 0, @lines);
+sub nested_quotewords {
+ my($delim, $keep, @lines) = @_;
+ my($i, @allwords);
+
+ for ($i = 0; $i < @lines; $i++) {
+ @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]);
+ return() unless (@{$allwords[$i]} || !length($lines[$i]));
+ }
+ return(@allwords);
}
-sub quotewords {
-
-# The inner "for" loop builds up each word (or $field) one $snippet
-# at a time. A $snippet is a quoted string, a backslashed character,
-# or an unquoted string. We fall out of the "for" loop when we reach
-# the end of $_ or when we hit a delimiter. Falling out of the "for"
-# loop, we push the $field we've been building up onto the list of
-# @words we'll be returning, and then loop back and pull another word
-# off of $_.
-#
-# The first two cases inside the "for" loop deal with quoted strings.
-# The first case matches a double quoted string, removes it from $_,
-# and assigns the double quoted string to $snippet in the body of the
-# conditional. The second case handles single quoted strings. In
-# the third case we've found a quote at the current beginning of $_,
-# but it didn't match the quoted string regexps in the first two cases,
-# so it must be an unbalanced quote and we croak with an error (which can
-# be caught by eval()).
-#
-# The next case handles backslashed characters, and the next case is the
-# exit case on reaching the end of the string or finding a delimiter.
-#
-# Otherwise, we've found an unquoted thing and we pull of characters one
-# at a time until we reach something that could start another $snippet--
-# a quote of some sort, a backslash, or the delimiter. This one character
-# at a time behavior was necessary if the delimiter was going to be a
-# regexp (love to hear it if you can figure out a better way).
-
- my ($delim, $keep, @lines) = @_;
- my (@words, $snippet, $field);
-
- local $_ = join ('', @lines);
-
- while (length) {
- $field = '';
+sub parse_line {
+ my($delimiter, $keep, $line) = @_;
+ my($quote, $quoted, $unquoted, $delim, $word, @pieces);
- for (;;) {
- $snippet = '';
+ while (length($line)) {
+ ($quote, $quoted, $unquoted, $delim) =
+ $line =~ m/^(["']) # a $quote
+ ((?:\\.|[^\1\\])*?) # and $quoted text
+ \1 # followed by the same quote
+ | # --OR--
+ ^((?:\\.|[^\\"'])*?) # an $unquoted text
+ (\Z(?!\n)|$delimiter|(?!^)(?=["']))
+ # plus EOL, delimiter, or quote
+ /x; # extended layout
- if (s/^"([^"\\]*(\\.[^"\\]*)*)"//) {
- $snippet = $1;
- $snippet = qq|"$snippet"| if $keep;
- }
- elsif (s/^'([^'\\]*(\\.[^'\\]*)*)'//) {
- $snippet = $1;
- $snippet = "'$snippet'" if $keep;
- }
- elsif (/^["']/) {
- croak 'Unmatched quote';
- }
- elsif (s/^\\(.)//) {
- $snippet = $1;
- $snippet = "\\$snippet" if $keep;
- }
- elsif (!length || s/^$delim//) {
- last;
- }
- else {
- while (length && !(/^$delim/ || /^['"\\]/)) {
- $snippet .= substr ($_, 0, 1);
- substr($_, 0, 1) = '';
- }
- }
+ return() unless(length($&));
+ $line = $';
- $field .= $snippet;
+ if ($keep) {
+ $quoted = "$quote$quoted$quote";
+ }
+ else {
+ $unquoted =~ s/\\(.)/$1/g;
+ $quoted =~ s/\\(.)/$1/g if ($quote eq '"');
+ }
+ $word .= ($quote) ? $quoted : $unquoted;
+
+ if (length($delim)) {
+ push(@pieces, $word);
+ push(@pieces, $delim) if ($keep eq 'delimiters');
+ undef $word;
+ }
+ if (!length($line)) {
+ push(@pieces, $word);
}
-
- push @words, $field;
}
-
- return @words;
+ return(@pieces);
}
+
sub old_shellwords {
# Usage:
@@ -154,13 +107,13 @@ sub old_shellwords {
($snippet = $1) =~ s#\\(.)#$1#g;
}
elsif (/^"/) {
- croak "Unmatched double quote: $_";
+ return();
}
elsif (s/^'(([^'\\]|\\.)*)'//) {
($snippet = $1) =~ s#\\(.)#$1#g;
}
elsif (/^'/) {
- croak "Unmatched single quote: $_";
+ return();
}
elsif (s/^\\(.)//) {
$snippet = $1;
@@ -178,3 +131,117 @@ sub old_shellwords {
}
@words;
}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Text::ParseWords - parse text into an array of tokens or array of arrays
+
+=head1 SYNOPSIS
+
+ use Text::ParseWords;
+ @lists = &nested_quotewords($delim, $keep, @lines);
+ @words = &quotewords($delim, $keep, @lines);
+ @words = &shellwords(@lines);
+ @words = &parse_line($delim, $keep, $line);
+ @words = &old_shellwords(@lines); # DEPRECATED!
+
+=head1 DESCRIPTION
+
+The &nested_quotewords() and &quotewords() functions accept a delimiter
+(which can be a regular expression)
+and a list of lines and then breaks those lines up into a list of
+words ignoring delimiters that appear inside quotes. &quotewords()
+returns all of the tokens in a single long list, while &nested_quotewords()
+returns a list of token lists corresponding to the elements of @lines.
+&parse_line() does tokenizing on a single string. The &*quotewords()
+functions simply call &parse_lines(), so if you're only splitting
+one line you can call &parse_lines() directly and save a function
+call.
+
+The $keep argument is a boolean flag. If true, then the tokens are
+split on the specified delimiter, but all other characters (quotes,
+backslashes, etc.) are kept in the tokens. If $keep is false then the
+&*quotewords() functions remove all quotes and backslashes that are
+not themselves backslash-escaped or inside of single quotes (i.e.,
+&quotewords() tries to interpret these characters just like the Bourne
+shell). NB: these semantics are significantly different from the
+original version of this module shipped with Perl 5.000 through 5.004.
+As an additional feature, $keep may be the keyword "delimiters" which
+causes the functions to preserve the delimiters in each string as
+tokens in the token lists, in addition to preserving quote and
+backslash characters.
+
+&shellwords() is written as a special case of &quotewords(), and it
+does token parsing with whitespace as a delimiter-- similar to most
+Unix shells.
+
+=head1 EXAMPLES
+
+The sample program:
+
+ use Text::ParseWords;
+ @words = &quotewords('\s+', 0, q{this is "a test" of\ quotewords \"for you});
+ $i = 0;
+ foreach (@words) {
+ print "$i: <$_>\n";
+ $i++;
+ }
+
+produces:
+
+ 0: <this>
+ 1: <is>
+ 2: <a test>
+ 3: <of quotewords>
+ 4: <"for>
+ 5: <you>
+
+demonstrating:
+
+=over 4
+
+=item 0
+a simple word
+
+=item 1
+multiple spaces are skipped because of our $delim
+
+=item 2
+use of quotes to include a space in a word
+
+=item 3
+use of a backslash to include a space in a word
+
+=item 4
+use of a backslash to remove the special meaning of a double-quote
+
+=item 5
+another simple word (note the lack of effect of the
+backslashed double-quote)
+
+=back
+
+Replacing C<&quotewords('\s+', 0, q{this is...})>
+with C<&shellwords(q{this is...})>
+is a simpler way to accomplish the same thing.
+
+=head1 AUTHORS
+
+Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original
+author unknown). Much of the code for &parse_line() (including the
+primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>.
+
+Examples section another documentation provided by John Heidemann
+<johnh@ISI.EDU>
+
+Bug reports, patches, and nagging provided by lots of folks-- thanks
+everybody! Special thanks to Michael Schwern <schwern@envirolink.org>
+for assuring me that a &nested_quotewords() would be useful, and to
+Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about
+error-checking (sort of-- you had to be there).
+
+=cut
diff --git a/lib/Text/Wrap.pm b/lib/Text/Wrap.pm
index 0910a2ab34..0fe7fb93c2 100644
--- a/lib/Text/Wrap.pm
+++ b/lib/Text/Wrap.pm
@@ -1,71 +1,74 @@
package Text::Wrap;
-require Exporter;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION $columns $debug);
+use strict;
+use Exporter;
-@ISA = (Exporter);
+$VERSION = "97.02";
+@ISA = qw(Exporter);
@EXPORT = qw(wrap);
-@EXPORT_OK = qw($columns);
+@EXPORT_OK = qw($columns $tabstop fill);
-$VERSION = 97.011701;
+use Text::Tabs qw(expand unexpand $tabstop);
-use vars qw($VERSION $columns $debug);
-use strict;
BEGIN {
- $columns = 76; # <= screen width
- $debug = 0;
+ $columns = 76; # <= screen width
+ $debug = 0;
}
-use Text::Tabs qw(expand unexpand);
-
sub wrap
{
- my ($ip, $xp, @t) = @_;
-
- my $r = "";
- my $t = expand(join(" ",@t));
- my $lead = $ip;
- my $ll = $columns - length(expand($lead)) - 1;
- my $nl = "";
-
- # remove up to a line length of things that aren't
- # new lines and tabs.
-
- if ($t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//xm) {
-
- # accept it.
- $r .= unexpand($lead . $1);
-
- # recompute the leader
- $lead = $xp;
- $ll = $columns - length(expand($lead)) - 1;
- $nl = $2;
-
- # repeat the above until there's none left
- while ($t) {
- if ( $t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//xm ) {
- print "\$2 is '$2'\n" if $debug;
- $nl = $2;
- $r .= unexpand("\n" . $lead . $1);
- } elsif ($t =~ s/^([^\n]{$ll})//) {
- $nl = "\n";
- $r .= unexpand("\n" . $lead . $1);
- }
- }
- $r .= $nl;
- }
+ my ($ip, $xp, @t) = @_;
+
+ my @rv;
+ my $t = expand(join(" ",@t));
+
+ my $lead = $ip;
+ my $ll = $columns - length(expand($lead)) - 1;
+ my $nl = "";
+
+ $t =~ s/^\s+//;
+ while(length($t) > $ll) {
+ # remove up to a line length of things that
+ # aren't new lines and tabs.
+ if ($t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//) {
+ my ($l,$r) = ($1,$2);
+ $l =~ s/\s+$//;
+ print "WRAP $lead$l..($r)\n" if $debug;
+ push @rv, unexpand($lead . $l), "\n";
+
+ } elsif ($t =~ s/^([^\n]{$ll})//) {
+ print "SPLIT $lead$1..\n" if $debug;
+ push @rv, unexpand($lead . $1),"\n";
+ }
+ # recompute the leader
+ $lead = $xp;
+ $ll = $columns - length(expand($lead)) - 1;
+ $t =~ s/^\s+//;
+ }
+ print "TAIL $lead$t\n" if $debug;
+ push @rv, $lead.$t if $t ne "";
+ return join '', @rv;
+}
- die "couldn't wrap '$t'"
- if length($t) > $ll;
- print "-----------$r---------\n" if $debug;
+sub fill
+{
+ my ($ip, $xp, @raw) = @_;
+ my @para;
+ my $pp;
- print "Finish up with '$lead', '$t'\n" if $debug;
+ for $pp (split(/\n\s+/, join("\n",@raw))) {
+ $pp =~ s/\s+/ /g;
+ my $x = wrap($ip, $xp, $pp);
+ push(@para, $x);
+ }
- $r .= $lead . $t if $t ne "";
+ # if paragraph_indent is the same as line_indent,
+ # separate paragraphs with blank lines
- print "-----------$r---------\n" if $debug;;
- return $r;
+ return join ($ip eq $xp ? "\n\n" : "\n", @para);
}
1;
@@ -81,9 +84,13 @@ Text::Wrap - line wrapping to form simple paragraphs
print wrap($initial_tab, $subsequent_tab, @text);
- use Text::Wrap qw(wrap $columns);
+ use Text::Wrap qw(wrap $columns $tabstop fill);
$columns = 132;
+ $tabstop = 4;
+
+ print fill($initial_tab, $subsequent_tab, @text);
+ print fill("", "", `cat book`);
=head1 DESCRIPTION
@@ -93,6 +100,12 @@ Indentation is controlled for the first line ($initial_tab) and
all subsquent lines ($subsequent_tab) independently. $Text::Wrap::columns
should be set to the full width of your output device.
+Text::Wrap::fill() is a simple multi-paragraph formatter. It formats
+each paragraph separately and then joins them together when it's done. It
+will destory any whitespace in the original text. It breaks text into
+paragraphs by looking for whitespace after a newline. In other respects
+it acts like wrap().
+
=head1 EXAMPLE
print wrap("\t","","This is a bit of text that forms
@@ -102,44 +115,11 @@ should be set to the full width of your output device.
It's not clear what the correct behavior should be when Wrap() is
presented with a word that is longer than a line. The previous
-behavior was to die. Now the word is split at line-length.
+behavior was to die. Now the word is now split at line-length.
=head1 AUTHOR
David Muir Sharnoff <muir@idiom.com> with help from Tim Pierce and
-others.
+others. Updated by Jacqui Caren.
=cut
-
-Latest change by Andreas Koenig <k@anna.in-berlin.de> - 1/17/97
-
- print fill($initial_tab, $subsequent_tab, @text);
-
- print fill("", "", `cat book`);
-
-Text::Wrap::fill() is a simple multi-paragraph formatter. It formats
-each paragraph separately and then joins them together when it's done. It
-will destory any whitespace in the original text. It breaks text into
-paragraphs by looking for whitespace after a newline. In other respects
-it acts like wrap().
-
-# Tim Pierce did a faster version of this:
-
-sub fill
-{
- my ($ip, $xp, @raw) = @_;
- my @para;
- my $pp;
-
- for $pp (split(/\n\s+/, join("\n",@raw))) {
- $pp =~ s/\s+/ /g;
- my $x = wrap($ip, $xp, $pp);
- push(@para, $x);
- }
-
- # if paragraph_indent is the same as line_indent,
- # separate paragraphs with blank lines
-
- return join ($ip eq $xp ? "\n\n" : "\n", @para);
-}
-