diff options
author | Gurusamy Sarathy <gsar@cpan.org> | 1998-05-14 09:31:34 +0000 |
---|---|---|
committer | Gurusamy Sarathy <gsar@cpan.org> | 1998-05-14 09:31:34 +0000 |
commit | 9b599b2a63d2324ddacddd9710c41b795a95070d (patch) | |
tree | 4180f11ca1ddccb984799ab74df847e9f64f1213 /lib/Text | |
parent | 491527d0220de34ec13035d557e288c9952d1007 (diff) | |
download | perl-9b599b2a63d2324ddacddd9710c41b795a95070d.tar.gz |
[win32] merge change#887 from maintbranch
p4raw-link: @887 on //depot/maint-5.004/perl: 6cdf74fe31f049dc2164dbb9e6242179d4b8ee1f
p4raw-id: //depot/win32/perl@937
Diffstat (limited to 'lib/Text')
-rw-r--r-- | lib/Text/ParseWords.pm | 293 | ||||
-rw-r--r-- | lib/Text/Wrap.pm | 154 |
2 files changed, 247 insertions, 200 deletions
diff --git a/lib/Text/ParseWords.pm b/lib/Text/ParseWords.pm index 62da1d273f..d3a89f03b8 100644 --- a/lib/Text/ParseWords.pm +++ b/lib/Text/ParseWords.pm @@ -1,140 +1,93 @@ package Text::ParseWords; -require 5.000; -use Carp; +use vars qw($VERSION @ISA @EXPORT); +$VERSION = "3.0"; -require AutoLoader; -*AUTOLOAD = \&AutoLoader::AUTOLOAD; +require 5.000; -require Exporter; +use Exporter; @ISA = qw(Exporter); -@EXPORT = qw(shellwords quotewords); +@EXPORT = qw(shellwords quotewords nested_quotewords parse_line); @EXPORT_OK = qw(old_shellwords); -=head1 NAME - -Text::ParseWords - parse text into an array of tokens - -=head1 SYNOPSIS - - use Text::ParseWords; - @words = "ewords($delim, $keep, @lines); - @words = &shellwords(@lines); - @words = &old_shellwords(@lines); - -=head1 DESCRIPTION -"ewords() accepts a delimiter (which can be a regular expression) -and a list of lines and then breaks those lines up into a list of -words ignoring delimiters that appear inside quotes. - -The $keep argument is a boolean flag. If true, the quotes are kept -with each word, otherwise quotes are stripped in the splitting process. -$keep also defines whether unprotected backslashes are retained. - -A &shellwords() replacement is included to demonstrate the new package. -This version differs from the original in that it will _NOT_ default -to using $_ if no arguments are given. I personally find the old behavior -to be a mis-feature. - -"ewords() works by simply jamming all of @lines into a single -string in $_ and then pulling off words a bit at a time until $_ -is exhausted. +sub shellwords { + local(@lines) = @_; + $lines[$#lines] =~ s/\s+$//; + return(quotewords('\s+', 0, @lines)); +} -=head1 AUTHORS -Hal Pomeranz (pomeranz@netcom.com), 23 March 1994 -Basically an update and generalization of the old shellwords.pl. -Much code shamelessly stolen from the old version (author unknown). +sub quotewords { + my($delim, $keep, @lines) = @_; + my($line, @words, @allwords); + + + foreach $line (@lines) { + @words = parse_line($delim, $keep, $line); + return() unless (@words || !length($line)); + push(@allwords, @words); + } + return(@allwords); +} -=cut -1; -__END__ -sub shellwords { - local(@lines) = @_; - $lines[$#lines] =~ s/\s+$//; - "ewords('\s+', 0, @lines); +sub nested_quotewords { + my($delim, $keep, @lines) = @_; + my($i, @allwords); + + for ($i = 0; $i < @lines; $i++) { + @{$allwords[$i]} = parse_line($delim, $keep, $lines[$i]); + return() unless (@{$allwords[$i]} || !length($lines[$i])); + } + return(@allwords); } -sub quotewords { - -# The inner "for" loop builds up each word (or $field) one $snippet -# at a time. A $snippet is a quoted string, a backslashed character, -# or an unquoted string. We fall out of the "for" loop when we reach -# the end of $_ or when we hit a delimiter. Falling out of the "for" -# loop, we push the $field we've been building up onto the list of -# @words we'll be returning, and then loop back and pull another word -# off of $_. -# -# The first two cases inside the "for" loop deal with quoted strings. -# The first case matches a double quoted string, removes it from $_, -# and assigns the double quoted string to $snippet in the body of the -# conditional. The second case handles single quoted strings. In -# the third case we've found a quote at the current beginning of $_, -# but it didn't match the quoted string regexps in the first two cases, -# so it must be an unbalanced quote and we croak with an error (which can -# be caught by eval()). -# -# The next case handles backslashed characters, and the next case is the -# exit case on reaching the end of the string or finding a delimiter. -# -# Otherwise, we've found an unquoted thing and we pull of characters one -# at a time until we reach something that could start another $snippet-- -# a quote of some sort, a backslash, or the delimiter. This one character -# at a time behavior was necessary if the delimiter was going to be a -# regexp (love to hear it if you can figure out a better way). - - my ($delim, $keep, @lines) = @_; - my (@words, $snippet, $field); - - local $_ = join ('', @lines); - - while (length) { - $field = ''; +sub parse_line { + my($delimiter, $keep, $line) = @_; + my($quote, $quoted, $unquoted, $delim, $word, @pieces); - for (;;) { - $snippet = ''; + while (length($line)) { + ($quote, $quoted, $unquoted, $delim) = + $line =~ m/^(["']) # a $quote + ((?:\\.|[^\1\\])*?) # and $quoted text + \1 # followed by the same quote + | # --OR-- + ^((?:\\.|[^\\"'])*?) # an $unquoted text + (\Z(?!\n)|$delimiter|(?!^)(?=["'])) + # plus EOL, delimiter, or quote + /x; # extended layout - if (s/^"([^"\\]*(\\.[^"\\]*)*)"//) { - $snippet = $1; - $snippet = qq|"$snippet"| if $keep; - } - elsif (s/^'([^'\\]*(\\.[^'\\]*)*)'//) { - $snippet = $1; - $snippet = "'$snippet'" if $keep; - } - elsif (/^["']/) { - croak 'Unmatched quote'; - } - elsif (s/^\\(.)//) { - $snippet = $1; - $snippet = "\\$snippet" if $keep; - } - elsif (!length || s/^$delim//) { - last; - } - else { - while (length && !(/^$delim/ || /^['"\\]/)) { - $snippet .= substr ($_, 0, 1); - substr($_, 0, 1) = ''; - } - } + return() unless(length($&)); + $line = $'; - $field .= $snippet; + if ($keep) { + $quoted = "$quote$quoted$quote"; + } + else { + $unquoted =~ s/\\(.)/$1/g; + $quoted =~ s/\\(.)/$1/g if ($quote eq '"'); + } + $word .= ($quote) ? $quoted : $unquoted; + + if (length($delim)) { + push(@pieces, $word); + push(@pieces, $delim) if ($keep eq 'delimiters'); + undef $word; + } + if (!length($line)) { + push(@pieces, $word); } - - push @words, $field; } - - return @words; + return(@pieces); } + sub old_shellwords { # Usage: @@ -154,13 +107,13 @@ sub old_shellwords { ($snippet = $1) =~ s#\\(.)#$1#g; } elsif (/^"/) { - croak "Unmatched double quote: $_"; + return(); } elsif (s/^'(([^'\\]|\\.)*)'//) { ($snippet = $1) =~ s#\\(.)#$1#g; } elsif (/^'/) { - croak "Unmatched single quote: $_"; + return(); } elsif (s/^\\(.)//) { $snippet = $1; @@ -178,3 +131,117 @@ sub old_shellwords { } @words; } + +1; + +__END__ + +=head1 NAME + +Text::ParseWords - parse text into an array of tokens or array of arrays + +=head1 SYNOPSIS + + use Text::ParseWords; + @lists = &nested_quotewords($delim, $keep, @lines); + @words = "ewords($delim, $keep, @lines); + @words = &shellwords(@lines); + @words = &parse_line($delim, $keep, $line); + @words = &old_shellwords(@lines); # DEPRECATED! + +=head1 DESCRIPTION + +The &nested_quotewords() and "ewords() functions accept a delimiter +(which can be a regular expression) +and a list of lines and then breaks those lines up into a list of +words ignoring delimiters that appear inside quotes. "ewords() +returns all of the tokens in a single long list, while &nested_quotewords() +returns a list of token lists corresponding to the elements of @lines. +&parse_line() does tokenizing on a single string. The &*quotewords() +functions simply call &parse_lines(), so if you're only splitting +one line you can call &parse_lines() directly and save a function +call. + +The $keep argument is a boolean flag. If true, then the tokens are +split on the specified delimiter, but all other characters (quotes, +backslashes, etc.) are kept in the tokens. If $keep is false then the +&*quotewords() functions remove all quotes and backslashes that are +not themselves backslash-escaped or inside of single quotes (i.e., +"ewords() tries to interpret these characters just like the Bourne +shell). NB: these semantics are significantly different from the +original version of this module shipped with Perl 5.000 through 5.004. +As an additional feature, $keep may be the keyword "delimiters" which +causes the functions to preserve the delimiters in each string as +tokens in the token lists, in addition to preserving quote and +backslash characters. + +&shellwords() is written as a special case of "ewords(), and it +does token parsing with whitespace as a delimiter-- similar to most +Unix shells. + +=head1 EXAMPLES + +The sample program: + + use Text::ParseWords; + @words = "ewords('\s+', 0, q{this is "a test" of\ quotewords \"for you}); + $i = 0; + foreach (@words) { + print "$i: <$_>\n"; + $i++; + } + +produces: + + 0: <this> + 1: <is> + 2: <a test> + 3: <of quotewords> + 4: <"for> + 5: <you> + +demonstrating: + +=over 4 + +=item 0 +a simple word + +=item 1 +multiple spaces are skipped because of our $delim + +=item 2 +use of quotes to include a space in a word + +=item 3 +use of a backslash to include a space in a word + +=item 4 +use of a backslash to remove the special meaning of a double-quote + +=item 5 +another simple word (note the lack of effect of the +backslashed double-quote) + +=back + +Replacing C<"ewords('\s+', 0, q{this is...})> +with C<&shellwords(q{this is...})> +is a simpler way to accomplish the same thing. + +=head1 AUTHORS + +Maintainer is Hal Pomeranz <pomeranz@netcom.com>, 1994-1997 (Original +author unknown). Much of the code for &parse_line() (including the +primary regexp) from Joerk Behrends <jbehrends@multimediaproduzenten.de>. + +Examples section another documentation provided by John Heidemann +<johnh@ISI.EDU> + +Bug reports, patches, and nagging provided by lots of folks-- thanks +everybody! Special thanks to Michael Schwern <schwern@envirolink.org> +for assuring me that a &nested_quotewords() would be useful, and to +Jeff Friedl <jfriedl@yahoo-inc.com> for telling me not to worry about +error-checking (sort of-- you had to be there). + +=cut diff --git a/lib/Text/Wrap.pm b/lib/Text/Wrap.pm index 0910a2ab34..0fe7fb93c2 100644 --- a/lib/Text/Wrap.pm +++ b/lib/Text/Wrap.pm @@ -1,71 +1,74 @@ package Text::Wrap; -require Exporter; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION $columns $debug); +use strict; +use Exporter; -@ISA = (Exporter); +$VERSION = "97.02"; +@ISA = qw(Exporter); @EXPORT = qw(wrap); -@EXPORT_OK = qw($columns); +@EXPORT_OK = qw($columns $tabstop fill); -$VERSION = 97.011701; +use Text::Tabs qw(expand unexpand $tabstop); -use vars qw($VERSION $columns $debug); -use strict; BEGIN { - $columns = 76; # <= screen width - $debug = 0; + $columns = 76; # <= screen width + $debug = 0; } -use Text::Tabs qw(expand unexpand); - sub wrap { - my ($ip, $xp, @t) = @_; - - my $r = ""; - my $t = expand(join(" ",@t)); - my $lead = $ip; - my $ll = $columns - length(expand($lead)) - 1; - my $nl = ""; - - # remove up to a line length of things that aren't - # new lines and tabs. - - if ($t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//xm) { - - # accept it. - $r .= unexpand($lead . $1); - - # recompute the leader - $lead = $xp; - $ll = $columns - length(expand($lead)) - 1; - $nl = $2; - - # repeat the above until there's none left - while ($t) { - if ( $t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//xm ) { - print "\$2 is '$2'\n" if $debug; - $nl = $2; - $r .= unexpand("\n" . $lead . $1); - } elsif ($t =~ s/^([^\n]{$ll})//) { - $nl = "\n"; - $r .= unexpand("\n" . $lead . $1); - } - } - $r .= $nl; - } + my ($ip, $xp, @t) = @_; + + my @rv; + my $t = expand(join(" ",@t)); + + my $lead = $ip; + my $ll = $columns - length(expand($lead)) - 1; + my $nl = ""; + + $t =~ s/^\s+//; + while(length($t) > $ll) { + # remove up to a line length of things that + # aren't new lines and tabs. + if ($t =~ s/^([^\n]{0,$ll})(\s|\Z(?!\n))//) { + my ($l,$r) = ($1,$2); + $l =~ s/\s+$//; + print "WRAP $lead$l..($r)\n" if $debug; + push @rv, unexpand($lead . $l), "\n"; + + } elsif ($t =~ s/^([^\n]{$ll})//) { + print "SPLIT $lead$1..\n" if $debug; + push @rv, unexpand($lead . $1),"\n"; + } + # recompute the leader + $lead = $xp; + $ll = $columns - length(expand($lead)) - 1; + $t =~ s/^\s+//; + } + print "TAIL $lead$t\n" if $debug; + push @rv, $lead.$t if $t ne ""; + return join '', @rv; +} - die "couldn't wrap '$t'" - if length($t) > $ll; - print "-----------$r---------\n" if $debug; +sub fill +{ + my ($ip, $xp, @raw) = @_; + my @para; + my $pp; - print "Finish up with '$lead', '$t'\n" if $debug; + for $pp (split(/\n\s+/, join("\n",@raw))) { + $pp =~ s/\s+/ /g; + my $x = wrap($ip, $xp, $pp); + push(@para, $x); + } - $r .= $lead . $t if $t ne ""; + # if paragraph_indent is the same as line_indent, + # separate paragraphs with blank lines - print "-----------$r---------\n" if $debug;; - return $r; + return join ($ip eq $xp ? "\n\n" : "\n", @para); } 1; @@ -81,9 +84,13 @@ Text::Wrap - line wrapping to form simple paragraphs print wrap($initial_tab, $subsequent_tab, @text); - use Text::Wrap qw(wrap $columns); + use Text::Wrap qw(wrap $columns $tabstop fill); $columns = 132; + $tabstop = 4; + + print fill($initial_tab, $subsequent_tab, @text); + print fill("", "", `cat book`); =head1 DESCRIPTION @@ -93,6 +100,12 @@ Indentation is controlled for the first line ($initial_tab) and all subsquent lines ($subsequent_tab) independently. $Text::Wrap::columns should be set to the full width of your output device. +Text::Wrap::fill() is a simple multi-paragraph formatter. It formats +each paragraph separately and then joins them together when it's done. It +will destory any whitespace in the original text. It breaks text into +paragraphs by looking for whitespace after a newline. In other respects +it acts like wrap(). + =head1 EXAMPLE print wrap("\t","","This is a bit of text that forms @@ -102,44 +115,11 @@ should be set to the full width of your output device. It's not clear what the correct behavior should be when Wrap() is presented with a word that is longer than a line. The previous -behavior was to die. Now the word is split at line-length. +behavior was to die. Now the word is now split at line-length. =head1 AUTHOR David Muir Sharnoff <muir@idiom.com> with help from Tim Pierce and -others. +others. Updated by Jacqui Caren. =cut - -Latest change by Andreas Koenig <k@anna.in-berlin.de> - 1/17/97 - - print fill($initial_tab, $subsequent_tab, @text); - - print fill("", "", `cat book`); - -Text::Wrap::fill() is a simple multi-paragraph formatter. It formats -each paragraph separately and then joins them together when it's done. It -will destory any whitespace in the original text. It breaks text into -paragraphs by looking for whitespace after a newline. In other respects -it acts like wrap(). - -# Tim Pierce did a faster version of this: - -sub fill -{ - my ($ip, $xp, @raw) = @_; - my @para; - my $pp; - - for $pp (split(/\n\s+/, join("\n",@raw))) { - $pp =~ s/\s+/ /g; - my $x = wrap($ip, $xp, $pp); - push(@para, $x); - } - - # if paragraph_indent is the same as line_indent, - # separate paragraphs with blank lines - - return join ($ip eq $xp ? "\n\n" : "\n", @para); -} - |