diff options
author | Steffen Mueller <smueller@cpan.org> | 2009-08-30 18:15:52 +0200 |
---|---|---|
committer | Steffen Mueller <smueller@cpan.org> | 2009-08-30 19:40:47 +0200 |
commit | 0508fb16c6a57ba6d246b3ddb11c79c68d302f62 (patch) | |
tree | 7b89a4beafe4b3848d23a6dce52e39ef66ae56ca /lib/Text | |
parent | e78bb0d89d491abc767df945e961dd3e6d600be5 (diff) | |
download | perl-0508fb16c6a57ba6d246b3ddb11c79c68d302f62.tar.gz |
Move Text::Balanced from lib to ext
Diffstat (limited to 'lib/Text')
-rw-r--r-- | lib/Text/Balanced.pm | 2306 | ||||
-rw-r--r-- | lib/Text/Balanced/Changes | 342 | ||||
-rw-r--r-- | lib/Text/Balanced/README | 71 | ||||
-rw-r--r-- | lib/Text/Balanced/t/00-load.t | 7 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extbrk.t | 83 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extcbk.t | 102 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extdel.t | 97 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extmul.t | 326 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extqlk.t | 142 | ||||
-rw-r--r-- | lib/Text/Balanced/t/exttag.t | 120 | ||||
-rw-r--r-- | lib/Text/Balanced/t/extvar.t | 160 | ||||
-rw-r--r-- | lib/Text/Balanced/t/gentag.t | 109 |
12 files changed, 0 insertions, 3865 deletions
diff --git a/lib/Text/Balanced.pm b/lib/Text/Balanced.pm deleted file mode 100644 index 16a559b6c8..0000000000 --- a/lib/Text/Balanced.pm +++ /dev/null @@ -1,2306 +0,0 @@ -# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. -# FOR FULL DOCUMENTATION SEE Balanced.pod - -use 5.005; -use strict; - -package Text::Balanced; - -use Exporter; -use SelfLoader; -use vars qw { $VERSION @ISA %EXPORT_TAGS }; - -use version; $VERSION = qv('2.0.0'); -@ISA = qw ( Exporter ); - -%EXPORT_TAGS = ( ALL => [ qw( - &extract_delimited - &extract_bracketed - &extract_quotelike - &extract_codeblock - &extract_variable - &extract_tagged - &extract_multiple - - &gen_delimited_pat - &gen_extract_tagged - - &delimited_pat - ) ] ); - -Exporter::export_ok_tags('ALL'); - -# PROTOTYPES - -sub _match_bracketed($$$$$$); -sub _match_variable($$); -sub _match_codeblock($$$$$$$); -sub _match_quotelike($$$$); - -# HANDLE RETURN VALUES IN VARIOUS CONTEXTS - -sub _failmsg { - my ($message, $pos) = @_; - $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg"; -} - -sub _fail -{ - my ($wantarray, $textref, $message, $pos) = @_; - _failmsg $message, $pos if $message; - return (undef,$$textref,undef) if $wantarray; - return undef; -} - -sub _succeed -{ - $@ = undef; - my ($wantarray,$textref) = splice @_, 0, 2; - my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0); - my ($startlen, $oppos) = @_[5,6]; - my $remainderpos = $_[2]; - if ($wantarray) - { - my @res; - while (my ($from, $len) = splice @_, 0, 2) - { - push @res, substr($$textref,$from,$len); - } - if ($extralen) { # CORRECT FILLET - my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); - $res[1] = "$extra$res[1]"; - eval { substr($$textref,$remainderpos,0) = $extra; - substr($$textref,$extrapos,$extralen,"\n")} ; - #REARRANGE HERE DOC AND FILLET IF POSSIBLE - pos($$textref) = $remainderpos-$extralen+1; # RESET \G - } - else { - pos($$textref) = $remainderpos; # RESET \G - } - return @res; - } - else - { - my $match = substr($$textref,$_[0],$_[1]); - substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; - my $extra = $extralen - ? substr($$textref, $extrapos, $extralen)."\n" : ""; - eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE - pos($$textref) = $_[4]; # RESET \G - return $match; - } -} - -# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING - -sub gen_delimited_pat($;$) # ($delimiters;$escapes) -{ - my ($dels, $escs) = @_; - return "" unless $dels =~ /\S/; - $escs = '\\' unless $escs; - $escs .= substr($escs,-1) x (length($dels)-length($escs)); - my @pat = (); - my $i; - for ($i=0; $i<length $dels; $i++) - { - my $del = quotemeta substr($dels,$i,1); - my $esc = quotemeta substr($escs,$i,1); - if ($del eq $esc) - { - push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; - } - else - { - push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; - } - } - my $pat = join '|', @pat; - return "(?:$pat)"; -} - -*delimited_pat = \&gen_delimited_pat; - - -# THE EXTRACTION FUNCTIONS - -sub extract_delimited (;$$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $esc = defined $_[3] ? $_[3] : qq{\\}; - my $pat = gen_delimited_pat($del, $esc); - my $startpos = pos $$textref || 0; - return _fail($wantarray, $textref, "Not a delimited pattern", 0) - unless $$textref =~ m/\G($pre)($pat)/gc; - my $prelen = length($1); - my $matchpos = $startpos+$prelen; - my $endpos = pos $$textref; - return _succeed $wantarray, $textref, - $matchpos, $endpos-$matchpos, # MATCH - $endpos, length($$textref)-$endpos, # REMAINDER - $startpos, $prelen; # PREFIX -} - -sub extract_bracketed (;$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $ldel = defined $_[1] ? $_[1] : '{([<'; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $wantarray = wantarray; - my $qdel = ""; - my $quotelike; - $ldel =~ s/'//g and $qdel .= q{'}; - $ldel =~ s/"//g and $qdel .= q{"}; - $ldel =~ s/`//g and $qdel .= q{`}; - $ldel =~ s/q//g and $quotelike = 1; - $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; - my $rdel = $ldel; - unless ($rdel =~ tr/[({</])}>/) - { - return _fail $wantarray, $textref, - "Did not find a suitable bracket in delimiter: \"$_[1]\"", - 0; - } - my $posbug = pos; - $ldel = join('|', map { quotemeta $_ } split('', $ldel)); - $rdel = join('|', map { quotemeta $_ } split('', $rdel)); - pos = $posbug; - - my $startpos = pos $$textref || 0; - my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); - - return _fail ($wantarray, $textref) unless @match; - - return _succeed ( $wantarray, $textref, - $match[2], $match[5]+2, # MATCH - @match[8,9], # REMAINDER - @match[0,1], # PREFIX - ); -} - -sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel -{ - my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; - my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); - unless ($$textref =~ m/\G$pre/gc) - { - _failmsg "Did not find prefix: /$pre/", $startpos; - return; - } - - $ldelpos = pos $$textref; - - unless ($$textref =~ m/\G($ldel)/gc) - { - _failmsg "Did not find opening bracket after prefix: \"$pre\"", - pos $$textref; - pos $$textref = $startpos; - return; - } - - my @nesting = ( $1 ); - my $textlen = length $$textref; - while (pos $$textref < $textlen) - { - next if $$textref =~ m/\G\\./gcs; - - if ($$textref =~ m/\G($ldel)/gc) - { - push @nesting, $1; - } - elsif ($$textref =~ m/\G($rdel)/gc) - { - my ($found, $brackettype) = ($1, $1); - if ($#nesting < 0) - { - _failmsg "Unmatched closing bracket: \"$found\"", - pos $$textref; - pos $$textref = $startpos; - return; - } - my $expected = pop(@nesting); - $expected =~ tr/({[</)}]>/; - if ($expected ne $brackettype) - { - _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, - pos $$textref; - pos $$textref = $startpos; - return; - } - last if $#nesting < 0; - } - elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) - { - $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; - _failmsg "Unmatched embedded quote ($1)", - pos $$textref; - pos $$textref = $startpos; - return; - } - elsif ($quotelike && _match_quotelike($textref,"",1,0)) - { - next; - } - - else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } - } - if ($#nesting>=0) - { - _failmsg "Unmatched opening bracket(s): " - . join("..",@nesting)."..", - pos $$textref; - pos $$textref = $startpos; - return; - } - - $endpos = pos $$textref; - - return ( - $startpos, $ldelpos-$startpos, # PREFIX - $ldelpos, 1, # OPENING BRACKET - $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS - $endpos-1, 1, # CLOSING BRACKET - $endpos, length($$textref)-$endpos, # REMAINDER - ); -} - -sub _revbracket($) -{ - my $brack = reverse $_[0]; - $brack =~ tr/[({</])}>/; - return $brack; -} - -my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; - -sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $ldel = $_[1]; - my $rdel = $_[2]; - my $pre = defined $_[3] ? $_[3] : '\s*'; - my %options = defined $_[4] ? %{$_[4]} : (); - my $omode = defined $options{fail} ? $options{fail} : ''; - my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) - : defined($options{reject}) ? $options{reject} - : '' - ; - my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) - : defined($options{ignore}) ? $options{ignore} - : '' - ; - - if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } - $@ = undef; - - my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); - - return _fail(wantarray, $textref) unless @match; - return _succeed wantarray, $textref, - $match[2], $match[3]+$match[5]+$match[7], # MATCH - @match[8..9,0..1,2..7]; # REM, PRE, BITS -} - -sub _match_tagged # ($$$$$$$) -{ - my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; - my $rdelspec; - - my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); - - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg "Did not find prefix: /$pre/", pos $$textref; - goto failed; - } - - $opentagpos = pos($$textref); - - unless ($$textref =~ m/\G$ldel/gc) - { - _failmsg "Did not find opening tag: /$ldel/", pos $$textref; - goto failed; - } - - $textpos = pos($$textref); - - if (!defined $rdel) - { - $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); - unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) - { - _failmsg "Unable to construct closing tag to match: $rdel", - pos $$textref; - goto failed; - } - } - else - { - $rdelspec = eval "qq{$rdel}" || do { - my $del; - for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) - { next if $rdel =~ /\Q$_/; $del = $_; last } - unless ($del) { - use Carp; - croak "Can't interpolate right delimiter $rdel" - } - eval "qq$del$rdel$del"; - }; - } - - while (pos($$textref) < length($$textref)) - { - next if $$textref =~ m/\G\\./gc; - - if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) - { - $parapos = pos($$textref) - length($1) - unless defined $parapos; - } - elsif ($$textref =~ m/\G($rdelspec)/gc ) - { - $closetagpos = pos($$textref)-length($1); - goto matched; - } - elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) - { - next; - } - elsif ($bad && $$textref =~ m/\G($bad)/gcs) - { - pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS - goto short if ($omode eq 'PARA' || $omode eq 'MAX'); - _failmsg "Found invalid nested tag: $1", pos $$textref; - goto failed; - } - elsif ($$textref =~ m/\G($ldel)/gc) - { - my $tag = $1; - pos($$textref) -= length($tag); # REWIND TO NESTED TAG - unless (_match_tagged(@_)) # MATCH NESTED TAG - { - goto short if $omode eq 'PARA' || $omode eq 'MAX'; - _failmsg "Found unbalanced nested tag: $tag", - pos $$textref; - goto failed; - } - } - else { $$textref =~ m/./gcs } - } - -short: - $closetagpos = pos($$textref); - goto matched if $omode eq 'MAX'; - goto failed unless $omode eq 'PARA'; - - if (defined $parapos) { pos($$textref) = $parapos } - else { $parapos = pos($$textref) } - - return ( - $startpos, $opentagpos-$startpos, # PREFIX - $opentagpos, $textpos-$opentagpos, # OPENING TAG - $textpos, $parapos-$textpos, # TEXT - $parapos, 0, # NO CLOSING TAG - $parapos, length($$textref)-$parapos, # REMAINDER - ); - -matched: - $endpos = pos($$textref); - return ( - $startpos, $opentagpos-$startpos, # PREFIX - $opentagpos, $textpos-$opentagpos, # OPENING TAG - $textpos, $closetagpos-$textpos, # TEXT - $closetagpos, $endpos-$closetagpos, # CLOSING TAG - $endpos, length($$textref)-$endpos, # REMAINDER - ); - -failed: - _failmsg "Did not find closing tag", pos $$textref unless $@; - pos($$textref) = $startpos; - return; -} - -sub extract_variable (;$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - return ("","","") unless defined $$textref; - my $pre = defined $_[1] ? $_[1] : '\s*'; - - my @match = _match_variable($textref,$pre); - - return _fail wantarray, $textref unless @match; - - return _succeed wantarray, $textref, - @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX -} - -sub _match_variable($$) -{ -# $# -# $^ -# $$ - my ($textref, $pre) = @_; - my $startpos = pos($$textref) = pos($$textref)||0; - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg "Did not find prefix: /$pre/", pos $$textref; - return; - } - my $varpos = pos($$textref); - unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) - { - unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) - { - _failmsg "Did not find leading dereferencer", pos $$textref; - pos $$textref = $startpos; - return; - } - my $deref = $1; - - unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci - or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) - or $deref eq '$#' or $deref eq '$$' ) - { - _failmsg "Bad identifier after dereferencer", pos $$textref; - pos $$textref = $startpos; - return; - } - } - - while (1) - { - next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; - next if _match_codeblock($textref, - qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, - qr/[({[]/, qr/[)}\]]/, - qr/[({[]/, qr/[)}\]]/, 0); - next if _match_codeblock($textref, - qr/\s*/, qr/[{[]/, qr/[}\]]/, - qr/[{[]/, qr/[}\]]/, 0); - next if _match_variable($textref,'\s*->\s*'); - next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; - last; - } - - my $endpos = pos($$textref); - return ($startpos, $varpos-$startpos, - $varpos, $endpos-$varpos, - $endpos, length($$textref)-$endpos - ); -} - -sub extract_codeblock (;$$$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $ldel_inner = defined $_[1] ? $_[1] : '{'; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; - my $rd = $_[4]; - my $rdel_inner = $ldel_inner; - my $rdel_outer = $ldel_outer; - my $posbug = pos; - for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } - for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } - for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) - { - $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' - } - pos = $posbug; - - my @match = _match_codeblock($textref, $pre, - $ldel_outer, $rdel_outer, - $ldel_inner, $rdel_inner, - $rd); - return _fail($wantarray, $textref) unless @match; - return _succeed($wantarray, $textref, - @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX - ); - -} - -sub _match_codeblock($$$$$$$) -{ - my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; - my $startpos = pos($$textref) = pos($$textref) || 0; - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg qq{Did not match prefix /$pre/ at"} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - return; - } - my $codepos = pos($$textref); - unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER - { - _failmsg qq{Did not find expected opening bracket at "} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - my $closing = $1; - $closing =~ tr/([<{/)]>}/; - my $matched; - my $patvalid = 1; - while (pos($$textref) < length($$textref)) - { - $matched = ''; - if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) - { - $patvalid = 0; - next; - } - - if ($$textref =~ m/\G\s*#.*/gc) - { - next; - } - - if ($$textref =~ m/\G\s*($rdel_outer)/gc) - { - unless ($matched = ($closing && $1 eq $closing) ) - { - next if $1 eq '>'; # MIGHT BE A "LESS THAN" - _failmsg q{Mismatched closing bracket at "} . - substr($$textref,pos($$textref),20) . - qq{...". Expected '$closing'}, - pos $$textref; - } - last; - } - - if (_match_variable($textref,'\s*') || - _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) - { - $patvalid = 0; - next; - } - - - # NEED TO COVER MANY MORE CASES HERE!!! - if ($$textref =~ m#\G\s*(?!$ldel_inner) - ( [-+*x/%^&|.]=? - | [!=]~ - | =(?!>) - | (\*\*|&&|\|\||<<|>>)=? - | split|grep|map|return - | [([] - )#gcx) - { - $patvalid = 1; - next; - } - - if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) - { - $patvalid = 1; - next; - } - - if ($$textref =~ m/\G\s*$ldel_outer/gc) - { - _failmsg q{Improperly nested codeblock at "} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - last; - } - - $patvalid = 0; - $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; - } - continue { $@ = undef } - - unless ($matched) - { - _failmsg 'No match found for opening bracket', pos $$textref - unless $@; - return; - } - - my $endpos = pos($$textref); - return ( $startpos, $codepos-$startpos, - $codepos, $endpos-$codepos, - $endpos, length($$textref)-$endpos, - ); -} - - -my %mods = ( - 'none' => '[cgimsox]*', - 'm' => '[cgimsox]*', - 's' => '[cegimsox]*', - 'tr' => '[cds]*', - 'y' => '[cds]*', - 'qq' => '', - 'qx' => '', - 'qw' => '', - 'qr' => '[imsx]*', - 'q' => '', - ); - -sub extract_quotelike (;$$) -{ - my $textref = $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $pre = defined $_[1] ? $_[1] : '\s*'; - - my @match = _match_quotelike($textref,$pre,1,0); - return _fail($wantarray, $textref) unless @match; - return _succeed($wantarray, $textref, - $match[2], $match[18]-$match[2], # MATCH - @match[18,19], # REMAINDER - @match[0,1], # PREFIX - @match[2..17], # THE BITS - @match[20,21], # ANY FILLET? - ); -}; - -sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) -{ - my ($textref, $pre, $rawmatch, $qmark) = @_; - - my ($textlen,$startpos, - $oppos, - $preld1pos,$ld1pos,$str1pos,$rd1pos, - $preld2pos,$ld2pos,$str2pos,$rd2pos, - $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); - - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg qq{Did not find prefix /$pre/ at "} . - substr($$textref, pos($$textref), 20) . - q{..."}, - pos $$textref; - return; - } - $oppos = pos($$textref); - - my $initial = substr($$textref,$oppos,1); - - if ($initial && $initial =~ m|^[\"\'\`]| - || $rawmatch && $initial =~ m|^/| - || $qmark && $initial =~ m|^\?|) - { - unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) - { - _failmsg qq{Did not find closing delimiter to match '$initial' at "} . - substr($$textref, $oppos, 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - $modpos= pos($$textref); - $rd1pos = $modpos-1; - - if ($initial eq '/' || $initial eq '?') - { - $$textref =~ m/\G$mods{none}/gc - } - - my $endpos = pos($$textref); - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, 0, # NO OPERATOR - $oppos, 1, # LEFT DEL - $oppos+1, $rd1pos-$oppos-1, # STR/PAT - $rd1pos, 1, # RIGHT DEL - $modpos, 0, # NO 2ND LDEL - $modpos, 0, # NO 2ND STR - $modpos, 0, # NO 2ND RDEL - $modpos, $endpos-$modpos, # MODIFIERS - $endpos, $textlen-$endpos, # REMAINDER - ); - } - - unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) - { - _failmsg q{No quotelike operator found after prefix at "} . - substr($$textref, pos($$textref), 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - - my $op = $1; - $preld1pos = pos($$textref); - if ($op eq '<<') { - $ld1pos = pos($$textref); - my $label; - if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { - $label = $1; - } - elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' - | \G " ([^"\\]* (?:\\.[^"\\]*)*) " - | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` - }gcsx) { - $label = $+; - } - else { - $label = ""; - } - my $extrapos = pos($$textref); - $$textref =~ m{.*\n}gc; - $str1pos = pos($$textref)--; - unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { - _failmsg qq{Missing here doc terminator ('$label') after "} . - substr($$textref, $startpos, 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - $rd1pos = pos($$textref); - $$textref =~ m{\Q$label\E\n}gc; - $ld2pos = pos($$textref); - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, length($op), # OPERATOR - $ld1pos, $extrapos-$ld1pos, # LEFT DEL - $str1pos, $rd1pos-$str1pos, # STR/PAT - $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL - $ld2pos, 0, # NO 2ND LDEL - $ld2pos, 0, # NO 2ND STR - $ld2pos, 0, # NO 2ND RDEL - $ld2pos, 0, # NO MODIFIERS - $ld2pos, $textlen-$ld2pos, # REMAINDER - $extrapos, $str1pos-$extrapos, # FILLETED BIT - ); - } - - $$textref =~ m/\G\s*/gc; - $ld1pos = pos($$textref); - $str1pos = $ld1pos+1; - - unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD - { - _failmsg "No block delimiter found after quotelike $op", - pos $$textref; - pos $$textref = $startpos; - return; - } - pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN - my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); - if ($ldel1 =~ /[[(<{]/) - { - $rdel1 =~ tr/[({</])}>/; - defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1)) - || do { pos $$textref = $startpos; return }; - $ld2pos = pos($$textref); - $rd1pos = $ld2pos-1; - } - else - { - $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs - || do { pos $$textref = $startpos; return }; - $ld2pos = $rd1pos = pos($$textref)-1; - } - - my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; - if ($second_arg) - { - my ($ldel2, $rdel2); - if ($ldel1 =~ /[[(<{]/) - { - unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD - { - _failmsg "Missing second block for quotelike $op", - pos $$textref; - pos $$textref = $startpos; - return; - } - $ldel2 = $rdel2 = "\Q$1"; - $rdel2 =~ tr/[({</])}>/; - } - else - { - $ldel2 = $rdel2 = $ldel1; - } - $str2pos = $ld2pos+1; - - if ($ldel2 =~ /[[(<{]/) - { - pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD - defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2)) - || do { pos $$textref = $startpos; return }; - } - else - { - $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs - || do { pos $$textref = $startpos; return }; - } - $rd2pos = pos($$textref)-1; - } - else - { - $ld2pos = $str2pos = $rd2pos = $rd1pos; - } - - $modpos = pos $$textref; - - $$textref =~ m/\G($mods{$op})/gc; - my $endpos = pos $$textref; - - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, length($op), # OPERATOR - $ld1pos, 1, # LEFT DEL - $str1pos, $rd1pos-$str1pos, # STR/PAT - $rd1pos, 1, # RIGHT DEL - $ld2pos, $second_arg, # 2ND LDEL (MAYBE) - $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) - $rd2pos, $second_arg, # 2ND RDEL (MAYBE) - $modpos, $endpos-$modpos, # MODIFIERS - $endpos, $textlen-$endpos, # REMAINDER - ); -} - -my $def_func = -[ - sub { extract_variable($_[0], '') }, - sub { extract_quotelike($_[0],'') }, - sub { extract_codeblock($_[0],'{}','') }, -]; - -sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) -{ - my $textref = defined($_[0]) ? \$_[0] : \$_; - my $posbug = pos; - my ($lastpos, $firstpos); - my @fields = (); - - #for ($$textref) - { - my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; - my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; - my $igunk = $_[3]; - - pos $$textref ||= 0; - - unless (wantarray) - { - use Carp; - carp "extract_multiple reset maximal count to 1 in scalar context" - if $^W && defined($_[2]) && $max > 1; - $max = 1 - } - - my $unkpos; - my $func; - my $class; - - my @class; - foreach $func ( @func ) - { - if (ref($func) eq 'HASH') - { - push @class, (keys %$func)[0]; - $func = (values %$func)[0]; - } - else - { - push @class, undef; - } - } - - FIELD: while (pos($$textref) < length($$textref)) - { - my ($field, $rem); - my @bits; - foreach my $i ( 0..$#func ) - { - my $pref; - $func = $func[$i]; - $class = $class[$i]; - $lastpos = pos $$textref; - if (ref($func) eq 'CODE') - { ($field,$rem,$pref) = @bits = $func->($$textref) } - elsif (ref($func) eq 'Text::Balanced::Extractor') - { @bits = $field = $func->extract($$textref) } - elsif( $$textref =~ m/\G$func/gc ) - { @bits = $field = defined($1) - ? $1 - : substr($$textref, $-[0], $+[0] - $-[0]) - } - $pref ||= ""; - if (defined($field) && length($field)) - { - if (!$igunk) { - $unkpos = $lastpos - if length($pref) && !defined($unkpos); - if (defined $unkpos) - { - push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; - $firstpos = $unkpos unless defined $firstpos; - undef $unkpos; - last FIELD if @fields == $max; - } - } - push @fields, $class - ? bless (\$field, $class) - : $field; - $firstpos = $lastpos unless defined $firstpos; - $lastpos = pos $$textref; - last FIELD if @fields == $max; - next FIELD; - } - } - if ($$textref =~ /\G(.)/gcs) - { - $unkpos = pos($$textref)-1 - unless $igunk || defined $unkpos; - } - } - - if (defined $unkpos) - { - push @fields, substr($$textref, $unkpos); - $firstpos = $unkpos unless defined $firstpos; - $lastpos = length $$textref; - } - last; - } - - pos $$textref = $lastpos; - return @fields if wantarray; - - $firstpos ||= 0; - eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; - pos $$textref = $firstpos }; - return $fields[0]; -} - - -sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) -{ - my $ldel = $_[0]; - my $rdel = $_[1]; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my %options = defined $_[3] ? %{$_[3]} : (); - my $omode = defined $options{fail} ? $options{fail} : ''; - my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) - : defined($options{reject}) ? $options{reject} - : '' - ; - my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) - : defined($options{ignore}) ? $options{ignore} - : '' - ; - - if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } - - my $posbug = pos; - for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } - pos = $posbug; - - my $closure = sub - { - my $textref = defined $_[0] ? \$_[0] : \$_; - my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); - - return _fail(wantarray, $textref) unless @match; - return _succeed wantarray, $textref, - $match[2], $match[3]+$match[5]+$match[7], # MATCH - @match[8..9,0..1,2..7]; # REM, PRE, BITS - }; - - bless $closure, 'Text::Balanced::Extractor'; -} - -package Text::Balanced::Extractor; - -sub extract($$) # ($self, $text) -{ - &{$_[0]}($_[1]); -} - -package Text::Balanced::ErrorMsg; - -use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; - -1; - -__END__ - -=head1 NAME - -Text::Balanced - Extract delimited text sequences from strings. - - -=head1 SYNOPSIS - - use Text::Balanced qw ( - extract_delimited - extract_bracketed - extract_quotelike - extract_codeblock - extract_variable - extract_tagged - extract_multiple - - gen_delimited_pat - gen_extract_tagged - ); - - # Extract the initial substring of $text that is delimited by - # two (unescaped) instances of the first character in $delim. - - ($extracted, $remainder) = extract_delimited($text,$delim); - - - # Extract the initial substring of $text that is bracketed - # with a delimiter(s) specified by $delim (where the string - # in $delim contains one or more of '(){}[]<>'). - - ($extracted, $remainder) = extract_bracketed($text,$delim); - - - # Extract the initial substring of $text that is bounded by - # an XML tag. - - ($extracted, $remainder) = extract_tagged($text); - - - # Extract the initial substring of $text that is bounded by - # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags - - ($extracted, $remainder) = - extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); - - - # Extract the initial substring of $text that represents a - # Perl "quote or quote-like operation" - - ($extracted, $remainder) = extract_quotelike($text); - - - # Extract the initial substring of $text that represents a block - # of Perl code, bracketed by any of character(s) specified by $delim - # (where the string $delim contains one or more of '(){}[]<>'). - - ($extracted, $remainder) = extract_codeblock($text,$delim); - - - # Extract the initial substrings of $text that would be extracted by - # one or more sequential applications of the specified functions - # or regular expressions - - @extracted = extract_multiple($text, - [ \&extract_bracketed, - \&extract_quotelike, - \&some_other_extractor_sub, - qr/[xyz]*/, - 'literal', - ]); - -# Create a string representing an optimized pattern (a la Friedl) -# that matches a substring delimited by any of the specified characters -# (in this case: any type of quote or a slash) - - $patstring = gen_delimited_pat(q{'"`/}); - - -# Generate a reference to an anonymous sub that is just like extract_tagged -# but pre-compiled and optimized for a specific pair of tags, and consequently -# much faster (i.e. 3 times faster). It uses qr// for better performance on -# repeated calls, so it only works under Perl 5.005 or later. - - $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); - - ($extracted, $remainder) = $extract_head->($text); - - -=head1 DESCRIPTION - -The various C<extract_...> subroutines may be used to -extract a delimited substring, possibly after skipping a -specified prefix string. By default, that prefix is -optional whitespace (C</\s*/>), but you can change it to whatever -you wish (see below). - -The substring to be extracted must appear at the -current C<pos> location of the string's variable -(or at index zero, if no C<pos> position is defined). -In other words, the C<extract_...> subroutines I<don't> -extract the first occurrence of a substring anywhere -in a string (like an unanchored regex would). Rather, -they extract an occurrence of the substring appearing -immediately at the current matching position in the -string (like a C<\G>-anchored regex would). - - - -=head2 General behaviour in list contexts - -In a list context, all the subroutines return a list, the first three -elements of which are always: - -=over 4 - -=item [0] - -The extracted string, including the specified delimiters. -If the extraction fails C<undef> is returned. - -=item [1] - -The remainder of the input string (i.e. the characters after the -extracted string). On failure, the entire string is returned. - -=item [2] - -The skipped prefix (i.e. the characters before the extracted string). -On failure, C<undef> is returned. - -=back - -Note that in a list context, the contents of the original input text (the first -argument) are not modified in any way. - -However, if the input text was passed in a variable, that variable's -C<pos> value is updated to point at the first character after the -extracted text. That means that in a list context the various -subroutines can be used much like regular expressions. For example: - - while ( $next = (extract_quotelike($text))[0] ) - { - # process next quote-like (in $next) - } - - -=head2 General behaviour in scalar and void contexts - -In a scalar context, the extracted string is returned, having first been -removed from the input text. Thus, the following code also processes -each quote-like operation, but actually removes them from $text: - - while ( $next = extract_quotelike($text) ) - { - # process next quote-like (in $next) - } - -Note that if the input text is a read-only string (i.e. a literal), -no attempt is made to remove the extracted text. - -In a void context the behaviour of the extraction subroutines is -exactly the same as in a scalar context, except (of course) that the -extracted substring is not returned. - -=head2 A note about prefixes - -Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) -This can bite you if you're expecting a prefix specification like -'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix -pattern will only succeed if the <H1> tag is on the current line, since -. normally doesn't match newlines. - -To overcome this limitation, you need to turn on /s matching within -the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' - - -=head2 C<extract_delimited> - -The C<extract_delimited> function formalizes the common idiom -of extracting a single-character-delimited substring from the start of -a string. For example, to extract a single-quote delimited string, the -following code is typically used: - - ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; - $extracted = $1; - -but with C<extract_delimited> it can be simplified to: - - ($extracted,$remainder) = extract_delimited($text, "'"); - -C<extract_delimited> takes up to four scalars (the input text, the -delimiters, a prefix pattern to be skipped, and any escape characters) -and extracts the initial substring of the text that -is appropriately delimited. If the delimiter string has multiple -characters, the first one encountered in the text is taken to delimit -the substring. -The third argument specifies a prefix pattern that is to be skipped -(but must be present!) before the substring is extracted. -The final argument specifies the escape character to be used for each -delimiter. - -All arguments are optional. If the escape characters are not specified, -every delimiter is escaped with a backslash (C<\>). -If the prefix is not specified, the -pattern C<'\s*'> - optional whitespace - is used. If the delimiter set -is also not specified, the set C</["'`]/> is used. If the text to be processed -is not specified either, C<$_> is used. - -In list context, C<extract_delimited> returns an array of three -elements, the extracted substring (I<including the surrounding -delimiters>), the remainder of the text, and the skipped prefix (if -any). If a suitable delimited substring is not found, the first -element of the array is the empty string, the second is the complete -original text, and the prefix returned in the third element is an -empty string. - -In a scalar context, just the extracted substring is returned. In -a void context, the extracted substring (and any prefix) are simply -removed from the beginning of the first argument. - -Examples: - - # Remove a single-quoted substring from the very beginning of $text: - - $substring = extract_delimited($text, "'", ''); - - # Remove a single-quoted Pascalish substring (i.e. one in which - # doubling the quote character escapes it) from the very - # beginning of $text: - - $substring = extract_delimited($text, "'", '', "'"); - - # Extract a single- or double- quoted substring from the - # beginning of $text, optionally after some whitespace - # (note the list context to protect $text from modification): - - ($substring) = extract_delimited $text, q{"'}; - - - # Delete the substring delimited by the first '/' in $text: - - $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; - -Note that this last example is I<not> the same as deleting the first -quote-like pattern. For instance, if C<$text> contained the string: - - "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" - -then after the deletion it would contain: - - "if ('.$UNIXCMD/s) { $cmd = $1; }" - -not: - - "if ('./cmd' =~ ms) { $cmd = $1; }" - - -See L<"extract_quotelike"> for a (partial) solution to this problem. - - -=head2 C<extract_bracketed> - -Like C<"extract_delimited">, the C<extract_bracketed> function takes -up to three optional scalar arguments: a string to extract from, a delimiter -specifier, and a prefix pattern. As before, a missing prefix defaults to -optional whitespace and a missing text defaults to C<$_>. However, a missing -delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). - -C<extract_bracketed> extracts a balanced-bracket-delimited -substring (using any one (or more) of the user-specified delimiter -brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also -respect quoted unbalanced brackets (see below). - -A "delimiter bracket" is a bracket in list of delimiters passed as -C<extract_bracketed>'s second argument. Delimiter brackets are -specified by giving either the left or right (or both!) versions -of the required bracket(s). Note that the order in which -two or more delimiter brackets are specified is not significant. - -A "balanced-bracket-delimited substring" is a substring bounded by -matched brackets, such that any other (left or right) delimiter -bracket I<within> the substring is also matched by an opposite -(right or left) delimiter bracket I<at the same level of nesting>. Any -type of bracket not in the delimiter list is treated as an ordinary -character. - -In other words, each type of bracket specified as a delimiter must be -balanced and correctly nested within the substring, and any other kind of -("non-delimiter") bracket in the substring is ignored. - -For example, given the string: - - $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; - -then a call to C<extract_bracketed> in a list context: - - @result = extract_bracketed( $text, '{}' ); - -would return: - - ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) - -since both sets of C<'{..}'> brackets are properly nested and evenly balanced. -(In a scalar context just the first element of the array would be returned. In -a void context, C<$text> would be replaced by an empty string.) - -Likewise the call in: - - @result = extract_bracketed( $text, '{[' ); - -would return the same result, since all sets of both types of specified -delimiter brackets are correctly nested and balanced. - -However, the call in: - - @result = extract_bracketed( $text, '{([<' ); - -would fail, returning: - - ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); - -because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and -the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would -return an empty string. In a void context, C<$text> would be unchanged.) - -Note that the embedded single-quotes in the string don't help in this -case, since they have not been specified as acceptable delimiters and are -therefore treated as non-delimiter characters (and ignored). - -However, if a particular species of quote character is included in the -delimiter specification, then that type of quote will be correctly handled. -for example, if C<$text> is: - - $text = '<A HREF=">>>>">link</A>'; - -then - - @result = extract_bracketed( $text, '<">' ); - -returns: - - ( '<A HREF=">>>>">', 'link</A>', "" ) - -as expected. Without the specification of C<"> as an embedded quoter: - - @result = extract_bracketed( $text, '<>' ); - -the result would be: - - ( '<A HREF=">', '>>>">link</A>', "" ) - -In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like -quoting (i.e. q{string}, qq{string}, etc) can be specified by including the -letter 'q' as a delimiter. Hence: - - @result = extract_bracketed( $text, '<q>' ); - -would correctly match something like this: - - $text = '<leftop: conj /and/ conj>'; - -See also: C<"extract_quotelike"> and C<"extract_codeblock">. - - -=head2 C<extract_variable> - -C<extract_variable> extracts any valid Perl variable or -variable-involved expression, including scalars, arrays, hashes, array -accesses, hash look-ups, method calls through objects, subroutine calls -through subroutine references, etc. - -The subroutine takes up to two optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A string specifying a pattern to be matched as a prefix (which is to be -skipped). If omitted, optional whitespace is skipped. - -=back - -On success in a list context, an array of 3 elements is returned. The -elements are: - -=over 4 - -=item [0] - -the extracted variable, or variablish expression - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=back - -On failure, all of these values (except the remaining text) are C<undef>. - -In a scalar context, C<extract_variable> returns just the complete -substring that matched a variablish expression. C<undef> is returned on -failure. In addition, the original input text has the returned substring -(and any prefix) removed from it. - -In a void context, the input text just has the matched substring (and -any specified prefix) removed. - - -=head2 C<extract_tagged> - -C<extract_tagged> extracts and segments text between (balanced) -specified tags. - -The subroutine takes up to five optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A string specifying a pattern to be matched as the opening tag. -If the pattern string is omitted (or C<undef>) then a pattern -that matches any standard XML tag is used. - -=item 3. - -A string specifying a pattern to be matched at the closing tag. -If the pattern string is omitted (or C<undef>) then the closing -tag is constructed by inserting a C</> after any leading bracket -characters in the actual opening tag that was matched (I<not> the pattern -that matched the tag). For example, if the opening tag pattern -is specified as C<'{{\w+}}'> and actually matched the opening tag -C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. - -=item 4. - -A string specifying a pattern to be matched as a prefix (which is to be -skipped). If omitted, optional whitespace is skipped. - -=item 5. - -A hash reference containing various parsing options (see below) - -=back - -The various options that can be specified are: - -=over 4 - -=item C<reject =E<gt> $listref> - -The list reference contains one or more strings specifying patterns -that must I<not> appear within the tagged text. - -For example, to extract -an HTML link (which should not contain nested links) use: - - extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); - -=item C<ignore =E<gt> $listref> - -The list reference contains one or more strings specifying patterns -that are I<not> be be treated as nested tags within the tagged text -(even if they would match the start tag pattern). - -For example, to extract an arbitrary XML tag, but ignore "empty" elements: - - extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); - -(also see L<"gen_delimited_pat"> below). - - -=item C<fail =E<gt> $str> - -The C<fail> option indicates the action to be taken if a matching end -tag is not encountered (i.e. before the end of the string or some -C<reject> pattern matches). By default, a failure to match a closing -tag causes C<extract_tagged> to immediately fail. - -However, if the string value associated with <reject> is "MAX", then -C<extract_tagged> returns the complete text up to the point of failure. -If the string is "PARA", C<extract_tagged> returns only the first paragraph -after the tag (up to the first line that is either empty or contains -only whitespace characters). -If the string is "", the the default behaviour (i.e. failure) is reinstated. - -For example, suppose the start tag "/para" introduces a paragraph, which then -continues until the next "/endpara" tag or until another "/para" tag is -encountered: - - $text = "/para line 1\n\nline 3\n/para line 4"; - - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); - - # EXTRACTED: "/para line 1\n\nline 3\n" - -Suppose instead, that if no matching "/endpara" tag is found, the "/para" -tag refers only to the immediately following paragraph: - - $text = "/para line 1\n\nline 3\n/para line 4"; - - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); - - # EXTRACTED: "/para line 1\n" - -Note that the specified C<fail> behaviour applies to nested tags as well. - -=back - -On success in a list context, an array of 6 elements is returned. The elements are: - -=over 4 - -=item [0] - -the extracted tagged substring (including the outermost tags), - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=item [3] - -the opening tag - -=item [4] - -the text between the opening and closing tags - -=item [5] - -the closing tag (or "" if no closing tag was found) - -=back - -On failure, all of these values (except the remaining text) are C<undef>. - -In a scalar context, C<extract_tagged> returns just the complete -substring that matched a tagged text (including the start and end -tags). C<undef> is returned on failure. In addition, the original input -text has the returned substring (and any prefix) removed from it. - -In a void context, the input text just has the matched substring (and -any specified prefix) removed. - - -=head2 C<gen_extract_tagged> - -(Note: This subroutine is only available under Perl5.005) - -C<gen_extract_tagged> generates a new anonymous subroutine which -extracts text between (balanced) specified tags. In other words, -it generates a function identical in function to C<extract_tagged>. - -The difference between C<extract_tagged> and the anonymous -subroutines generated by -C<gen_extract_tagged>, is that those generated subroutines: - -=over 4 - -=item * - -do not have to reparse tag specification or parsing options every time -they are called (whereas C<extract_tagged> has to effectively rebuild -its tag parser on every call); - -=item * - -make use of the new qr// construct to pre-compile the regexes they use -(whereas C<extract_tagged> uses standard string variable interpolation -to create tag-matching patterns). - -=back - -The subroutine takes up to four optional arguments (the same set as -C<extract_tagged> except for the string to be processed). It returns -a reference to a subroutine which in turn takes a single argument (the text to -be extracted from). - -In other words, the implementation of C<extract_tagged> is exactly -equivalent to: - - sub extract_tagged - { - my $text = shift; - $extractor = gen_extract_tagged(@_); - return $extractor->($text); - } - -(although C<extract_tagged> is not currently implemented that way, in order -to preserve pre-5.005 compatibility). - -Using C<gen_extract_tagged> to create extraction functions for specific tags -is a good idea if those functions are going to be called more than once, since -their performance is typically twice as good as the more general-purpose -C<extract_tagged>. - - -=head2 C<extract_quotelike> - -C<extract_quotelike> attempts to recognize, extract, and segment any -one of the various Perl quotes and quotelike operators (see -L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket -delimiters (for the quotelike operators), and trailing modifiers are -all caught. For example, in: - - extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' - - extract_quotelike ' "You said, \"Use sed\"." ' - - extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' - - extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' - -the full Perl quotelike operations are all extracted correctly. - -Note too that, when using the /x modifier on a regex, any comment -containing the current pattern delimiter will cause the regex to be -immediately terminated. In other words: - - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/UNDERSCORE - [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS - /x' - -will be extracted as if it were: - - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/' - -This behaviour is identical to that of the actual compiler. - -C<extract_quotelike> takes two arguments: the text to be processed and -a prefix to be matched at the very beginning of the text. If no prefix -is specified, optional whitespace is the default. If no text is given, -C<$_> is used. - -In a list context, an array of 11 elements is returned. The elements are: - -=over 4 - -=item [0] - -the extracted quotelike substring (including trailing modifiers), - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=item [3] - -the name of the quotelike operator (if any), - -=item [4] - -the left delimiter of the first block of the operation, - -=item [5] - -the text of the first block of the operation -(that is, the contents of -a quote, the regex of a match or substitution or the target list of a -translation), - -=item [6] - -the right delimiter of the first block of the operation, - -=item [7] - -the left delimiter of the second block of the operation -(that is, if it is a C<s>, C<tr>, or C<y>), - -=item [8] - -the text of the second block of the operation -(that is, the replacement of a substitution or the translation list -of a translation), - -=item [9] - -the right delimiter of the second block of the operation (if any), - -=item [10] - -the trailing modifiers on the operation (if any). - -=back - -For each of the fields marked "(if any)" the default value on success is -an empty string. -On failure, all of these values (except the remaining text) are C<undef>. - - -In a scalar context, C<extract_quotelike> returns just the complete substring -that matched a quotelike operation (or C<undef> on failure). In a scalar or -void context, the input text has the same substring (and any specified -prefix) removed. - -Examples: - - # Remove the first quotelike literal that appears in text - - $quotelike = extract_quotelike($text,'.*?'); - - # Replace one or more leading whitespace-separated quotelike - # literals in $_ with "<QLL>" - - do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; - - - # Isolate the search pattern in a quotelike operation from $text - - ($op,$pat) = (extract_quotelike $text)[3,5]; - if ($op =~ /[ms]/) - { - print "search pattern: $pat\n"; - } - else - { - print "$op is not a pattern matching operation\n"; - } - - -=head2 C<extract_quotelike> and "here documents" - -C<extract_quotelike> can successfully extract "here documents" from an input -string, but with an important caveat in list contexts. - -Unlike other types of quote-like literals, a here document is rarely -a contiguous substring. For example, a typical piece of code using -here document might look like this: - - <<'EOMSG' || die; - This is the message. - EOMSG - exit; - -Given this as an input string in a scalar context, C<extract_quotelike> -would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", -leaving the string " || die;\nexit;" in the original variable. In other words, -the two separate pieces of the here document are successfully extracted and -concatenated. - -In a list context, C<extract_quotelike> would return the list - -=over 4 - -=item [0] - -"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, -including fore and aft delimiters), - -=item [1] - -" || die;\nexit;" (i.e. the remainder of the input text, concatenated), - -=item [2] - -"" (i.e. the prefix substring -- trivial in this case), - -=item [3] - -"<<" (i.e. the "name" of the quotelike operator) - -=item [4] - -"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), - -=item [5] - -"This is the message.\n" (i.e. the text of the here document), - -=item [6] - -"EOMSG" (i.e. the right delimiter of the here document), - -=item [7..10] - -"" (a here document has no second left delimiter, second text, second right -delimiter, or trailing modifiers). - -=back - -However, the matching position of the input variable would be set to -"exit;" (i.e. I<after> the closing delimiter of the here document), -which would cause the earlier " || die;\nexit;" to be skipped in any -sequence of code fragment extractions. - -To avoid this problem, when it encounters a here document whilst -extracting from a modifiable string, C<extract_quotelike> silently -rearranges the string to an equivalent piece of Perl: - - <<'EOMSG' - This is the message. - EOMSG - || die; - exit; - -in which the here document I<is> contiguous. It still leaves the -matching position after the here document, but now the rest of the line -on which the here document starts is not skipped. - -To prevent <extract_quotelike> from mucking about with the input in this way -(this is the only case where a list-context C<extract_quotelike> does so), -you can pass the input variable as an interpolated literal: - - $quotelike = extract_quotelike("$var"); - - -=head2 C<extract_codeblock> - -C<extract_codeblock> attempts to recognize and extract a balanced -bracket delimited substring that may contain unbalanced brackets -inside Perl quotes or quotelike operations. That is, C<extract_codeblock> -is like a combination of C<"extract_bracketed"> and -C<"extract_quotelike">. - -C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: -a text to process, a set of delimiter brackets to look for, and a prefix to -match first. It also takes an optional fourth parameter, which allows the -outermost delimiter brackets to be specified separately (see below). - -Omitting the first argument (input text) means process C<$_> instead. -Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. -Omitting the third argument (prefix argument) implies optional whitespace at the start. -Omitting the fourth argument (outermost delimiter brackets) indicates that the -value of the second argument is to be used for the outermost delimiters. - -Once the prefix an dthe outermost opening delimiter bracket have been -recognized, code blocks are extracted by stepping through the input text and -trying the following alternatives in sequence: - -=over 4 - -=item 1. - -Try and match a closing delimiter bracket. If the bracket was the same -species as the last opening bracket, return the substring to that -point. If the bracket was mismatched, return an error. - -=item 2. - -Try to match a quote or quotelike operator. If found, call -C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return -the error it returned. Otherwise go back to step 1. - -=item 3. - -Try to match an opening delimiter bracket. If found, call -C<extract_codeblock> recursively to eat the embedded block. If the -recursive call fails, return an error. Otherwise, go back to step 1. - -=item 4. - -Unconditionally match a bareword or any other single character, and -then go back to step 1. - -=back - - -Examples: - - # Find a while loop in the text - - if ($text =~ s/.*?while\s*\{/{/) - { - $loop = "while " . extract_codeblock($text); - } - - # Remove the first round-bracketed list (which may include - # round- or curly-bracketed code blocks or quotelike operators) - - extract_codeblock $text, "(){}", '[^(]*'; - - -The ability to specify a different outermost delimiter bracket is useful -in some circumstances. For example, in the Parse::RecDescent module, -parser actions which are to be performed only on a successful parse -are specified using a C<E<lt>defer:...E<gt>> directive. For example: - - sentence: subject verb object - <defer: {$::theVerb = $item{verb}} > - -Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code -within the C<E<lt>defer:...E<gt>> directive, but there's a problem. - -A deferred action like this: - - <defer: {if ($count>10) {$count--}} > - -will be incorrectly parsed as: - - <defer: {if ($count> - -because the "less than" operator is interpreted as a closing delimiter. - -But, by extracting the directive using -S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> -the '>' character is only treated as a delimited at the outermost -level of the code block, so the directive is parsed correctly. - -=head2 C<extract_multiple> - -The C<extract_multiple> subroutine takes a string to be processed and a -list of extractors (subroutines or regular expressions) to apply to that string. - -In an array context C<extract_multiple> returns an array of substrings -of the original string, as extracted by the specified extractors. -In a scalar context, C<extract_multiple> returns the first -substring successfully extracted from the original string. In both -scalar and void contexts the original string has the first successfully -extracted substring removed from it. In all contexts -C<extract_multiple> starts at the current C<pos> of the string, and -sets that C<pos> appropriately after it matches. - -Hence, the aim of of a call to C<extract_multiple> in a list context -is to split the processed string into as many non-overlapping fields as -possible, by repeatedly applying each of the specified extractors -to the remainder of the string. Thus C<extract_multiple> is -a generalized form of Perl's C<split> subroutine. - -The subroutine takes up to four optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A reference to a list of subroutine references and/or qr// objects and/or -literal strings and/or hash references, specifying the extractors -to be used to split the string. If this argument is omitted (or -C<undef>) the list: - - [ - sub { extract_variable($_[0], '') }, - sub { extract_quotelike($_[0],'') }, - sub { extract_codeblock($_[0],'{}','') }, - ] - -is used. - - -=item 3. - -An number specifying the maximum number of fields to return. If this -argument is omitted (or C<undef>), split continues as long as possible. - -If the third argument is I<N>, then extraction continues until I<N> fields -have been successfully extracted, or until the string has been completely -processed. - -Note that in scalar and void contexts the value of this argument is -automatically reset to 1 (under C<-w>, a warning is issued if the argument -has to be reset). - -=item 4. - -A value indicating whether unmatched substrings (see below) within the -text should be skipped or returned as fields. If the value is true, -such substrings are skipped. Otherwise, they are returned. - -=back - -The extraction process works by applying each extractor in -sequence to the text string. - -If the extractor is a subroutine it is called in a list context and is -expected to return a list of a single element, namely the extracted -text. It may optionally also return two further arguments: a string -representing the text left after extraction (like $' for a pattern -match), and a string representing any prefix skipped before the -extraction (like $` in a pattern match). Note that this is designed -to facilitate the use of other Text::Balanced subroutines with -C<extract_multiple>. Note too that the value returned by an extractor -subroutine need not bear any relationship to the corresponding substring -of the original text (see examples below). - -If the extractor is a precompiled regular expression or a string, -it is matched against the text in a scalar context with a leading -'\G' and the gc modifiers enabled. The extracted value is either -$1 if that variable is defined after the match, or else the -complete match (i.e. $&). - -If the extractor is a hash reference, it must contain exactly one element. -The value of that element is one of the -above extractor types (subroutine reference, regular expression, or string). -The key of that element is the name of a class into which the successful -return value of the extractor will be blessed. - -If an extractor returns a defined value, that value is immediately -treated as the next extracted field and pushed onto the list of fields. -If the extractor was specified in a hash reference, the field is also -blessed into the appropriate class, - -If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is -assumed to have failed to extract. -If none of the extractor subroutines succeeds, then one -character is extracted from the start of the text and the extraction -subroutines reapplied. Characters which are thus removed are accumulated and -eventually become the next field (unless the fourth argument is true, in which -case they are discarded). - -For example, the following extracts substrings that are valid Perl variables: - - @fields = extract_multiple($text, - [ sub { extract_variable($_[0]) } ], - undef, 1); - -This example separates a text into fields which are quote delimited, -curly bracketed, and anything else. The delimited and bracketed -parts are also blessed to identify them (the "anything else" is unblessed): - - @fields = extract_multiple($text, - [ - { Delim => sub { extract_delimited($_[0],q{'"}) } }, - { Brack => sub { extract_bracketed($_[0],'{}') } }, - ]); - -This call extracts the next single substring that is a valid Perl quotelike -operator (and removes it from $text): - - $quotelike = extract_multiple($text, - [ - sub { extract_quotelike($_[0]) }, - ], undef, 1); - -Finally, here is yet another way to do comma-separated value parsing: - - @fields = extract_multiple($csv_text, - [ - sub { extract_delimited($_[0],q{'"}) }, - qr/([^,]+)(.*)/, - ], - undef,1); - -The list in the second argument means: -I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. -The undef third argument means: -I<"...as many times as possible...">, -and the true value in the fourth argument means -I<"...discarding anything else that appears (i.e. the commas)">. - -If you wanted the commas preserved as separate fields (i.e. like split -does if your split pattern has capturing parentheses), you would -just make the last parameter undefined (or remove it). - - -=head2 C<gen_delimited_pat> - -The C<gen_delimited_pat> subroutine takes a single (string) argument and - > builds a Friedl-style optimized regex that matches a string delimited -by any one of the characters in the single argument. For example: - - gen_delimited_pat(q{'"}) - -returns the regex: - - (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') - -Note that the specified delimiters are automatically quotemeta'd. - -A typical use of C<gen_delimited_pat> would be to build special purpose tags -for C<extract_tagged>. For example, to properly ignore "empty" XML elements -(which might contain quoted strings): - - my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; - - extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); - - -C<gen_delimited_pat> may also be called with an optional second argument, -which specifies the "escape" character(s) to be used for each delimiter. -For example to match a Pascal-style string (where ' is the delimiter -and '' is a literal ' within the string): - - gen_delimited_pat(q{'},q{'}); - -Different escape characters can be specified for different delimiters. -For example, to specify that '/' is the escape for single quotes -and '%' is the escape for double quotes: - - gen_delimited_pat(q{'"},q{/%}); - -If more delimiters than escape chars are specified, the last escape char -is used for the remaining delimiters. -If no escape char is specified for a given specified delimiter, '\' is used. - -=head2 C<delimited_pat> - -Note that C<gen_delimited_pat> was previously called C<delimited_pat>. -That name may still be used, but is now deprecated. - - -=head1 DIAGNOSTICS - -In a list context, all the functions return C<(undef,$original_text)> -on failure. In a scalar context, failure is indicated by returning C<undef> -(in this case the input text is not modified in any way). - -In addition, on failure in I<any> context, the C<$@> variable is set. -Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed -below. -Accessing C<$@-E<gt>{pos}> returns the offset into the original string at -which the error was detected (although not necessarily where it occurred!) -Printing C<$@> directly produces the error message, with the offset appended. -On success, the C<$@> variable is guaranteed to be C<undef>. - -The available diagnostics are: - -=over 4 - -=item C<Did not find a suitable bracket: "%s"> - -The delimiter provided to C<extract_bracketed> was not one of -C<'()[]E<lt>E<gt>{}'>. - -=item C<Did not find prefix: /%s/> - -A non-optional prefix was specified but wasn't found at the start of the text. - -=item C<Did not find opening bracket after prefix: "%s"> - -C<extract_bracketed> or C<extract_codeblock> was expecting a -particular kind of bracket at the start of the text, and didn't find it. - -=item C<No quotelike operator found after prefix: "%s"> - -C<extract_quotelike> didn't find one of the quotelike operators C<q>, -C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring -it was extracting. - -=item C<Unmatched closing bracket: "%c"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered -a closing bracket where none was expected. - -=item C<Unmatched opening bracket(s): "%s"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran -out of characters in the text before closing one or more levels of nested -brackets. - -=item C<Unmatched embedded quote (%s)> - -C<extract_bracketed> attempted to match an embedded quoted substring, but -failed to find a closing quote to match it. - -=item C<Did not find closing delimiter to match '%s'> - -C<extract_quotelike> was unable to find a closing delimiter to match the -one that opened the quote-like operation. - -=item C<Mismatched closing bracket: expected "%c" but found "%s"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found -a valid bracket delimiter, but it was the wrong species. This usually -indicates a nesting error, but may indicate incorrect quoting or escaping. - -=item C<No block delimiter found after quotelike "%s"> - -C<extract_quotelike> or C<extract_codeblock> found one of the -quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> -without a suitable block after it. - -=item C<Did not find leading dereferencer> - -C<extract_variable> was expecting one of '$', '@', or '%' at the start of -a variable, but didn't find any of them. - -=item C<Bad identifier after dereferencer> - -C<extract_variable> found a '$', '@', or '%' indicating a variable, but that -character was not followed by a legal Perl identifier. - -=item C<Did not find expected opening bracket at %s> - -C<extract_codeblock> failed to find any of the outermost opening brackets -that were specified. - -=item C<Improperly nested codeblock at %s> - -A nested code block was found that started with a delimiter that was specified -as being only to be used as an outermost bracket. - -=item C<Missing second block for quotelike "%s"> - -C<extract_codeblock> or C<extract_quotelike> found one of the -quotelike operators C<s>, C<tr> or C<y> followed by only one block. - -=item C<No match found for opening bracket> - -C<extract_codeblock> failed to find a closing bracket to match the outermost -opening bracket. - -=item C<Did not find opening tag: /%s/> - -C<extract_tagged> did not find a suitable opening tag (after any specified -prefix was removed). - -=item C<Unable to construct closing tag to match: /%s/> - -C<extract_tagged> matched the specified opening tag and tried to -modify the matched text to produce a matching closing tag (because -none was specified). It failed to generate the closing tag, almost -certainly because the opening tag did not start with a -bracket of some kind. - -=item C<Found invalid nested tag: %s> - -C<extract_tagged> found a nested tag that appeared in the "reject" list -(and the failure mode was not "MAX" or "PARA"). - -=item C<Found unbalanced nested tag: %s> - -C<extract_tagged> found a nested opening tag that was not matched by a -corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). - -=item C<Did not find closing tag> - -C<extract_tagged> reached the end of the text without finding a closing tag -to match the original opening tag (and the failure mode was not -"MAX" or "PARA"). - - - - -=back - - -=head1 AUTHOR - -Damian Conway (damian@conway.org) - - -=head1 BUGS AND IRRITATIONS - -There are undoubtedly serious bugs lurking somewhere in this code, if -only because parts of it give the impression of understanding a great deal -more about Perl than they really do. - -Bug reports and other feedback are most welcome. - - -=head1 COPYRIGHT - - Copyright (c) 1997-2001, Damian Conway. All Rights Reserved. - This module is free software. It may be used, redistributed - and/or modified under the same terms as Perl itself. diff --git a/lib/Text/Balanced/Changes b/lib/Text/Balanced/Changes deleted file mode 100644 index 49957dc61c..0000000000 --- a/lib/Text/Balanced/Changes +++ /dev/null @@ -1,342 +0,0 @@ -Revision history for Perl extension Text::Balanced. - -1.00 Mon Aug 11 12:42:56 1997 - - - original version - - -1.01 Mon Sep 8 18:09:18 EST 1997 - - - changed "quotemeta" to "quotemeta $_" to work - around bug in Perl 5.002 and 5.003 - - -1.10 Tue Sep 30 17:23:23 EST 1997 - - - reworked extract_quotelike to correct handling of some obscure cases - - -1.21 Sat Oct 4 17:21:54 EST 1997 - - - synchronised with Parse::RecDescent distribution (version number - will now reflect that package) - -1.23 Fri Oct 17 10:26:38 EST 1997 - - - changed behaviour in scalar and void contexts. Scalar contexts - now return only the extracted string. Void contexts now remove - the extracted string from the first argument (or $_). - -1.24 - - - changed behaviour in scalar contexts. Scalar contexts - now return the extracted string _and_ remove it from the - first argument (or $_). - - - changed return values on failure (all contexts return undef - for invalid return fields) - - - fixed some lurking bugs with trailing modifier handling - - - added :ALL tag to simplify wholesale importing of functions - - - fixed serious bug with embedded division operators ("/") - This now also allows the ?...? form of pattern matching! - -1.35 Wed Jun 24 09:53:31 1998 - - - fixed handling of :: quantifiers in extract_variable() - - - numerous trivial lexical changes to make xemacs happy - - -1.36 Tue Jul 14 12:26:04 1998 - - - Reinstated POD file missing from previous distribution - - - Added undocumented fourth parameter to extract_codeblock - so as to correctly handle (?) and (s?) modifiers in - RecDescent grammars. - - -1.40 Tue Aug 4 13:54:52 1998 - - - Added (optional) handling of embedded quoted text to - extract_delimited (see revised entry in Balanced.pod) - - - Added extract_tagged which extracts text between arbitrary, - optionally nested start and end tags (see new entry in - Balanced.pod). - - - Added delimited_pat which builds a pattern which matches a - string delimited by any of the delimiters specified (see new - entry in Balanced.pod). - - - Added test.pl - - -1.41 Mon Aug 10 14:51:50 1998 - - - Reinstated change to extract_codeblock from 1.36 which were - mysteriously lost in 1.40 - - -1.50 Thu Aug 27 09:20:19 1998 - - - Improved the structure of the regex generated by - delimited_pat (and used in extract_delimited). It's - considerably more complex, but also more robust and - much faster in the worst case. - - - Altered extract_variable to accept whitespace in variables, - e.g. '$ a -> {'b'} -> [2]' - - - -1.51 Sat Feb 13 10:31:55 1999 - - - Fixed bugs in prefix matching in extract_variable: - * incorrectly used default if '' specified - * now handles $#array correctly - - - Fixed bugs in extract_codeblock: - * Now handles !~ properly - * Now handles embedded comments better. - * Now handles "raw" pattern matches better. - - - Added support for single strings or qr's as - 'reject' and 'ignore' args to extract_tagged() - - - Added gen_extract_tagged() to "precompile" - a specific tag extractor for repeated use - (approximately 3 times faster!) - - -1.52 Thu Mar 4 12:43:38 1999 - - - Added CSV parsing example to documentation of extract_multiple. - - - Fixed a bug with extract_codeblock in "RecDescent" mode - (it would accept "subrule(s?)" and "subrule(?)", but - not "subrule(s)"). Thanks, Jan. - - -1.66 Fri Jul 2 13:29:22 1999 - - - Added ability to use quotelike operators in extract_bracketed - - - Fixed bug under 5.003 ('foreach my $func' not understood) - - - Added escape specification as fourth arg to &extract_delimited - - - Fixed handling of &delimited_pat and &extract_delimited - when delimiter is same as escape - - - Fixed handling of ->, =>, and >> in &extract_code - when delimiters are "<>" - - -1.76 Fri Nov 19 06:51:54 1999 - - - IMPORTANT: Now requires 5.005 or better. - - - IMPORTANT: Made extract methods sensitive to the pos() - value of the text they are parsing. In other words, - all extract subroutines now act like patterns of the form - /\G.../gc. See documentation for details. - - - IMPORTANT: Changed semantics of extract_multiple, in line - with the above change, and to simplify the semantics to - something vaguely predictable. See documentation for details. - - - Added ability to use qr/../'s and raw strings as extractors - in extract_multiple. See documentation. - - - Added fourth argument to extract_codeblock to allow - outermost brackets to be separately specified. See - documentation for details. - - - Reimplemented internals of all extraction subroutines - for significant speed-ups (between 100% and 2000% - improvement). - - - Fixed nasty bug in extract_variable and extract_codeblock - (they were returning prefix as well in scalar context) - - - Allowed read-only strings to be used as arguments in - scalar contexts. - - - Renamed delimited_pat to gen-delimited pat (in line with - gen_extract_tagged). Old name still works, but is now deprecated. - - - Tweaked all extraction subs so they correctly handle - zero-length prefix matches after another zero-length match. - - -1.77 Mon Nov 22 06:08:23 1999 - - - Fixed major bug in extract_codeblock (would not - terminate if there was trailing whitespace) - - - Improved /.../ pattern parsing within codeblocks - - -1.81 Wed Sep 13 11:58:49 2000 - - - Fixed test count in extract_codeblock.t - - - Fixed improbable bug with trailing ->'s in extract_variable - - - Fixed (HT|X)ML tag extraction in extract_tagged (thanks, Tim) - - - Added explanatory note about prefix matching (thanks again, Tim) - - - Added handling of globs and sub refs to extract_variable - - - Pod tweak (thanks Abigail) - - - Allowed right tags to be run-time evaluated, so - extract_tagged($text, '/([a-z]+)', '/end$1') works - as expected. - - - Added optional blessing of matches via extract_multiple - - - Fixed bug in autogeneration of closing tags in extract_tagged - (Thanks, Coke) - - - Fixed bug in interaction between extract_multiple and - gen_extract_tagged (Thanks Anthony) - - -1.82 Sun Jan 14 16:56:04 2001 - - - Fixed nit in extract_variable.t - (tested more cases than it promised to) - - - Fixed bug extracting prefix in extract_quotelike (Thanks Michael) - - - Added handling of Perl 4 package qualifier: $Package'var, etc. - - - Added handling of here docs (see documentation for limitations) - - - Added reporting of failure position via $@->{pos} (see documentation) - - -1.83 Mon Jan 15 12:43:12 2001 - - - Fixed numerous bugs in here doc extraction (many thanks Tim) - - -1.84 Thu Apr 26 11:58:13 2001 - - - Fixed bug in certain extractions not matching strings - with embedded newlines (thanks Robin) - - -1.85 Sun Jun 3 07:47:18 2001 - - - Fixed bug in extract_variable recognizing method calls that - start with an underscore (thanks Jeff) - - -1.86 Mon Sep 3 06:57:08 2001 - - - Revised licence for inclusion in core distribution - - - Consolidated POD in .pm file - - - renamed tests to let DOS cope with them - - -1.87 Thu Nov 15 21:25:35 2001 - - - Made extract_multiple aware of skipped prefixes returned - by subroutine extractors (such as extract_quotelike, etc.) - - - Made extract_variable aware of punctuation variables - - - Corified tests - - -1.89 Sun Nov 18 22:49:50 2001 - - - Fixed extvar.t tests - - -1.90 Tue Mar 25 11:14:38 2003 - - - Fixed subtle bug in gen_extract_tagged (thanks Martin) - - - Doc fix: removed suggestion that extract_tagged defaults - to matching HTML tags - - - Doc fix: clarified general matching behaviour - - - Fixed bug in parsing /.../ after a ( - - - Doc fix: documented extract_variable - - - Fixed extract_variable handling of $h{qr}, $h{tr}, etc. - (thanks, Briac) - - - Fixed incorrect handling of $::var (thanks Tim) - - -1.91 Fri Mar 28 23:19:17 2003 - - - Fixed error count on t/extract_variable.t - - - Fixed bug in extract_codelike when non-standard delimiters used - - -1.94 Sun Apr 13 02:18:41 2003 - - - rereleased in attempt to fix CPAN problems - - -1.95 Mon Apr 28 00:22:04 2003 - - - Constrainted _match_quote to only match at word boundaries - (so "exemplum(hic)" doesn't match "m(hic)") - (thanks Craig) - - - -1.96.0 Mon May 1 21:52:37 2006 - - - Fixed major bug in extract_multiple handling of unknowns - - - Fixed return value on failure (thanks Eric) - - - Fixed bug differentiating heredocs and left-shift operators - (thanks Anthony) - -1.97 Mon May 1 21:58:04 2006 - - - Removed three-part version number and dependency on version.pm - - -1.98 Fri May 5 14:58:49 2006 - - - Reinstated full test suite (thanks Steve!) - - - -1.99.0 Thu Nov 16 07:32:06 2006 - - - Removed reliance on expensive $& variable (thanks John) - - - Made Makefile.PL play nice with core versions (thanks Schwern!) - - -1.99.1 Thu Nov 16 09:29:14 2006 - - - Included dependency on version.pm (thanks Andy) - - - -2.0.0 Wed Dec 20 10:50:24 2006 - - - Added patches from bleadperl version (thanks Rafael!) - - - Fixed bug in second bracketed delimiters (thanks David) diff --git a/lib/Text/Balanced/README b/lib/Text/Balanced/README deleted file mode 100644 index 386bd5ac78..0000000000 --- a/lib/Text/Balanced/README +++ /dev/null @@ -1,71 +0,0 @@ -Text::Balanced version 2.0.0 - - Text::Balanced - Extract delimited text sequences from strings. - -SUMMARY - - Text::Balanced::extract_delimited - - `extract_delimited' extracts the initial substring of a string - which is delimited by a user-specified set of single-character - delimiters, whilst ignoring any backslash-escaped delimiter - characters. - - Text::Balanced::extract_bracketed - - `extract_bracketed' extracts a balanced-bracket-delimited substring - (using any one (or more) of the user-specified delimiter brackets: - '(..)', '{..}', '[..]', or '<..>'). - - Text::Balanced::extract_quotelike - - `extract_quotelike' attempts to recognize and extract any one of the - various Perl quote and quotelike operators (see "perlop(3)"). Embedded - backslashed delimiters, nested bracket delimiters (for the - quotelike operators), and trailing modifiers are all correctly handled. - - Text::Balanced::extract_codeblock - - `extract_codeblock' attempts to recognize and extract a - balanced bracket-delimited substring which may also contain - unbalanced brackets inside Perl quotes or quotelike - operations. That is, `extract_codeblock' is like a combination - of `extract_bracketed' and `extract_quotelike'. - - Text::Balanced::extract_tagged - - `extract_tagged' attempts to recognize and extract a - substring between two arbitrary "tag" patterns (a start tag - and an end tag). - - -INSTALLATION - -To install this module, run the following commands: - - perl Makefile.PL - make - make test - make install - - -Alternatively, to install with Module::Build, you can use the following commands: - - perl Build.PL - ./Build - ./Build test - ./Build install - - - -DEPENDENCIES - -None. - - -COPYRIGHT AND LICENCE - -Copyright (C) 2006, Damian Conway - -This library is free software; you can redistribute it and/or modify -it under the same terms as Perl itself. diff --git a/lib/Text/Balanced/t/00-load.t b/lib/Text/Balanced/t/00-load.t deleted file mode 100644 index a8268a75b1..0000000000 --- a/lib/Text/Balanced/t/00-load.t +++ /dev/null @@ -1,7 +0,0 @@ -use Test::More tests => 1; - -BEGIN { -use_ok( 'Text::Balanced' ); -diag( "Testing Text::Balanced $Text::Balanced::VERSION" ) - unless $ENV{PERL_CORE}; -} diff --git a/lib/Text/Balanced/t/extbrk.t b/lib/Text/Balanced/t/extbrk.t deleted file mode 100644 index e2763e83ae..0000000000 --- a/lib/Text/Balanced/t/extbrk.t +++ /dev/null @@ -1,83 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..19\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_bracketed ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - $var = eval "() = $cmd"; - debug "\t list got: [$var]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str),1) eq ';')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; - - pos $str = 0; - $var = eval $cmd; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ - -# USING: extract_bracketed($str); -{a nested { and } are okay as are () and <> pairs and escaped \}'s }; -{a nested\n{ and } are okay as are\n() and <> pairs and escaped \}'s }; - -# USING: extract_bracketed($str,'{}'); -{a nested { and } are okay as are unbalanced ( and < pairs and escaped \}'s }; - -# THESE SHOULD FAIL -{an unmatched nested { isn't okay, nor are ( and < }; -{an unbalanced nested [ even with } and ] to match them; - - -# USING: extract_bracketed($str,'<"`q>'); -<a q{uoted} ">" unbalanced right bracket of /(q>)/ either sort (`>>>""">>>>`) is okay >; - -# USING: extract_bracketed($str,'<">'); -<a quoted ">" unbalanced right bracket is okay >; - -# USING: extract_bracketed($str,'<"`>'); -<a quoted ">" unbalanced right bracket of either sort (`>>>""">>>>`) is okay >; - -# THIS SHOULD FAIL -<a misquoted '>' unbalanced right bracket is bad >; diff --git a/lib/Text/Balanced/t/extcbk.t b/lib/Text/Balanced/t/extcbk.t deleted file mode 100644 index 30b7e502cb..0000000000 --- a/lib/Text/Balanced/t/extcbk.t +++ /dev/null @@ -1,102 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..41\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_codeblock ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - my @res; - $var = eval "\@res = $cmd"; - debug "\t Failed: $@ at " . $@+0 .")" if $@; - debug "\t list got: [" . join("|", map {defined $_ ? $_ : '<undef>'} @res) . "]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg; - print "ok ", $count++; - print "\n"; - - pos $str = 0; - $var = eval $cmd; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ - -# USING: extract_codeblock($str,'(){}',undef,'()'); -(Foo(')')); - -# USING: extract_codeblock($str); -{ $data[4] =~ /['"]/; }; - -# USING: extract_codeblock($str,'<>'); -< %x = ( try => "this") >; -< %x = () >; -< %x = ( $try->{this}, "too") >; -< %'x = ( $try->{this}, "too") >; -< %'x'y = ( $try->{this}, "too") >; -< %::x::y = ( $try->{this}, "too") >; - -# THIS SHOULD FAIL -< %x = do { $try > 10 } >; - -# USING: extract_codeblock($str); - -{ $a = /\}/; }; -{ sub { $_[0] /= $_[1] } }; # / here -{ 1; }; -{ $a = 1; }; - - -# USING: extract_codeblock($str,undef,'=*'); -========{$a=1}; - -# USING: extract_codeblock($str,'{}<>'); -< %x = do { $try > 10 } >; - -# USING: extract_codeblock($str,'{}',undef,'<>'); -< %x = do { $try > 10 } >; - -# USING: extract_codeblock($str,'{}'); -{ $a = $b; # what's this doing here? \n };' -{ $a = $b; \n $a =~ /$b/; \n @a = map /\s/ @b }; - -# THIS SHOULD FAIL -{ $a = $b; # what's this doing here? };' -{ $a = $b; # what's this doing here? ;' diff --git a/lib/Text/Balanced/t/extdel.t b/lib/Text/Balanced/t/extdel.t deleted file mode 100644 index 6db547f43a..0000000000 --- a/lib/Text/Balanced/t/extdel.t +++ /dev/null @@ -1,97 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..45\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_delimited ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - $var = eval "() = $cmd"; - debug "\t list got: [$var]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; - - pos $str = 0; - $var = eval $cmd; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ -# USING: extract_delimited($str,'/#$',undef,'/#$'); -/a/; -/a///; -#b#; -#b###; -$c$; -$c$$$; - -# TEST EXTRACTION OF DELIMITED TEXT WITH ESCAPES -# USING: extract_delimited($str,'/#$',undef,'\\'); -/a/; -/a\//; -#b#; -#b\##; -$c$; -$c\$$; - -# TEST EXTRACTION OF DELIMITED TEXT -# USING: extract_delimited($str); -'a'; -"b"; -`c`; -'a\''; -'a\\'; -'\\a'; -"a\\"; -"\\a"; -"b\'\"\'"; -`c '\`abc\`'`; - -# TEST EXTRACTION OF DELIMITED TEXT -# USING: extract_delimited($str,'/#$','-->'); --->/a/; --->#b#; --->$c$; - -# THIS SHOULD FAIL -$c$; diff --git a/lib/Text/Balanced/t/extmul.t b/lib/Text/Balanced/t/extmul.t deleted file mode 100644 index 98b6272476..0000000000 --- a/lib/Text/Balanced/t/extmul.t +++ /dev/null @@ -1,326 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..86\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( :ALL ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - -sub expect -{ - local $^W; - my ($l1, $l2) = @_; - - if (@$l1 != @$l2) - { - print "\@l1: ", join(", ", @$l1), "\n"; - print "\@l2: ", join(", ", @$l2), "\n"; - print "not "; - } - else - { - for (my $i = 0; $i < @$l1; $i++) - { - if ($l1->[$i] ne $l2->[$i]) - { - print "field $i: '$l1->[$i]' ne '$l2->[$i]'\n"; - print "not "; - last; - } - } - } - - print "ok $count\n"; - $count++; -} - -sub divide -{ - my ($text, @index) = @_; - my @bits = (); - unshift @index, 0; - push @index, length($text); - for ( my $i= 0; $i < $#index; $i++) - { - push @bits, substr($text, $index[$i], $index[$i+1]-$index[$i]); - } - pop @bits; - return @bits; - -} - - -$stdtext1 = q{$var = do {"val" && $val;};}; - -# TESTS 2-4 -$text = $stdtext1; -expect [ extract_multiple($text,undef,1) ], - [ divide $stdtext1 => 4 ]; - -expect [ pos $text], [ 4 ]; -expect [ $text ], [ $stdtext1 ]; - -# TESTS 5-7 -$text = $stdtext1; -expect [ scalar extract_multiple($text,undef,1) ], - [ divide $stdtext1 => 4 ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext1,4) ]; - - -# TESTS 8-10 -$text = $stdtext1; -expect [ extract_multiple($text,undef,2) ], - [ divide($stdtext1 => 4, 10) ]; - -expect [ pos $text], [ 10 ]; -expect [ $text ], [ $stdtext1 ]; - -# TESTS 11-13 -$text = $stdtext1; -expect [ eval{local$^W;scalar extract_multiple($text,undef,2)} ], - [ substr($stdtext1,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext1,4) ]; - - -# TESTS 14-16 -$text = $stdtext1; -expect [ extract_multiple($text,undef,3) ], - [ divide($stdtext1 => 4, 10, 26) ]; - -expect [ pos $text], [ 26 ]; -expect [ $text ], [ $stdtext1 ]; - -# TESTS 17-19 -$text = $stdtext1; -expect [ eval{local$^W;scalar extract_multiple($text,undef,3)} ], - [ substr($stdtext1,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext1,4) ]; - - -# TESTS 20-22 -$text = $stdtext1; -expect [ extract_multiple($text,undef,4) ], - [ divide($stdtext1 => 4, 10, 26, 27) ]; - -expect [ pos $text], [ 27 ]; -expect [ $text ], [ $stdtext1 ]; - -# TESTS 23-25 -$text = $stdtext1; -expect [ eval{local$^W;scalar extract_multiple($text,undef,4)} ], - [ substr($stdtext1,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext1,4) ]; - - -# TESTS 26-28 -$text = $stdtext1; -expect [ extract_multiple($text,undef,5) ], - [ divide($stdtext1 => 4, 10, 26, 27) ]; - -expect [ pos $text], [ 27 ]; -expect [ $text ], [ $stdtext1 ]; - - -# TESTS 29-31 -$text = $stdtext1; -expect [ eval{local$^W;scalar extract_multiple($text,undef,5)} ], - [ substr($stdtext1,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext1,4) ]; - - - -# TESTS 32-34 -$stdtext2 = q{$var = "val" && (1,2,3);}; - -$text = $stdtext2; -expect [ extract_multiple($text) ], - [ divide($stdtext2 => 4, 7, 12, 24) ]; - -expect [ pos $text], [ 24 ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 35-37 -$text = $stdtext2; -expect [ scalar extract_multiple($text) ], - [ substr($stdtext2,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext2,4) ]; - - -# TESTS 38-40 -$text = $stdtext2; -expect [ extract_multiple($text,[\&extract_bracketed]) ], - [ substr($stdtext2,0,16), substr($stdtext2,16,7), substr($stdtext2,23) ]; - -expect [ pos $text], [ 24 ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 41-43 -$text = $stdtext2; -expect [ scalar extract_multiple($text,[\&extract_bracketed]) ], - [ substr($stdtext2,0,16) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext2,15) ]; - - -# TESTS 44-46 -$text = $stdtext2; -expect [ extract_multiple($text,[\&extract_variable]) ], - [ substr($stdtext2,0,4), substr($stdtext2,4) ]; - -expect [ pos $text], [ length($text) ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 47-49 -$text = $stdtext2; -expect [ scalar extract_multiple($text,[\&extract_variable]) ], - [ substr($stdtext2,0,4) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext2,4) ]; - - -# TESTS 50-52 -$text = $stdtext2; -expect [ extract_multiple($text,[\&extract_quotelike]) ], - [ substr($stdtext2,0,7), substr($stdtext2,7,5), substr($stdtext2,12) ]; - -expect [ pos $text], [ length($text) ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 53-55 -$text = $stdtext2; -expect [ scalar extract_multiple($text,[\&extract_quotelike]) ], - [ substr($stdtext2,0,7) ]; - -expect [ pos $text], [ 0 ]; -expect [ $text ], [ substr($stdtext2,6) ]; - - -# TESTS 56-58 -$text = $stdtext2; -expect [ extract_multiple($text,[\&extract_quotelike],2,1) ], - [ substr($stdtext2,7,5) ]; - -expect [ pos $text], [ 23 ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 59-61 -$text = $stdtext2; -expect [ eval{local$^W;scalar extract_multiple($text,[\&extract_quotelike],2,1)} ], - [ substr($stdtext2,7,5) ]; - -expect [ pos $text], [ 6 ]; -expect [ $text ], [ substr($stdtext2,0,6). substr($stdtext2,12) ]; - - -# TESTS 62-64 -$text = $stdtext2; -expect [ extract_multiple($text,[\&extract_quotelike],1,1) ], - [ substr($stdtext2,7,5) ]; - -expect [ pos $text], [ 12 ]; -expect [ $text ], [ $stdtext2 ]; - -# TESTS 65-67 -$text = $stdtext2; -expect [ scalar extract_multiple($text,[\&extract_quotelike],1,1) ], - [ substr($stdtext2,7,5) ]; - -expect [ pos $text], [ 6 ]; -expect [ $text ], [ substr($stdtext2,0,6). substr($stdtext2,12) ]; - -# TESTS 68-70 -my $stdtext3 = "a,b,c"; - -$_ = $stdtext3; -expect [ extract_multiple(undef, [ sub { /\G[a-z]/gc && $& } ]) ], - [ divide($stdtext3 => 1,2,3,4,5) ]; - -expect [ pos ], [ 5 ]; -expect [ $_ ], [ $stdtext3 ]; - -# TESTS 71-73 - -$_ = $stdtext3; -expect [ scalar extract_multiple(undef, [ sub { /\G[a-z]/gc && $& } ]) ], - [ divide($stdtext3 => 1) ]; - -expect [ pos ], [ 0 ]; -expect [ $_ ], [ substr($stdtext3,1) ]; - - -# TESTS 74-76 - -$_ = $stdtext3; -expect [ extract_multiple(undef, [ qr/\G[a-z]/ ]) ], - [ divide($stdtext3 => 1,2,3,4,5) ]; - -expect [ pos ], [ 5 ]; -expect [ $_ ], [ $stdtext3 ]; - -# TESTS 77-79 - -$_ = $stdtext3; -expect [ scalar extract_multiple(undef, [ qr/\G[a-z]/ ]) ], - [ divide($stdtext3 => 1) ]; - -expect [ pos ], [ 0 ]; -expect [ $_ ], [ substr($stdtext3,1) ]; - - -# TESTS 80-82 - -$_ = $stdtext3; -expect [ extract_multiple(undef, [ q/([a-z]),?/ ]) ], - [ qw(a b c) ]; - -expect [ pos ], [ 5 ]; -expect [ $_ ], [ $stdtext3 ]; - -# TESTS 83-85 - -$_ = $stdtext3; -expect [ scalar extract_multiple(undef, [ q/([a-z]),?/ ]) ], - [ divide($stdtext3 => 1) ]; - -expect [ pos ], [ 0 ]; -expect [ $_ ], [ substr($stdtext3,2) ]; - - -# TEST 86 - -# Fails in Text-Balanced-1.95 with result ['1 ', '""', '1234'] -$_ = q{ ""1234}; -expect [ extract_multiple(undef, [\&extract_quotelike]) ], - [ ' ', '""', '1234' ]; diff --git a/lib/Text/Balanced/t/extqlk.t b/lib/Text/Balanced/t/extqlk.t deleted file mode 100644 index 97dc517c54..0000000000 --- a/lib/Text/Balanced/t/extqlk.t +++ /dev/null @@ -1,142 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -#! /usr/local/bin/perl -ws -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..95\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_quotelike ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -#$DEBUG=1; -sub debug { print "\t>>>",@_ if $ENV{DEBUG} } -sub esc { my $x = shift||'<undef>'; $x =~ s/\n/\\n/gs; $x } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - my $setup_cmd = ($str =~ s/\A\{(.*)\}//) ? $1 : ''; - my $tests = 'sl'; - $str =~ s/\\n/\n/g; - my $orig = $str; - - eval $setup_cmd if $setup_cmd ne ''; - if($tests =~ /l/) { - debug "\tUsing: $cmd\n"; - debug "\t on: [" . esc($setup_cmd) . "][" . esc($str) . "]\n"; - my @res; - eval qq{\@res = $cmd; }; - debug "\t got:\n" . join "", map { "\t\t\t$_: [" . esc($res[$_]) . "]\n"} (0..$#res); - debug "\t left: [" . esc($str) . "]\n"; - debug "\t pos: [" . esc(substr($str,pos($str))) . "...]\n"; - print "not " if (substr($str,pos($str),1) eq ';')==$neg; - print "ok ", $count++; - print "\n"; - } - - eval $setup_cmd if $setup_cmd ne ''; - if($tests =~ /s/) { - $str = $orig; - debug "\tUsing: scalar $cmd\n"; - debug "\t on: [" . esc($str) . "]\n"; - $var = eval $cmd; - print " ($@)" if $@ && $DEBUG; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [" . esc($var) . "]\n"; - debug "\t scalar left: [" . esc($str) . "]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print "\n"; - } -} - -# fails in Text::Balanced 1.95 -$_ = qq(s{}{}); -my @z = extract_quotelike(); -print "not " if $z[0] eq ''; -print "ok ", $count++; -print "\n"; - - -__DATA__ - -# USING: extract_quotelike($str); -''; -""; -"a"; -'b'; -`cc`; - - -<<EOHERE; done();\nline1\nline2\nEOHERE\n; next; - <<EOHERE; done();\nline1\nline2\nEOHERE\n; next; -<<"EOHERE"; done()\nline1\nline2\nEOHERE\n and next -<<`EOHERE`; done()\nline1\nline2\nEOHERE\n and next -<<'EOHERE'; done()\nline1\n'line2'\nEOHERE\n and next -<<'EOHERE;'; done()\nline1\nline2\nEOHERE;\n and next -<<" EOHERE"; done() \nline1\nline2\n EOHERE\nand next -<<""; done()\nline1\nline2\n\n and next -<<; done()\nline1\nline2\n\n and next -# fails in Text::Balanced 1.95 -<<EOHERE;\nEOHERE\n; -# fails in Text::Balanced 1.95 -<<"*";\n\n*\n; - -"this is a nested $var[$x] {"; -/a/gci; -m/a/gci; - -q(d); -qq(e); -qx(f); -qr(g); -qw(h i j); -q{d}; -qq{e}; -qx{f}; -qr{g}; -qq{a nested { and } are okay as are () and <> pairs and escaped \}'s }; -q/slash/; -q # slash #; -qr qw qx; - -s/x/y/; -s/x/y/cgimsox; -s{a}{b}; -s{a}\n {b}; -s(a){b}; -s(a)/b/; -s/'/\\'/g; -tr/x/y/; -y/x/y/; - -# fails on Text-Balanced-1.95 -{ $tests = 'l'; pos($str)=6 }012345<<E;\n\nE\n - -# THESE SHOULD FAIL -s<$self->{pat}>{$self->{sub}}; # CAN'T HANDLE '>' in '->' -s-$self->{pap}-$self->{sub}-; # CAN'T HANDLE '-' in '->' -<<EOHERE; done();\nline1\nline2\nEOHERE;\n; next; # RDEL HAS NO ';' -<<'EOHERE'; done();\nline1\nline2\nEOHERE;\n; next; # RDEF HAS NO ';' - << EOTHERE; done();\nline1\nline2\n EOTHERE\n; next; # RDEL IS "" (!) diff --git a/lib/Text/Balanced/t/exttag.t b/lib/Text/Balanced/t/exttag.t deleted file mode 100644 index d412c23ef3..0000000000 --- a/lib/Text/Balanced/t/exttag.t +++ /dev/null @@ -1,120 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..53\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_tagged gen_extract_tagged ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - my @res; - $var = eval "\@res = $cmd"; - debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; - - pos $str = 0; - $var = eval $cmd; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ -# USING: gen_extract_tagged("BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)")->($str); - ignore\n this and then BEGINHERE at the ENDHERE; - ignore\n this and then BEGINTHIS at the ENDTHIS; - -# USING: extract_tagged($str,"BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)"); - ignore\n this and then BEGINHERE at the ENDHERE; - ignore\n this and then BEGINTHIS at the ENDTHIS; - -# USING: extract_tagged($str,"BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)"); - ignore\n this and then BEGINHERE at the ENDHERE; - ignore\n this and then BEGINTHIS at the ENDTHIS; - -# THIS SHOULD FAIL - ignore\n this and then BEGINTHIS at the ENDTHAT; - -# USING: extract_tagged($str,"BEGIN","END","(?s).*?(?=BEGIN)"); - ignore\n this and then BEGIN at the END; - -# USING: extract_tagged($str); - <A-1 HREF="#section2">some text</A-1>; - -# USING: extract_tagged($str,qr/<[A-Z]+>/,undef, undef, {ignore=>["<BR>"]}); - <A>aaa<B>bbb<BR>ccc</B>ddd</A>; - -# USING: extract_tagged($str,"BEGIN","END"); - BEGIN at the BEGIN keyword and END at the END; - BEGIN at the beginning and end at the END; - -# USING: extract_tagged($str,undef,undef,undef,{ignore=>["<[^>]*/>"]}); - <A>aaa<B>bbb<BR/>ccc</B>ddd</A>; - -# USING: extract_tagged($str,";","-",undef,{reject=>[";"],fail=>"MAX"}); - ; at the ;-) keyword - -# USING: extract_tagged($str,"<[A-Z]+>",undef, undef, {ignore=>["<BR>"]}); - <A>aaa<B>bbb<BR>ccc</B>ddd</A>; - -# THESE SHOULD FAIL - BEGIN at the beginning and end at the end; - BEGIN at the BEGIN keyword and END at the end; - -# TEST EXTRACTION OF TAGGED STRINGS -# USING: extract_tagged($str,"BEGIN","END",undef,{reject=>["BEGIN","END"]}); -# THESE SHOULD FAIL - BEGIN at the BEGIN keyword and END at the end; - -# USING: extract_tagged($str,";","-",undef,{reject=>[";"],fail=>"PARA"}); - ; at the ;-) keyword - - -# USING: extract_tagged($str); - <A>some text</A>; - <B>some text<A>other text</A></B>; - <A>some text<A>other text</A></A>; - <A HREF="#section2">some text</A>; - -# THESE SHOULD FAIL - <A>some text - <A>some text<A>other text</A>; - <B>some text<A>other text</B>; diff --git a/lib/Text/Balanced/t/extvar.t b/lib/Text/Balanced/t/extvar.t deleted file mode 100644 index 5f37d8c049..0000000000 --- a/lib/Text/Balanced/t/extvar.t +++ /dev/null @@ -1,160 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..183\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( extract_variable ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - my @res; - $var = eval "\@res = $cmd"; - debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; - - pos $str = 0; - $var = eval $cmd; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ - -# USING: extract_variable($str); -# THESE SHOULD FAIL -$a->; -$a (1..3) { print $a }; - -# USING: extract_variable($str); -$::obj; -$obj->nextval; -*var; -*$var; -*{var}; -*{$var}; -*var{cat}; -\&var; -\&mod::var; -\&mod'var; -$a; -$_; -$a[1]; -$_[1]; -$a{cat}; -$_{cat}; -$a->[1]; -$a->{"cat"}[1]; -@$listref; -@{$listref}; -$obj->nextval; -$obj->_nextval; -$obj->next_val_; -@{$obj->nextval}; -@{$obj->nextval($cat,$dog)->{new}}; -@{$obj->nextval($cat?$dog:$fish)->{new}}; -@{$obj->nextval(cat()?$dog:$fish)->{new}}; -$ a {'cat'}; -$a::b::c{d}->{$e->()}; -$a'b'c'd{e}->{$e->()}; -$a'b::c'd{e}->{$e->()}; -$#_; -$#array; -$#{array}; -$var[$#var]; -$1; -$11; -$&; -$`; -$'; -$+; -$*; -$.; -$/; -$|; -$,; -$"; -$;; -$#; -$%; -$=; -$-; -$~; -$^; -$:; -$^L; -$^A; -$?; -$!; -$^E; -$@; -$$; -$<; -$>; -$(; -$); -$[; -$]; -$^C; -$^D; -$^F; -$^H; -$^I; -$^M; -$^O; -$^P; -$^R; -$^S; -$^T; -$^V; -$^W; -${^WARNING_BITS}; -${^WIDE_SYSTEM_CALLS}; -$^X; - -# THESE SHOULD FAIL -$a->; -@{$; -$ a :: b :: c -$ a ' b ' c - -# USING: extract_variable($str,'=*'); -========$a; diff --git a/lib/Text/Balanced/t/gentag.t b/lib/Text/Balanced/t/gentag.t deleted file mode 100644 index f5fd5dcf0b..0000000000 --- a/lib/Text/Balanced/t/gentag.t +++ /dev/null @@ -1,109 +0,0 @@ -BEGIN { - if ($ENV{PERL_CORE}) { - chdir('t') if -d 't'; - @INC = qw(../lib); - } -} - -# Before `make install' is performed this script should be runnable with -# `make test'. After `make install' it should work as `perl test.pl' - -######################### We start with some black magic to print on failure. - -# Change 1..1 below to 1..last_test_to_print . -# (It may become useful if the test is moved to ./t subdirectory.) - -BEGIN { $| = 1; print "1..37\n"; } -END {print "not ok 1\n" unless $loaded;} -use Text::Balanced qw ( gen_extract_tagged ); -$loaded = 1; -print "ok 1\n"; -$count=2; -use vars qw( $DEBUG ); -sub debug { print "\t>>>",@_ if $DEBUG } - -######################### End of black magic. - - -$cmd = "print"; -$neg = 0; -while (defined($str = <DATA>)) -{ - chomp $str; - $str =~ s/\\n/\n/g; - if ($str =~ s/\A# USING://) - { - $neg = 0; - eval{local$^W;*f = eval $str || die}; - next; - } - elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; } - elsif (!$str || $str =~ /\A#/) { $neg = 0; next } - $str =~ s/\\n/\n/g; - debug "\tUsing: $cmd\n"; - debug "\t on: [$str]\n"; - - my @res; - $var = eval { @res = f($str) }; - debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n"; - debug "\t list left: [$str]\n"; - print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; - - pos $str = 0; - $var = eval { scalar f($str) }; - $var = "<undef>" unless defined $var; - debug "\t scalar got: [$var]\n"; - debug "\t scalar left: [$str]\n"; - print "not " if ($str =~ '\A;')==$neg; - print "ok ", $count++; - print " ($@)" if $@ && $DEBUG; - print "\n"; -} - -__DATA__ - -# USING: gen_extract_tagged('{','}'); - { a test }; - -# USING: gen_extract_tagged(qr/<[A-Z]+>/,undef, undef, {ignore=>["<BR>"]}); - <A>aaa<B>bbb<BR>ccc</B>ddd</A>; - -# USING: gen_extract_tagged("BEGIN","END"); - BEGIN at the BEGIN keyword and END at the END; - BEGIN at the beginning and end at the END; - -# USING: gen_extract_tagged(undef,undef,undef,{ignore=>["<[^>]*/>"]}); - <A>aaa<B>bbb<BR/>ccc</B>ddd</A>; - -# USING: gen_extract_tagged(";","-",undef,{reject=>[";"],fail=>"MAX"}); - ; at the ;-) keyword - -# USING: gen_extract_tagged("<[A-Z]+>",undef, undef, {ignore=>["<BR>"]}); - <A>aaa<B>bbb<BR>ccc</B>ddd</A>; - -# THESE SHOULD FAIL - BEGIN at the beginning and end at the end; - BEGIN at the BEGIN keyword and END at the end; - -# TEST EXTRACTION OF TAGGED STRINGS -# USING: gen_extract_tagged("BEGIN","END",undef,{reject=>["BEGIN","END"]}); -# THESE SHOULD FAIL - BEGIN at the BEGIN keyword and END at the end; - -# USING: gen_extract_tagged(";","-",undef,{reject=>[";"],fail=>"PARA"}); - ; at the ;-) keyword - - -# USING: gen_extract_tagged(); - <A>some text</A>; - <B>some text<A>other text</A></B>; - <A>some text<A>other text</A></A>; - <A HREF="#section2">some text</A>; - -# THESE SHOULD FAIL - <A>some text - <A>some text<A>other text</A>; - <B>some text<A>other text</B>; |