diff options
Diffstat (limited to 'lib/Text/Balanced.pm')
-rw-r--r-- | lib/Text/Balanced.pm | 2306 |
1 files changed, 0 insertions, 2306 deletions
diff --git a/lib/Text/Balanced.pm b/lib/Text/Balanced.pm deleted file mode 100644 index 16a559b6c8..0000000000 --- a/lib/Text/Balanced.pm +++ /dev/null @@ -1,2306 +0,0 @@ -# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS. -# FOR FULL DOCUMENTATION SEE Balanced.pod - -use 5.005; -use strict; - -package Text::Balanced; - -use Exporter; -use SelfLoader; -use vars qw { $VERSION @ISA %EXPORT_TAGS }; - -use version; $VERSION = qv('2.0.0'); -@ISA = qw ( Exporter ); - -%EXPORT_TAGS = ( ALL => [ qw( - &extract_delimited - &extract_bracketed - &extract_quotelike - &extract_codeblock - &extract_variable - &extract_tagged - &extract_multiple - - &gen_delimited_pat - &gen_extract_tagged - - &delimited_pat - ) ] ); - -Exporter::export_ok_tags('ALL'); - -# PROTOTYPES - -sub _match_bracketed($$$$$$); -sub _match_variable($$); -sub _match_codeblock($$$$$$$); -sub _match_quotelike($$$$); - -# HANDLE RETURN VALUES IN VARIOUS CONTEXTS - -sub _failmsg { - my ($message, $pos) = @_; - $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg"; -} - -sub _fail -{ - my ($wantarray, $textref, $message, $pos) = @_; - _failmsg $message, $pos if $message; - return (undef,$$textref,undef) if $wantarray; - return undef; -} - -sub _succeed -{ - $@ = undef; - my ($wantarray,$textref) = splice @_, 0, 2; - my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0); - my ($startlen, $oppos) = @_[5,6]; - my $remainderpos = $_[2]; - if ($wantarray) - { - my @res; - while (my ($from, $len) = splice @_, 0, 2) - { - push @res, substr($$textref,$from,$len); - } - if ($extralen) { # CORRECT FILLET - my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n"); - $res[1] = "$extra$res[1]"; - eval { substr($$textref,$remainderpos,0) = $extra; - substr($$textref,$extrapos,$extralen,"\n")} ; - #REARRANGE HERE DOC AND FILLET IF POSSIBLE - pos($$textref) = $remainderpos-$extralen+1; # RESET \G - } - else { - pos($$textref) = $remainderpos; # RESET \G - } - return @res; - } - else - { - my $match = substr($$textref,$_[0],$_[1]); - substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen; - my $extra = $extralen - ? substr($$textref, $extrapos, $extralen)."\n" : ""; - eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE - pos($$textref) = $_[4]; # RESET \G - return $match; - } -} - -# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING - -sub gen_delimited_pat($;$) # ($delimiters;$escapes) -{ - my ($dels, $escs) = @_; - return "" unless $dels =~ /\S/; - $escs = '\\' unless $escs; - $escs .= substr($escs,-1) x (length($dels)-length($escs)); - my @pat = (); - my $i; - for ($i=0; $i<length $dels; $i++) - { - my $del = quotemeta substr($dels,$i,1); - my $esc = quotemeta substr($escs,$i,1); - if ($del eq $esc) - { - push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del"; - } - else - { - push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del"; - } - } - my $pat = join '|', @pat; - return "(?:$pat)"; -} - -*delimited_pat = \&gen_delimited_pat; - - -# THE EXTRACTION FUNCTIONS - -sub extract_delimited (;$$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $del = defined $_[1] ? $_[1] : qq{\'\"\`}; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $esc = defined $_[3] ? $_[3] : qq{\\}; - my $pat = gen_delimited_pat($del, $esc); - my $startpos = pos $$textref || 0; - return _fail($wantarray, $textref, "Not a delimited pattern", 0) - unless $$textref =~ m/\G($pre)($pat)/gc; - my $prelen = length($1); - my $matchpos = $startpos+$prelen; - my $endpos = pos $$textref; - return _succeed $wantarray, $textref, - $matchpos, $endpos-$matchpos, # MATCH - $endpos, length($$textref)-$endpos, # REMAINDER - $startpos, $prelen; # PREFIX -} - -sub extract_bracketed (;$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $ldel = defined $_[1] ? $_[1] : '{([<'; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $wantarray = wantarray; - my $qdel = ""; - my $quotelike; - $ldel =~ s/'//g and $qdel .= q{'}; - $ldel =~ s/"//g and $qdel .= q{"}; - $ldel =~ s/`//g and $qdel .= q{`}; - $ldel =~ s/q//g and $quotelike = 1; - $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds; - my $rdel = $ldel; - unless ($rdel =~ tr/[({</])}>/) - { - return _fail $wantarray, $textref, - "Did not find a suitable bracket in delimiter: \"$_[1]\"", - 0; - } - my $posbug = pos; - $ldel = join('|', map { quotemeta $_ } split('', $ldel)); - $rdel = join('|', map { quotemeta $_ } split('', $rdel)); - pos = $posbug; - - my $startpos = pos $$textref || 0; - my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel); - - return _fail ($wantarray, $textref) unless @match; - - return _succeed ( $wantarray, $textref, - $match[2], $match[5]+2, # MATCH - @match[8,9], # REMAINDER - @match[0,1], # PREFIX - ); -} - -sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel -{ - my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_; - my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0); - unless ($$textref =~ m/\G$pre/gc) - { - _failmsg "Did not find prefix: /$pre/", $startpos; - return; - } - - $ldelpos = pos $$textref; - - unless ($$textref =~ m/\G($ldel)/gc) - { - _failmsg "Did not find opening bracket after prefix: \"$pre\"", - pos $$textref; - pos $$textref = $startpos; - return; - } - - my @nesting = ( $1 ); - my $textlen = length $$textref; - while (pos $$textref < $textlen) - { - next if $$textref =~ m/\G\\./gcs; - - if ($$textref =~ m/\G($ldel)/gc) - { - push @nesting, $1; - } - elsif ($$textref =~ m/\G($rdel)/gc) - { - my ($found, $brackettype) = ($1, $1); - if ($#nesting < 0) - { - _failmsg "Unmatched closing bracket: \"$found\"", - pos $$textref; - pos $$textref = $startpos; - return; - } - my $expected = pop(@nesting); - $expected =~ tr/({[</)}]>/; - if ($expected ne $brackettype) - { - _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"}, - pos $$textref; - pos $$textref = $startpos; - return; - } - last if $#nesting < 0; - } - elsif ($qdel && $$textref =~ m/\G([$qdel])/gc) - { - $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next; - _failmsg "Unmatched embedded quote ($1)", - pos $$textref; - pos $$textref = $startpos; - return; - } - elsif ($quotelike && _match_quotelike($textref,"",1,0)) - { - next; - } - - else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs } - } - if ($#nesting>=0) - { - _failmsg "Unmatched opening bracket(s): " - . join("..",@nesting)."..", - pos $$textref; - pos $$textref = $startpos; - return; - } - - $endpos = pos $$textref; - - return ( - $startpos, $ldelpos-$startpos, # PREFIX - $ldelpos, 1, # OPENING BRACKET - $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS - $endpos-1, 1, # CLOSING BRACKET - $endpos, length($$textref)-$endpos, # REMAINDER - ); -} - -sub _revbracket($) -{ - my $brack = reverse $_[0]; - $brack =~ tr/[({</])}>/; - return $brack; -} - -my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*}; - -sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $ldel = $_[1]; - my $rdel = $_[2]; - my $pre = defined $_[3] ? $_[3] : '\s*'; - my %options = defined $_[4] ? %{$_[4]} : (); - my $omode = defined $options{fail} ? $options{fail} : ''; - my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) - : defined($options{reject}) ? $options{reject} - : '' - ; - my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) - : defined($options{ignore}) ? $options{ignore} - : '' - ; - - if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } - $@ = undef; - - my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); - - return _fail(wantarray, $textref) unless @match; - return _succeed wantarray, $textref, - $match[2], $match[3]+$match[5]+$match[7], # MATCH - @match[8..9,0..1,2..7]; # REM, PRE, BITS -} - -sub _match_tagged # ($$$$$$$) -{ - my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_; - my $rdelspec; - - my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 ); - - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg "Did not find prefix: /$pre/", pos $$textref; - goto failed; - } - - $opentagpos = pos($$textref); - - unless ($$textref =~ m/\G$ldel/gc) - { - _failmsg "Did not find opening tag: /$ldel/", pos $$textref; - goto failed; - } - - $textpos = pos($$textref); - - if (!defined $rdel) - { - $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]); - unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes) - { - _failmsg "Unable to construct closing tag to match: $rdel", - pos $$textref; - goto failed; - } - } - else - { - $rdelspec = eval "qq{$rdel}" || do { - my $del; - for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',) - { next if $rdel =~ /\Q$_/; $del = $_; last } - unless ($del) { - use Carp; - croak "Can't interpolate right delimiter $rdel" - } - eval "qq$del$rdel$del"; - }; - } - - while (pos($$textref) < length($$textref)) - { - next if $$textref =~ m/\G\\./gc; - - if ($$textref =~ m/\G(\n[ \t]*\n)/gc ) - { - $parapos = pos($$textref) - length($1) - unless defined $parapos; - } - elsif ($$textref =~ m/\G($rdelspec)/gc ) - { - $closetagpos = pos($$textref)-length($1); - goto matched; - } - elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc) - { - next; - } - elsif ($bad && $$textref =~ m/\G($bad)/gcs) - { - pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS - goto short if ($omode eq 'PARA' || $omode eq 'MAX'); - _failmsg "Found invalid nested tag: $1", pos $$textref; - goto failed; - } - elsif ($$textref =~ m/\G($ldel)/gc) - { - my $tag = $1; - pos($$textref) -= length($tag); # REWIND TO NESTED TAG - unless (_match_tagged(@_)) # MATCH NESTED TAG - { - goto short if $omode eq 'PARA' || $omode eq 'MAX'; - _failmsg "Found unbalanced nested tag: $tag", - pos $$textref; - goto failed; - } - } - else { $$textref =~ m/./gcs } - } - -short: - $closetagpos = pos($$textref); - goto matched if $omode eq 'MAX'; - goto failed unless $omode eq 'PARA'; - - if (defined $parapos) { pos($$textref) = $parapos } - else { $parapos = pos($$textref) } - - return ( - $startpos, $opentagpos-$startpos, # PREFIX - $opentagpos, $textpos-$opentagpos, # OPENING TAG - $textpos, $parapos-$textpos, # TEXT - $parapos, 0, # NO CLOSING TAG - $parapos, length($$textref)-$parapos, # REMAINDER - ); - -matched: - $endpos = pos($$textref); - return ( - $startpos, $opentagpos-$startpos, # PREFIX - $opentagpos, $textpos-$opentagpos, # OPENING TAG - $textpos, $closetagpos-$textpos, # TEXT - $closetagpos, $endpos-$closetagpos, # CLOSING TAG - $endpos, length($$textref)-$endpos, # REMAINDER - ); - -failed: - _failmsg "Did not find closing tag", pos $$textref unless $@; - pos($$textref) = $startpos; - return; -} - -sub extract_variable (;$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - return ("","","") unless defined $$textref; - my $pre = defined $_[1] ? $_[1] : '\s*'; - - my @match = _match_variable($textref,$pre); - - return _fail wantarray, $textref unless @match; - - return _succeed wantarray, $textref, - @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX -} - -sub _match_variable($$) -{ -# $# -# $^ -# $$ - my ($textref, $pre) = @_; - my $startpos = pos($$textref) = pos($$textref)||0; - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg "Did not find prefix: /$pre/", pos $$textref; - return; - } - my $varpos = pos($$textref); - unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci) - { - unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc) - { - _failmsg "Did not find leading dereferencer", pos $$textref; - pos $$textref = $startpos; - return; - } - my $deref = $1; - - unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci - or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0) - or $deref eq '$#' or $deref eq '$$' ) - { - _failmsg "Bad identifier after dereferencer", pos $$textref; - pos $$textref = $startpos; - return; - } - } - - while (1) - { - next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc; - next if _match_codeblock($textref, - qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/, - qr/[({[]/, qr/[)}\]]/, - qr/[({[]/, qr/[)}\]]/, 0); - next if _match_codeblock($textref, - qr/\s*/, qr/[{[]/, qr/[}\]]/, - qr/[{[]/, qr/[}\]]/, 0); - next if _match_variable($textref,'\s*->\s*'); - next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc; - last; - } - - my $endpos = pos($$textref); - return ($startpos, $varpos-$startpos, - $varpos, $endpos-$varpos, - $endpos, length($$textref)-$endpos - ); -} - -sub extract_codeblock (;$$$$$) -{ - my $textref = defined $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $ldel_inner = defined $_[1] ? $_[1] : '{'; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner; - my $rd = $_[4]; - my $rdel_inner = $ldel_inner; - my $rdel_outer = $ldel_outer; - my $posbug = pos; - for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds } - for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds } - for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer) - { - $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')' - } - pos = $posbug; - - my @match = _match_codeblock($textref, $pre, - $ldel_outer, $rdel_outer, - $ldel_inner, $rdel_inner, - $rd); - return _fail($wantarray, $textref) unless @match; - return _succeed($wantarray, $textref, - @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX - ); - -} - -sub _match_codeblock($$$$$$$) -{ - my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_; - my $startpos = pos($$textref) = pos($$textref) || 0; - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg qq{Did not match prefix /$pre/ at"} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - return; - } - my $codepos = pos($$textref); - unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER - { - _failmsg qq{Did not find expected opening bracket at "} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - my $closing = $1; - $closing =~ tr/([<{/)]>}/; - my $matched; - my $patvalid = 1; - while (pos($$textref) < length($$textref)) - { - $matched = ''; - if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc) - { - $patvalid = 0; - next; - } - - if ($$textref =~ m/\G\s*#.*/gc) - { - next; - } - - if ($$textref =~ m/\G\s*($rdel_outer)/gc) - { - unless ($matched = ($closing && $1 eq $closing) ) - { - next if $1 eq '>'; # MIGHT BE A "LESS THAN" - _failmsg q{Mismatched closing bracket at "} . - substr($$textref,pos($$textref),20) . - qq{...". Expected '$closing'}, - pos $$textref; - } - last; - } - - if (_match_variable($textref,'\s*') || - _match_quotelike($textref,'\s*',$patvalid,$patvalid) ) - { - $patvalid = 0; - next; - } - - - # NEED TO COVER MANY MORE CASES HERE!!! - if ($$textref =~ m#\G\s*(?!$ldel_inner) - ( [-+*x/%^&|.]=? - | [!=]~ - | =(?!>) - | (\*\*|&&|\|\||<<|>>)=? - | split|grep|map|return - | [([] - )#gcx) - { - $patvalid = 1; - next; - } - - if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) ) - { - $patvalid = 1; - next; - } - - if ($$textref =~ m/\G\s*$ldel_outer/gc) - { - _failmsg q{Improperly nested codeblock at "} . - substr($$textref,pos($$textref),20) . - q{..."}, - pos $$textref; - last; - } - - $patvalid = 0; - $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc; - } - continue { $@ = undef } - - unless ($matched) - { - _failmsg 'No match found for opening bracket', pos $$textref - unless $@; - return; - } - - my $endpos = pos($$textref); - return ( $startpos, $codepos-$startpos, - $codepos, $endpos-$codepos, - $endpos, length($$textref)-$endpos, - ); -} - - -my %mods = ( - 'none' => '[cgimsox]*', - 'm' => '[cgimsox]*', - 's' => '[cegimsox]*', - 'tr' => '[cds]*', - 'y' => '[cds]*', - 'qq' => '', - 'qx' => '', - 'qw' => '', - 'qr' => '[imsx]*', - 'q' => '', - ); - -sub extract_quotelike (;$$) -{ - my $textref = $_[0] ? \$_[0] : \$_; - my $wantarray = wantarray; - my $pre = defined $_[1] ? $_[1] : '\s*'; - - my @match = _match_quotelike($textref,$pre,1,0); - return _fail($wantarray, $textref) unless @match; - return _succeed($wantarray, $textref, - $match[2], $match[18]-$match[2], # MATCH - @match[18,19], # REMAINDER - @match[0,1], # PREFIX - @match[2..17], # THE BITS - @match[20,21], # ANY FILLET? - ); -}; - -sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match) -{ - my ($textref, $pre, $rawmatch, $qmark) = @_; - - my ($textlen,$startpos, - $oppos, - $preld1pos,$ld1pos,$str1pos,$rd1pos, - $preld2pos,$ld2pos,$str2pos,$rd2pos, - $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 ); - - unless ($$textref =~ m/\G($pre)/gc) - { - _failmsg qq{Did not find prefix /$pre/ at "} . - substr($$textref, pos($$textref), 20) . - q{..."}, - pos $$textref; - return; - } - $oppos = pos($$textref); - - my $initial = substr($$textref,$oppos,1); - - if ($initial && $initial =~ m|^[\"\'\`]| - || $rawmatch && $initial =~ m|^/| - || $qmark && $initial =~ m|^\?|) - { - unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx) - { - _failmsg qq{Did not find closing delimiter to match '$initial' at "} . - substr($$textref, $oppos, 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - $modpos= pos($$textref); - $rd1pos = $modpos-1; - - if ($initial eq '/' || $initial eq '?') - { - $$textref =~ m/\G$mods{none}/gc - } - - my $endpos = pos($$textref); - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, 0, # NO OPERATOR - $oppos, 1, # LEFT DEL - $oppos+1, $rd1pos-$oppos-1, # STR/PAT - $rd1pos, 1, # RIGHT DEL - $modpos, 0, # NO 2ND LDEL - $modpos, 0, # NO 2ND STR - $modpos, 0, # NO 2ND RDEL - $modpos, $endpos-$modpos, # MODIFIERS - $endpos, $textlen-$endpos, # REMAINDER - ); - } - - unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc) - { - _failmsg q{No quotelike operator found after prefix at "} . - substr($$textref, pos($$textref), 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - - my $op = $1; - $preld1pos = pos($$textref); - if ($op eq '<<') { - $ld1pos = pos($$textref); - my $label; - if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) { - $label = $1; - } - elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) ' - | \G " ([^"\\]* (?:\\.[^"\\]*)*) " - | \G ` ([^`\\]* (?:\\.[^`\\]*)*) ` - }gcsx) { - $label = $+; - } - else { - $label = ""; - } - my $extrapos = pos($$textref); - $$textref =~ m{.*\n}gc; - $str1pos = pos($$textref)--; - unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) { - _failmsg qq{Missing here doc terminator ('$label') after "} . - substr($$textref, $startpos, 20) . - q{..."}, - pos $$textref; - pos $$textref = $startpos; - return; - } - $rd1pos = pos($$textref); - $$textref =~ m{\Q$label\E\n}gc; - $ld2pos = pos($$textref); - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, length($op), # OPERATOR - $ld1pos, $extrapos-$ld1pos, # LEFT DEL - $str1pos, $rd1pos-$str1pos, # STR/PAT - $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL - $ld2pos, 0, # NO 2ND LDEL - $ld2pos, 0, # NO 2ND STR - $ld2pos, 0, # NO 2ND RDEL - $ld2pos, 0, # NO MODIFIERS - $ld2pos, $textlen-$ld2pos, # REMAINDER - $extrapos, $str1pos-$extrapos, # FILLETED BIT - ); - } - - $$textref =~ m/\G\s*/gc; - $ld1pos = pos($$textref); - $str1pos = $ld1pos+1; - - unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD - { - _failmsg "No block delimiter found after quotelike $op", - pos $$textref; - pos $$textref = $startpos; - return; - } - pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN - my ($ldel1, $rdel1) = ("\Q$1","\Q$1"); - if ($ldel1 =~ /[[(<{]/) - { - $rdel1 =~ tr/[({</])}>/; - defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1)) - || do { pos $$textref = $startpos; return }; - $ld2pos = pos($$textref); - $rd1pos = $ld2pos-1; - } - else - { - $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs - || do { pos $$textref = $startpos; return }; - $ld2pos = $rd1pos = pos($$textref)-1; - } - - my $second_arg = $op =~ /s|tr|y/ ? 1 : 0; - if ($second_arg) - { - my ($ldel2, $rdel2); - if ($ldel1 =~ /[[(<{]/) - { - unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD - { - _failmsg "Missing second block for quotelike $op", - pos $$textref; - pos $$textref = $startpos; - return; - } - $ldel2 = $rdel2 = "\Q$1"; - $rdel2 =~ tr/[({</])}>/; - } - else - { - $ldel2 = $rdel2 = $ldel1; - } - $str2pos = $ld2pos+1; - - if ($ldel2 =~ /[[(<{]/) - { - pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD - defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2)) - || do { pos $$textref = $startpos; return }; - } - else - { - $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs - || do { pos $$textref = $startpos; return }; - } - $rd2pos = pos($$textref)-1; - } - else - { - $ld2pos = $str2pos = $rd2pos = $rd1pos; - } - - $modpos = pos $$textref; - - $$textref =~ m/\G($mods{$op})/gc; - my $endpos = pos $$textref; - - return ( - $startpos, $oppos-$startpos, # PREFIX - $oppos, length($op), # OPERATOR - $ld1pos, 1, # LEFT DEL - $str1pos, $rd1pos-$str1pos, # STR/PAT - $rd1pos, 1, # RIGHT DEL - $ld2pos, $second_arg, # 2ND LDEL (MAYBE) - $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE) - $rd2pos, $second_arg, # 2ND RDEL (MAYBE) - $modpos, $endpos-$modpos, # MODIFIERS - $endpos, $textlen-$endpos, # REMAINDER - ); -} - -my $def_func = -[ - sub { extract_variable($_[0], '') }, - sub { extract_quotelike($_[0],'') }, - sub { extract_codeblock($_[0],'{}','') }, -]; - -sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown) -{ - my $textref = defined($_[0]) ? \$_[0] : \$_; - my $posbug = pos; - my ($lastpos, $firstpos); - my @fields = (); - - #for ($$textref) - { - my @func = defined $_[1] ? @{$_[1]} : @{$def_func}; - my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000; - my $igunk = $_[3]; - - pos $$textref ||= 0; - - unless (wantarray) - { - use Carp; - carp "extract_multiple reset maximal count to 1 in scalar context" - if $^W && defined($_[2]) && $max > 1; - $max = 1 - } - - my $unkpos; - my $func; - my $class; - - my @class; - foreach $func ( @func ) - { - if (ref($func) eq 'HASH') - { - push @class, (keys %$func)[0]; - $func = (values %$func)[0]; - } - else - { - push @class, undef; - } - } - - FIELD: while (pos($$textref) < length($$textref)) - { - my ($field, $rem); - my @bits; - foreach my $i ( 0..$#func ) - { - my $pref; - $func = $func[$i]; - $class = $class[$i]; - $lastpos = pos $$textref; - if (ref($func) eq 'CODE') - { ($field,$rem,$pref) = @bits = $func->($$textref) } - elsif (ref($func) eq 'Text::Balanced::Extractor') - { @bits = $field = $func->extract($$textref) } - elsif( $$textref =~ m/\G$func/gc ) - { @bits = $field = defined($1) - ? $1 - : substr($$textref, $-[0], $+[0] - $-[0]) - } - $pref ||= ""; - if (defined($field) && length($field)) - { - if (!$igunk) { - $unkpos = $lastpos - if length($pref) && !defined($unkpos); - if (defined $unkpos) - { - push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref; - $firstpos = $unkpos unless defined $firstpos; - undef $unkpos; - last FIELD if @fields == $max; - } - } - push @fields, $class - ? bless (\$field, $class) - : $field; - $firstpos = $lastpos unless defined $firstpos; - $lastpos = pos $$textref; - last FIELD if @fields == $max; - next FIELD; - } - } - if ($$textref =~ /\G(.)/gcs) - { - $unkpos = pos($$textref)-1 - unless $igunk || defined $unkpos; - } - } - - if (defined $unkpos) - { - push @fields, substr($$textref, $unkpos); - $firstpos = $unkpos unless defined $firstpos; - $lastpos = length $$textref; - } - last; - } - - pos $$textref = $lastpos; - return @fields if wantarray; - - $firstpos ||= 0; - eval { substr($$textref,$firstpos,$lastpos-$firstpos)=""; - pos $$textref = $firstpos }; - return $fields[0]; -} - - -sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options) -{ - my $ldel = $_[0]; - my $rdel = $_[1]; - my $pre = defined $_[2] ? $_[2] : '\s*'; - my %options = defined $_[3] ? %{$_[3]} : (); - my $omode = defined $options{fail} ? $options{fail} : ''; - my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}}) - : defined($options{reject}) ? $options{reject} - : '' - ; - my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}}) - : defined($options{ignore}) ? $options{ignore} - : '' - ; - - if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; } - - my $posbug = pos; - for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ } - pos = $posbug; - - my $closure = sub - { - my $textref = defined $_[0] ? \$_[0] : \$_; - my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore); - - return _fail(wantarray, $textref) unless @match; - return _succeed wantarray, $textref, - $match[2], $match[3]+$match[5]+$match[7], # MATCH - @match[8..9,0..1,2..7]; # REM, PRE, BITS - }; - - bless $closure, 'Text::Balanced::Extractor'; -} - -package Text::Balanced::Extractor; - -sub extract($$) # ($self, $text) -{ - &{$_[0]}($_[1]); -} - -package Text::Balanced::ErrorMsg; - -use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" }; - -1; - -__END__ - -=head1 NAME - -Text::Balanced - Extract delimited text sequences from strings. - - -=head1 SYNOPSIS - - use Text::Balanced qw ( - extract_delimited - extract_bracketed - extract_quotelike - extract_codeblock - extract_variable - extract_tagged - extract_multiple - - gen_delimited_pat - gen_extract_tagged - ); - - # Extract the initial substring of $text that is delimited by - # two (unescaped) instances of the first character in $delim. - - ($extracted, $remainder) = extract_delimited($text,$delim); - - - # Extract the initial substring of $text that is bracketed - # with a delimiter(s) specified by $delim (where the string - # in $delim contains one or more of '(){}[]<>'). - - ($extracted, $remainder) = extract_bracketed($text,$delim); - - - # Extract the initial substring of $text that is bounded by - # an XML tag. - - ($extracted, $remainder) = extract_tagged($text); - - - # Extract the initial substring of $text that is bounded by - # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags - - ($extracted, $remainder) = - extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]}); - - - # Extract the initial substring of $text that represents a - # Perl "quote or quote-like operation" - - ($extracted, $remainder) = extract_quotelike($text); - - - # Extract the initial substring of $text that represents a block - # of Perl code, bracketed by any of character(s) specified by $delim - # (where the string $delim contains one or more of '(){}[]<>'). - - ($extracted, $remainder) = extract_codeblock($text,$delim); - - - # Extract the initial substrings of $text that would be extracted by - # one or more sequential applications of the specified functions - # or regular expressions - - @extracted = extract_multiple($text, - [ \&extract_bracketed, - \&extract_quotelike, - \&some_other_extractor_sub, - qr/[xyz]*/, - 'literal', - ]); - -# Create a string representing an optimized pattern (a la Friedl) -# that matches a substring delimited by any of the specified characters -# (in this case: any type of quote or a slash) - - $patstring = gen_delimited_pat(q{'"`/}); - - -# Generate a reference to an anonymous sub that is just like extract_tagged -# but pre-compiled and optimized for a specific pair of tags, and consequently -# much faster (i.e. 3 times faster). It uses qr// for better performance on -# repeated calls, so it only works under Perl 5.005 or later. - - $extract_head = gen_extract_tagged('<HEAD>','</HEAD>'); - - ($extracted, $remainder) = $extract_head->($text); - - -=head1 DESCRIPTION - -The various C<extract_...> subroutines may be used to -extract a delimited substring, possibly after skipping a -specified prefix string. By default, that prefix is -optional whitespace (C</\s*/>), but you can change it to whatever -you wish (see below). - -The substring to be extracted must appear at the -current C<pos> location of the string's variable -(or at index zero, if no C<pos> position is defined). -In other words, the C<extract_...> subroutines I<don't> -extract the first occurrence of a substring anywhere -in a string (like an unanchored regex would). Rather, -they extract an occurrence of the substring appearing -immediately at the current matching position in the -string (like a C<\G>-anchored regex would). - - - -=head2 General behaviour in list contexts - -In a list context, all the subroutines return a list, the first three -elements of which are always: - -=over 4 - -=item [0] - -The extracted string, including the specified delimiters. -If the extraction fails C<undef> is returned. - -=item [1] - -The remainder of the input string (i.e. the characters after the -extracted string). On failure, the entire string is returned. - -=item [2] - -The skipped prefix (i.e. the characters before the extracted string). -On failure, C<undef> is returned. - -=back - -Note that in a list context, the contents of the original input text (the first -argument) are not modified in any way. - -However, if the input text was passed in a variable, that variable's -C<pos> value is updated to point at the first character after the -extracted text. That means that in a list context the various -subroutines can be used much like regular expressions. For example: - - while ( $next = (extract_quotelike($text))[0] ) - { - # process next quote-like (in $next) - } - - -=head2 General behaviour in scalar and void contexts - -In a scalar context, the extracted string is returned, having first been -removed from the input text. Thus, the following code also processes -each quote-like operation, but actually removes them from $text: - - while ( $next = extract_quotelike($text) ) - { - # process next quote-like (in $next) - } - -Note that if the input text is a read-only string (i.e. a literal), -no attempt is made to remove the extracted text. - -In a void context the behaviour of the extraction subroutines is -exactly the same as in a scalar context, except (of course) that the -extracted substring is not returned. - -=head2 A note about prefixes - -Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.) -This can bite you if you're expecting a prefix specification like -'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix -pattern will only succeed if the <H1> tag is on the current line, since -. normally doesn't match newlines. - -To overcome this limitation, you need to turn on /s matching within -the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)' - - -=head2 C<extract_delimited> - -The C<extract_delimited> function formalizes the common idiom -of extracting a single-character-delimited substring from the start of -a string. For example, to extract a single-quote delimited string, the -following code is typically used: - - ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s; - $extracted = $1; - -but with C<extract_delimited> it can be simplified to: - - ($extracted,$remainder) = extract_delimited($text, "'"); - -C<extract_delimited> takes up to four scalars (the input text, the -delimiters, a prefix pattern to be skipped, and any escape characters) -and extracts the initial substring of the text that -is appropriately delimited. If the delimiter string has multiple -characters, the first one encountered in the text is taken to delimit -the substring. -The third argument specifies a prefix pattern that is to be skipped -(but must be present!) before the substring is extracted. -The final argument specifies the escape character to be used for each -delimiter. - -All arguments are optional. If the escape characters are not specified, -every delimiter is escaped with a backslash (C<\>). -If the prefix is not specified, the -pattern C<'\s*'> - optional whitespace - is used. If the delimiter set -is also not specified, the set C</["'`]/> is used. If the text to be processed -is not specified either, C<$_> is used. - -In list context, C<extract_delimited> returns an array of three -elements, the extracted substring (I<including the surrounding -delimiters>), the remainder of the text, and the skipped prefix (if -any). If a suitable delimited substring is not found, the first -element of the array is the empty string, the second is the complete -original text, and the prefix returned in the third element is an -empty string. - -In a scalar context, just the extracted substring is returned. In -a void context, the extracted substring (and any prefix) are simply -removed from the beginning of the first argument. - -Examples: - - # Remove a single-quoted substring from the very beginning of $text: - - $substring = extract_delimited($text, "'", ''); - - # Remove a single-quoted Pascalish substring (i.e. one in which - # doubling the quote character escapes it) from the very - # beginning of $text: - - $substring = extract_delimited($text, "'", '', "'"); - - # Extract a single- or double- quoted substring from the - # beginning of $text, optionally after some whitespace - # (note the list context to protect $text from modification): - - ($substring) = extract_delimited $text, q{"'}; - - - # Delete the substring delimited by the first '/' in $text: - - $text = join '', (extract_delimited($text,'/','[^/]*')[2,1]; - -Note that this last example is I<not> the same as deleting the first -quote-like pattern. For instance, if C<$text> contained the string: - - "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }" - -then after the deletion it would contain: - - "if ('.$UNIXCMD/s) { $cmd = $1; }" - -not: - - "if ('./cmd' =~ ms) { $cmd = $1; }" - - -See L<"extract_quotelike"> for a (partial) solution to this problem. - - -=head2 C<extract_bracketed> - -Like C<"extract_delimited">, the C<extract_bracketed> function takes -up to three optional scalar arguments: a string to extract from, a delimiter -specifier, and a prefix pattern. As before, a missing prefix defaults to -optional whitespace and a missing text defaults to C<$_>. However, a missing -delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below). - -C<extract_bracketed> extracts a balanced-bracket-delimited -substring (using any one (or more) of the user-specified delimiter -brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also -respect quoted unbalanced brackets (see below). - -A "delimiter bracket" is a bracket in list of delimiters passed as -C<extract_bracketed>'s second argument. Delimiter brackets are -specified by giving either the left or right (or both!) versions -of the required bracket(s). Note that the order in which -two or more delimiter brackets are specified is not significant. - -A "balanced-bracket-delimited substring" is a substring bounded by -matched brackets, such that any other (left or right) delimiter -bracket I<within> the substring is also matched by an opposite -(right or left) delimiter bracket I<at the same level of nesting>. Any -type of bracket not in the delimiter list is treated as an ordinary -character. - -In other words, each type of bracket specified as a delimiter must be -balanced and correctly nested within the substring, and any other kind of -("non-delimiter") bracket in the substring is ignored. - -For example, given the string: - - $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }"; - -then a call to C<extract_bracketed> in a list context: - - @result = extract_bracketed( $text, '{}' ); - -would return: - - ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" ) - -since both sets of C<'{..}'> brackets are properly nested and evenly balanced. -(In a scalar context just the first element of the array would be returned. In -a void context, C<$text> would be replaced by an empty string.) - -Likewise the call in: - - @result = extract_bracketed( $text, '{[' ); - -would return the same result, since all sets of both types of specified -delimiter brackets are correctly nested and balanced. - -However, the call in: - - @result = extract_bracketed( $text, '{([<' ); - -would fail, returning: - - ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" ); - -because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and -the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would -return an empty string. In a void context, C<$text> would be unchanged.) - -Note that the embedded single-quotes in the string don't help in this -case, since they have not been specified as acceptable delimiters and are -therefore treated as non-delimiter characters (and ignored). - -However, if a particular species of quote character is included in the -delimiter specification, then that type of quote will be correctly handled. -for example, if C<$text> is: - - $text = '<A HREF=">>>>">link</A>'; - -then - - @result = extract_bracketed( $text, '<">' ); - -returns: - - ( '<A HREF=">>>>">', 'link</A>', "" ) - -as expected. Without the specification of C<"> as an embedded quoter: - - @result = extract_bracketed( $text, '<>' ); - -the result would be: - - ( '<A HREF=">', '>>>">link</A>', "" ) - -In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like -quoting (i.e. q{string}, qq{string}, etc) can be specified by including the -letter 'q' as a delimiter. Hence: - - @result = extract_bracketed( $text, '<q>' ); - -would correctly match something like this: - - $text = '<leftop: conj /and/ conj>'; - -See also: C<"extract_quotelike"> and C<"extract_codeblock">. - - -=head2 C<extract_variable> - -C<extract_variable> extracts any valid Perl variable or -variable-involved expression, including scalars, arrays, hashes, array -accesses, hash look-ups, method calls through objects, subroutine calls -through subroutine references, etc. - -The subroutine takes up to two optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A string specifying a pattern to be matched as a prefix (which is to be -skipped). If omitted, optional whitespace is skipped. - -=back - -On success in a list context, an array of 3 elements is returned. The -elements are: - -=over 4 - -=item [0] - -the extracted variable, or variablish expression - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=back - -On failure, all of these values (except the remaining text) are C<undef>. - -In a scalar context, C<extract_variable> returns just the complete -substring that matched a variablish expression. C<undef> is returned on -failure. In addition, the original input text has the returned substring -(and any prefix) removed from it. - -In a void context, the input text just has the matched substring (and -any specified prefix) removed. - - -=head2 C<extract_tagged> - -C<extract_tagged> extracts and segments text between (balanced) -specified tags. - -The subroutine takes up to five optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A string specifying a pattern to be matched as the opening tag. -If the pattern string is omitted (or C<undef>) then a pattern -that matches any standard XML tag is used. - -=item 3. - -A string specifying a pattern to be matched at the closing tag. -If the pattern string is omitted (or C<undef>) then the closing -tag is constructed by inserting a C</> after any leading bracket -characters in the actual opening tag that was matched (I<not> the pattern -that matched the tag). For example, if the opening tag pattern -is specified as C<'{{\w+}}'> and actually matched the opening tag -C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">. - -=item 4. - -A string specifying a pattern to be matched as a prefix (which is to be -skipped). If omitted, optional whitespace is skipped. - -=item 5. - -A hash reference containing various parsing options (see below) - -=back - -The various options that can be specified are: - -=over 4 - -=item C<reject =E<gt> $listref> - -The list reference contains one or more strings specifying patterns -that must I<not> appear within the tagged text. - -For example, to extract -an HTML link (which should not contain nested links) use: - - extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} ); - -=item C<ignore =E<gt> $listref> - -The list reference contains one or more strings specifying patterns -that are I<not> be be treated as nested tags within the tagged text -(even if they would match the start tag pattern). - -For example, to extract an arbitrary XML tag, but ignore "empty" elements: - - extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} ); - -(also see L<"gen_delimited_pat"> below). - - -=item C<fail =E<gt> $str> - -The C<fail> option indicates the action to be taken if a matching end -tag is not encountered (i.e. before the end of the string or some -C<reject> pattern matches). By default, a failure to match a closing -tag causes C<extract_tagged> to immediately fail. - -However, if the string value associated with <reject> is "MAX", then -C<extract_tagged> returns the complete text up to the point of failure. -If the string is "PARA", C<extract_tagged> returns only the first paragraph -after the tag (up to the first line that is either empty or contains -only whitespace characters). -If the string is "", the the default behaviour (i.e. failure) is reinstated. - -For example, suppose the start tag "/para" introduces a paragraph, which then -continues until the next "/endpara" tag or until another "/para" tag is -encountered: - - $text = "/para line 1\n\nline 3\n/para line 4"; - - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); - - # EXTRACTED: "/para line 1\n\nline 3\n" - -Suppose instead, that if no matching "/endpara" tag is found, the "/para" -tag refers only to the immediately following paragraph: - - $text = "/para line 1\n\nline 3\n/para line 4"; - - extract_tagged($text, '/para', '/endpara', undef, - {reject => '/para', fail => MAX ); - - # EXTRACTED: "/para line 1\n" - -Note that the specified C<fail> behaviour applies to nested tags as well. - -=back - -On success in a list context, an array of 6 elements is returned. The elements are: - -=over 4 - -=item [0] - -the extracted tagged substring (including the outermost tags), - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=item [3] - -the opening tag - -=item [4] - -the text between the opening and closing tags - -=item [5] - -the closing tag (or "" if no closing tag was found) - -=back - -On failure, all of these values (except the remaining text) are C<undef>. - -In a scalar context, C<extract_tagged> returns just the complete -substring that matched a tagged text (including the start and end -tags). C<undef> is returned on failure. In addition, the original input -text has the returned substring (and any prefix) removed from it. - -In a void context, the input text just has the matched substring (and -any specified prefix) removed. - - -=head2 C<gen_extract_tagged> - -(Note: This subroutine is only available under Perl5.005) - -C<gen_extract_tagged> generates a new anonymous subroutine which -extracts text between (balanced) specified tags. In other words, -it generates a function identical in function to C<extract_tagged>. - -The difference between C<extract_tagged> and the anonymous -subroutines generated by -C<gen_extract_tagged>, is that those generated subroutines: - -=over 4 - -=item * - -do not have to reparse tag specification or parsing options every time -they are called (whereas C<extract_tagged> has to effectively rebuild -its tag parser on every call); - -=item * - -make use of the new qr// construct to pre-compile the regexes they use -(whereas C<extract_tagged> uses standard string variable interpolation -to create tag-matching patterns). - -=back - -The subroutine takes up to four optional arguments (the same set as -C<extract_tagged> except for the string to be processed). It returns -a reference to a subroutine which in turn takes a single argument (the text to -be extracted from). - -In other words, the implementation of C<extract_tagged> is exactly -equivalent to: - - sub extract_tagged - { - my $text = shift; - $extractor = gen_extract_tagged(@_); - return $extractor->($text); - } - -(although C<extract_tagged> is not currently implemented that way, in order -to preserve pre-5.005 compatibility). - -Using C<gen_extract_tagged> to create extraction functions for specific tags -is a good idea if those functions are going to be called more than once, since -their performance is typically twice as good as the more general-purpose -C<extract_tagged>. - - -=head2 C<extract_quotelike> - -C<extract_quotelike> attempts to recognize, extract, and segment any -one of the various Perl quotes and quotelike operators (see -L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket -delimiters (for the quotelike operators), and trailing modifiers are -all caught. For example, in: - - extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #' - - extract_quotelike ' "You said, \"Use sed\"." ' - - extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; ' - - extract_quotelike ' tr/\\\/\\\\/\\\//ds; ' - -the full Perl quotelike operations are all extracted correctly. - -Note too that, when using the /x modifier on a regex, any comment -containing the current pattern delimiter will cause the regex to be -immediately terminated. In other words: - - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/UNDERSCORE - [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS - /x' - -will be extracted as if it were: - - 'm / - (?i) # CASE INSENSITIVE - [a-z_] # LEADING ALPHABETIC/' - -This behaviour is identical to that of the actual compiler. - -C<extract_quotelike> takes two arguments: the text to be processed and -a prefix to be matched at the very beginning of the text. If no prefix -is specified, optional whitespace is the default. If no text is given, -C<$_> is used. - -In a list context, an array of 11 elements is returned. The elements are: - -=over 4 - -=item [0] - -the extracted quotelike substring (including trailing modifiers), - -=item [1] - -the remainder of the input text, - -=item [2] - -the prefix substring (if any), - -=item [3] - -the name of the quotelike operator (if any), - -=item [4] - -the left delimiter of the first block of the operation, - -=item [5] - -the text of the first block of the operation -(that is, the contents of -a quote, the regex of a match or substitution or the target list of a -translation), - -=item [6] - -the right delimiter of the first block of the operation, - -=item [7] - -the left delimiter of the second block of the operation -(that is, if it is a C<s>, C<tr>, or C<y>), - -=item [8] - -the text of the second block of the operation -(that is, the replacement of a substitution or the translation list -of a translation), - -=item [9] - -the right delimiter of the second block of the operation (if any), - -=item [10] - -the trailing modifiers on the operation (if any). - -=back - -For each of the fields marked "(if any)" the default value on success is -an empty string. -On failure, all of these values (except the remaining text) are C<undef>. - - -In a scalar context, C<extract_quotelike> returns just the complete substring -that matched a quotelike operation (or C<undef> on failure). In a scalar or -void context, the input text has the same substring (and any specified -prefix) removed. - -Examples: - - # Remove the first quotelike literal that appears in text - - $quotelike = extract_quotelike($text,'.*?'); - - # Replace one or more leading whitespace-separated quotelike - # literals in $_ with "<QLL>" - - do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@; - - - # Isolate the search pattern in a quotelike operation from $text - - ($op,$pat) = (extract_quotelike $text)[3,5]; - if ($op =~ /[ms]/) - { - print "search pattern: $pat\n"; - } - else - { - print "$op is not a pattern matching operation\n"; - } - - -=head2 C<extract_quotelike> and "here documents" - -C<extract_quotelike> can successfully extract "here documents" from an input -string, but with an important caveat in list contexts. - -Unlike other types of quote-like literals, a here document is rarely -a contiguous substring. For example, a typical piece of code using -here document might look like this: - - <<'EOMSG' || die; - This is the message. - EOMSG - exit; - -Given this as an input string in a scalar context, C<extract_quotelike> -would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG", -leaving the string " || die;\nexit;" in the original variable. In other words, -the two separate pieces of the here document are successfully extracted and -concatenated. - -In a list context, C<extract_quotelike> would return the list - -=over 4 - -=item [0] - -"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document, -including fore and aft delimiters), - -=item [1] - -" || die;\nexit;" (i.e. the remainder of the input text, concatenated), - -=item [2] - -"" (i.e. the prefix substring -- trivial in this case), - -=item [3] - -"<<" (i.e. the "name" of the quotelike operator) - -=item [4] - -"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes), - -=item [5] - -"This is the message.\n" (i.e. the text of the here document), - -=item [6] - -"EOMSG" (i.e. the right delimiter of the here document), - -=item [7..10] - -"" (a here document has no second left delimiter, second text, second right -delimiter, or trailing modifiers). - -=back - -However, the matching position of the input variable would be set to -"exit;" (i.e. I<after> the closing delimiter of the here document), -which would cause the earlier " || die;\nexit;" to be skipped in any -sequence of code fragment extractions. - -To avoid this problem, when it encounters a here document whilst -extracting from a modifiable string, C<extract_quotelike> silently -rearranges the string to an equivalent piece of Perl: - - <<'EOMSG' - This is the message. - EOMSG - || die; - exit; - -in which the here document I<is> contiguous. It still leaves the -matching position after the here document, but now the rest of the line -on which the here document starts is not skipped. - -To prevent <extract_quotelike> from mucking about with the input in this way -(this is the only case where a list-context C<extract_quotelike> does so), -you can pass the input variable as an interpolated literal: - - $quotelike = extract_quotelike("$var"); - - -=head2 C<extract_codeblock> - -C<extract_codeblock> attempts to recognize and extract a balanced -bracket delimited substring that may contain unbalanced brackets -inside Perl quotes or quotelike operations. That is, C<extract_codeblock> -is like a combination of C<"extract_bracketed"> and -C<"extract_quotelike">. - -C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>: -a text to process, a set of delimiter brackets to look for, and a prefix to -match first. It also takes an optional fourth parameter, which allows the -outermost delimiter brackets to be specified separately (see below). - -Omitting the first argument (input text) means process C<$_> instead. -Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used. -Omitting the third argument (prefix argument) implies optional whitespace at the start. -Omitting the fourth argument (outermost delimiter brackets) indicates that the -value of the second argument is to be used for the outermost delimiters. - -Once the prefix an dthe outermost opening delimiter bracket have been -recognized, code blocks are extracted by stepping through the input text and -trying the following alternatives in sequence: - -=over 4 - -=item 1. - -Try and match a closing delimiter bracket. If the bracket was the same -species as the last opening bracket, return the substring to that -point. If the bracket was mismatched, return an error. - -=item 2. - -Try to match a quote or quotelike operator. If found, call -C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return -the error it returned. Otherwise go back to step 1. - -=item 3. - -Try to match an opening delimiter bracket. If found, call -C<extract_codeblock> recursively to eat the embedded block. If the -recursive call fails, return an error. Otherwise, go back to step 1. - -=item 4. - -Unconditionally match a bareword or any other single character, and -then go back to step 1. - -=back - - -Examples: - - # Find a while loop in the text - - if ($text =~ s/.*?while\s*\{/{/) - { - $loop = "while " . extract_codeblock($text); - } - - # Remove the first round-bracketed list (which may include - # round- or curly-bracketed code blocks or quotelike operators) - - extract_codeblock $text, "(){}", '[^(]*'; - - -The ability to specify a different outermost delimiter bracket is useful -in some circumstances. For example, in the Parse::RecDescent module, -parser actions which are to be performed only on a successful parse -are specified using a C<E<lt>defer:...E<gt>> directive. For example: - - sentence: subject verb object - <defer: {$::theVerb = $item{verb}} > - -Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code -within the C<E<lt>defer:...E<gt>> directive, but there's a problem. - -A deferred action like this: - - <defer: {if ($count>10) {$count--}} > - -will be incorrectly parsed as: - - <defer: {if ($count> - -because the "less than" operator is interpreted as a closing delimiter. - -But, by extracting the directive using -S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>> -the '>' character is only treated as a delimited at the outermost -level of the code block, so the directive is parsed correctly. - -=head2 C<extract_multiple> - -The C<extract_multiple> subroutine takes a string to be processed and a -list of extractors (subroutines or regular expressions) to apply to that string. - -In an array context C<extract_multiple> returns an array of substrings -of the original string, as extracted by the specified extractors. -In a scalar context, C<extract_multiple> returns the first -substring successfully extracted from the original string. In both -scalar and void contexts the original string has the first successfully -extracted substring removed from it. In all contexts -C<extract_multiple> starts at the current C<pos> of the string, and -sets that C<pos> appropriately after it matches. - -Hence, the aim of of a call to C<extract_multiple> in a list context -is to split the processed string into as many non-overlapping fields as -possible, by repeatedly applying each of the specified extractors -to the remainder of the string. Thus C<extract_multiple> is -a generalized form of Perl's C<split> subroutine. - -The subroutine takes up to four optional arguments: - -=over 4 - -=item 1. - -A string to be processed (C<$_> if the string is omitted or C<undef>) - -=item 2. - -A reference to a list of subroutine references and/or qr// objects and/or -literal strings and/or hash references, specifying the extractors -to be used to split the string. If this argument is omitted (or -C<undef>) the list: - - [ - sub { extract_variable($_[0], '') }, - sub { extract_quotelike($_[0],'') }, - sub { extract_codeblock($_[0],'{}','') }, - ] - -is used. - - -=item 3. - -An number specifying the maximum number of fields to return. If this -argument is omitted (or C<undef>), split continues as long as possible. - -If the third argument is I<N>, then extraction continues until I<N> fields -have been successfully extracted, or until the string has been completely -processed. - -Note that in scalar and void contexts the value of this argument is -automatically reset to 1 (under C<-w>, a warning is issued if the argument -has to be reset). - -=item 4. - -A value indicating whether unmatched substrings (see below) within the -text should be skipped or returned as fields. If the value is true, -such substrings are skipped. Otherwise, they are returned. - -=back - -The extraction process works by applying each extractor in -sequence to the text string. - -If the extractor is a subroutine it is called in a list context and is -expected to return a list of a single element, namely the extracted -text. It may optionally also return two further arguments: a string -representing the text left after extraction (like $' for a pattern -match), and a string representing any prefix skipped before the -extraction (like $` in a pattern match). Note that this is designed -to facilitate the use of other Text::Balanced subroutines with -C<extract_multiple>. Note too that the value returned by an extractor -subroutine need not bear any relationship to the corresponding substring -of the original text (see examples below). - -If the extractor is a precompiled regular expression or a string, -it is matched against the text in a scalar context with a leading -'\G' and the gc modifiers enabled. The extracted value is either -$1 if that variable is defined after the match, or else the -complete match (i.e. $&). - -If the extractor is a hash reference, it must contain exactly one element. -The value of that element is one of the -above extractor types (subroutine reference, regular expression, or string). -The key of that element is the name of a class into which the successful -return value of the extractor will be blessed. - -If an extractor returns a defined value, that value is immediately -treated as the next extracted field and pushed onto the list of fields. -If the extractor was specified in a hash reference, the field is also -blessed into the appropriate class, - -If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is -assumed to have failed to extract. -If none of the extractor subroutines succeeds, then one -character is extracted from the start of the text and the extraction -subroutines reapplied. Characters which are thus removed are accumulated and -eventually become the next field (unless the fourth argument is true, in which -case they are discarded). - -For example, the following extracts substrings that are valid Perl variables: - - @fields = extract_multiple($text, - [ sub { extract_variable($_[0]) } ], - undef, 1); - -This example separates a text into fields which are quote delimited, -curly bracketed, and anything else. The delimited and bracketed -parts are also blessed to identify them (the "anything else" is unblessed): - - @fields = extract_multiple($text, - [ - { Delim => sub { extract_delimited($_[0],q{'"}) } }, - { Brack => sub { extract_bracketed($_[0],'{}') } }, - ]); - -This call extracts the next single substring that is a valid Perl quotelike -operator (and removes it from $text): - - $quotelike = extract_multiple($text, - [ - sub { extract_quotelike($_[0]) }, - ], undef, 1); - -Finally, here is yet another way to do comma-separated value parsing: - - @fields = extract_multiple($csv_text, - [ - sub { extract_delimited($_[0],q{'"}) }, - qr/([^,]+)(.*)/, - ], - undef,1); - -The list in the second argument means: -I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">. -The undef third argument means: -I<"...as many times as possible...">, -and the true value in the fourth argument means -I<"...discarding anything else that appears (i.e. the commas)">. - -If you wanted the commas preserved as separate fields (i.e. like split -does if your split pattern has capturing parentheses), you would -just make the last parameter undefined (or remove it). - - -=head2 C<gen_delimited_pat> - -The C<gen_delimited_pat> subroutine takes a single (string) argument and - > builds a Friedl-style optimized regex that matches a string delimited -by any one of the characters in the single argument. For example: - - gen_delimited_pat(q{'"}) - -returns the regex: - - (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\') - -Note that the specified delimiters are automatically quotemeta'd. - -A typical use of C<gen_delimited_pat> would be to build special purpose tags -for C<extract_tagged>. For example, to properly ignore "empty" XML elements -(which might contain quoted strings): - - my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>'; - - extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} ); - - -C<gen_delimited_pat> may also be called with an optional second argument, -which specifies the "escape" character(s) to be used for each delimiter. -For example to match a Pascal-style string (where ' is the delimiter -and '' is a literal ' within the string): - - gen_delimited_pat(q{'},q{'}); - -Different escape characters can be specified for different delimiters. -For example, to specify that '/' is the escape for single quotes -and '%' is the escape for double quotes: - - gen_delimited_pat(q{'"},q{/%}); - -If more delimiters than escape chars are specified, the last escape char -is used for the remaining delimiters. -If no escape char is specified for a given specified delimiter, '\' is used. - -=head2 C<delimited_pat> - -Note that C<gen_delimited_pat> was previously called C<delimited_pat>. -That name may still be used, but is now deprecated. - - -=head1 DIAGNOSTICS - -In a list context, all the functions return C<(undef,$original_text)> -on failure. In a scalar context, failure is indicated by returning C<undef> -(in this case the input text is not modified in any way). - -In addition, on failure in I<any> context, the C<$@> variable is set. -Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed -below. -Accessing C<$@-E<gt>{pos}> returns the offset into the original string at -which the error was detected (although not necessarily where it occurred!) -Printing C<$@> directly produces the error message, with the offset appended. -On success, the C<$@> variable is guaranteed to be C<undef>. - -The available diagnostics are: - -=over 4 - -=item C<Did not find a suitable bracket: "%s"> - -The delimiter provided to C<extract_bracketed> was not one of -C<'()[]E<lt>E<gt>{}'>. - -=item C<Did not find prefix: /%s/> - -A non-optional prefix was specified but wasn't found at the start of the text. - -=item C<Did not find opening bracket after prefix: "%s"> - -C<extract_bracketed> or C<extract_codeblock> was expecting a -particular kind of bracket at the start of the text, and didn't find it. - -=item C<No quotelike operator found after prefix: "%s"> - -C<extract_quotelike> didn't find one of the quotelike operators C<q>, -C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring -it was extracting. - -=item C<Unmatched closing bracket: "%c"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered -a closing bracket where none was expected. - -=item C<Unmatched opening bracket(s): "%s"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran -out of characters in the text before closing one or more levels of nested -brackets. - -=item C<Unmatched embedded quote (%s)> - -C<extract_bracketed> attempted to match an embedded quoted substring, but -failed to find a closing quote to match it. - -=item C<Did not find closing delimiter to match '%s'> - -C<extract_quotelike> was unable to find a closing delimiter to match the -one that opened the quote-like operation. - -=item C<Mismatched closing bracket: expected "%c" but found "%s"> - -C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found -a valid bracket delimiter, but it was the wrong species. This usually -indicates a nesting error, but may indicate incorrect quoting or escaping. - -=item C<No block delimiter found after quotelike "%s"> - -C<extract_quotelike> or C<extract_codeblock> found one of the -quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> -without a suitable block after it. - -=item C<Did not find leading dereferencer> - -C<extract_variable> was expecting one of '$', '@', or '%' at the start of -a variable, but didn't find any of them. - -=item C<Bad identifier after dereferencer> - -C<extract_variable> found a '$', '@', or '%' indicating a variable, but that -character was not followed by a legal Perl identifier. - -=item C<Did not find expected opening bracket at %s> - -C<extract_codeblock> failed to find any of the outermost opening brackets -that were specified. - -=item C<Improperly nested codeblock at %s> - -A nested code block was found that started with a delimiter that was specified -as being only to be used as an outermost bracket. - -=item C<Missing second block for quotelike "%s"> - -C<extract_codeblock> or C<extract_quotelike> found one of the -quotelike operators C<s>, C<tr> or C<y> followed by only one block. - -=item C<No match found for opening bracket> - -C<extract_codeblock> failed to find a closing bracket to match the outermost -opening bracket. - -=item C<Did not find opening tag: /%s/> - -C<extract_tagged> did not find a suitable opening tag (after any specified -prefix was removed). - -=item C<Unable to construct closing tag to match: /%s/> - -C<extract_tagged> matched the specified opening tag and tried to -modify the matched text to produce a matching closing tag (because -none was specified). It failed to generate the closing tag, almost -certainly because the opening tag did not start with a -bracket of some kind. - -=item C<Found invalid nested tag: %s> - -C<extract_tagged> found a nested tag that appeared in the "reject" list -(and the failure mode was not "MAX" or "PARA"). - -=item C<Found unbalanced nested tag: %s> - -C<extract_tagged> found a nested opening tag that was not matched by a -corresponding nested closing tag (and the failure mode was not "MAX" or "PARA"). - -=item C<Did not find closing tag> - -C<extract_tagged> reached the end of the text without finding a closing tag -to match the original opening tag (and the failure mode was not -"MAX" or "PARA"). - - - - -=back - - -=head1 AUTHOR - -Damian Conway (damian@conway.org) - - -=head1 BUGS AND IRRITATIONS - -There are undoubtedly serious bugs lurking somewhere in this code, if -only because parts of it give the impression of understanding a great deal -more about Perl than they really do. - -Bug reports and other feedback are most welcome. - - -=head1 COPYRIGHT - - Copyright (c) 1997-2001, Damian Conway. All Rights Reserved. - This module is free software. It may be used, redistributed - and/or modified under the same terms as Perl itself. |