summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorNicholas Clark <nick@ccl4.org>2009-10-02 16:29:35 +0100
committerNicholas Clark <nick@ccl4.org>2009-10-02 16:29:35 +0100
commite916ef552ee31bfadefeb6b7752fce6b84326b26 (patch)
treec3d5a309459eeee4c5f4d30d987633c1498e7553 /ext
parente0ee75a6976f08f9bc3868227f1cd11ab6507895 (diff)
downloadperl-e916ef552ee31bfadefeb6b7752fce6b84326b26.tar.gz
Move Text::Balanced from ext/ to cpan/
Diffstat (limited to 'ext')
-rw-r--r--ext/Text-Balanced/Changes234
-rw-r--r--ext/Text-Balanced/README1066
-rw-r--r--ext/Text-Balanced/lib/Text/Balanced.pm2281
-rw-r--r--ext/Text-Balanced/t/01_compile.t11
-rw-r--r--ext/Text-Balanced/t/02_extbrk.t76
-rw-r--r--ext/Text-Balanced/t/03_extcbk.t95
-rw-r--r--ext/Text-Balanced/t/04_extdel.t90
-rw-r--r--ext/Text-Balanced/t/05_extmul.t319
-rw-r--r--ext/Text-Balanced/t/06_extqlk.t135
-rw-r--r--ext/Text-Balanced/t/07_exttag.t113
-rw-r--r--ext/Text-Balanced/t/08_extvar.t153
-rw-r--r--ext/Text-Balanced/t/09_gentag.t102
12 files changed, 0 insertions, 4675 deletions
diff --git a/ext/Text-Balanced/Changes b/ext/Text-Balanced/Changes
deleted file mode 100644
index 640686e79b..0000000000
--- a/ext/Text-Balanced/Changes
+++ /dev/null
@@ -1,234 +0,0 @@
-Revision history for Perl extension Text-Balanced.
-
-2.02 Thu 20 Jul 2009
- - Fixed the mixed "Damian Conway <adamk@cpan.org>" AUTHOR setting.
- For the record, I am NOT the author, I'm just the maintainer.
- Unfortunately, Makefile.PL does not have a MAINTAINER setting and
- this way all the emails about this module come to me.
-
-2.01 Tue 28 Jul 2009
- - Taken over by Adam Kennedy to move it to a long-term maintenance mode
- - Removing use warnings to restore 5.005 compatibility
- - Removing Module::Build for higher back-compatibility
- - Removing version.pm for higher back-compatibility
- - use Exporter -> use Exporter () to avoid some pathalogical cases
- - Upgraded the bundled author tests to be friendlier
- - Changes order now a more normal reverse chronological
-
-2.0.0 Wed Dec 20 10:50:24 2006
- - Added patches from bleadperl version (thanks Rafael!)
- - Fixed bug in second bracketed delimiters (thanks David)
-
-1.99.1 Thu Nov 16 09:29:14 2006
- - Included dependency on version.pm (thanks Andy)
-
-1.99.0 Thu Nov 16 07:32:06 2006
- - Removed reliance on expensive $& variable (thanks John)
- - Made Makefile.PL play nice with core versions (thanks Schwern!)
-
-1.98 Fri May 5 14:58:49 2006
- - Reinstated full test suite (thanks Steve!)
-
-1.97 Mon May 1 21:58:04 2006
- - Removed three-part version number and dependency on version.pm
-
-1.96.0 Mon May 1 21:52:37 2006
- - Fixed major bug in extract_multiple handling of unknowns
- - Fixed return value on failure (thanks Eric)
- - Fixed bug differentiating heredocs and left-shift operators
- (thanks Anthony)
-
-1.95 Mon Apr 28 00:22:04 2003
- - Constrainted _match_quote to only match at word boundaries
- (so "exemplum(hic)" doesn't match "m(hic)")
- (thanks Craig)
-
-1.94 Sun Apr 13 02:18:41 2003
- - rereleased in attempt to fix CPAN problems
-
-1.91 Fri Mar 28 23:19:17 2003
- - Fixed error count on t/extract_variable.t
- - Fixed bug in extract_codelike when non-standard delimiters used
-
-1.90 Tue Mar 25 11:14:38 2003
- - Fixed subtle bug in gen_extract_tagged (thanks Martin)
- - Doc fix: removed suggestion that extract_tagged defaults
- to matching HTML tags
- - Doc fix: clarified general matching behaviour
- - Fixed bug in parsing /.../ after a (
- - Doc fix: documented extract_variable
- - Fixed extract_variable handling of $h{qr}, $h{tr}, etc.
- (thanks, Briac)
- - Fixed incorrect handling of $::var (thanks Tim)
-
-1.89 Sun Nov 18 22:49:50 2001
- - Fixed extvar.t tests
-
-1.87 Thu Nov 15 21:25:35 2001
- - Made extract_multiple aware of skipped prefixes returned
- by subroutine extractors (such as extract_quotelike, etc.)
- - Made extract_variable aware of punctuation variables
- - Corified tests
-
-1.86 Mon Sep 3 06:57:08 2001
- - Revised licence for inclusion in core distribution
- - Consolidated POD in .pm file
- - renamed tests to let DOS cope with them
-
-1.85 Sun Jun 3 07:47:18 2001
- - Fixed bug in extract_variable recognizing method calls that
- start with an underscore (thanks Jeff)
-
-1.82 Sun Jan 14 16:56:04 2001
- - Fixed nit in extract_variable.t
- (tested more cases than it promised to)
- - Fixed bug extracting prefix in extract_quotelike (Thanks Michael)
- - Added handling of Perl 4 package qualifier: $Package'var, etc.
- - Added handling of here docs (see documentation for limitations)
- - Added reporting of failure position via $@->{pos} (see documentation)
-
-1.84 Thu Apr 26 11:58:13 2001
- - Fixed bug in certain extractions not matching strings
- with embedded newlines (thanks Robin)
-
-1.83 Mon Jan 15 12:43:12 2001
- - Fixed numerous bugs in here doc extraction (many thanks Tim)
-
-1.81 Wed Sep 13 11:58:49 2000
- - Fixed test count in extract_codeblock.t
- - Fixed improbable bug with trailing ->'s in extract_variable
- - Fixed (HT|X)ML tag extraction in extract_tagged (thanks, Tim)
- - Added explanatory note about prefix matching (thanks again, Tim)
- - Added handling of globs and sub refs to extract_variable
- - Pod tweak (thanks Abigail)
- - Allowed right tags to be run-time evaluated, so
- extract_tagged($text, '/([a-z]+)', '/end$1') works
- as expected.
- - Added optional blessing of matches via extract_multiple
- - Fixed bug in autogeneration of closing tags in extract_tagged
- (Thanks, Coke)
- - Fixed bug in interaction between extract_multiple and
- gen_extract_tagged (Thanks Anthony)
-
-1.77 Mon Nov 22 06:08:23 1999
- - Fixed major bug in extract_codeblock (would not
- terminate if there was trailing whitespace)
- - Improved /.../ pattern parsing within codeblocks
-
-1.76 Fri Nov 19 06:51:54 1999
- - IMPORTANT: Now requires 5.005 or better.
- - IMPORTANT: Made extract methods sensitive to the pos()
- value of the text they are parsing. In other words,
- all extract subroutines now act like patterns of the form
- /\G.../gc. See documentation for details.
- - IMPORTANT: Changed semantics of extract_multiple, in line
- with the above change, and to simplify the semantics to
- something vaguely predictable. See documentation for details.
- - Added ability to use qr/../'s and raw strings as extractors
- in extract_multiple. See documentation.
- - Added fourth argument to extract_codeblock to allow
- outermost brackets to be separately specified. See
- documentation for details.
- - Reimplemented internals of all extraction subroutines
- for significant speed-ups (between 100% and 2000%
- improvement).
- - Fixed nasty bug in extract_variable and extract_codeblock
- (they were returning prefix as well in scalar context)
- - Allowed read-only strings to be used as arguments in
- scalar contexts.
- - Renamed delimited_pat to gen-delimited pat (in line with
- gen_extract_tagged). Old name still works, but is now deprecated.
- - Tweaked all extraction subs so they correctly handle
- zero-length prefix matches after another zero-length match.
-
-1.66 Fri Jul 2 13:29:22 1999
- - Added ability to use quotelike operators in extract_bracketed
- - Fixed bug under 5.003 ('foreach my $func' not understood)
- - Added escape specification as fourth arg to &extract_delimited
- - Fixed handling of &delimited_pat and &extract_delimited
- when delimiter is same as escape
- - Fixed handling of ->, =>, and >> in &extract_code
- when delimiters are "<>"
-
-1.52 Thu Mar 4 12:43:38 1999
- - Added CSV parsing example to documentation of extract_multiple.
- - Fixed a bug with extract_codeblock in "RecDescent" mode
- (it would accept "subrule(s?)" and "subrule(?)", but
- not "subrule(s)"). Thanks, Jan.
-
-1.51 Sat Feb 13 10:31:55 1999
- - Fixed bugs in prefix matching in extract_variable:
- * incorrectly used default if '' specified
- * now handles $#array correctly
- - Fixed bugs in extract_codeblock:
- * Now handles !~ properly
- * Now handles embedded comments better.
- * Now handles "raw" pattern matches better.
- - Added support for single strings or qr's as
- 'reject' and 'ignore' args to extract_tagged()
- - Added gen_extract_tagged() to "precompile"
- a specific tag extractor for repeated use
- (approximately 3 times faster!)
-
-1.50 Thu Aug 27 09:20:19 1998
- - Improved the structure of the regex generated by
- delimited_pat (and used in extract_delimited). It's
- considerably more complex, but also more robust and
- much faster in the worst case.
- - Altered extract_variable to accept whitespace in variables,
- e.g. '$ a -> {'b'} -> [2]'
-
-1.41 Mon Aug 10 14:51:50 1998
- - Reinstated change to extract_codeblock from 1.36 which were
- mysteriously lost in 1.40
-
-1.40 Tue Aug 4 13:54:52 1998
- - Added (optional) handling of embedded quoted text to
- extract_delimited (see revised entry in Balanced.pod)
- - Added extract_tagged which extracts text between arbitrary,
- optionally nested start and end tags (see new entry in
- Balanced.pod).
- - Added delimited_pat which builds a pattern which matches a
- string delimited by any of the delimiters specified (see new
- entry in Balanced.pod).
- - Added test.pl
-
-1.36 Tue Jul 14 12:26:04 1998
- - Reinstated POD file missing from previous distribution
- - Added undocumented fourth parameter to extract_codeblock
- so as to correctly handle (?) and (s?) modifiers in
- RecDescent grammars.
-
-1.35 Wed Jun 24 09:53:31 1998
- - fixed handling of :: quantifiers in extract_variable()
- - numerous trivial lexical changes to make xemacs happy
-
-1.24
- - changed behaviour in scalar contexts. Scalar contexts
- now return the extracted string _and_ remove it from the
- first argument (or $_).
- - changed return values on failure (all contexts return undef
- for invalid return fields)
- - fixed some lurking bugs with trailing modifier handling
- - added :ALL tag to simplify wholesale importing of functions
- - fixed serious bug with embedded division operators ("/")
- This now also allows the ?...? form of pattern matching!
-
-1.23 Fri Oct 17 10:26:38 EST 1997
- - changed behaviour in scalar and void contexts. Scalar contexts
- now return only the extracted string. Void contexts now remove
- the extracted string from the first argument (or $_).
-
-1.21 Sat Oct 4 17:21:54 EST 1997
- - synchronised with Parse::RecDescent distribution (version number
- will now reflect that package)
-
-1.10 Tue Sep 30 17:23:23 EST 1997
- - reworked extract_quotelike to correct handling of some obscure cases
-
-1.01 Mon Sep 8 18:09:18 EST 1997
- - changed "quotemeta" to "quotemeta $_" to work
- around bug in Perl 5.002 and 5.003
-
-1.00 Mon Aug 11 12:42:56 1997
- - original version
diff --git a/ext/Text-Balanced/README b/ext/Text-Balanced/README
deleted file mode 100644
index f5f48edced..0000000000
--- a/ext/Text-Balanced/README
+++ /dev/null
@@ -1,1066 +0,0 @@
-NAME
- Text::Balanced - Extract delimited text sequences from strings.
-
-SYNOPSIS
- use Text::Balanced qw (
- extract_delimited
- extract_bracketed
- extract_quotelike
- extract_codeblock
- extract_variable
- extract_tagged
- extract_multiple
- gen_delimited_pat
- gen_extract_tagged
- );
-
- # Extract the initial substring of $text that is delimited by
- # two (unescaped) instances of the first character in $delim.
-
- ($extracted, $remainder) = extract_delimited($text,$delim);
-
-
- # Extract the initial substring of $text that is bracketed
- # with a delimiter(s) specified by $delim (where the string
- # in $delim contains one or more of '(){}[]<>').
-
- ($extracted, $remainder) = extract_bracketed($text,$delim);
-
-
- # Extract the initial substring of $text that is bounded by
- # an XML tag.
-
- ($extracted, $remainder) = extract_tagged($text);
-
-
- # Extract the initial substring of $text that is bounded by
- # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
-
- ($extracted, $remainder) =
- extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
-
-
- # Extract the initial substring of $text that represents a
- # Perl "quote or quote-like operation"
-
- ($extracted, $remainder) = extract_quotelike($text);
-
-
- # Extract the initial substring of $text that represents a block
- # of Perl code, bracketed by any of character(s) specified by $delim
- # (where the string $delim contains one or more of '(){}[]<>').
-
- ($extracted, $remainder) = extract_codeblock($text,$delim);
-
-
- # Extract the initial substrings of $text that would be extracted by
- # one or more sequential applications of the specified functions
- # or regular expressions
-
- @extracted = extract_multiple($text,
- [ \&extract_bracketed,
- \&extract_quotelike,
- \&some_other_extractor_sub,
- qr/[xyz]*/,
- 'literal',
- ]);
-
- # Create a string representing an optimized pattern (a la Friedl) # that
- matches a substring delimited by any of the specified characters # (in
- this case: any type of quote or a slash)
-
- $patstring = gen_delimited_pat(q{'"`/});
-
- # Generate a reference to an anonymous sub that is just like
- extract_tagged # but pre-compiled and optimized for a specific pair of
- tags, and consequently # much faster (i.e. 3 times faster). It uses qr//
- for better performance on # repeated calls, so it only works under Perl
- 5.005 or later.
-
- $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
-
- ($extracted, $remainder) = $extract_head->($text);
-
-DESCRIPTION
- The various "extract_..." subroutines may be used to extract a delimited
- substring, possibly after skipping a specified prefix string. By
- default, that prefix is optional whitespace ("/\s*/"), but you can
- change it to whatever you wish (see below).
-
- The substring to be extracted must appear at the current "pos" location
- of the string's variable (or at index zero, if no "pos" position is
- defined). In other words, the "extract_..." subroutines *don't* extract
- the first occurrence of a substring anywhere in a string (like an
- unanchored regex would). Rather, they extract an occurrence of the
- substring appearing immediately at the current matching position in the
- string (like a "\G"-anchored regex would).
-
- General behaviour in list contexts
- In a list context, all the subroutines return a list, the first three
- elements of which are always:
-
- [0] The extracted string, including the specified delimiters. If the
- extraction fails "undef" is returned.
-
- [1] The remainder of the input string (i.e. the characters after the
- extracted string). On failure, the entire string is returned.
-
- [2] The skipped prefix (i.e. the characters before the extracted
- string). On failure, "undef" is returned.
-
- Note that in a list context, the contents of the original input text
- (the first argument) are not modified in any way.
-
- However, if the input text was passed in a variable, that variable's
- "pos" value is updated to point at the first character after the
- extracted text. That means that in a list context the various
- subroutines can be used much like regular expressions. For example:
-
- while ( $next = (extract_quotelike($text))[0] )
- {
- # process next quote-like (in $next)
- }
-
- General behaviour in scalar and void contexts
- In a scalar context, the extracted string is returned, having first been
- removed from the input text. Thus, the following code also processes
- each quote-like operation, but actually removes them from $text:
-
- while ( $next = extract_quotelike($text) )
- {
- # process next quote-like (in $next)
- }
-
- Note that if the input text is a read-only string (i.e. a literal), no
- attempt is made to remove the extracted text.
-
- In a void context the behaviour of the extraction subroutines is exactly
- the same as in a scalar context, except (of course) that the extracted
- substring is not returned.
-
- A note about prefixes
- Prefix patterns are matched without any trailing modifiers ("/gimsox"
- etc.) This can bite you if you're expecting a prefix specification like
- '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
- pattern will only succeed if the <H1> tag is on the current line, since
- . normally doesn't match newlines.
-
- To overcome this limitation, you need to turn on /s matching within the
- prefix pattern, using the "(?s)" directive: '(?s).*?(?=<H1>)'
-
- "extract_delimited"
- The "extract_delimited" function formalizes the common idiom of
- extracting a single-character-delimited substring from the start of a
- string. For example, to extract a single-quote delimited string, the
- following code is typically used:
-
- ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
- $extracted = $1;
-
- but with "extract_delimited" it can be simplified to:
-
- ($extracted,$remainder) = extract_delimited($text, "'");
-
- "extract_delimited" takes up to four scalars (the input text, the
- delimiters, a prefix pattern to be skipped, and any escape characters)
- and extracts the initial substring of the text that is appropriately
- delimited. If the delimiter string has multiple characters, the first
- one encountered in the text is taken to delimit the substring. The third
- argument specifies a prefix pattern that is to be skipped (but must be
- present!) before the substring is extracted. The final argument
- specifies the escape character to be used for each delimiter.
-
- All arguments are optional. If the escape characters are not specified,
- every delimiter is escaped with a backslash ("\"). If the prefix is not
- specified, the pattern '\s*' - optional whitespace - is used. If the
- delimiter set is also not specified, the set "/["'`]/" is used. If the
- text to be processed is not specified either, $_ is used.
-
- In list context, "extract_delimited" returns a array of three elements,
- the extracted substring (*including the surrounding delimiters*), the
- remainder of the text, and the skipped prefix (if any). If a suitable
- delimited substring is not found, the first element of the array is the
- empty string, the second is the complete original text, and the prefix
- returned in the third element is an empty string.
-
- In a scalar context, just the extracted substring is returned. In a void
- context, the extracted substring (and any prefix) are simply removed
- from the beginning of the first argument.
-
- Examples:
-
- # Remove a single-quoted substring from the very beginning of $text:
-
- $substring = extract_delimited($text, "'", '');
-
- # Remove a single-quoted Pascalish substring (i.e. one in which
- # doubling the quote character escapes it) from the very
- # beginning of $text:
-
- $substring = extract_delimited($text, "'", '', "'");
-
- # Extract a single- or double- quoted substring from the
- # beginning of $text, optionally after some whitespace
- # (note the list context to protect $text from modification):
-
- ($substring) = extract_delimited $text, q{"'};
-
- # Delete the substring delimited by the first '/' in $text:
-
- $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
-
- Note that this last example is *not* the same as deleting the first
- quote-like pattern. For instance, if $text contained the string:
-
- "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
-
- then after the deletion it would contain:
-
- "if ('.$UNIXCMD/s) { $cmd = $1; }"
-
- not:
-
- "if ('./cmd' =~ ms) { $cmd = $1; }"
-
- See "extract_quotelike" for a (partial) solution to this problem.
-
- "extract_bracketed"
- Like "extract_delimited", the "extract_bracketed" function takes up to
- three optional scalar arguments: a string to extract from, a delimiter
- specifier, and a prefix pattern. As before, a missing prefix defaults to
- optional whitespace and a missing text defaults to $_. However, a
- missing delimiter specifier defaults to '{}()[]<>' (see below).
-
- "extract_bracketed" extracts a balanced-bracket-delimited substring
- (using any one (or more) of the user-specified delimiter brackets:
- '(..)', '{..}', '[..]', or '<..>'). Optionally it will also respect
- quoted unbalanced brackets (see below).
-
- A "delimiter bracket" is a bracket in list of delimiters passed as
- "extract_bracketed"'s second argument. Delimiter brackets are specified
- by giving either the left or right (or both!) versions of the required
- bracket(s). Note that the order in which two or more delimiter brackets
- are specified is not significant.
-
- A "balanced-bracket-delimited substring" is a substring bounded by
- matched brackets, such that any other (left or right) delimiter bracket
- *within* the substring is also matched by an opposite (right or left)
- delimiter bracket *at the same level of nesting*. Any type of bracket
- not in the delimiter list is treated as an ordinary character.
-
- In other words, each type of bracket specified as a delimiter must be
- balanced and correctly nested within the substring, and any other kind
- of ("non-delimiter") bracket in the substring is ignored.
-
- For example, given the string:
-
- $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
-
- then a call to "extract_bracketed" in a list context:
-
- @result = extract_bracketed( $text, '{}' );
-
- would return:
-
- ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
-
- since both sets of '{..}' brackets are properly nested and evenly
- balanced. (In a scalar context just the first element of the array would
- be returned. In a void context, $text would be replaced by an empty
- string.)
-
- Likewise the call in:
-
- @result = extract_bracketed( $text, '{[' );
-
- would return the same result, since all sets of both types of specified
- delimiter brackets are correctly nested and balanced.
-
- However, the call in:
-
- @result = extract_bracketed( $text, '{([<' );
-
- would fail, returning:
-
- ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
-
- because the embedded pairs of '(..)'s and '[..]'s are "cross-nested" and
- the embedded '>' is unbalanced. (In a scalar context, this call would
- return an empty string. In a void context, $text would be unchanged.)
-
- Note that the embedded single-quotes in the string don't help in this
- case, since they have not been specified as acceptable delimiters and
- are therefore treated as non-delimiter characters (and ignored).
-
- However, if a particular species of quote character is included in the
- delimiter specification, then that type of quote will be correctly
- handled. for example, if $text is:
-
- $text = '<A HREF=">>>>">link</A>';
-
- then
-
- @result = extract_bracketed( $text, '<">' );
-
- returns:
-
- ( '<A HREF=">>>>">', 'link</A>', "" )
-
- as expected. Without the specification of """ as an embedded quoter:
-
- @result = extract_bracketed( $text, '<>' );
-
- the result would be:
-
- ( '<A HREF=">', '>>>">link</A>', "" )
-
- In addition to the quote delimiters "'", """, and "`", full Perl
- quote-like quoting (i.e. q{string}, qq{string}, etc) can be specified by
- including the letter 'q' as a delimiter. Hence:
-
- @result = extract_bracketed( $text, '<q>' );
-
- would correctly match something like this:
-
- $text = '<leftop: conj /and/ conj>';
-
- See also: "extract_quotelike" and "extract_codeblock".
-
- "extract_variable"
- "extract_variable" extracts any valid Perl variable or variable-involved
- expression, including scalars, arrays, hashes, array accesses, hash
- look-ups, method calls through objects, subroutine calls through
- subroutine references, etc.
-
- The subroutine takes up to two optional arguments:
-
- 1. A string to be processed ($_ if the string is omitted or "undef")
-
- 2. A string specifying a pattern to be matched as a prefix (which is to
- be skipped). If omitted, optional whitespace is skipped.
-
- On success in a list context, an array of 3 elements is returned. The
- elements are:
-
- [0] the extracted variable, or variablish expression
-
- [1] the remainder of the input text,
-
- [2] the prefix substring (if any),
-
- On failure, all of these values (except the remaining text) are "undef".
-
- In a scalar context, "extract_variable" returns just the complete
- substring that matched a variablish expression. "undef" is returned on
- failure. In addition, the original input text has the returned substring
- (and any prefix) removed from it.
-
- In a void context, the input text just has the matched substring (and
- any specified prefix) removed.
-
- "extract_tagged"
- "extract_tagged" extracts and segments text between (balanced) specified
- tags.
-
- The subroutine takes up to five optional arguments:
-
- 1. A string to be processed ($_ if the string is omitted or "undef")
-
- 2. A string specifying a pattern to be matched as the opening tag. If
- the pattern string is omitted (or "undef") then a pattern that
- matches any standard XML tag is used.
-
- 3. A string specifying a pattern to be matched at the closing tag. If
- the pattern string is omitted (or "undef") then the closing tag is
- constructed by inserting a "/" after any leading bracket characters
- in the actual opening tag that was matched (*not* the pattern that
- matched the tag). For example, if the opening tag pattern is
- specified as '{{\w+}}' and actually matched the opening tag
- "{{DATA}}", then the constructed closing tag would be "{{/DATA}}".
-
- 4. A string specifying a pattern to be matched as a prefix (which is to
- be skipped). If omitted, optional whitespace is skipped.
-
- 5. A hash reference containing various parsing options (see below)
-
- The various options that can be specified are:
-
- "reject => $listref"
- The list reference contains one or more strings specifying patterns
- that must *not* appear within the tagged text.
-
- For example, to extract an HTML link (which should not contain
- nested links) use:
-
- extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
-
- "ignore => $listref"
- The list reference contains one or more strings specifying patterns
- that are *not* be be treated as nested tags within the tagged text
- (even if they would match the start tag pattern).
-
- For example, to extract an arbitrary XML tag, but ignore "empty"
- elements:
-
- extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
-
- (also see "gen_delimited_pat" below).
-
- "fail => $str"
- The "fail" option indicates the action to be taken if a matching end
- tag is not encountered (i.e. before the end of the string or some
- "reject" pattern matches). By default, a failure to match a closing
- tag causes "extract_tagged" to immediately fail.
-
- However, if the string value associated with <reject> is "MAX", then
- "extract_tagged" returns the complete text up to the point of
- failure. If the string is "PARA", "extract_tagged" returns only the
- first paragraph after the tag (up to the first line that is either
- empty or contains only whitespace characters). If the string is "",
- the the default behaviour (i.e. failure) is reinstated.
-
- For example, suppose the start tag "/para" introduces a paragraph,
- which then continues until the next "/endpara" tag or until another
- "/para" tag is encountered:
-
- $text = "/para line 1\n\nline 3\n/para line 4";
-
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
-
- # EXTRACTED: "/para line 1\n\nline 3\n"
-
- Suppose instead, that if no matching "/endpara" tag is found, the
- "/para" tag refers only to the immediately following paragraph:
-
- $text = "/para line 1\n\nline 3\n/para line 4";
-
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
-
- # EXTRACTED: "/para line 1\n"
-
- Note that the specified "fail" behaviour applies to nested tags as
- well.
-
- On success in a list context, an array of 6 elements is returned. The
- elements are:
-
- [0] the extracted tagged substring (including the outermost tags),
-
- [1] the remainder of the input text,
-
- [2] the prefix substring (if any),
-
- [3] the opening tag
-
- [4] the text between the opening and closing tags
-
- [5] the closing tag (or "" if no closing tag was found)
-
- On failure, all of these values (except the remaining text) are "undef".
-
- In a scalar context, "extract_tagged" returns just the complete
- substring that matched a tagged text (including the start and end tags).
- "undef" is returned on failure. In addition, the original input text has
- the returned substring (and any prefix) removed from it.
-
- In a void context, the input text just has the matched substring (and
- any specified prefix) removed.
-
- "gen_extract_tagged"
- (Note: This subroutine is only available under Perl5.005)
-
- "gen_extract_tagged" generates a new anonymous subroutine which extracts
- text between (balanced) specified tags. In other words, it generates a
- function identical in function to "extract_tagged".
-
- The difference between "extract_tagged" and the anonymous subroutines
- generated by "gen_extract_tagged", is that those generated subroutines:
-
- * do not have to reparse tag specification or parsing options every
- time they are called (whereas "extract_tagged" has to effectively
- rebuild its tag parser on every call);
-
- * make use of the new qr// construct to pre-compile the regexes they
- use (whereas "extract_tagged" uses standard string variable
- interpolation to create tag-matching patterns).
-
- The subroutine takes up to four optional arguments (the same set as
- "extract_tagged" except for the string to be processed). It returns a
- reference to a subroutine which in turn takes a single argument (the
- text to be extracted from).
-
- In other words, the implementation of "extract_tagged" is exactly
- equivalent to:
-
- sub extract_tagged
- {
- my $text = shift;
- $extractor = gen_extract_tagged(@_);
- return $extractor->($text);
- }
-
- (although "extract_tagged" is not currently implemented that way, in
- order to preserve pre-5.005 compatibility).
-
- Using "gen_extract_tagged" to create extraction functions for specific
- tags is a good idea if those functions are going to be called more than
- once, since their performance is typically twice as good as the more
- general-purpose "extract_tagged".
-
- "extract_quotelike"
- "extract_quotelike" attempts to recognize, extract, and segment any one
- of the various Perl quotes and quotelike operators (see perlop(3))
- Nested backslashed delimiters, embedded balanced bracket delimiters (for
- the quotelike operators), and trailing modifiers are all caught. For
- example, in:
-
- extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
-
- extract_quotelike ' "You said, \"Use sed\"." '
-
- extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
-
- extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
-
- the full Perl quotelike operations are all extracted correctly.
-
- Note too that, when using the /x modifier on a regex, any comment
- containing the current pattern delimiter will cause the regex to be
- immediately terminated. In other words:
-
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/UNDERSCORE
- [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
- /x'
-
- will be extracted as if it were:
-
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/'
-
- This behaviour is identical to that of the actual compiler.
-
- "extract_quotelike" takes two arguments: the text to be processed and a
- prefix to be matched at the very beginning of the text. If no prefix is
- specified, optional whitespace is the default. If no text is given, $_
- is used.
-
- In a list context, an array of 11 elements is returned. The elements
- are:
-
- [0] the extracted quotelike substring (including trailing modifiers),
-
- [1] the remainder of the input text,
-
- [2] the prefix substring (if any),
-
- [3] the name of the quotelike operator (if any),
-
- [4] the left delimiter of the first block of the operation,
-
- [5] the text of the first block of the operation (that is, the contents
- of a quote, the regex of a match or substitution or the target list
- of a translation),
-
- [6] the right delimiter of the first block of the operation,
-
- [7] the left delimiter of the second block of the operation (that is, if
- it is a "s", "tr", or "y"),
-
- [8] the text of the second block of the operation (that is, the
- replacement of a substitution or the translation list of a
- translation),
-
- [9] the right delimiter of the second block of the operation (if any),
-
- [10]
- the trailing modifiers on the operation (if any).
-
- For each of the fields marked "(if any)" the default value on success is
- an empty string. On failure, all of these values (except the remaining
- text) are "undef".
-
- In a scalar context, "extract_quotelike" returns just the complete
- substring that matched a quotelike operation (or "undef" on failure). In
- a scalar or void context, the input text has the same substring (and any
- specified prefix) removed.
-
- Examples:
-
- # Remove the first quotelike literal that appears in text
-
- $quotelike = extract_quotelike($text,'.*?');
-
- # Replace one or more leading whitespace-separated quotelike
- # literals in $_ with "<QLL>"
-
- do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
-
-
- # Isolate the search pattern in a quotelike operation from $text
-
- ($op,$pat) = (extract_quotelike $text)[3,5];
- if ($op =~ /[ms]/)
- {
- print "search pattern: $pat\n";
- }
- else
- {
- print "$op is not a pattern matching operation\n";
- }
-
- "extract_quotelike" and "here documents"
- "extract_quotelike" can successfully extract "here documents" from an
- input string, but with an important caveat in list contexts.
-
- Unlike other types of quote-like literals, a here document is rarely a
- contiguous substring. For example, a typical piece of code using here
- document might look like this:
-
- <<'EOMSG' || die;
- This is the message.
- EOMSG
- exit;
-
- Given this as an input string in a scalar context, "extract_quotelike"
- would correctly return the string "<<'EOMSG'\nThis is the
- message.\nEOMSG", leaving the string " || die;\nexit;" in the original
- variable. In other words, the two separate pieces of the here document
- are successfully extracted and concatenated.
-
- In a list context, "extract_quotelike" would return the list
-
- [0] "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted
- here document, including fore and aft delimiters),
-
- [1] " || die;\nexit;" (i.e. the remainder of the input text,
- concatenated),
-
- [2] "" (i.e. the prefix substring -- trivial in this case),
-
- [3] "<<" (i.e. the "name" of the quotelike operator)
-
- [4] "'EOMSG'" (i.e. the left delimiter of the here document, including
- any quotes),
-
- [5] "This is the message.\n" (i.e. the text of the here document),
-
- [6] "EOMSG" (i.e. the right delimiter of the here document),
-
- [7..10]
- "" (a here document has no second left delimiter, second text,
- second right delimiter, or trailing modifiers).
-
- However, the matching position of the input variable would be set to
- "exit;" (i.e. *after* the closing delimiter of the here document), which
- would cause the earlier " || die;\nexit;" to be skipped in any sequence
- of code fragment extractions.
-
- To avoid this problem, when it encounters a here document whilst
- extracting from a modifiable string, "extract_quotelike" silently
- rearranges the string to an equivalent piece of Perl:
-
- <<'EOMSG'
- This is the message.
- EOMSG
- || die;
- exit;
-
- in which the here document *is* contiguous. It still leaves the matching
- position after the here document, but now the rest of the line on which
- the here document starts is not skipped.
-
- To prevent <extract_quotelike> from mucking about with the input in this
- way (this is the only case where a list-context "extract_quotelike" does
- so), you can pass the input variable as an interpolated literal:
-
- $quotelike = extract_quotelike("$var");
-
- "extract_codeblock"
- "extract_codeblock" attempts to recognize and extract a balanced bracket
- delimited substring that may contain unbalanced brackets inside Perl
- quotes or quotelike operations. That is, "extract_codeblock" is like a
- combination of "extract_bracketed" and "extract_quotelike".
-
- "extract_codeblock" takes the same initial three parameters as
- "extract_bracketed": a text to process, a set of delimiter brackets to
- look for, and a prefix to match first. It also takes an optional fourth
- parameter, which allows the outermost delimiter brackets to be specified
- separately (see below).
-
- Omitting the first argument (input text) means process $_ instead.
- Omitting the second argument (delimiter brackets) indicates that only
- '{' is to be used. Omitting the third argument (prefix argument) implies
- optional whitespace at the start. Omitting the fourth argument
- (outermost delimiter brackets) indicates that the value of the second
- argument is to be used for the outermost delimiters.
-
- Once the prefix an dthe outermost opening delimiter bracket have been
- recognized, code blocks are extracted by stepping through the input text
- and trying the following alternatives in sequence:
-
- 1. Try and match a closing delimiter bracket. If the bracket was the
- same species as the last opening bracket, return the substring to
- that point. If the bracket was mismatched, return an error.
-
- 2. Try to match a quote or quotelike operator. If found, call
- "extract_quotelike" to eat it. If "extract_quotelike" fails, return
- the error it returned. Otherwise go back to step 1.
-
- 3. Try to match an opening delimiter bracket. If found, call
- "extract_codeblock" recursively to eat the embedded block. If the
- recursive call fails, return an error. Otherwise, go back to step 1.
-
- 4. Unconditionally match a bareword or any other single character, and
- then go back to step 1.
-
- Examples:
-
- # Find a while loop in the text
-
- if ($text =~ s/.*?while\s*\{/{/)
- {
- $loop = "while " . extract_codeblock($text);
- }
-
- # Remove the first round-bracketed list (which may include
- # round- or curly-bracketed code blocks or quotelike operators)
-
- extract_codeblock $text, "(){}", '[^(]*';
-
- The ability to specify a different outermost delimiter bracket is useful
- in some circumstances. For example, in the Parse::RecDescent module,
- parser actions which are to be performed only on a successful parse are
- specified using a "<defer:...>" directive. For example:
-
- sentence: subject verb object
- <defer: {$::theVerb = $item{verb}} >
-
- Parse::RecDescent uses "extract_codeblock($text, '{}<>')" to extract the
- code within the "<defer:...>" directive, but there's a problem.
-
- A deferred action like this:
-
- <defer: {if ($count>10) {$count--}} >
-
- will be incorrectly parsed as:
-
- <defer: {if ($count>
-
- because the "less than" operator is interpreted as a closing delimiter.
-
- But, by extracting the directive using
- "extract_codeblock($text, '{}', undef, '<>')" the '>' character is only
- treated as a delimited at the outermost level of the code block, so the
- directive is parsed correctly.
-
- "extract_multiple"
- The "extract_multiple" subroutine takes a string to be processed and a
- list of extractors (subroutines or regular expressions) to apply to that
- string.
-
- In an array context "extract_multiple" returns an array of substrings of
- the original string, as extracted by the specified extractors. In a
- scalar context, "extract_multiple" returns the first substring
- successfully extracted from the original string. In both scalar and void
- contexts the original string has the first successfully extracted
- substring removed from it. In all contexts "extract_multiple" starts at
- the current "pos" of the string, and sets that "pos" appropriately after
- it matches.
-
- Hence, the aim of of a call to "extract_multiple" in a list context is
- to split the processed string into as many non-overlapping fields as
- possible, by repeatedly applying each of the specified extractors to the
- remainder of the string. Thus "extract_multiple" is a generalized form
- of Perl's "split" subroutine.
-
- The subroutine takes up to four optional arguments:
-
- 1. A string to be processed ($_ if the string is omitted or "undef")
-
- 2. A reference to a list of subroutine references and/or qr// objects
- and/or literal strings and/or hash references, specifying the
- extractors to be used to split the string. If this argument is
- omitted (or "undef") the list:
-
- [
- sub { extract_variable($_[0], '') },
- sub { extract_quotelike($_[0],'') },
- sub { extract_codeblock($_[0],'{}','') },
- ]
-
- is used.
-
- 3. An number specifying the maximum number of fields to return. If this
- argument is omitted (or "undef"), split continues as long as
- possible.
-
- If the third argument is *N*, then extraction continues until *N*
- fields have been successfully extracted, or until the string has
- been completely processed.
-
- Note that in scalar and void contexts the value of this argument is
- automatically reset to 1 (under "-w", a warning is issued if the
- argument has to be reset).
-
- 4. A value indicating whether unmatched substrings (see below) within
- the text should be skipped or returned as fields. If the value is
- true, such substrings are skipped. Otherwise, they are returned.
-
- The extraction process works by applying each extractor in sequence to
- the text string.
-
- If the extractor is a subroutine it is called in a list context and is
- expected to return a list of a single element, namely the extracted
- text. It may optionally also return two further arguments: a string
- representing the text left after extraction (like $' for a pattern
- match), and a string representing any prefix skipped before the
- extraction (like $` in a pattern match). Note that this is designed to
- facilitate the use of other Text::Balanced subroutines with
- "extract_multiple". Note too that the value returned by an extractor
- subroutine need not bear any relationship to the corresponding substring
- of the original text (see examples below).
-
- If the extractor is a precompiled regular expression or a string, it is
- matched against the text in a scalar context with a leading '\G' and the
- gc modifiers enabled. The extracted value is either $1 if that variable
- is defined after the match, or else the complete match (i.e. $&).
-
- If the extractor is a hash reference, it must contain exactly one
- element. The value of that element is one of the above extractor types
- (subroutine reference, regular expression, or string). The key of that
- element is the name of a class into which the successful return value of
- the extractor will be blessed.
-
- If an extractor returns a defined value, that value is immediately
- treated as the next extracted field and pushed onto the list of fields.
- If the extractor was specified in a hash reference, the field is also
- blessed into the appropriate class,
-
- If the extractor fails to match (in the case of a regex extractor), or
- returns an empty list or an undefined value (in the case of a subroutine
- extractor), it is assumed to have failed to extract. If none of the
- extractor subroutines succeeds, then one character is extracted from the
- start of the text and the extraction subroutines reapplied. Characters
- which are thus removed are accumulated and eventually become the next
- field (unless the fourth argument is true, in which case they are
- discarded).
-
- For example, the following extracts substrings that are valid Perl
- variables:
-
- @fields = extract_multiple($text,
- [ sub { extract_variable($_[0]) } ],
- undef, 1);
-
- This example separates a text into fields which are quote delimited,
- curly bracketed, and anything else. The delimited and bracketed parts
- are also blessed to identify them (the "anything else" is unblessed):
-
- @fields = extract_multiple($text,
- [
- { Delim => sub { extract_delimited($_[0],q{'"}) } },
- { Brack => sub { extract_bracketed($_[0],'{}') } },
- ]);
-
- This call extracts the next single substring that is a valid Perl
- quotelike operator (and removes it from $text):
-
- $quotelike = extract_multiple($text,
- [
- sub { extract_quotelike($_[0]) },
- ], undef, 1);
-
- Finally, here is yet another way to do comma-separated value parsing:
-
- @fields = extract_multiple($csv_text,
- [
- sub { extract_delimited($_[0],q{'"}) },
- qr/([^,]+)(.*)/,
- ],
- undef,1);
-
- The list in the second argument means: *"Try and extract a ' or "
- delimited string, otherwise extract anything up to a comma..."*. The
- undef third argument means: *"...as many times as possible..."*, and the
- true value in the fourth argument means *"...discarding anything else
- that appears (i.e. the commas)"*.
-
- If you wanted the commas preserved as separate fields (i.e. like split
- does if your split pattern has capturing parentheses), you would just
- make the last parameter undefined (or remove it).
-
- "gen_delimited_pat"
- The "gen_delimited_pat" subroutine takes a single (string) argument and
- > builds a Friedl-style optimized regex that matches a string delimited
- by any one of the characters in the single argument. For example:
-
- gen_delimited_pat(q{'"})
-
- returns the regex:
-
- (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
-
- Note that the specified delimiters are automatically quotemeta'd.
-
- A typical use of "gen_delimited_pat" would be to build special purpose
- tags for "extract_tagged". For example, to properly ignore "empty" XML
- elements (which might contain quoted strings):
-
- my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
-
- extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
-
- "gen_delimited_pat" may also be called with an optional second argument,
- which specifies the "escape" character(s) to be used for each delimiter.
- For example to match a Pascal-style string (where ' is the delimiter and
- '' is a literal ' within the string):
-
- gen_delimited_pat(q{'},q{'});
-
- Different escape characters can be specified for different delimiters.
- For example, to specify that '/' is the escape for single quotes and '%'
- is the escape for double quotes:
-
- gen_delimited_pat(q{'"},q{/%});
-
- If more delimiters than escape chars are specified, the last escape char
- is used for the remaining delimiters. If no escape char is specified for
- a given specified delimiter, '\' is used.
-
- "delimited_pat"
- Note that "gen_delimited_pat" was previously called "delimited_pat".
- That name may still be used, but is now deprecated.
-
-DIAGNOSTICS
- In a list context, all the functions return "(undef,$original_text)" on
- failure. In a scalar context, failure is indicated by returning "undef"
- (in this case the input text is not modified in any way).
-
- In addition, on failure in *any* context, the $@ variable is set.
- Accessing "$@->{error}" returns one of the error diagnostics listed
- below. Accessing "$@->{pos}" returns the offset into the original string
- at which the error was detected (although not necessarily where it
- occurred!) Printing $@ directly produces the error message, with the
- offset appended. On success, the $@ variable is guaranteed to be
- "undef".
-
- The available diagnostics are:
-
- "Did not find a suitable bracket: "%s""
- The delimiter provided to "extract_bracketed" was not one of
- '()[]<>{}'.
-
- "Did not find prefix: /%s/"
- A non-optional prefix was specified but wasn't found at the start of
- the text.
-
- "Did not find opening bracket after prefix: "%s""
- "extract_bracketed" or "extract_codeblock" was expecting a
- particular kind of bracket at the start of the text, and didn't find
- it.
-
- "No quotelike operator found after prefix: "%s""
- "extract_quotelike" didn't find one of the quotelike operators "q",
- "qq", "qw", "qx", "s", "tr" or "y" at the start of the substring it
- was extracting.
-
- "Unmatched closing bracket: "%c""
- "extract_bracketed", "extract_quotelike" or "extract_codeblock"
- encountered a closing bracket where none was expected.
-
- "Unmatched opening bracket(s): "%s""
- "extract_bracketed", "extract_quotelike" or "extract_codeblock" ran
- out of characters in the text before closing one or more levels of
- nested brackets.
-
- "Unmatched embedded quote (%s)"
- "extract_bracketed" attempted to match an embedded quoted substring,
- but failed to find a closing quote to match it.
-
- "Did not find closing delimiter to match '%s'"
- "extract_quotelike" was unable to find a closing delimiter to match
- the one that opened the quote-like operation.
-
- "Mismatched closing bracket: expected "%c" but found "%s""
- "extract_bracketed", "extract_quotelike" or "extract_codeblock"
- found a valid bracket delimiter, but it was the wrong species. This
- usually indicates a nesting error, but may indicate incorrect
- quoting or escaping.
-
- "No block delimiter found after quotelike "%s""
- "extract_quotelike" or "extract_codeblock" found one of the
- quotelike operators "q", "qq", "qw", "qx", "s", "tr" or "y" without
- a suitable block after it.
-
- "Did not find leading dereferencer"
- "extract_variable" was expecting one of '$', '@', or '%' at the
- start of a variable, but didn't find any of them.
-
- "Bad identifier after dereferencer"
- "extract_variable" found a '$', '@', or '%' indicating a variable,
- but that character was not followed by a legal Perl identifier.
-
- "Did not find expected opening bracket at %s"
- "extract_codeblock" failed to find any of the outermost opening
- brackets that were specified.
-
- "Improperly nested codeblock at %s"
- A nested code block was found that started with a delimiter that was
- specified as being only to be used as an outermost bracket.
-
- "Missing second block for quotelike "%s""
- "extract_codeblock" or "extract_quotelike" found one of the
- quotelike operators "s", "tr" or "y" followed by only one block.
-
- "No match found for opening bracket"
- "extract_codeblock" failed to find a closing bracket to match the
- outermost opening bracket.
-
- "Did not find opening tag: /%s/"
- "extract_tagged" did not find a suitable opening tag (after any
- specified prefix was removed).
-
- "Unable to construct closing tag to match: /%s/"
- "extract_tagged" matched the specified opening tag and tried to
- modify the matched text to produce a matching closing tag (because
- none was specified). It failed to generate the closing tag, almost
- certainly because the opening tag did not start with a bracket of
- some kind.
-
- "Found invalid nested tag: %s"
- "extract_tagged" found a nested tag that appeared in the "reject"
- list (and the failure mode was not "MAX" or "PARA").
-
- "Found unbalanced nested tag: %s"
- "extract_tagged" found a nested opening tag that was not matched by
- a corresponding nested closing tag (and the failure mode was not
- "MAX" or "PARA").
-
- "Did not find closing tag"
- "extract_tagged" reached the end of the text without finding a
- closing tag to match the original opening tag (and the failure mode
- was not "MAX" or "PARA").
-
-AUTHOR
- Damian Conway (damian@conway.org)
-
-BUGS AND IRRITATIONS
- There are undoubtedly serious bugs lurking somewhere in this code, if
- only because parts of it give the impression of understanding a great
- deal more about Perl than they really do.
-
- Bug reports and other feedback are most welcome.
-
-COPYRIGHT
- Copyright 1997 - 2001 Damian Conway. All Rights Reserved.
-
- Some (minor) parts copyright 2009 Adam Kennedy.
-
- This module is free software. It may be used, redistributed and/or
- modified under the same terms as Perl itself.
-
diff --git a/ext/Text-Balanced/lib/Text/Balanced.pm b/ext/Text-Balanced/lib/Text/Balanced.pm
deleted file mode 100644
index 07d956735c..0000000000
--- a/ext/Text-Balanced/lib/Text/Balanced.pm
+++ /dev/null
@@ -1,2281 +0,0 @@
-package Text::Balanced;
-
-# EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
-# FOR FULL DOCUMENTATION SEE Balanced.pod
-
-use 5.005;
-use strict;
-use Exporter ();
-use SelfLoader;
-
-use vars qw { $VERSION @ISA %EXPORT_TAGS };
-BEGIN {
- $VERSION = '2.02';
- @ISA = 'Exporter';
- %EXPORT_TAGS = (
- ALL => [ qw{
- &extract_delimited
- &extract_bracketed
- &extract_quotelike
- &extract_codeblock
- &extract_variable
- &extract_tagged
- &extract_multiple
- &gen_delimited_pat
- &gen_extract_tagged
- &delimited_pat
- } ],
- );
-}
-
-Exporter::export_ok_tags('ALL');
-
-# PROTOTYPES
-
-sub _match_bracketed($$$$$$);
-sub _match_variable($$);
-sub _match_codeblock($$$$$$$);
-sub _match_quotelike($$$$);
-
-# HANDLE RETURN VALUES IN VARIOUS CONTEXTS
-
-sub _failmsg {
- my ($message, $pos) = @_;
- $@ = bless {
- error => $message,
- pos => $pos,
- }, 'Text::Balanced::ErrorMsg';
-}
-
-sub _fail {
- my ($wantarray, $textref, $message, $pos) = @_;
- _failmsg $message, $pos if $message;
- return (undef, $$textref, undef) if $wantarray;
- return undef;
-}
-
-sub _succeed {
- $@ = undef;
- my ($wantarray,$textref) = splice @_, 0, 2;
- my ($extrapos, $extralen) = @_ > 18
- ? splice(@_, -2, 2)
- : (0, 0);
- my ($startlen, $oppos) = @_[5,6];
- my $remainderpos = $_[2];
- if ( $wantarray ) {
- my @res;
- while (my ($from, $len) = splice @_, 0, 2) {
- push @res, substr($$textref, $from, $len);
- }
- if ( $extralen ) { # CORRECT FILLET
- my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
- $res[1] = "$extra$res[1]";
- eval { substr($$textref,$remainderpos,0) = $extra;
- substr($$textref,$extrapos,$extralen,"\n")} ;
- #REARRANGE HERE DOC AND FILLET IF POSSIBLE
- pos($$textref) = $remainderpos-$extralen+1; # RESET \G
- } else {
- pos($$textref) = $remainderpos; # RESET \G
- }
- return @res;
- } else {
- my $match = substr($$textref,$_[0],$_[1]);
- substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
- my $extra = $extralen
- ? substr($$textref, $extrapos, $extralen)."\n" : "";
- eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ; #CHOP OUT PREFIX & MATCH, IF POSSIBLE
- pos($$textref) = $_[4]; # RESET \G
- return $match;
- }
-}
-
-# BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
-
-sub gen_delimited_pat($;$) # ($delimiters;$escapes)
-{
- my ($dels, $escs) = @_;
- return "" unless $dels =~ /\S/;
- $escs = '\\' unless $escs;
- $escs .= substr($escs,-1) x (length($dels)-length($escs));
- my @pat = ();
- my $i;
- for ($i=0; $i<length $dels; $i++)
- {
- my $del = quotemeta substr($dels,$i,1);
- my $esc = quotemeta substr($escs,$i,1);
- if ($del eq $esc)
- {
- push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
- }
- else
- {
- push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
- }
- }
- my $pat = join '|', @pat;
- return "(?:$pat)";
-}
-
-*delimited_pat = \&gen_delimited_pat;
-
-# THE EXTRACTION FUNCTIONS
-
-sub extract_delimited (;$$$$)
-{
- my $textref = defined $_[0] ? \$_[0] : \$_;
- my $wantarray = wantarray;
- my $del = defined $_[1] ? $_[1] : qq{\'\"\`};
- my $pre = defined $_[2] ? $_[2] : '\s*';
- my $esc = defined $_[3] ? $_[3] : qq{\\};
- my $pat = gen_delimited_pat($del, $esc);
- my $startpos = pos $$textref || 0;
- return _fail($wantarray, $textref, "Not a delimited pattern", 0)
- unless $$textref =~ m/\G($pre)($pat)/gc;
- my $prelen = length($1);
- my $matchpos = $startpos+$prelen;
- my $endpos = pos $$textref;
- return _succeed $wantarray, $textref,
- $matchpos, $endpos-$matchpos, # MATCH
- $endpos, length($$textref)-$endpos, # REMAINDER
- $startpos, $prelen; # PREFIX
-}
-
-sub extract_bracketed (;$$$)
-{
- my $textref = defined $_[0] ? \$_[0] : \$_;
- my $ldel = defined $_[1] ? $_[1] : '{([<';
- my $pre = defined $_[2] ? $_[2] : '\s*';
- my $wantarray = wantarray;
- my $qdel = "";
- my $quotelike;
- $ldel =~ s/'//g and $qdel .= q{'};
- $ldel =~ s/"//g and $qdel .= q{"};
- $ldel =~ s/`//g and $qdel .= q{`};
- $ldel =~ s/q//g and $quotelike = 1;
- $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
- my $rdel = $ldel;
- unless ($rdel =~ tr/[({</])}>/)
- {
- return _fail $wantarray, $textref,
- "Did not find a suitable bracket in delimiter: \"$_[1]\"",
- 0;
- }
- my $posbug = pos;
- $ldel = join('|', map { quotemeta $_ } split('', $ldel));
- $rdel = join('|', map { quotemeta $_ } split('', $rdel));
- pos = $posbug;
-
- my $startpos = pos $$textref || 0;
- my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
-
- return _fail ($wantarray, $textref) unless @match;
-
- return _succeed ( $wantarray, $textref,
- $match[2], $match[5]+2, # MATCH
- @match[8,9], # REMAINDER
- @match[0,1], # PREFIX
- );
-}
-
-sub _match_bracketed($$$$$$) # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
-{
- my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
- my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
- unless ($$textref =~ m/\G$pre/gc)
- {
- _failmsg "Did not find prefix: /$pre/", $startpos;
- return;
- }
-
- $ldelpos = pos $$textref;
-
- unless ($$textref =~ m/\G($ldel)/gc)
- {
- _failmsg "Did not find opening bracket after prefix: \"$pre\"",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
-
- my @nesting = ( $1 );
- my $textlen = length $$textref;
- while (pos $$textref < $textlen)
- {
- next if $$textref =~ m/\G\\./gcs;
-
- if ($$textref =~ m/\G($ldel)/gc)
- {
- push @nesting, $1;
- }
- elsif ($$textref =~ m/\G($rdel)/gc)
- {
- my ($found, $brackettype) = ($1, $1);
- if ($#nesting < 0)
- {
- _failmsg "Unmatched closing bracket: \"$found\"",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- my $expected = pop(@nesting);
- $expected =~ tr/({[</)}]>/;
- if ($expected ne $brackettype)
- {
- _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- last if $#nesting < 0;
- }
- elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
- {
- $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
- _failmsg "Unmatched embedded quote ($1)",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- elsif ($quotelike && _match_quotelike($textref,"",1,0))
- {
- next;
- }
-
- else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
- }
- if ($#nesting>=0)
- {
- _failmsg "Unmatched opening bracket(s): "
- . join("..",@nesting)."..",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
-
- $endpos = pos $$textref;
-
- return (
- $startpos, $ldelpos-$startpos, # PREFIX
- $ldelpos, 1, # OPENING BRACKET
- $ldelpos+1, $endpos-$ldelpos-2, # CONTENTS
- $endpos-1, 1, # CLOSING BRACKET
- $endpos, length($$textref)-$endpos, # REMAINDER
- );
-}
-
-sub _revbracket($)
-{
- my $brack = reverse $_[0];
- $brack =~ tr/[({</])}>/;
- return $brack;
-}
-
-my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
-
-sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
-{
- my $textref = defined $_[0] ? \$_[0] : \$_;
- my $ldel = $_[1];
- my $rdel = $_[2];
- my $pre = defined $_[3] ? $_[3] : '\s*';
- my %options = defined $_[4] ? %{$_[4]} : ();
- my $omode = defined $options{fail} ? $options{fail} : '';
- my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
- : defined($options{reject}) ? $options{reject}
- : ''
- ;
- my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
- : defined($options{ignore}) ? $options{ignore}
- : ''
- ;
-
- if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
- $@ = undef;
-
- my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
-
- return _fail(wantarray, $textref) unless @match;
- return _succeed wantarray, $textref,
- $match[2], $match[3]+$match[5]+$match[7], # MATCH
- @match[8..9,0..1,2..7]; # REM, PRE, BITS
-}
-
-sub _match_tagged # ($$$$$$$)
-{
- my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
- my $rdelspec;
-
- my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
-
- unless ($$textref =~ m/\G($pre)/gc)
- {
- _failmsg "Did not find prefix: /$pre/", pos $$textref;
- goto failed;
- }
-
- $opentagpos = pos($$textref);
-
- unless ($$textref =~ m/\G$ldel/gc)
- {
- _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
- goto failed;
- }
-
- $textpos = pos($$textref);
-
- if (!defined $rdel)
- {
- $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
- unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
- {
- _failmsg "Unable to construct closing tag to match: $rdel",
- pos $$textref;
- goto failed;
- }
- }
- else
- {
- $rdelspec = eval "qq{$rdel}" || do {
- my $del;
- for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
- { next if $rdel =~ /\Q$_/; $del = $_; last }
- unless ($del) {
- use Carp;
- croak "Can't interpolate right delimiter $rdel"
- }
- eval "qq$del$rdel$del";
- };
- }
-
- while (pos($$textref) < length($$textref))
- {
- next if $$textref =~ m/\G\\./gc;
-
- if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
- {
- $parapos = pos($$textref) - length($1)
- unless defined $parapos;
- }
- elsif ($$textref =~ m/\G($rdelspec)/gc )
- {
- $closetagpos = pos($$textref)-length($1);
- goto matched;
- }
- elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
- {
- next;
- }
- elsif ($bad && $$textref =~ m/\G($bad)/gcs)
- {
- pos($$textref) -= length($1); # CUT OFF WHATEVER CAUSED THE SHORTNESS
- goto short if ($omode eq 'PARA' || $omode eq 'MAX');
- _failmsg "Found invalid nested tag: $1", pos $$textref;
- goto failed;
- }
- elsif ($$textref =~ m/\G($ldel)/gc)
- {
- my $tag = $1;
- pos($$textref) -= length($tag); # REWIND TO NESTED TAG
- unless (_match_tagged(@_)) # MATCH NESTED TAG
- {
- goto short if $omode eq 'PARA' || $omode eq 'MAX';
- _failmsg "Found unbalanced nested tag: $tag",
- pos $$textref;
- goto failed;
- }
- }
- else { $$textref =~ m/./gcs }
- }
-
-short:
- $closetagpos = pos($$textref);
- goto matched if $omode eq 'MAX';
- goto failed unless $omode eq 'PARA';
-
- if (defined $parapos) { pos($$textref) = $parapos }
- else { $parapos = pos($$textref) }
-
- return (
- $startpos, $opentagpos-$startpos, # PREFIX
- $opentagpos, $textpos-$opentagpos, # OPENING TAG
- $textpos, $parapos-$textpos, # TEXT
- $parapos, 0, # NO CLOSING TAG
- $parapos, length($$textref)-$parapos, # REMAINDER
- );
-
-matched:
- $endpos = pos($$textref);
- return (
- $startpos, $opentagpos-$startpos, # PREFIX
- $opentagpos, $textpos-$opentagpos, # OPENING TAG
- $textpos, $closetagpos-$textpos, # TEXT
- $closetagpos, $endpos-$closetagpos, # CLOSING TAG
- $endpos, length($$textref)-$endpos, # REMAINDER
- );
-
-failed:
- _failmsg "Did not find closing tag", pos $$textref unless $@;
- pos($$textref) = $startpos;
- return;
-}
-
-sub extract_variable (;$$)
-{
- my $textref = defined $_[0] ? \$_[0] : \$_;
- return ("","","") unless defined $$textref;
- my $pre = defined $_[1] ? $_[1] : '\s*';
-
- my @match = _match_variable($textref,$pre);
-
- return _fail wantarray, $textref unless @match;
-
- return _succeed wantarray, $textref,
- @match[2..3,4..5,0..1]; # MATCH, REMAINDER, PREFIX
-}
-
-sub _match_variable($$)
-{
-# $#
-# $^
-# $$
- my ($textref, $pre) = @_;
- my $startpos = pos($$textref) = pos($$textref)||0;
- unless ($$textref =~ m/\G($pre)/gc)
- {
- _failmsg "Did not find prefix: /$pre/", pos $$textref;
- return;
- }
- my $varpos = pos($$textref);
- unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
- {
- unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
- {
- _failmsg "Did not find leading dereferencer", pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- my $deref = $1;
-
- unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
- or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
- or $deref eq '$#' or $deref eq '$$' )
- {
- _failmsg "Bad identifier after dereferencer", pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- }
-
- while (1)
- {
- next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
- next if _match_codeblock($textref,
- qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
- qr/[({[]/, qr/[)}\]]/,
- qr/[({[]/, qr/[)}\]]/, 0);
- next if _match_codeblock($textref,
- qr/\s*/, qr/[{[]/, qr/[}\]]/,
- qr/[{[]/, qr/[}\]]/, 0);
- next if _match_variable($textref,'\s*->\s*');
- next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
- last;
- }
-
- my $endpos = pos($$textref);
- return ($startpos, $varpos-$startpos,
- $varpos, $endpos-$varpos,
- $endpos, length($$textref)-$endpos
- );
-}
-
-sub extract_codeblock (;$$$$$)
-{
- my $textref = defined $_[0] ? \$_[0] : \$_;
- my $wantarray = wantarray;
- my $ldel_inner = defined $_[1] ? $_[1] : '{';
- my $pre = defined $_[2] ? $_[2] : '\s*';
- my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
- my $rd = $_[4];
- my $rdel_inner = $ldel_inner;
- my $rdel_outer = $ldel_outer;
- my $posbug = pos;
- for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
- for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
- for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
- {
- $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
- }
- pos = $posbug;
-
- my @match = _match_codeblock($textref, $pre,
- $ldel_outer, $rdel_outer,
- $ldel_inner, $rdel_inner,
- $rd);
- return _fail($wantarray, $textref) unless @match;
- return _succeed($wantarray, $textref,
- @match[2..3,4..5,0..1] # MATCH, REMAINDER, PREFIX
- );
-
-}
-
-sub _match_codeblock($$$$$$$)
-{
- my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
- my $startpos = pos($$textref) = pos($$textref) || 0;
- unless ($$textref =~ m/\G($pre)/gc)
- {
- _failmsg qq{Did not match prefix /$pre/ at"} .
- substr($$textref,pos($$textref),20) .
- q{..."},
- pos $$textref;
- return;
- }
- my $codepos = pos($$textref);
- unless ($$textref =~ m/\G($ldel_outer)/gc) # OUTERMOST DELIMITER
- {
- _failmsg qq{Did not find expected opening bracket at "} .
- substr($$textref,pos($$textref),20) .
- q{..."},
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- my $closing = $1;
- $closing =~ tr/([<{/)]>}/;
- my $matched;
- my $patvalid = 1;
- while (pos($$textref) < length($$textref))
- {
- $matched = '';
- if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
- {
- $patvalid = 0;
- next;
- }
-
- if ($$textref =~ m/\G\s*#.*/gc)
- {
- next;
- }
-
- if ($$textref =~ m/\G\s*($rdel_outer)/gc)
- {
- unless ($matched = ($closing && $1 eq $closing) )
- {
- next if $1 eq '>'; # MIGHT BE A "LESS THAN"
- _failmsg q{Mismatched closing bracket at "} .
- substr($$textref,pos($$textref),20) .
- qq{...". Expected '$closing'},
- pos $$textref;
- }
- last;
- }
-
- if (_match_variable($textref,'\s*') ||
- _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
- {
- $patvalid = 0;
- next;
- }
-
-
- # NEED TO COVER MANY MORE CASES HERE!!!
- if ($$textref =~ m#\G\s*(?!$ldel_inner)
- ( [-+*x/%^&|.]=?
- | [!=]~
- | =(?!>)
- | (\*\*|&&|\|\||<<|>>)=?
- | split|grep|map|return
- | [([]
- )#gcx)
- {
- $patvalid = 1;
- next;
- }
-
- if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
- {
- $patvalid = 1;
- next;
- }
-
- if ($$textref =~ m/\G\s*$ldel_outer/gc)
- {
- _failmsg q{Improperly nested codeblock at "} .
- substr($$textref,pos($$textref),20) .
- q{..."},
- pos $$textref;
- last;
- }
-
- $patvalid = 0;
- $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
- }
- continue { $@ = undef }
-
- unless ($matched)
- {
- _failmsg 'No match found for opening bracket', pos $$textref
- unless $@;
- return;
- }
-
- my $endpos = pos($$textref);
- return ( $startpos, $codepos-$startpos,
- $codepos, $endpos-$codepos,
- $endpos, length($$textref)-$endpos,
- );
-}
-
-
-my %mods = (
- 'none' => '[cgimsox]*',
- 'm' => '[cgimsox]*',
- 's' => '[cegimsox]*',
- 'tr' => '[cds]*',
- 'y' => '[cds]*',
- 'qq' => '',
- 'qx' => '',
- 'qw' => '',
- 'qr' => '[imsx]*',
- 'q' => '',
- );
-
-sub extract_quotelike (;$$)
-{
- my $textref = $_[0] ? \$_[0] : \$_;
- my $wantarray = wantarray;
- my $pre = defined $_[1] ? $_[1] : '\s*';
-
- my @match = _match_quotelike($textref,$pre,1,0);
- return _fail($wantarray, $textref) unless @match;
- return _succeed($wantarray, $textref,
- $match[2], $match[18]-$match[2], # MATCH
- @match[18,19], # REMAINDER
- @match[0,1], # PREFIX
- @match[2..17], # THE BITS
- @match[20,21], # ANY FILLET?
- );
-};
-
-sub _match_quotelike($$$$) # ($textref, $prepat, $allow_raw_match)
-{
- my ($textref, $pre, $rawmatch, $qmark) = @_;
-
- my ($textlen,$startpos,
- $oppos,
- $preld1pos,$ld1pos,$str1pos,$rd1pos,
- $preld2pos,$ld2pos,$str2pos,$rd2pos,
- $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
-
- unless ($$textref =~ m/\G($pre)/gc)
- {
- _failmsg qq{Did not find prefix /$pre/ at "} .
- substr($$textref, pos($$textref), 20) .
- q{..."},
- pos $$textref;
- return;
- }
- $oppos = pos($$textref);
-
- my $initial = substr($$textref,$oppos,1);
-
- if ($initial && $initial =~ m|^[\"\'\`]|
- || $rawmatch && $initial =~ m|^/|
- || $qmark && $initial =~ m|^\?|)
- {
- unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
- {
- _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
- substr($$textref, $oppos, 20) .
- q{..."},
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- $modpos= pos($$textref);
- $rd1pos = $modpos-1;
-
- if ($initial eq '/' || $initial eq '?')
- {
- $$textref =~ m/\G$mods{none}/gc
- }
-
- my $endpos = pos($$textref);
- return (
- $startpos, $oppos-$startpos, # PREFIX
- $oppos, 0, # NO OPERATOR
- $oppos, 1, # LEFT DEL
- $oppos+1, $rd1pos-$oppos-1, # STR/PAT
- $rd1pos, 1, # RIGHT DEL
- $modpos, 0, # NO 2ND LDEL
- $modpos, 0, # NO 2ND STR
- $modpos, 0, # NO 2ND RDEL
- $modpos, $endpos-$modpos, # MODIFIERS
- $endpos, $textlen-$endpos, # REMAINDER
- );
- }
-
- unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
- {
- _failmsg q{No quotelike operator found after prefix at "} .
- substr($$textref, pos($$textref), 20) .
- q{..."},
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
-
- my $op = $1;
- $preld1pos = pos($$textref);
- if ($op eq '<<') {
- $ld1pos = pos($$textref);
- my $label;
- if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
- $label = $1;
- }
- elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
- | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
- | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
- }gcsx) {
- $label = $+;
- }
- else {
- $label = "";
- }
- my $extrapos = pos($$textref);
- $$textref =~ m{.*\n}gc;
- $str1pos = pos($$textref)--;
- unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
- _failmsg qq{Missing here doc terminator ('$label') after "} .
- substr($$textref, $startpos, 20) .
- q{..."},
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- $rd1pos = pos($$textref);
- $$textref =~ m{\Q$label\E\n}gc;
- $ld2pos = pos($$textref);
- return (
- $startpos, $oppos-$startpos, # PREFIX
- $oppos, length($op), # OPERATOR
- $ld1pos, $extrapos-$ld1pos, # LEFT DEL
- $str1pos, $rd1pos-$str1pos, # STR/PAT
- $rd1pos, $ld2pos-$rd1pos, # RIGHT DEL
- $ld2pos, 0, # NO 2ND LDEL
- $ld2pos, 0, # NO 2ND STR
- $ld2pos, 0, # NO 2ND RDEL
- $ld2pos, 0, # NO MODIFIERS
- $ld2pos, $textlen-$ld2pos, # REMAINDER
- $extrapos, $str1pos-$extrapos, # FILLETED BIT
- );
- }
-
- $$textref =~ m/\G\s*/gc;
- $ld1pos = pos($$textref);
- $str1pos = $ld1pos+1;
-
- unless ($$textref =~ m/\G(\S)/gc) # SHOULD USE LOOKAHEAD
- {
- _failmsg "No block delimiter found after quotelike $op",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- pos($$textref) = $ld1pos; # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
- my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
- if ($ldel1 =~ /[[(<{]/)
- {
- $rdel1 =~ tr/[({</])}>/;
- defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
- || do { pos $$textref = $startpos; return };
- $ld2pos = pos($$textref);
- $rd1pos = $ld2pos-1;
- }
- else
- {
- $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
- || do { pos $$textref = $startpos; return };
- $ld2pos = $rd1pos = pos($$textref)-1;
- }
-
- my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
- if ($second_arg)
- {
- my ($ldel2, $rdel2);
- if ($ldel1 =~ /[[(<{]/)
- {
- unless ($$textref =~ /\G\s*(\S)/gc) # SHOULD USE LOOKAHEAD
- {
- _failmsg "Missing second block for quotelike $op",
- pos $$textref;
- pos $$textref = $startpos;
- return;
- }
- $ldel2 = $rdel2 = "\Q$1";
- $rdel2 =~ tr/[({</])}>/;
- }
- else
- {
- $ldel2 = $rdel2 = $ldel1;
- }
- $str2pos = $ld2pos+1;
-
- if ($ldel2 =~ /[[(<{]/)
- {
- pos($$textref)--; # OVERCOME BROKEN LOOKAHEAD
- defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
- || do { pos $$textref = $startpos; return };
- }
- else
- {
- $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
- || do { pos $$textref = $startpos; return };
- }
- $rd2pos = pos($$textref)-1;
- }
- else
- {
- $ld2pos = $str2pos = $rd2pos = $rd1pos;
- }
-
- $modpos = pos $$textref;
-
- $$textref =~ m/\G($mods{$op})/gc;
- my $endpos = pos $$textref;
-
- return (
- $startpos, $oppos-$startpos, # PREFIX
- $oppos, length($op), # OPERATOR
- $ld1pos, 1, # LEFT DEL
- $str1pos, $rd1pos-$str1pos, # STR/PAT
- $rd1pos, 1, # RIGHT DEL
- $ld2pos, $second_arg, # 2ND LDEL (MAYBE)
- $str2pos, $rd2pos-$str2pos, # 2ND STR (MAYBE)
- $rd2pos, $second_arg, # 2ND RDEL (MAYBE)
- $modpos, $endpos-$modpos, # MODIFIERS
- $endpos, $textlen-$endpos, # REMAINDER
- );
-}
-
-my $def_func = [
- sub { extract_variable($_[0], '') },
- sub { extract_quotelike($_[0],'') },
- sub { extract_codeblock($_[0],'{}','') },
-];
-
-sub extract_multiple (;$$$$) # ($text, $functions_ref, $max_fields, $ignoreunknown)
-{
- my $textref = defined($_[0]) ? \$_[0] : \$_;
- my $posbug = pos;
- my ($lastpos, $firstpos);
- my @fields = ();
-
- #for ($$textref)
- {
- my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
- my $max = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
- my $igunk = $_[3];
-
- pos $$textref ||= 0;
-
- unless (wantarray)
- {
- use Carp;
- carp "extract_multiple reset maximal count to 1 in scalar context"
- if $^W && defined($_[2]) && $max > 1;
- $max = 1
- }
-
- my $unkpos;
- my $func;
- my $class;
-
- my @class;
- foreach $func ( @func )
- {
- if (ref($func) eq 'HASH')
- {
- push @class, (keys %$func)[0];
- $func = (values %$func)[0];
- }
- else
- {
- push @class, undef;
- }
- }
-
- FIELD: while (pos($$textref) < length($$textref))
- {
- my ($field, $rem);
- my @bits;
- foreach my $i ( 0..$#func )
- {
- my $pref;
- $func = $func[$i];
- $class = $class[$i];
- $lastpos = pos $$textref;
- if (ref($func) eq 'CODE')
- { ($field,$rem,$pref) = @bits = $func->($$textref) }
- elsif (ref($func) eq 'Text::Balanced::Extractor')
- { @bits = $field = $func->extract($$textref) }
- elsif( $$textref =~ m/\G$func/gc )
- { @bits = $field = defined($1)
- ? $1
- : substr($$textref, $-[0], $+[0] - $-[0])
- }
- $pref ||= "";
- if (defined($field) && length($field))
- {
- if (!$igunk) {
- $unkpos = $lastpos
- if length($pref) && !defined($unkpos);
- if (defined $unkpos)
- {
- push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
- $firstpos = $unkpos unless defined $firstpos;
- undef $unkpos;
- last FIELD if @fields == $max;
- }
- }
- push @fields, $class
- ? bless (\$field, $class)
- : $field;
- $firstpos = $lastpos unless defined $firstpos;
- $lastpos = pos $$textref;
- last FIELD if @fields == $max;
- next FIELD;
- }
- }
- if ($$textref =~ /\G(.)/gcs)
- {
- $unkpos = pos($$textref)-1
- unless $igunk || defined $unkpos;
- }
- }
-
- if (defined $unkpos)
- {
- push @fields, substr($$textref, $unkpos);
- $firstpos = $unkpos unless defined $firstpos;
- $lastpos = length $$textref;
- }
- last;
- }
-
- pos $$textref = $lastpos;
- return @fields if wantarray;
-
- $firstpos ||= 0;
- eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
- pos $$textref = $firstpos };
- return $fields[0];
-}
-
-sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
-{
- my $ldel = $_[0];
- my $rdel = $_[1];
- my $pre = defined $_[2] ? $_[2] : '\s*';
- my %options = defined $_[3] ? %{$_[3]} : ();
- my $omode = defined $options{fail} ? $options{fail} : '';
- my $bad = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
- : defined($options{reject}) ? $options{reject}
- : ''
- ;
- my $ignore = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
- : defined($options{ignore}) ? $options{ignore}
- : ''
- ;
-
- if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
-
- my $posbug = pos;
- for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
- pos = $posbug;
-
- my $closure = sub
- {
- my $textref = defined $_[0] ? \$_[0] : \$_;
- my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
-
- return _fail(wantarray, $textref) unless @match;
- return _succeed wantarray, $textref,
- $match[2], $match[3]+$match[5]+$match[7], # MATCH
- @match[8..9,0..1,2..7]; # REM, PRE, BITS
- };
-
- bless $closure, 'Text::Balanced::Extractor';
-}
-
-package Text::Balanced::Extractor;
-
-sub extract($$) # ($self, $text)
-{
- &{$_[0]}($_[1]);
-}
-
-package Text::Balanced::ErrorMsg;
-
-use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
-
-1;
-
-__END__
-
-=pod
-
-=head1 NAME
-
-Text::Balanced - Extract delimited text sequences from strings.
-
-=head1 SYNOPSIS
-
- use Text::Balanced qw (
- extract_delimited
- extract_bracketed
- extract_quotelike
- extract_codeblock
- extract_variable
- extract_tagged
- extract_multiple
- gen_delimited_pat
- gen_extract_tagged
- );
-
- # Extract the initial substring of $text that is delimited by
- # two (unescaped) instances of the first character in $delim.
-
- ($extracted, $remainder) = extract_delimited($text,$delim);
-
-
- # Extract the initial substring of $text that is bracketed
- # with a delimiter(s) specified by $delim (where the string
- # in $delim contains one or more of '(){}[]<>').
-
- ($extracted, $remainder) = extract_bracketed($text,$delim);
-
-
- # Extract the initial substring of $text that is bounded by
- # an XML tag.
-
- ($extracted, $remainder) = extract_tagged($text);
-
-
- # Extract the initial substring of $text that is bounded by
- # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
-
- ($extracted, $remainder) =
- extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
-
-
- # Extract the initial substring of $text that represents a
- # Perl "quote or quote-like operation"
-
- ($extracted, $remainder) = extract_quotelike($text);
-
-
- # Extract the initial substring of $text that represents a block
- # of Perl code, bracketed by any of character(s) specified by $delim
- # (where the string $delim contains one or more of '(){}[]<>').
-
- ($extracted, $remainder) = extract_codeblock($text,$delim);
-
-
- # Extract the initial substrings of $text that would be extracted by
- # one or more sequential applications of the specified functions
- # or regular expressions
-
- @extracted = extract_multiple($text,
- [ \&extract_bracketed,
- \&extract_quotelike,
- \&some_other_extractor_sub,
- qr/[xyz]*/,
- 'literal',
- ]);
-
-# Create a string representing an optimized pattern (a la Friedl)
-# that matches a substring delimited by any of the specified characters
-# (in this case: any type of quote or a slash)
-
- $patstring = gen_delimited_pat(q{'"`/});
-
-# Generate a reference to an anonymous sub that is just like extract_tagged
-# but pre-compiled and optimized for a specific pair of tags, and consequently
-# much faster (i.e. 3 times faster). It uses qr// for better performance on
-# repeated calls, so it only works under Perl 5.005 or later.
-
- $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
-
- ($extracted, $remainder) = $extract_head->($text);
-
-=head1 DESCRIPTION
-
-The various C<extract_...> subroutines may be used to
-extract a delimited substring, possibly after skipping a
-specified prefix string. By default, that prefix is
-optional whitespace (C</\s*/>), but you can change it to whatever
-you wish (see below).
-
-The substring to be extracted must appear at the
-current C<pos> location of the string's variable
-(or at index zero, if no C<pos> position is defined).
-In other words, the C<extract_...> subroutines I<don't>
-extract the first occurrence of a substring anywhere
-in a string (like an unanchored regex would). Rather,
-they extract an occurrence of the substring appearing
-immediately at the current matching position in the
-string (like a C<\G>-anchored regex would).
-
-=head2 General behaviour in list contexts
-
-In a list context, all the subroutines return a list, the first three
-elements of which are always:
-
-=over 4
-
-=item [0]
-
-The extracted string, including the specified delimiters.
-If the extraction fails C<undef> is returned.
-
-=item [1]
-
-The remainder of the input string (i.e. the characters after the
-extracted string). On failure, the entire string is returned.
-
-=item [2]
-
-The skipped prefix (i.e. the characters before the extracted string).
-On failure, C<undef> is returned.
-
-=back
-
-Note that in a list context, the contents of the original input text (the first
-argument) are not modified in any way.
-
-However, if the input text was passed in a variable, that variable's
-C<pos> value is updated to point at the first character after the
-extracted text. That means that in a list context the various
-subroutines can be used much like regular expressions. For example:
-
- while ( $next = (extract_quotelike($text))[0] )
- {
- # process next quote-like (in $next)
- }
-
-=head2 General behaviour in scalar and void contexts
-
-In a scalar context, the extracted string is returned, having first been
-removed from the input text. Thus, the following code also processes
-each quote-like operation, but actually removes them from $text:
-
- while ( $next = extract_quotelike($text) )
- {
- # process next quote-like (in $next)
- }
-
-Note that if the input text is a read-only string (i.e. a literal),
-no attempt is made to remove the extracted text.
-
-In a void context the behaviour of the extraction subroutines is
-exactly the same as in a scalar context, except (of course) that the
-extracted substring is not returned.
-
-=head2 A note about prefixes
-
-Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
-This can bite you if you're expecting a prefix specification like
-'.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
-pattern will only succeed if the <H1> tag is on the current line, since
-. normally doesn't match newlines.
-
-To overcome this limitation, you need to turn on /s matching within
-the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
-
-=head2 C<extract_delimited>
-
-The C<extract_delimited> function formalizes the common idiom
-of extracting a single-character-delimited substring from the start of
-a string. For example, to extract a single-quote delimited string, the
-following code is typically used:
-
- ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
- $extracted = $1;
-
-but with C<extract_delimited> it can be simplified to:
-
- ($extracted,$remainder) = extract_delimited($text, "'");
-
-C<extract_delimited> takes up to four scalars (the input text, the
-delimiters, a prefix pattern to be skipped, and any escape characters)
-and extracts the initial substring of the text that
-is appropriately delimited. If the delimiter string has multiple
-characters, the first one encountered in the text is taken to delimit
-the substring.
-The third argument specifies a prefix pattern that is to be skipped
-(but must be present!) before the substring is extracted.
-The final argument specifies the escape character to be used for each
-delimiter.
-
-All arguments are optional. If the escape characters are not specified,
-every delimiter is escaped with a backslash (C<\>).
-If the prefix is not specified, the
-pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
-is also not specified, the set C</["'`]/> is used. If the text to be processed
-is not specified either, C<$_> is used.
-
-In list context, C<extract_delimited> returns a array of three
-elements, the extracted substring (I<including the surrounding
-delimiters>), the remainder of the text, and the skipped prefix (if
-any). If a suitable delimited substring is not found, the first
-element of the array is the empty string, the second is the complete
-original text, and the prefix returned in the third element is an
-empty string.
-
-In a scalar context, just the extracted substring is returned. In
-a void context, the extracted substring (and any prefix) are simply
-removed from the beginning of the first argument.
-
-Examples:
-
- # Remove a single-quoted substring from the very beginning of $text:
-
- $substring = extract_delimited($text, "'", '');
-
- # Remove a single-quoted Pascalish substring (i.e. one in which
- # doubling the quote character escapes it) from the very
- # beginning of $text:
-
- $substring = extract_delimited($text, "'", '', "'");
-
- # Extract a single- or double- quoted substring from the
- # beginning of $text, optionally after some whitespace
- # (note the list context to protect $text from modification):
-
- ($substring) = extract_delimited $text, q{"'};
-
- # Delete the substring delimited by the first '/' in $text:
-
- $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
-
-Note that this last example is I<not> the same as deleting the first
-quote-like pattern. For instance, if C<$text> contained the string:
-
- "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
-
-then after the deletion it would contain:
-
- "if ('.$UNIXCMD/s) { $cmd = $1; }"
-
-not:
-
- "if ('./cmd' =~ ms) { $cmd = $1; }"
-
-See L<"extract_quotelike"> for a (partial) solution to this problem.
-
-=head2 C<extract_bracketed>
-
-Like C<"extract_delimited">, the C<extract_bracketed> function takes
-up to three optional scalar arguments: a string to extract from, a delimiter
-specifier, and a prefix pattern. As before, a missing prefix defaults to
-optional whitespace and a missing text defaults to C<$_>. However, a missing
-delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
-
-C<extract_bracketed> extracts a balanced-bracket-delimited
-substring (using any one (or more) of the user-specified delimiter
-brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
-respect quoted unbalanced brackets (see below).
-
-A "delimiter bracket" is a bracket in list of delimiters passed as
-C<extract_bracketed>'s second argument. Delimiter brackets are
-specified by giving either the left or right (or both!) versions
-of the required bracket(s). Note that the order in which
-two or more delimiter brackets are specified is not significant.
-
-A "balanced-bracket-delimited substring" is a substring bounded by
-matched brackets, such that any other (left or right) delimiter
-bracket I<within> the substring is also matched by an opposite
-(right or left) delimiter bracket I<at the same level of nesting>. Any
-type of bracket not in the delimiter list is treated as an ordinary
-character.
-
-In other words, each type of bracket specified as a delimiter must be
-balanced and correctly nested within the substring, and any other kind of
-("non-delimiter") bracket in the substring is ignored.
-
-For example, given the string:
-
- $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
-
-then a call to C<extract_bracketed> in a list context:
-
- @result = extract_bracketed( $text, '{}' );
-
-would return:
-
- ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
-
-since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
-(In a scalar context just the first element of the array would be returned. In
-a void context, C<$text> would be replaced by an empty string.)
-
-Likewise the call in:
-
- @result = extract_bracketed( $text, '{[' );
-
-would return the same result, since all sets of both types of specified
-delimiter brackets are correctly nested and balanced.
-
-However, the call in:
-
- @result = extract_bracketed( $text, '{([<' );
-
-would fail, returning:
-
- ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }" );
-
-because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
-the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
-return an empty string. In a void context, C<$text> would be unchanged.)
-
-Note that the embedded single-quotes in the string don't help in this
-case, since they have not been specified as acceptable delimiters and are
-therefore treated as non-delimiter characters (and ignored).
-
-However, if a particular species of quote character is included in the
-delimiter specification, then that type of quote will be correctly handled.
-for example, if C<$text> is:
-
- $text = '<A HREF=">>>>">link</A>';
-
-then
-
- @result = extract_bracketed( $text, '<">' );
-
-returns:
-
- ( '<A HREF=">>>>">', 'link</A>', "" )
-
-as expected. Without the specification of C<"> as an embedded quoter:
-
- @result = extract_bracketed( $text, '<>' );
-
-the result would be:
-
- ( '<A HREF=">', '>>>">link</A>', "" )
-
-In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
-quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
-letter 'q' as a delimiter. Hence:
-
- @result = extract_bracketed( $text, '<q>' );
-
-would correctly match something like this:
-
- $text = '<leftop: conj /and/ conj>';
-
-See also: C<"extract_quotelike"> and C<"extract_codeblock">.
-
-=head2 C<extract_variable>
-
-C<extract_variable> extracts any valid Perl variable or
-variable-involved expression, including scalars, arrays, hashes, array
-accesses, hash look-ups, method calls through objects, subroutine calls
-through subroutine references, etc.
-
-The subroutine takes up to two optional arguments:
-
-=over 4
-
-=item 1.
-
-A string to be processed (C<$_> if the string is omitted or C<undef>)
-
-=item 2.
-
-A string specifying a pattern to be matched as a prefix (which is to be
-skipped). If omitted, optional whitespace is skipped.
-
-=back
-
-On success in a list context, an array of 3 elements is returned. The
-elements are:
-
-=over 4
-
-=item [0]
-
-the extracted variable, or variablish expression
-
-=item [1]
-
-the remainder of the input text,
-
-=item [2]
-
-the prefix substring (if any),
-
-=back
-
-On failure, all of these values (except the remaining text) are C<undef>.
-
-In a scalar context, C<extract_variable> returns just the complete
-substring that matched a variablish expression. C<undef> is returned on
-failure. In addition, the original input text has the returned substring
-(and any prefix) removed from it.
-
-In a void context, the input text just has the matched substring (and
-any specified prefix) removed.
-
-
-=head2 C<extract_tagged>
-
-C<extract_tagged> extracts and segments text between (balanced)
-specified tags.
-
-The subroutine takes up to five optional arguments:
-
-=over 4
-
-=item 1.
-
-A string to be processed (C<$_> if the string is omitted or C<undef>)
-
-=item 2.
-
-A string specifying a pattern to be matched as the opening tag.
-If the pattern string is omitted (or C<undef>) then a pattern
-that matches any standard XML tag is used.
-
-=item 3.
-
-A string specifying a pattern to be matched at the closing tag.
-If the pattern string is omitted (or C<undef>) then the closing
-tag is constructed by inserting a C</> after any leading bracket
-characters in the actual opening tag that was matched (I<not> the pattern
-that matched the tag). For example, if the opening tag pattern
-is specified as C<'{{\w+}}'> and actually matched the opening tag
-C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
-
-=item 4.
-
-A string specifying a pattern to be matched as a prefix (which is to be
-skipped). If omitted, optional whitespace is skipped.
-
-=item 5.
-
-A hash reference containing various parsing options (see below)
-
-=back
-
-The various options that can be specified are:
-
-=over 4
-
-=item C<reject =E<gt> $listref>
-
-The list reference contains one or more strings specifying patterns
-that must I<not> appear within the tagged text.
-
-For example, to extract
-an HTML link (which should not contain nested links) use:
-
- extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
-
-=item C<ignore =E<gt> $listref>
-
-The list reference contains one or more strings specifying patterns
-that are I<not> be be treated as nested tags within the tagged text
-(even if they would match the start tag pattern).
-
-For example, to extract an arbitrary XML tag, but ignore "empty" elements:
-
- extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
-
-(also see L<"gen_delimited_pat"> below).
-
-=item C<fail =E<gt> $str>
-
-The C<fail> option indicates the action to be taken if a matching end
-tag is not encountered (i.e. before the end of the string or some
-C<reject> pattern matches). By default, a failure to match a closing
-tag causes C<extract_tagged> to immediately fail.
-
-However, if the string value associated with <reject> is "MAX", then
-C<extract_tagged> returns the complete text up to the point of failure.
-If the string is "PARA", C<extract_tagged> returns only the first paragraph
-after the tag (up to the first line that is either empty or contains
-only whitespace characters).
-If the string is "", the the default behaviour (i.e. failure) is reinstated.
-
-For example, suppose the start tag "/para" introduces a paragraph, which then
-continues until the next "/endpara" tag or until another "/para" tag is
-encountered:
-
- $text = "/para line 1\n\nline 3\n/para line 4";
-
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
-
- # EXTRACTED: "/para line 1\n\nline 3\n"
-
-Suppose instead, that if no matching "/endpara" tag is found, the "/para"
-tag refers only to the immediately following paragraph:
-
- $text = "/para line 1\n\nline 3\n/para line 4";
-
- extract_tagged($text, '/para', '/endpara', undef,
- {reject => '/para', fail => MAX );
-
- # EXTRACTED: "/para line 1\n"
-
-Note that the specified C<fail> behaviour applies to nested tags as well.
-
-=back
-
-On success in a list context, an array of 6 elements is returned. The elements are:
-
-=over 4
-
-=item [0]
-
-the extracted tagged substring (including the outermost tags),
-
-=item [1]
-
-the remainder of the input text,
-
-=item [2]
-
-the prefix substring (if any),
-
-=item [3]
-
-the opening tag
-
-=item [4]
-
-the text between the opening and closing tags
-
-=item [5]
-
-the closing tag (or "" if no closing tag was found)
-
-=back
-
-On failure, all of these values (except the remaining text) are C<undef>.
-
-In a scalar context, C<extract_tagged> returns just the complete
-substring that matched a tagged text (including the start and end
-tags). C<undef> is returned on failure. In addition, the original input
-text has the returned substring (and any prefix) removed from it.
-
-In a void context, the input text just has the matched substring (and
-any specified prefix) removed.
-
-=head2 C<gen_extract_tagged>
-
-(Note: This subroutine is only available under Perl5.005)
-
-C<gen_extract_tagged> generates a new anonymous subroutine which
-extracts text between (balanced) specified tags. In other words,
-it generates a function identical in function to C<extract_tagged>.
-
-The difference between C<extract_tagged> and the anonymous
-subroutines generated by
-C<gen_extract_tagged>, is that those generated subroutines:
-
-=over 4
-
-=item *
-
-do not have to reparse tag specification or parsing options every time
-they are called (whereas C<extract_tagged> has to effectively rebuild
-its tag parser on every call);
-
-=item *
-
-make use of the new qr// construct to pre-compile the regexes they use
-(whereas C<extract_tagged> uses standard string variable interpolation
-to create tag-matching patterns).
-
-=back
-
-The subroutine takes up to four optional arguments (the same set as
-C<extract_tagged> except for the string to be processed). It returns
-a reference to a subroutine which in turn takes a single argument (the text to
-be extracted from).
-
-In other words, the implementation of C<extract_tagged> is exactly
-equivalent to:
-
- sub extract_tagged
- {
- my $text = shift;
- $extractor = gen_extract_tagged(@_);
- return $extractor->($text);
- }
-
-(although C<extract_tagged> is not currently implemented that way, in order
-to preserve pre-5.005 compatibility).
-
-Using C<gen_extract_tagged> to create extraction functions for specific tags
-is a good idea if those functions are going to be called more than once, since
-their performance is typically twice as good as the more general-purpose
-C<extract_tagged>.
-
-
-=head2 C<extract_quotelike>
-
-C<extract_quotelike> attempts to recognize, extract, and segment any
-one of the various Perl quotes and quotelike operators (see
-L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
-delimiters (for the quotelike operators), and trailing modifiers are
-all caught. For example, in:
-
- extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
-
- extract_quotelike ' "You said, \"Use sed\"." '
-
- extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
-
- extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
-
-the full Perl quotelike operations are all extracted correctly.
-
-Note too that, when using the /x modifier on a regex, any comment
-containing the current pattern delimiter will cause the regex to be
-immediately terminated. In other words:
-
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/UNDERSCORE
- [a-z0-9]* # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
- /x'
-
-will be extracted as if it were:
-
- 'm /
- (?i) # CASE INSENSITIVE
- [a-z_] # LEADING ALPHABETIC/'
-
-This behaviour is identical to that of the actual compiler.
-
-C<extract_quotelike> takes two arguments: the text to be processed and
-a prefix to be matched at the very beginning of the text. If no prefix
-is specified, optional whitespace is the default. If no text is given,
-C<$_> is used.
-
-In a list context, an array of 11 elements is returned. The elements are:
-
-=over 4
-
-=item [0]
-
-the extracted quotelike substring (including trailing modifiers),
-
-=item [1]
-
-the remainder of the input text,
-
-=item [2]
-
-the prefix substring (if any),
-
-=item [3]
-
-the name of the quotelike operator (if any),
-
-=item [4]
-
-the left delimiter of the first block of the operation,
-
-=item [5]
-
-the text of the first block of the operation
-(that is, the contents of
-a quote, the regex of a match or substitution or the target list of a
-translation),
-
-=item [6]
-
-the right delimiter of the first block of the operation,
-
-=item [7]
-
-the left delimiter of the second block of the operation
-(that is, if it is a C<s>, C<tr>, or C<y>),
-
-=item [8]
-
-the text of the second block of the operation
-(that is, the replacement of a substitution or the translation list
-of a translation),
-
-=item [9]
-
-the right delimiter of the second block of the operation (if any),
-
-=item [10]
-
-the trailing modifiers on the operation (if any).
-
-=back
-
-For each of the fields marked "(if any)" the default value on success is
-an empty string.
-On failure, all of these values (except the remaining text) are C<undef>.
-
-In a scalar context, C<extract_quotelike> returns just the complete substring
-that matched a quotelike operation (or C<undef> on failure). In a scalar or
-void context, the input text has the same substring (and any specified
-prefix) removed.
-
-Examples:
-
- # Remove the first quotelike literal that appears in text
-
- $quotelike = extract_quotelike($text,'.*?');
-
- # Replace one or more leading whitespace-separated quotelike
- # literals in $_ with "<QLL>"
-
- do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
-
-
- # Isolate the search pattern in a quotelike operation from $text
-
- ($op,$pat) = (extract_quotelike $text)[3,5];
- if ($op =~ /[ms]/)
- {
- print "search pattern: $pat\n";
- }
- else
- {
- print "$op is not a pattern matching operation\n";
- }
-
-=head2 C<extract_quotelike> and "here documents"
-
-C<extract_quotelike> can successfully extract "here documents" from an input
-string, but with an important caveat in list contexts.
-
-Unlike other types of quote-like literals, a here document is rarely
-a contiguous substring. For example, a typical piece of code using
-here document might look like this:
-
- <<'EOMSG' || die;
- This is the message.
- EOMSG
- exit;
-
-Given this as an input string in a scalar context, C<extract_quotelike>
-would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
-leaving the string " || die;\nexit;" in the original variable. In other words,
-the two separate pieces of the here document are successfully extracted and
-concatenated.
-
-In a list context, C<extract_quotelike> would return the list
-
-=over 4
-
-=item [0]
-
-"<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
-including fore and aft delimiters),
-
-=item [1]
-
-" || die;\nexit;" (i.e. the remainder of the input text, concatenated),
-
-=item [2]
-
-"" (i.e. the prefix substring -- trivial in this case),
-
-=item [3]
-
-"<<" (i.e. the "name" of the quotelike operator)
-
-=item [4]
-
-"'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
-
-=item [5]
-
-"This is the message.\n" (i.e. the text of the here document),
-
-=item [6]
-
-"EOMSG" (i.e. the right delimiter of the here document),
-
-=item [7..10]
-
-"" (a here document has no second left delimiter, second text, second right
-delimiter, or trailing modifiers).
-
-=back
-
-However, the matching position of the input variable would be set to
-"exit;" (i.e. I<after> the closing delimiter of the here document),
-which would cause the earlier " || die;\nexit;" to be skipped in any
-sequence of code fragment extractions.
-
-To avoid this problem, when it encounters a here document whilst
-extracting from a modifiable string, C<extract_quotelike> silently
-rearranges the string to an equivalent piece of Perl:
-
- <<'EOMSG'
- This is the message.
- EOMSG
- || die;
- exit;
-
-in which the here document I<is> contiguous. It still leaves the
-matching position after the here document, but now the rest of the line
-on which the here document starts is not skipped.
-
-To prevent <extract_quotelike> from mucking about with the input in this way
-(this is the only case where a list-context C<extract_quotelike> does so),
-you can pass the input variable as an interpolated literal:
-
- $quotelike = extract_quotelike("$var");
-
-=head2 C<extract_codeblock>
-
-C<extract_codeblock> attempts to recognize and extract a balanced
-bracket delimited substring that may contain unbalanced brackets
-inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
-is like a combination of C<"extract_bracketed"> and
-C<"extract_quotelike">.
-
-C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
-a text to process, a set of delimiter brackets to look for, and a prefix to
-match first. It also takes an optional fourth parameter, which allows the
-outermost delimiter brackets to be specified separately (see below).
-
-Omitting the first argument (input text) means process C<$_> instead.
-Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
-Omitting the third argument (prefix argument) implies optional whitespace at the start.
-Omitting the fourth argument (outermost delimiter brackets) indicates that the
-value of the second argument is to be used for the outermost delimiters.
-
-Once the prefix an dthe outermost opening delimiter bracket have been
-recognized, code blocks are extracted by stepping through the input text and
-trying the following alternatives in sequence:
-
-=over 4
-
-=item 1.
-
-Try and match a closing delimiter bracket. If the bracket was the same
-species as the last opening bracket, return the substring to that
-point. If the bracket was mismatched, return an error.
-
-=item 2.
-
-Try to match a quote or quotelike operator. If found, call
-C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
-the error it returned. Otherwise go back to step 1.
-
-=item 3.
-
-Try to match an opening delimiter bracket. If found, call
-C<extract_codeblock> recursively to eat the embedded block. If the
-recursive call fails, return an error. Otherwise, go back to step 1.
-
-=item 4.
-
-Unconditionally match a bareword or any other single character, and
-then go back to step 1.
-
-=back
-
-Examples:
-
- # Find a while loop in the text
-
- if ($text =~ s/.*?while\s*\{/{/)
- {
- $loop = "while " . extract_codeblock($text);
- }
-
- # Remove the first round-bracketed list (which may include
- # round- or curly-bracketed code blocks or quotelike operators)
-
- extract_codeblock $text, "(){}", '[^(]*';
-
-
-The ability to specify a different outermost delimiter bracket is useful
-in some circumstances. For example, in the Parse::RecDescent module,
-parser actions which are to be performed only on a successful parse
-are specified using a C<E<lt>defer:...E<gt>> directive. For example:
-
- sentence: subject verb object
- <defer: {$::theVerb = $item{verb}} >
-
-Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
-within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
-
-A deferred action like this:
-
- <defer: {if ($count>10) {$count--}} >
-
-will be incorrectly parsed as:
-
- <defer: {if ($count>
-
-because the "less than" operator is interpreted as a closing delimiter.
-
-But, by extracting the directive using
-S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
-the '>' character is only treated as a delimited at the outermost
-level of the code block, so the directive is parsed correctly.
-
-=head2 C<extract_multiple>
-
-The C<extract_multiple> subroutine takes a string to be processed and a
-list of extractors (subroutines or regular expressions) to apply to that string.
-
-In an array context C<extract_multiple> returns an array of substrings
-of the original string, as extracted by the specified extractors.
-In a scalar context, C<extract_multiple> returns the first
-substring successfully extracted from the original string. In both
-scalar and void contexts the original string has the first successfully
-extracted substring removed from it. In all contexts
-C<extract_multiple> starts at the current C<pos> of the string, and
-sets that C<pos> appropriately after it matches.
-
-Hence, the aim of of a call to C<extract_multiple> in a list context
-is to split the processed string into as many non-overlapping fields as
-possible, by repeatedly applying each of the specified extractors
-to the remainder of the string. Thus C<extract_multiple> is
-a generalized form of Perl's C<split> subroutine.
-
-The subroutine takes up to four optional arguments:
-
-=over 4
-
-=item 1.
-
-A string to be processed (C<$_> if the string is omitted or C<undef>)
-
-=item 2.
-
-A reference to a list of subroutine references and/or qr// objects and/or
-literal strings and/or hash references, specifying the extractors
-to be used to split the string. If this argument is omitted (or
-C<undef>) the list:
-
- [
- sub { extract_variable($_[0], '') },
- sub { extract_quotelike($_[0],'') },
- sub { extract_codeblock($_[0],'{}','') },
- ]
-
-is used.
-
-=item 3.
-
-An number specifying the maximum number of fields to return. If this
-argument is omitted (or C<undef>), split continues as long as possible.
-
-If the third argument is I<N>, then extraction continues until I<N> fields
-have been successfully extracted, or until the string has been completely
-processed.
-
-Note that in scalar and void contexts the value of this argument is
-automatically reset to 1 (under C<-w>, a warning is issued if the argument
-has to be reset).
-
-=item 4.
-
-A value indicating whether unmatched substrings (see below) within the
-text should be skipped or returned as fields. If the value is true,
-such substrings are skipped. Otherwise, they are returned.
-
-=back
-
-The extraction process works by applying each extractor in
-sequence to the text string.
-
-If the extractor is a subroutine it is called in a list context and is
-expected to return a list of a single element, namely the extracted
-text. It may optionally also return two further arguments: a string
-representing the text left after extraction (like $' for a pattern
-match), and a string representing any prefix skipped before the
-extraction (like $` in a pattern match). Note that this is designed
-to facilitate the use of other Text::Balanced subroutines with
-C<extract_multiple>. Note too that the value returned by an extractor
-subroutine need not bear any relationship to the corresponding substring
-of the original text (see examples below).
-
-If the extractor is a precompiled regular expression or a string,
-it is matched against the text in a scalar context with a leading
-'\G' and the gc modifiers enabled. The extracted value is either
-$1 if that variable is defined after the match, or else the
-complete match (i.e. $&).
-
-If the extractor is a hash reference, it must contain exactly one element.
-The value of that element is one of the
-above extractor types (subroutine reference, regular expression, or string).
-The key of that element is the name of a class into which the successful
-return value of the extractor will be blessed.
-
-If an extractor returns a defined value, that value is immediately
-treated as the next extracted field and pushed onto the list of fields.
-If the extractor was specified in a hash reference, the field is also
-blessed into the appropriate class,
-
-If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
-assumed to have failed to extract.
-If none of the extractor subroutines succeeds, then one
-character is extracted from the start of the text and the extraction
-subroutines reapplied. Characters which are thus removed are accumulated and
-eventually become the next field (unless the fourth argument is true, in which
-case they are discarded).
-
-For example, the following extracts substrings that are valid Perl variables:
-
- @fields = extract_multiple($text,
- [ sub { extract_variable($_[0]) } ],
- undef, 1);
-
-This example separates a text into fields which are quote delimited,
-curly bracketed, and anything else. The delimited and bracketed
-parts are also blessed to identify them (the "anything else" is unblessed):
-
- @fields = extract_multiple($text,
- [
- { Delim => sub { extract_delimited($_[0],q{'"}) } },
- { Brack => sub { extract_bracketed($_[0],'{}') } },
- ]);
-
-This call extracts the next single substring that is a valid Perl quotelike
-operator (and removes it from $text):
-
- $quotelike = extract_multiple($text,
- [
- sub { extract_quotelike($_[0]) },
- ], undef, 1);
-
-Finally, here is yet another way to do comma-separated value parsing:
-
- @fields = extract_multiple($csv_text,
- [
- sub { extract_delimited($_[0],q{'"}) },
- qr/([^,]+)(.*)/,
- ],
- undef,1);
-
-The list in the second argument means:
-I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
-The undef third argument means:
-I<"...as many times as possible...">,
-and the true value in the fourth argument means
-I<"...discarding anything else that appears (i.e. the commas)">.
-
-If you wanted the commas preserved as separate fields (i.e. like split
-does if your split pattern has capturing parentheses), you would
-just make the last parameter undefined (or remove it).
-
-=head2 C<gen_delimited_pat>
-
-The C<gen_delimited_pat> subroutine takes a single (string) argument and
- > builds a Friedl-style optimized regex that matches a string delimited
-by any one of the characters in the single argument. For example:
-
- gen_delimited_pat(q{'"})
-
-returns the regex:
-
- (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
-
-Note that the specified delimiters are automatically quotemeta'd.
-
-A typical use of C<gen_delimited_pat> would be to build special purpose tags
-for C<extract_tagged>. For example, to properly ignore "empty" XML elements
-(which might contain quoted strings):
-
- my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
-
- extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
-
-C<gen_delimited_pat> may also be called with an optional second argument,
-which specifies the "escape" character(s) to be used for each delimiter.
-For example to match a Pascal-style string (where ' is the delimiter
-and '' is a literal ' within the string):
-
- gen_delimited_pat(q{'},q{'});
-
-Different escape characters can be specified for different delimiters.
-For example, to specify that '/' is the escape for single quotes
-and '%' is the escape for double quotes:
-
- gen_delimited_pat(q{'"},q{/%});
-
-If more delimiters than escape chars are specified, the last escape char
-is used for the remaining delimiters.
-If no escape char is specified for a given specified delimiter, '\' is used.
-
-=head2 C<delimited_pat>
-
-Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
-That name may still be used, but is now deprecated.
-
-
-=head1 DIAGNOSTICS
-
-In a list context, all the functions return C<(undef,$original_text)>
-on failure. In a scalar context, failure is indicated by returning C<undef>
-(in this case the input text is not modified in any way).
-
-In addition, on failure in I<any> context, the C<$@> variable is set.
-Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
-below.
-Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
-which the error was detected (although not necessarily where it occurred!)
-Printing C<$@> directly produces the error message, with the offset appended.
-On success, the C<$@> variable is guaranteed to be C<undef>.
-
-The available diagnostics are:
-
-=over 4
-
-=item C<Did not find a suitable bracket: "%s">
-
-The delimiter provided to C<extract_bracketed> was not one of
-C<'()[]E<lt>E<gt>{}'>.
-
-=item C<Did not find prefix: /%s/>
-
-A non-optional prefix was specified but wasn't found at the start of the text.
-
-=item C<Did not find opening bracket after prefix: "%s">
-
-C<extract_bracketed> or C<extract_codeblock> was expecting a
-particular kind of bracket at the start of the text, and didn't find it.
-
-=item C<No quotelike operator found after prefix: "%s">
-
-C<extract_quotelike> didn't find one of the quotelike operators C<q>,
-C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
-it was extracting.
-
-=item C<Unmatched closing bracket: "%c">
-
-C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
-a closing bracket where none was expected.
-
-=item C<Unmatched opening bracket(s): "%s">
-
-C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran
-out of characters in the text before closing one or more levels of nested
-brackets.
-
-=item C<Unmatched embedded quote (%s)>
-
-C<extract_bracketed> attempted to match an embedded quoted substring, but
-failed to find a closing quote to match it.
-
-=item C<Did not find closing delimiter to match '%s'>
-
-C<extract_quotelike> was unable to find a closing delimiter to match the
-one that opened the quote-like operation.
-
-=item C<Mismatched closing bracket: expected "%c" but found "%s">
-
-C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
-a valid bracket delimiter, but it was the wrong species. This usually
-indicates a nesting error, but may indicate incorrect quoting or escaping.
-
-=item C<No block delimiter found after quotelike "%s">
-
-C<extract_quotelike> or C<extract_codeblock> found one of the
-quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
-without a suitable block after it.
-
-=item C<Did not find leading dereferencer>
-
-C<extract_variable> was expecting one of '$', '@', or '%' at the start of
-a variable, but didn't find any of them.
-
-=item C<Bad identifier after dereferencer>
-
-C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
-character was not followed by a legal Perl identifier.
-
-=item C<Did not find expected opening bracket at %s>
-
-C<extract_codeblock> failed to find any of the outermost opening brackets
-that were specified.
-
-=item C<Improperly nested codeblock at %s>
-
-A nested code block was found that started with a delimiter that was specified
-as being only to be used as an outermost bracket.
-
-=item C<Missing second block for quotelike "%s">
-
-C<extract_codeblock> or C<extract_quotelike> found one of the
-quotelike operators C<s>, C<tr> or C<y> followed by only one block.
-
-=item C<No match found for opening bracket>
-
-C<extract_codeblock> failed to find a closing bracket to match the outermost
-opening bracket.
-
-=item C<Did not find opening tag: /%s/>
-
-C<extract_tagged> did not find a suitable opening tag (after any specified
-prefix was removed).
-
-=item C<Unable to construct closing tag to match: /%s/>
-
-C<extract_tagged> matched the specified opening tag and tried to
-modify the matched text to produce a matching closing tag (because
-none was specified). It failed to generate the closing tag, almost
-certainly because the opening tag did not start with a
-bracket of some kind.
-
-=item C<Found invalid nested tag: %s>
-
-C<extract_tagged> found a nested tag that appeared in the "reject" list
-(and the failure mode was not "MAX" or "PARA").
-
-=item C<Found unbalanced nested tag: %s>
-
-C<extract_tagged> found a nested opening tag that was not matched by a
-corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
-
-=item C<Did not find closing tag>
-
-C<extract_tagged> reached the end of the text without finding a closing tag
-to match the original opening tag (and the failure mode was not
-"MAX" or "PARA").
-
-=back
-
-=head1 AUTHOR
-
-Damian Conway (damian@conway.org)
-
-=head1 BUGS AND IRRITATIONS
-
-There are undoubtedly serious bugs lurking somewhere in this code, if
-only because parts of it give the impression of understanding a great deal
-more about Perl than they really do.
-
-Bug reports and other feedback are most welcome.
-
-=head1 COPYRIGHT
-
-Copyright 1997 - 2001 Damian Conway. All Rights Reserved.
-
-Some (minor) parts copyright 2009 Adam Kennedy.
-
-This module is free software. It may be used, redistributed
-and/or modified under the same terms as Perl itself.
-
-=cut
diff --git a/ext/Text-Balanced/t/01_compile.t b/ext/Text-Balanced/t/01_compile.t
deleted file mode 100644
index 77c1099995..0000000000
--- a/ext/Text-Balanced/t/01_compile.t
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-BEGIN {
- $| = 1;
- $^W = 1;
-}
-
-use Test::More tests => 1;
-
-use_ok( 'Text::Balanced' );
diff --git a/ext/Text-Balanced/t/02_extbrk.t b/ext/Text-Balanced/t/02_extbrk.t
deleted file mode 100644
index a36025ddb0..0000000000
--- a/ext/Text-Balanced/t/02_extbrk.t
+++ /dev/null
@@ -1,76 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..19\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_bracketed );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- $var = eval "() = $cmd";
- debug "\t list got: [$var]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str),1) eq ';')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-
- pos $str = 0;
- $var = eval $cmd;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-
-# USING: extract_bracketed($str);
-{a nested { and } are okay as are () and <> pairs and escaped \}'s };
-{a nested\n{ and } are okay as are\n() and <> pairs and escaped \}'s };
-
-# USING: extract_bracketed($str,'{}');
-{a nested { and } are okay as are unbalanced ( and < pairs and escaped \}'s };
-
-# THESE SHOULD FAIL
-{an unmatched nested { isn't okay, nor are ( and < };
-{an unbalanced nested [ even with } and ] to match them;
-
-
-# USING: extract_bracketed($str,'<"`q>');
-<a q{uoted} ">" unbalanced right bracket of /(q>)/ either sort (`>>>""">>>>`) is okay >;
-
-# USING: extract_bracketed($str,'<">');
-<a quoted ">" unbalanced right bracket is okay >;
-
-# USING: extract_bracketed($str,'<"`>');
-<a quoted ">" unbalanced right bracket of either sort (`>>>""">>>>`) is okay >;
-
-# THIS SHOULD FAIL
-<a misquoted '>' unbalanced right bracket is bad >;
diff --git a/ext/Text-Balanced/t/03_extcbk.t b/ext/Text-Balanced/t/03_extcbk.t
deleted file mode 100644
index 83081ae28d..0000000000
--- a/ext/Text-Balanced/t/03_extcbk.t
+++ /dev/null
@@ -1,95 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..41\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_codeblock );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- my @res;
- $var = eval "\@res = $cmd";
- debug "\t Failed: $@ at " . $@+0 .")" if $@;
- debug "\t list got: [" . join("|", map {defined $_ ? $_ : '<undef>'} @res) . "]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg;
- print "ok ", $count++;
- print "\n";
-
- pos $str = 0;
- $var = eval $cmd;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-
-# USING: extract_codeblock($str,'(){}',undef,'()');
-(Foo(')'));
-
-# USING: extract_codeblock($str);
-{ $data[4] =~ /['"]/; };
-
-# USING: extract_codeblock($str,'<>');
-< %x = ( try => "this") >;
-< %x = () >;
-< %x = ( $try->{this}, "too") >;
-< %'x = ( $try->{this}, "too") >;
-< %'x'y = ( $try->{this}, "too") >;
-< %::x::y = ( $try->{this}, "too") >;
-
-# THIS SHOULD FAIL
-< %x = do { $try > 10 } >;
-
-# USING: extract_codeblock($str);
-
-{ $a = /\}/; };
-{ sub { $_[0] /= $_[1] } }; # / here
-{ 1; };
-{ $a = 1; };
-
-
-# USING: extract_codeblock($str,undef,'=*');
-========{$a=1};
-
-# USING: extract_codeblock($str,'{}<>');
-< %x = do { $try > 10 } >;
-
-# USING: extract_codeblock($str,'{}',undef,'<>');
-< %x = do { $try > 10 } >;
-
-# USING: extract_codeblock($str,'{}');
-{ $a = $b; # what's this doing here? \n };'
-{ $a = $b; \n $a =~ /$b/; \n @a = map /\s/ @b };
-
-# THIS SHOULD FAIL
-{ $a = $b; # what's this doing here? };'
-{ $a = $b; # what's this doing here? ;'
diff --git a/ext/Text-Balanced/t/04_extdel.t b/ext/Text-Balanced/t/04_extdel.t
deleted file mode 100644
index c5ca88eebf..0000000000
--- a/ext/Text-Balanced/t/04_extdel.t
+++ /dev/null
@@ -1,90 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..45\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_delimited );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- $var = eval "() = $cmd";
- debug "\t list got: [$var]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-
- pos $str = 0;
- $var = eval $cmd;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-# USING: extract_delimited($str,'/#$',undef,'/#$');
-/a/;
-/a///;
-#b#;
-#b###;
-$c$;
-$c$$$;
-
-# TEST EXTRACTION OF DELIMITED TEXT WITH ESCAPES
-# USING: extract_delimited($str,'/#$',undef,'\\');
-/a/;
-/a\//;
-#b#;
-#b\##;
-$c$;
-$c\$$;
-
-# TEST EXTRACTION OF DELIMITED TEXT
-# USING: extract_delimited($str);
-'a';
-"b";
-`c`;
-'a\'';
-'a\\';
-'\\a';
-"a\\";
-"\\a";
-"b\'\"\'";
-`c '\`abc\`'`;
-
-# TEST EXTRACTION OF DELIMITED TEXT
-# USING: extract_delimited($str,'/#$','-->');
--->/a/;
--->#b#;
--->$c$;
-
-# THIS SHOULD FAIL
-$c$;
diff --git a/ext/Text-Balanced/t/05_extmul.t b/ext/Text-Balanced/t/05_extmul.t
deleted file mode 100644
index 2ac1b19ffd..0000000000
--- a/ext/Text-Balanced/t/05_extmul.t
+++ /dev/null
@@ -1,319 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..86\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( :ALL );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-sub expect
-{
- local $^W;
- my ($l1, $l2) = @_;
-
- if (@$l1 != @$l2)
- {
- print "\@l1: ", join(", ", @$l1), "\n";
- print "\@l2: ", join(", ", @$l2), "\n";
- print "not ";
- }
- else
- {
- for (my $i = 0; $i < @$l1; $i++)
- {
- if ($l1->[$i] ne $l2->[$i])
- {
- print "field $i: '$l1->[$i]' ne '$l2->[$i]'\n";
- print "not ";
- last;
- }
- }
- }
-
- print "ok $count\n";
- $count++;
-}
-
-sub divide
-{
- my ($text, @index) = @_;
- my @bits = ();
- unshift @index, 0;
- push @index, length($text);
- for ( my $i= 0; $i < $#index; $i++)
- {
- push @bits, substr($text, $index[$i], $index[$i+1]-$index[$i]);
- }
- pop @bits;
- return @bits;
-
-}
-
-
-$stdtext1 = q{$var = do {"val" && $val;};};
-
-# TESTS 2-4
-$text = $stdtext1;
-expect [ extract_multiple($text,undef,1) ],
- [ divide $stdtext1 => 4 ];
-
-expect [ pos $text], [ 4 ];
-expect [ $text ], [ $stdtext1 ];
-
-# TESTS 5-7
-$text = $stdtext1;
-expect [ scalar extract_multiple($text,undef,1) ],
- [ divide $stdtext1 => 4 ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext1,4) ];
-
-
-# TESTS 8-10
-$text = $stdtext1;
-expect [ extract_multiple($text,undef,2) ],
- [ divide($stdtext1 => 4, 10) ];
-
-expect [ pos $text], [ 10 ];
-expect [ $text ], [ $stdtext1 ];
-
-# TESTS 11-13
-$text = $stdtext1;
-expect [ eval{local$^W;scalar extract_multiple($text,undef,2)} ],
- [ substr($stdtext1,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext1,4) ];
-
-
-# TESTS 14-16
-$text = $stdtext1;
-expect [ extract_multiple($text,undef,3) ],
- [ divide($stdtext1 => 4, 10, 26) ];
-
-expect [ pos $text], [ 26 ];
-expect [ $text ], [ $stdtext1 ];
-
-# TESTS 17-19
-$text = $stdtext1;
-expect [ eval{local$^W;scalar extract_multiple($text,undef,3)} ],
- [ substr($stdtext1,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext1,4) ];
-
-
-# TESTS 20-22
-$text = $stdtext1;
-expect [ extract_multiple($text,undef,4) ],
- [ divide($stdtext1 => 4, 10, 26, 27) ];
-
-expect [ pos $text], [ 27 ];
-expect [ $text ], [ $stdtext1 ];
-
-# TESTS 23-25
-$text = $stdtext1;
-expect [ eval{local$^W;scalar extract_multiple($text,undef,4)} ],
- [ substr($stdtext1,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext1,4) ];
-
-
-# TESTS 26-28
-$text = $stdtext1;
-expect [ extract_multiple($text,undef,5) ],
- [ divide($stdtext1 => 4, 10, 26, 27) ];
-
-expect [ pos $text], [ 27 ];
-expect [ $text ], [ $stdtext1 ];
-
-
-# TESTS 29-31
-$text = $stdtext1;
-expect [ eval{local$^W;scalar extract_multiple($text,undef,5)} ],
- [ substr($stdtext1,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext1,4) ];
-
-
-
-# TESTS 32-34
-$stdtext2 = q{$var = "val" && (1,2,3);};
-
-$text = $stdtext2;
-expect [ extract_multiple($text) ],
- [ divide($stdtext2 => 4, 7, 12, 24) ];
-
-expect [ pos $text], [ 24 ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 35-37
-$text = $stdtext2;
-expect [ scalar extract_multiple($text) ],
- [ substr($stdtext2,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext2,4) ];
-
-
-# TESTS 38-40
-$text = $stdtext2;
-expect [ extract_multiple($text,[\&extract_bracketed]) ],
- [ substr($stdtext2,0,16), substr($stdtext2,16,7), substr($stdtext2,23) ];
-
-expect [ pos $text], [ 24 ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 41-43
-$text = $stdtext2;
-expect [ scalar extract_multiple($text,[\&extract_bracketed]) ],
- [ substr($stdtext2,0,16) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext2,15) ];
-
-
-# TESTS 44-46
-$text = $stdtext2;
-expect [ extract_multiple($text,[\&extract_variable]) ],
- [ substr($stdtext2,0,4), substr($stdtext2,4) ];
-
-expect [ pos $text], [ length($text) ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 47-49
-$text = $stdtext2;
-expect [ scalar extract_multiple($text,[\&extract_variable]) ],
- [ substr($stdtext2,0,4) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext2,4) ];
-
-
-# TESTS 50-52
-$text = $stdtext2;
-expect [ extract_multiple($text,[\&extract_quotelike]) ],
- [ substr($stdtext2,0,7), substr($stdtext2,7,5), substr($stdtext2,12) ];
-
-expect [ pos $text], [ length($text) ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 53-55
-$text = $stdtext2;
-expect [ scalar extract_multiple($text,[\&extract_quotelike]) ],
- [ substr($stdtext2,0,7) ];
-
-expect [ pos $text], [ 0 ];
-expect [ $text ], [ substr($stdtext2,6) ];
-
-
-# TESTS 56-58
-$text = $stdtext2;
-expect [ extract_multiple($text,[\&extract_quotelike],2,1) ],
- [ substr($stdtext2,7,5) ];
-
-expect [ pos $text], [ 23 ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 59-61
-$text = $stdtext2;
-expect [ eval{local$^W;scalar extract_multiple($text,[\&extract_quotelike],2,1)} ],
- [ substr($stdtext2,7,5) ];
-
-expect [ pos $text], [ 6 ];
-expect [ $text ], [ substr($stdtext2,0,6). substr($stdtext2,12) ];
-
-
-# TESTS 62-64
-$text = $stdtext2;
-expect [ extract_multiple($text,[\&extract_quotelike],1,1) ],
- [ substr($stdtext2,7,5) ];
-
-expect [ pos $text], [ 12 ];
-expect [ $text ], [ $stdtext2 ];
-
-# TESTS 65-67
-$text = $stdtext2;
-expect [ scalar extract_multiple($text,[\&extract_quotelike],1,1) ],
- [ substr($stdtext2,7,5) ];
-
-expect [ pos $text], [ 6 ];
-expect [ $text ], [ substr($stdtext2,0,6). substr($stdtext2,12) ];
-
-# TESTS 68-70
-my $stdtext3 = "a,b,c";
-
-$_ = $stdtext3;
-expect [ extract_multiple(undef, [ sub { /\G[a-z]/gc && $& } ]) ],
- [ divide($stdtext3 => 1,2,3,4,5) ];
-
-expect [ pos ], [ 5 ];
-expect [ $_ ], [ $stdtext3 ];
-
-# TESTS 71-73
-
-$_ = $stdtext3;
-expect [ scalar extract_multiple(undef, [ sub { /\G[a-z]/gc && $& } ]) ],
- [ divide($stdtext3 => 1) ];
-
-expect [ pos ], [ 0 ];
-expect [ $_ ], [ substr($stdtext3,1) ];
-
-
-# TESTS 74-76
-
-$_ = $stdtext3;
-expect [ extract_multiple(undef, [ qr/\G[a-z]/ ]) ],
- [ divide($stdtext3 => 1,2,3,4,5) ];
-
-expect [ pos ], [ 5 ];
-expect [ $_ ], [ $stdtext3 ];
-
-# TESTS 77-79
-
-$_ = $stdtext3;
-expect [ scalar extract_multiple(undef, [ qr/\G[a-z]/ ]) ],
- [ divide($stdtext3 => 1) ];
-
-expect [ pos ], [ 0 ];
-expect [ $_ ], [ substr($stdtext3,1) ];
-
-
-# TESTS 80-82
-
-$_ = $stdtext3;
-expect [ extract_multiple(undef, [ q/([a-z]),?/ ]) ],
- [ qw(a b c) ];
-
-expect [ pos ], [ 5 ];
-expect [ $_ ], [ $stdtext3 ];
-
-# TESTS 83-85
-
-$_ = $stdtext3;
-expect [ scalar extract_multiple(undef, [ q/([a-z]),?/ ]) ],
- [ divide($stdtext3 => 1) ];
-
-expect [ pos ], [ 0 ];
-expect [ $_ ], [ substr($stdtext3,2) ];
-
-
-# TEST 86
-
-# Fails in Text-Balanced-1.95 with result ['1 ', '""', '1234']
-$_ = q{ ""1234};
-expect [ extract_multiple(undef, [\&extract_quotelike]) ],
- [ ' ', '""', '1234' ];
diff --git a/ext/Text-Balanced/t/06_extqlk.t b/ext/Text-Balanced/t/06_extqlk.t
deleted file mode 100644
index 6badc0ee18..0000000000
--- a/ext/Text-Balanced/t/06_extqlk.t
+++ /dev/null
@@ -1,135 +0,0 @@
-#! /usr/local/bin/perl -ws
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..95\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_quotelike );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-#$DEBUG=1;
-sub debug { print "\t>>>",@_ if $ENV{DEBUG} }
-sub esc { my $x = shift||'<undef>'; $x =~ s/\n/\\n/gs; $x }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- my $setup_cmd = ($str =~ s/\A\{(.*)\}//) ? $1 : '';
- my $tests = 'sl';
- $str =~ s/\\n/\n/g;
- my $orig = $str;
-
- eval $setup_cmd if $setup_cmd ne '';
- if($tests =~ /l/) {
- debug "\tUsing: $cmd\n";
- debug "\t on: [" . esc($setup_cmd) . "][" . esc($str) . "]\n";
- my @res;
- eval qq{\@res = $cmd; };
- debug "\t got:\n" . join "", map { "\t\t\t$_: [" . esc($res[$_]) . "]\n"} (0..$#res);
- debug "\t left: [" . esc($str) . "]\n";
- debug "\t pos: [" . esc(substr($str,pos($str))) . "...]\n";
- print "not " if (substr($str,pos($str),1) eq ';')==$neg;
- print "ok ", $count++;
- print "\n";
- }
-
- eval $setup_cmd if $setup_cmd ne '';
- if($tests =~ /s/) {
- $str = $orig;
- debug "\tUsing: scalar $cmd\n";
- debug "\t on: [" . esc($str) . "]\n";
- $var = eval $cmd;
- print " ($@)" if $@ && $DEBUG;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [" . esc($var) . "]\n";
- debug "\t scalar left: [" . esc($str) . "]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print "\n";
- }
-}
-
-# fails in Text::Balanced 1.95
-$_ = qq(s{}{});
-my @z = extract_quotelike();
-print "not " if $z[0] eq '';
-print "ok ", $count++;
-print "\n";
-
-
-__DATA__
-
-# USING: extract_quotelike($str);
-'';
-"";
-"a";
-'b';
-`cc`;
-
-
-<<EOHERE; done();\nline1\nline2\nEOHERE\n; next;
- <<EOHERE; done();\nline1\nline2\nEOHERE\n; next;
-<<"EOHERE"; done()\nline1\nline2\nEOHERE\n and next
-<<`EOHERE`; done()\nline1\nline2\nEOHERE\n and next
-<<'EOHERE'; done()\nline1\n'line2'\nEOHERE\n and next
-<<'EOHERE;'; done()\nline1\nline2\nEOHERE;\n and next
-<<" EOHERE"; done() \nline1\nline2\n EOHERE\nand next
-<<""; done()\nline1\nline2\n\n and next
-<<; done()\nline1\nline2\n\n and next
-# fails in Text::Balanced 1.95
-<<EOHERE;\nEOHERE\n;
-# fails in Text::Balanced 1.95
-<<"*";\n\n*\n;
-
-"this is a nested $var[$x] {";
-/a/gci;
-m/a/gci;
-
-q(d);
-qq(e);
-qx(f);
-qr(g);
-qw(h i j);
-q{d};
-qq{e};
-qx{f};
-qr{g};
-qq{a nested { and } are okay as are () and <> pairs and escaped \}'s };
-q/slash/;
-q # slash #;
-qr qw qx;
-
-s/x/y/;
-s/x/y/cgimsox;
-s{a}{b};
-s{a}\n {b};
-s(a){b};
-s(a)/b/;
-s/'/\\'/g;
-tr/x/y/;
-y/x/y/;
-
-# fails on Text-Balanced-1.95
-{ $tests = 'l'; pos($str)=6 }012345<<E;\n\nE\n
-
-# THESE SHOULD FAIL
-s<$self->{pat}>{$self->{sub}}; # CAN'T HANDLE '>' in '->'
-s-$self->{pap}-$self->{sub}-; # CAN'T HANDLE '-' in '->'
-<<EOHERE; done();\nline1\nline2\nEOHERE;\n; next; # RDEL HAS NO ';'
-<<'EOHERE'; done();\nline1\nline2\nEOHERE;\n; next; # RDEF HAS NO ';'
- << EOTHERE; done();\nline1\nline2\n EOTHERE\n; next; # RDEL IS "" (!)
diff --git a/ext/Text-Balanced/t/07_exttag.t b/ext/Text-Balanced/t/07_exttag.t
deleted file mode 100644
index 16a48b2ae3..0000000000
--- a/ext/Text-Balanced/t/07_exttag.t
+++ /dev/null
@@ -1,113 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..53\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_tagged gen_extract_tagged );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- my @res;
- $var = eval "\@res = $cmd";
- debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-
- pos $str = 0;
- $var = eval $cmd;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-# USING: gen_extract_tagged("BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)")->($str);
- ignore\n this and then BEGINHERE at the ENDHERE;
- ignore\n this and then BEGINTHIS at the ENDTHIS;
-
-# USING: extract_tagged($str,"BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)");
- ignore\n this and then BEGINHERE at the ENDHERE;
- ignore\n this and then BEGINTHIS at the ENDTHIS;
-
-# USING: extract_tagged($str,"BEGIN([A-Z]+)",'END$1',"(?s).*?(?=BEGIN)");
- ignore\n this and then BEGINHERE at the ENDHERE;
- ignore\n this and then BEGINTHIS at the ENDTHIS;
-
-# THIS SHOULD FAIL
- ignore\n this and then BEGINTHIS at the ENDTHAT;
-
-# USING: extract_tagged($str,"BEGIN","END","(?s).*?(?=BEGIN)");
- ignore\n this and then BEGIN at the END;
-
-# USING: extract_tagged($str);
- <A-1 HREF="#section2">some text</A-1>;
-
-# USING: extract_tagged($str,qr/<[A-Z]+>/,undef, undef, {ignore=>["<BR>"]});
- <A>aaa<B>bbb<BR>ccc</B>ddd</A>;
-
-# USING: extract_tagged($str,"BEGIN","END");
- BEGIN at the BEGIN keyword and END at the END;
- BEGIN at the beginning and end at the END;
-
-# USING: extract_tagged($str,undef,undef,undef,{ignore=>["<[^>]*/>"]});
- <A>aaa<B>bbb<BR/>ccc</B>ddd</A>;
-
-# USING: extract_tagged($str,";","-",undef,{reject=>[";"],fail=>"MAX"});
- ; at the ;-) keyword
-
-# USING: extract_tagged($str,"<[A-Z]+>",undef, undef, {ignore=>["<BR>"]});
- <A>aaa<B>bbb<BR>ccc</B>ddd</A>;
-
-# THESE SHOULD FAIL
- BEGIN at the beginning and end at the end;
- BEGIN at the BEGIN keyword and END at the end;
-
-# TEST EXTRACTION OF TAGGED STRINGS
-# USING: extract_tagged($str,"BEGIN","END",undef,{reject=>["BEGIN","END"]});
-# THESE SHOULD FAIL
- BEGIN at the BEGIN keyword and END at the end;
-
-# USING: extract_tagged($str,";","-",undef,{reject=>[";"],fail=>"PARA"});
- ; at the ;-) keyword
-
-
-# USING: extract_tagged($str);
- <A>some text</A>;
- <B>some text<A>other text</A></B>;
- <A>some text<A>other text</A></A>;
- <A HREF="#section2">some text</A>;
-
-# THESE SHOULD FAIL
- <A>some text
- <A>some text<A>other text</A>;
- <B>some text<A>other text</B>;
diff --git a/ext/Text-Balanced/t/08_extvar.t b/ext/Text-Balanced/t/08_extvar.t
deleted file mode 100644
index a33ac919ec..0000000000
--- a/ext/Text-Balanced/t/08_extvar.t
+++ /dev/null
@@ -1,153 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..183\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( extract_variable );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- if ($str =~ s/\A# USING://) { $neg = 0; $cmd = $str; next; }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- my @res;
- $var = eval "\@res = $cmd";
- debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-
- pos $str = 0;
- $var = eval $cmd;
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-
-# USING: extract_variable($str);
-# THESE SHOULD FAIL
-$a->;
-$a (1..3) { print $a };
-
-# USING: extract_variable($str);
-$::obj;
-$obj->nextval;
-*var;
-*$var;
-*{var};
-*{$var};
-*var{cat};
-\&var;
-\&mod::var;
-\&mod'var;
-$a;
-$_;
-$a[1];
-$_[1];
-$a{cat};
-$_{cat};
-$a->[1];
-$a->{"cat"}[1];
-@$listref;
-@{$listref};
-$obj->nextval;
-$obj->_nextval;
-$obj->next_val_;
-@{$obj->nextval};
-@{$obj->nextval($cat,$dog)->{new}};
-@{$obj->nextval($cat?$dog:$fish)->{new}};
-@{$obj->nextval(cat()?$dog:$fish)->{new}};
-$ a {'cat'};
-$a::b::c{d}->{$e->()};
-$a'b'c'd{e}->{$e->()};
-$a'b::c'd{e}->{$e->()};
-$#_;
-$#array;
-$#{array};
-$var[$#var];
-$1;
-$11;
-$&;
-$`;
-$';
-$+;
-$*;
-$.;
-$/;
-$|;
-$,;
-$";
-$;;
-$#;
-$%;
-$=;
-$-;
-$~;
-$^;
-$:;
-$^L;
-$^A;
-$?;
-$!;
-$^E;
-$@;
-$$;
-$<;
-$>;
-$(;
-$);
-$[;
-$];
-$^C;
-$^D;
-$^F;
-$^H;
-$^I;
-$^M;
-$^O;
-$^P;
-$^R;
-$^S;
-$^T;
-$^V;
-$^W;
-${^WARNING_BITS};
-${^WIDE_SYSTEM_CALLS};
-$^X;
-
-# THESE SHOULD FAIL
-$a->;
-@{$;
-$ a :: b :: c
-$ a ' b ' c
-
-# USING: extract_variable($str,'=*');
-========$a;
diff --git a/ext/Text-Balanced/t/09_gentag.t b/ext/Text-Balanced/t/09_gentag.t
deleted file mode 100644
index 0dd55a5f3f..0000000000
--- a/ext/Text-Balanced/t/09_gentag.t
+++ /dev/null
@@ -1,102 +0,0 @@
-# Before `make install' is performed this script should be runnable with
-# `make test'. After `make install' it should work as `perl test.pl'
-
-######################### We start with some black magic to print on failure.
-
-# Change 1..1 below to 1..last_test_to_print .
-# (It may become useful if the test is moved to ./t subdirectory.)
-
-BEGIN { $| = 1; print "1..37\n"; }
-END {print "not ok 1\n" unless $loaded;}
-use Text::Balanced qw ( gen_extract_tagged );
-$loaded = 1;
-print "ok 1\n";
-$count=2;
-use vars qw( $DEBUG );
-sub debug { print "\t>>>",@_ if $DEBUG }
-
-######################### End of black magic.
-
-
-$cmd = "print";
-$neg = 0;
-while (defined($str = <DATA>))
-{
- chomp $str;
- $str =~ s/\\n/\n/g;
- if ($str =~ s/\A# USING://)
- {
- $neg = 0;
- eval{local$^W;*f = eval $str || die};
- next;
- }
- elsif ($str =~ /\A# TH[EI]SE? SHOULD FAIL/) { $neg = 1; next; }
- elsif (!$str || $str =~ /\A#/) { $neg = 0; next }
- $str =~ s/\\n/\n/g;
- debug "\tUsing: $cmd\n";
- debug "\t on: [$str]\n";
-
- my @res;
- $var = eval { @res = f($str) };
- debug "\t list got: [" . join("|",map {defined $_ ? $_ : '<undef>'} @res) . "]\n";
- debug "\t list left: [$str]\n";
- print "not " if (substr($str,pos($str)||0,1) eq ';')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-
- pos $str = 0;
- $var = eval { scalar f($str) };
- $var = "<undef>" unless defined $var;
- debug "\t scalar got: [$var]\n";
- debug "\t scalar left: [$str]\n";
- print "not " if ($str =~ '\A;')==$neg;
- print "ok ", $count++;
- print " ($@)" if $@ && $DEBUG;
- print "\n";
-}
-
-__DATA__
-
-# USING: gen_extract_tagged('{','}');
- { a test };
-
-# USING: gen_extract_tagged(qr/<[A-Z]+>/,undef, undef, {ignore=>["<BR>"]});
- <A>aaa<B>bbb<BR>ccc</B>ddd</A>;
-
-# USING: gen_extract_tagged("BEGIN","END");
- BEGIN at the BEGIN keyword and END at the END;
- BEGIN at the beginning and end at the END;
-
-# USING: gen_extract_tagged(undef,undef,undef,{ignore=>["<[^>]*/>"]});
- <A>aaa<B>bbb<BR/>ccc</B>ddd</A>;
-
-# USING: gen_extract_tagged(";","-",undef,{reject=>[";"],fail=>"MAX"});
- ; at the ;-) keyword
-
-# USING: gen_extract_tagged("<[A-Z]+>",undef, undef, {ignore=>["<BR>"]});
- <A>aaa<B>bbb<BR>ccc</B>ddd</A>;
-
-# THESE SHOULD FAIL
- BEGIN at the beginning and end at the end;
- BEGIN at the BEGIN keyword and END at the end;
-
-# TEST EXTRACTION OF TAGGED STRINGS
-# USING: gen_extract_tagged("BEGIN","END",undef,{reject=>["BEGIN","END"]});
-# THESE SHOULD FAIL
- BEGIN at the BEGIN keyword and END at the end;
-
-# USING: gen_extract_tagged(";","-",undef,{reject=>[";"],fail=>"PARA"});
- ; at the ;-) keyword
-
-
-# USING: gen_extract_tagged();
- <A>some text</A>;
- <B>some text<A>other text</A></B>;
- <A>some text<A>other text</A></A>;
- <A HREF="#section2">some text</A>;
-
-# THESE SHOULD FAIL
- <A>some text
- <A>some text<A>other text</A>;
- <B>some text<A>other text</B>;