summaryrefslogtreecommitdiff
path: root/lib/HTML/PullParser.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/HTML/PullParser.pm')
-rw-r--r--lib/HTML/PullParser.pm209
1 files changed, 209 insertions, 0 deletions
diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm
new file mode 100644
index 0000000..3083379
--- /dev/null
+++ b/lib/HTML/PullParser.pm
@@ -0,0 +1,209 @@
+package HTML::PullParser;
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+$VERSION = "3.57";
+
+use strict;
+use Carp ();
+
+sub new
+{
+ my($class, %cnf) = @_;
+
+ # Construct argspecs for the various events
+ my %argspec;
+ for (qw(start end text declaration comment process default)) {
+ my $tmp = delete $cnf{$_};
+ next unless defined $tmp;
+ $argspec{$_} = $tmp;
+ }
+ Carp::croak("Info not collected for any events")
+ unless %argspec;
+
+ my $file = delete $cnf{file};
+ my $doc = delete $cnf{doc};
+ Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
+ if defined($file) && defined($doc);
+ Carp::croak("No 'doc' or 'file' given to parse from")
+ unless defined($file) || defined($doc);
+
+ # Create object
+ $cnf{api_version} = 3;
+ my $self = $class->SUPER::new(%cnf);
+
+ my $accum = $self->{pullparser_accum} = [];
+ while (my($event, $argspec) = each %argspec) {
+ $self->SUPER::handler($event => $accum, $argspec);
+ }
+
+ if (defined $doc) {
+ $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
+ $self->{pullparser_str_pos} = 0;
+ }
+ else {
+ if (!ref($file) && ref(\$file) ne "GLOB") {
+ require IO::File;
+ $file = IO::File->new($file, "r") || return;
+ }
+
+ $self->{pullparser_file} = $file;
+ }
+ $self;
+}
+
+
+sub handler
+{
+ Carp::croak("Can't set handlers for HTML::PullParser");
+}
+
+
+sub get_token
+{
+ my $self = shift;
+ while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
+ if (my $f = $self->{pullparser_file}) {
+ # must try to parse more from the file
+ my $buf;
+ if (read($f, $buf, 512)) {
+ $self->parse($buf);
+ } else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_file};
+ }
+ }
+ elsif (my $sref = $self->{pullparser_str_ref}) {
+ # must try to parse more from the scalar
+ my $pos = $self->{pullparser_str_pos};
+ my $chunk = substr($$sref, $pos, 512);
+ $self->parse($chunk);
+ $pos += length($chunk);
+ if ($pos < length($$sref)) {
+ $self->{pullparser_str_pos} = $pos;
+ }
+ else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_str_ref};
+ delete $self->{pullparser_str_pos};
+ }
+ }
+ else {
+ die;
+ }
+ }
+ shift @{$self->{pullparser_accum}};
+}
+
+
+sub unget_token
+{
+ my $self = shift;
+ unshift @{$self->{pullparser_accum}}, @_;
+ $self;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::PullParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ use HTML::PullParser;
+
+ $p = HTML::PullParser->new(file => "index.html",
+ start => 'event, tagname, @attr',
+ end => 'event, tagname',
+ ignore_elements => [qw(script style)],
+ ) || die "Can't open: $!";
+ while (my $token = $p->get_token) {
+ #...do something with $token
+ }
+
+=head1 DESCRIPTION
+
+The HTML::PullParser is an alternative interface to the HTML::Parser class.
+It basically turns the HTML::Parser inside out. You associate a file
+(or any IO::Handle object or string) with the parser at construction time and
+then repeatedly call $parser->get_token to obtain the tags and text
+found in the parsed document.
+
+The following methods are provided:
+
+=over 4
+
+=item $p = HTML::PullParser->new( file => $file, %options )
+
+=item $p = HTML::PullParser->new( doc => \$doc, %options )
+
+A C<HTML::PullParser> can be made to parse from either a file or a
+literal document based on whether the C<file> or C<doc> option is
+passed to the parser's constructor.
+
+The C<file> passed in can either be a file name or a file handle
+object. If a file name is passed, and it can't be opened for reading,
+then the constructor will return an undefined value and $! will tell
+you why it failed. Otherwise the argument is taken to be some object
+that the C<HTML::PullParser> can read() from when it needs more data.
+The stream will be read() until EOF, but not closed.
+
+A C<doc> can be passed plain or as a reference
+to a scalar. If a reference is passed then the value of this scalar
+should not be changed before all tokens have been extracted.
+
+Next the information to be returned for the different token types must
+be set up. This is done by simply associating an argspec (as defined
+in L<HTML::Parser>) with the events you have an interest in. For
+instance, if you want C<start> tokens to be reported as the string
+C<'S'> followed by the tagname and the attributes you might pass an
+C<start>-option like this:
+
+ $p = HTML::PullParser->new(
+ doc => $document_to_parse,
+ start => '"S", tagname, @attr',
+ end => '"E", tagname',
+ );
+
+At last other C<HTML::Parser> options, like C<ignore_tags>, and
+C<unbroken_text>, can be passed in. Note that you should not use the
+I<event>_h options to set up parser handlers. That would confuse the
+inner logic of C<HTML::PullParser>.
+
+=item $token = $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document. The token is returned as an
+array reference. The content of this array match the argspec set up
+during C<HTML::PullParser> construction.
+
+=item $p->unget_token( @tokens )
+
+If you find out you have read too many tokens you can push them back,
+so that they are returned again the next time $p->get_token is called.
+
+=back
+
+=head1 EXAMPLES
+
+The 'eg/hform' script shows how we might parse the form section of
+HTML::Documents using HTML::PullParser.
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::TokeParser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut