1 files changed, 209 insertions, 0 deletions
diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm
new file mode 100644
index 0000000..3083379
--- /dev/null
+++ b/lib/HTML/PullParser.pm
@@ -0,0 +1,209 @@
+package HTML::PullParser;
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+$VERSION = "3.57";
+
+use strict;
+use Carp ();
+
+sub new
+{
+    my($class, %cnf) = @_;
+
+    # Construct argspecs for the various events
+    my %argspec;
+    for (qw(start end text declaration comment process default)) {
+	my $tmp = delete $cnf{$_};
+	next unless defined $tmp;
+	$argspec{$_} = $tmp;
+    }
+    Carp::croak("Info not collected for any events")
+	  unless %argspec;
+
+    my $file = delete $cnf{file};
+    my $doc  = delete $cnf{doc};
+    Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
+	  if defined($file) && defined($doc);
+    Carp::croak("No 'doc' or 'file' given to parse from")
+	  unless defined($file) || defined($doc);
+
+    # Create object
+    $cnf{api_version} = 3;
+    my $self = $class->SUPER::new(%cnf);
+
+    my $accum = $self->{pullparser_accum} = [];
+    while (my($event, $argspec) = each %argspec) {
+	$self->SUPER::handler($event => $accum, $argspec);
+    }
+
+    if (defined $doc) {
+	$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
+	$self->{pullparser_str_pos} = 0;
+    }
+    else {
+	if (!ref($file) && ref(\$file) ne "GLOB") {
+	    require IO::File;
+	    $file = IO::File->new($file, "r") || return;
+	}
+
+	$self->{pullparser_file} = $file;
+    }
+    $self;
+}
+
+
+sub handler
+{
+    Carp::croak("Can't set handlers for HTML::PullParser");
+}
+
+
+sub get_token
+{
+    my $self = shift;
+    while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
+	if (my $f = $self->{pullparser_file}) {
+	    # must try to parse more from the file
+	    my $buf;
+	    if (read($f, $buf, 512)) {
+		$self->parse($buf);
+	    } else {
+		$self->eof;
+		$self->{pullparser_eof}++;
+		delete $self->{pullparser_file};
+	    }
+	}
+	elsif (my $sref = $self->{pullparser_str_ref}) {
+	    # must try to parse more from the scalar
+	    my $pos = $self->{pullparser_str_pos};
+	    my $chunk = substr($$sref, $pos, 512);
+	    $self->parse($chunk);
+	    $pos += length($chunk);
+	    if ($pos < length($$sref)) {
+		$self->{pullparser_str_pos} = $pos;
+	    }
+	    else {
+		$self->eof;
+		$self->{pullparser_eof}++;
+		delete $self->{pullparser_str_ref};
+		delete $self->{pullparser_str_pos};
+	    }
+	}
+	else {
+	    die;
+	}
+    }
+    shift @{$self->{pullparser_accum}};
+}
+
+
+sub unget_token
+{
+    my $self = shift;
+    unshift @{$self->{pullparser_accum}}, @_;
+    $self;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::PullParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ use HTML::PullParser;
+
+ $p = HTML::PullParser->new(file => "index.html",
+                            start => 'event, tagname, @attr',
+                            end   => 'event, tagname',
+                            ignore_elements => [qw(script style)],
+                           ) || die "Can't open: $!";
+ while (my $token = $p->get_token) {
+     #...do something with $token
+ }
+
+=head1 DESCRIPTION
+
+The HTML::PullParser is an alternative interface to the HTML::Parser class.
+It basically turns the HTML::Parser inside out.  You associate a file
+(or any IO::Handle object or string) with the parser at construction time and
+then repeatedly call $parser->get_token to obtain the tags and text
+found in the parsed document.
+
+The following methods are provided:
+
+=over 4
+
+=item $p = HTML::PullParser->new( file => $file, %options )
+
+=item $p = HTML::PullParser->new( doc => \$doc, %options )
+
+A C<HTML::PullParser> can be made to parse from either a file or a
+literal document based on whether the C<file> or C<doc> option is
+passed to the parser's constructor.
+
+The C<file> passed in can either be a file name or a file handle
+object.  If a file name is passed, and it can't be opened for reading,
+then the constructor will return an undefined value and $!  will tell
+you why it failed.  Otherwise the argument is taken to be some object
+that the C<HTML::PullParser> can read() from when it needs more data.
+The stream will be read() until EOF, but not closed.
+
+A C<doc> can be passed plain or as a reference
+to a scalar.  If a reference is passed then the value of this scalar
+should not be changed before all tokens have been extracted.
+
+Next the information to be returned for the different token types must
+be set up.  This is done by simply associating an argspec (as defined
+in L<HTML::Parser>) with the events you have an interest in.  For
+instance, if you want C<start> tokens to be reported as the string
+C<'S'> followed by the tagname and the attributes you might pass an
+C<start>-option like this:
+
+   $p = HTML::PullParser->new(
+          doc   => $document_to_parse,
+          start => '"S", tagname, @attr',
+          end   => '"E", tagname',
+        );
+
+At last other C<HTML::Parser> options, like C<ignore_tags>, and
+C<unbroken_text>, can be passed in.  Note that you should not use the
+I<event>_h options to set up parser handlers.  That would confuse the
+inner logic of C<HTML::PullParser>.
+
+=item $token = $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document.  The token is returned as an
+array reference.  The content of this array match the argspec set up
+during C<HTML::PullParser> construction.
+
+=item $p->unget_token( @tokens )
+
+If you find out you have read too many tokens you can push them back,
+so that they are returned again the next time $p->get_token is called.
+
+=back
+
+=head1 EXAMPLES
+
+The 'eg/hform' script shows how we might parse the form section of
+HTML::Documents using HTML::PullParser.
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::TokeParser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut