summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
commit2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree4734ccd522c71dd455879162006742002f8c1565 /lib
downloadHTML-Parser-tarball-master.tar.gz
Diffstat (limited to 'lib')
-rw-r--r--lib/HTML/Entities.pm483
-rw-r--r--lib/HTML/Filter.pm112
-rw-r--r--lib/HTML/HeadParser.pm315
-rw-r--r--lib/HTML/LinkExtor.pm185
-rw-r--r--lib/HTML/PullParser.pm209
-rw-r--r--lib/HTML/TokeParser.pm371
6 files changed, 1675 insertions, 0 deletions
diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm
new file mode 100644
index 0000000..ecd8e0d
--- /dev/null
+++ b/lib/HTML/Entities.pm
@@ -0,0 +1,483 @@
+package HTML::Entities;
+
+=encoding utf8
+
+=head1 NAME
+
+HTML::Entities - Encode or decode strings with HTML entities
+
+=head1 SYNOPSIS
+
+ use HTML::Entities;
+
+ $a = "V&aring;re norske tegn b&oslash;r &#230res";
+ decode_entities($a);
+ encode_entities($a, "\200-\377");
+
+For example, this:
+
+ $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
+ print encode_entities($input), "\n"
+
+Prints this out:
+
+ vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve
+ papier-m&acirc;ch&eacute; r&eacute;sum&eacute;
+
+=head1 DESCRIPTION
+
+This module deals with encoding and decoding of strings with HTML
+character entities. The module provides the following functions:
+
+=over 4
+
+=item decode_entities( $string, ... )
+
+This routine replaces HTML entities found in the $string with the
+corresponding Unicode character. Unrecognized entities are left alone.
+
+If multiple strings are provided as argument they are each decoded
+separately and the same number of strings are returned.
+
+If called in void context the arguments are decoded in-place.
+
+This routine is exported by default.
+
+=item _decode_entities( $string, \%entity2char )
+
+=item _decode_entities( $string, \%entity2char, $expand_prefix )
+
+This will in-place replace HTML entities in $string. The %entity2char
+hash must be provided. Named entities not found in the %entity2char
+hash are left alone. Numeric entities are expanded unless their value
+overflow.
+
+The keys in %entity2char are the entity names to be expanded and their
+values are what they should expand into. The values do not have to be
+single character strings. If a key has ";" as suffix,
+then occurrences in $string are only expanded if properly terminated
+with ";". Entities without ";" will be expanded regardless of how
+they are terminated for compatibility with how common browsers treat
+entities in the Latin-1 range.
+
+If $expand_prefix is TRUE then entities without trailing ";" in
+%entity2char will even be expanded as a prefix of a longer
+unrecognized name. The longest matching name in %entity2char will be
+used. This is mainly present for compatibility with an MSIE
+misfeature.
+
+ $string = "foo&nbspbar";
+ _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
+ print $string; # will print "foo bar"
+
+This routine is exported by default.
+
+=item encode_entities( $string )
+
+=item encode_entities( $string, $unsafe_chars )
+
+This routine replaces unsafe characters in $string with their entity
+representation. A second argument can be given to specify which characters to
+consider unsafe. The unsafe characters is specified using the regular
+expression character class syntax (what you find within brackets in regular
+expressions).
+
+The default set of characters to encode are control chars, high-bit chars, and
+the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this,
+for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
+>> characters:
+
+ $encoded = encode_entities($input, '<>&"');
+
+and this would only encode non-plain ascii:
+
+ $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
+
+This routine is exported by default.
+
+=item encode_entities_numeric( $string )
+
+=item encode_entities_numeric( $string, $unsafe_chars )
+
+This routine works just like encode_entities, except that the replacement
+entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For
+example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
+C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
+
+This routine is I<not> exported by default. But you can always
+export it with C<use HTML::Entities qw(encode_entities_numeric);>
+or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
+
+=back
+
+All these routines modify the string passed as the first argument, if
+called in a void context. In scalar and array contexts, the encoded or
+decoded string is returned (without changing the input string).
+
+If you prefer not to import these routines into your namespace, you can
+call them as:
+
+ use HTML::Entities ();
+ $decoded = HTML::Entities::decode($a);
+ $encoded = HTML::Entities::encode($a);
+ $encoded = HTML::Entities::encode_numeric($a);
+
+The module can also export the %char2entity and the %entity2char
+hashes, which contain the mapping from all characters to the
+corresponding entities (and vice versa, respectively).
+
+=head1 COPYRIGHT
+
+Copyright 1995-2006 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+use strict;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
+use vars qw(%entity2char %char2entity);
+
+require 5.004;
+require Exporter;
+@ISA = qw(Exporter);
+
+@EXPORT = qw(encode_entities decode_entities _decode_entities);
+@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
+
+$VERSION = "3.69";
+sub Version { $VERSION; }
+
+require HTML::Parser; # for fast XS implemented decode_entities
+
+
+%entity2char = (
+ # Some normal chars that have special meaning in SGML context
+ amp => '&', # ampersand
+'gt' => '>', # greater than
+'lt' => '<', # less than
+ quot => '"', # double quote
+ apos => "'", # single quote
+
+ # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
+ AElig => chr(198), # capital AE diphthong (ligature)
+ Aacute => chr(193), # capital A, acute accent
+ Acirc => chr(194), # capital A, circumflex accent
+ Agrave => chr(192), # capital A, grave accent
+ Aring => chr(197), # capital A, ring
+ Atilde => chr(195), # capital A, tilde
+ Auml => chr(196), # capital A, dieresis or umlaut mark
+ Ccedil => chr(199), # capital C, cedilla
+ ETH => chr(208), # capital Eth, Icelandic
+ Eacute => chr(201), # capital E, acute accent
+ Ecirc => chr(202), # capital E, circumflex accent
+ Egrave => chr(200), # capital E, grave accent
+ Euml => chr(203), # capital E, dieresis or umlaut mark
+ Iacute => chr(205), # capital I, acute accent
+ Icirc => chr(206), # capital I, circumflex accent
+ Igrave => chr(204), # capital I, grave accent
+ Iuml => chr(207), # capital I, dieresis or umlaut mark
+ Ntilde => chr(209), # capital N, tilde
+ Oacute => chr(211), # capital O, acute accent
+ Ocirc => chr(212), # capital O, circumflex accent
+ Ograve => chr(210), # capital O, grave accent
+ Oslash => chr(216), # capital O, slash
+ Otilde => chr(213), # capital O, tilde
+ Ouml => chr(214), # capital O, dieresis or umlaut mark
+ THORN => chr(222), # capital THORN, Icelandic
+ Uacute => chr(218), # capital U, acute accent
+ Ucirc => chr(219), # capital U, circumflex accent
+ Ugrave => chr(217), # capital U, grave accent
+ Uuml => chr(220), # capital U, dieresis or umlaut mark
+ Yacute => chr(221), # capital Y, acute accent
+ aacute => chr(225), # small a, acute accent
+ acirc => chr(226), # small a, circumflex accent
+ aelig => chr(230), # small ae diphthong (ligature)
+ agrave => chr(224), # small a, grave accent
+ aring => chr(229), # small a, ring
+ atilde => chr(227), # small a, tilde
+ auml => chr(228), # small a, dieresis or umlaut mark
+ ccedil => chr(231), # small c, cedilla
+ eacute => chr(233), # small e, acute accent
+ ecirc => chr(234), # small e, circumflex accent
+ egrave => chr(232), # small e, grave accent
+ eth => chr(240), # small eth, Icelandic
+ euml => chr(235), # small e, dieresis or umlaut mark
+ iacute => chr(237), # small i, acute accent
+ icirc => chr(238), # small i, circumflex accent
+ igrave => chr(236), # small i, grave accent
+ iuml => chr(239), # small i, dieresis or umlaut mark
+ ntilde => chr(241), # small n, tilde
+ oacute => chr(243), # small o, acute accent
+ ocirc => chr(244), # small o, circumflex accent
+ ograve => chr(242), # small o, grave accent
+ oslash => chr(248), # small o, slash
+ otilde => chr(245), # small o, tilde
+ ouml => chr(246), # small o, dieresis or umlaut mark
+ szlig => chr(223), # small sharp s, German (sz ligature)
+ thorn => chr(254), # small thorn, Icelandic
+ uacute => chr(250), # small u, acute accent
+ ucirc => chr(251), # small u, circumflex accent
+ ugrave => chr(249), # small u, grave accent
+ uuml => chr(252), # small u, dieresis or umlaut mark
+ yacute => chr(253), # small y, acute accent
+ yuml => chr(255), # small y, dieresis or umlaut mark
+
+ # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
+ copy => chr(169), # copyright sign
+ reg => chr(174), # registered sign
+ nbsp => chr(160), # non breaking space
+
+ # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
+ iexcl => chr(161),
+ cent => chr(162),
+ pound => chr(163),
+ curren => chr(164),
+ yen => chr(165),
+ brvbar => chr(166),
+ sect => chr(167),
+ uml => chr(168),
+ ordf => chr(170),
+ laquo => chr(171),
+'not' => chr(172), # not is a keyword in perl
+ shy => chr(173),
+ macr => chr(175),
+ deg => chr(176),
+ plusmn => chr(177),
+ sup1 => chr(185),
+ sup2 => chr(178),
+ sup3 => chr(179),
+ acute => chr(180),
+ micro => chr(181),
+ para => chr(182),
+ middot => chr(183),
+ cedil => chr(184),
+ ordm => chr(186),
+ raquo => chr(187),
+ frac14 => chr(188),
+ frac12 => chr(189),
+ frac34 => chr(190),
+ iquest => chr(191),
+'times' => chr(215), # times is a keyword in perl
+ divide => chr(247),
+
+ ( $] > 5.007 ? (
+ 'OElig;' => chr(338),
+ 'oelig;' => chr(339),
+ 'Scaron;' => chr(352),
+ 'scaron;' => chr(353),
+ 'Yuml;' => chr(376),
+ 'fnof;' => chr(402),
+ 'circ;' => chr(710),
+ 'tilde;' => chr(732),
+ 'Alpha;' => chr(913),
+ 'Beta;' => chr(914),
+ 'Gamma;' => chr(915),
+ 'Delta;' => chr(916),
+ 'Epsilon;' => chr(917),
+ 'Zeta;' => chr(918),
+ 'Eta;' => chr(919),
+ 'Theta;' => chr(920),
+ 'Iota;' => chr(921),
+ 'Kappa;' => chr(922),
+ 'Lambda;' => chr(923),
+ 'Mu;' => chr(924),
+ 'Nu;' => chr(925),
+ 'Xi;' => chr(926),
+ 'Omicron;' => chr(927),
+ 'Pi;' => chr(928),
+ 'Rho;' => chr(929),
+ 'Sigma;' => chr(931),
+ 'Tau;' => chr(932),
+ 'Upsilon;' => chr(933),
+ 'Phi;' => chr(934),
+ 'Chi;' => chr(935),
+ 'Psi;' => chr(936),
+ 'Omega;' => chr(937),
+ 'alpha;' => chr(945),
+ 'beta;' => chr(946),
+ 'gamma;' => chr(947),
+ 'delta;' => chr(948),
+ 'epsilon;' => chr(949),
+ 'zeta;' => chr(950),
+ 'eta;' => chr(951),
+ 'theta;' => chr(952),
+ 'iota;' => chr(953),
+ 'kappa;' => chr(954),
+ 'lambda;' => chr(955),
+ 'mu;' => chr(956),
+ 'nu;' => chr(957),
+ 'xi;' => chr(958),
+ 'omicron;' => chr(959),
+ 'pi;' => chr(960),
+ 'rho;' => chr(961),
+ 'sigmaf;' => chr(962),
+ 'sigma;' => chr(963),
+ 'tau;' => chr(964),
+ 'upsilon;' => chr(965),
+ 'phi;' => chr(966),
+ 'chi;' => chr(967),
+ 'psi;' => chr(968),
+ 'omega;' => chr(969),
+ 'thetasym;' => chr(977),
+ 'upsih;' => chr(978),
+ 'piv;' => chr(982),
+ 'ensp;' => chr(8194),
+ 'emsp;' => chr(8195),
+ 'thinsp;' => chr(8201),
+ 'zwnj;' => chr(8204),
+ 'zwj;' => chr(8205),
+ 'lrm;' => chr(8206),
+ 'rlm;' => chr(8207),
+ 'ndash;' => chr(8211),
+ 'mdash;' => chr(8212),
+ 'lsquo;' => chr(8216),
+ 'rsquo;' => chr(8217),
+ 'sbquo;' => chr(8218),
+ 'ldquo;' => chr(8220),
+ 'rdquo;' => chr(8221),
+ 'bdquo;' => chr(8222),
+ 'dagger;' => chr(8224),
+ 'Dagger;' => chr(8225),
+ 'bull;' => chr(8226),
+ 'hellip;' => chr(8230),
+ 'permil;' => chr(8240),
+ 'prime;' => chr(8242),
+ 'Prime;' => chr(8243),
+ 'lsaquo;' => chr(8249),
+ 'rsaquo;' => chr(8250),
+ 'oline;' => chr(8254),
+ 'frasl;' => chr(8260),
+ 'euro;' => chr(8364),
+ 'image;' => chr(8465),
+ 'weierp;' => chr(8472),
+ 'real;' => chr(8476),
+ 'trade;' => chr(8482),
+ 'alefsym;' => chr(8501),
+ 'larr;' => chr(8592),
+ 'uarr;' => chr(8593),
+ 'rarr;' => chr(8594),
+ 'darr;' => chr(8595),
+ 'harr;' => chr(8596),
+ 'crarr;' => chr(8629),
+ 'lArr;' => chr(8656),
+ 'uArr;' => chr(8657),
+ 'rArr;' => chr(8658),
+ 'dArr;' => chr(8659),
+ 'hArr;' => chr(8660),
+ 'forall;' => chr(8704),
+ 'part;' => chr(8706),
+ 'exist;' => chr(8707),
+ 'empty;' => chr(8709),
+ 'nabla;' => chr(8711),
+ 'isin;' => chr(8712),
+ 'notin;' => chr(8713),
+ 'ni;' => chr(8715),
+ 'prod;' => chr(8719),
+ 'sum;' => chr(8721),
+ 'minus;' => chr(8722),
+ 'lowast;' => chr(8727),
+ 'radic;' => chr(8730),
+ 'prop;' => chr(8733),
+ 'infin;' => chr(8734),
+ 'ang;' => chr(8736),
+ 'and;' => chr(8743),
+ 'or;' => chr(8744),
+ 'cap;' => chr(8745),
+ 'cup;' => chr(8746),
+ 'int;' => chr(8747),
+ 'there4;' => chr(8756),
+ 'sim;' => chr(8764),
+ 'cong;' => chr(8773),
+ 'asymp;' => chr(8776),
+ 'ne;' => chr(8800),
+ 'equiv;' => chr(8801),
+ 'le;' => chr(8804),
+ 'ge;' => chr(8805),
+ 'sub;' => chr(8834),
+ 'sup;' => chr(8835),
+ 'nsub;' => chr(8836),
+ 'sube;' => chr(8838),
+ 'supe;' => chr(8839),
+ 'oplus;' => chr(8853),
+ 'otimes;' => chr(8855),
+ 'perp;' => chr(8869),
+ 'sdot;' => chr(8901),
+ 'lceil;' => chr(8968),
+ 'rceil;' => chr(8969),
+ 'lfloor;' => chr(8970),
+ 'rfloor;' => chr(8971),
+ 'lang;' => chr(9001),
+ 'rang;' => chr(9002),
+ 'loz;' => chr(9674),
+ 'spades;' => chr(9824),
+ 'clubs;' => chr(9827),
+ 'hearts;' => chr(9829),
+ 'diams;' => chr(9830),
+ ) : ())
+);
+
+
+# Make the opposite mapping
+while (my($entity, $char) = each(%entity2char)) {
+ $entity =~ s/;\z//;
+ $char2entity{$char} = "&$entity;";
+}
+delete $char2entity{"'"}; # only one-way decoding
+
+# Fill in missing entities
+for (0 .. 255) {
+ next if exists $char2entity{chr($_)};
+ $char2entity{chr($_)} = "&#$_;";
+}
+
+my %subst; # compiled encoding regexps
+
+sub encode_entities
+{
+ return undef unless defined $_[0];
+ my $ref;
+ if (defined wantarray) {
+ my $x = $_[0];
+ $ref = \$x; # copy
+ } else {
+ $ref = \$_[0]; # modify in-place
+ }
+ if (defined $_[1] and length $_[1]) {
+ unless (exists $subst{$_[1]}) {
+ # Because we can't compile regex we fake it with a cached sub
+ my $chars = $_[1];
+ $chars =~ s,(?<!\\)([]/]),\\$1,g;
+ $chars =~ s,(?<!\\)\\\z,\\\\,;
+ my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
+ $subst{$_[1]} = eval $code;
+ die( $@ . " while trying to turn range: \"$_[1]\"\n "
+ . "into code: $code\n "
+ ) if $@;
+ }
+ &{$subst{$_[1]}}($$ref);
+ } else {
+ # Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
+ $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
+ }
+ $$ref;
+}
+
+sub encode_entities_numeric {
+ local %char2entity;
+ return &encode_entities; # a goto &encode_entities wouldn't work
+}
+
+
+sub num_entity {
+ sprintf "&#x%X;", ord($_[0]);
+}
+
+# Set up aliases
+*encode = \&encode_entities;
+*encode_numeric = \&encode_entities_numeric;
+*encode_numerically = \&encode_entities_numeric;
+*decode = \&decode_entities;
+
+1;
diff --git a/lib/HTML/Filter.pm b/lib/HTML/Filter.pm
new file mode 100644
index 0000000..c5aa16e
--- /dev/null
+++ b/lib/HTML/Filter.pm
@@ -0,0 +1,112 @@
+package HTML::Filter;
+
+use strict;
+use vars qw(@ISA $VERSION);
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+
+$VERSION = "3.57";
+
+sub declaration { $_[0]->output("<!$_[1]>") }
+sub process { $_[0]->output($_[2]) }
+sub comment { $_[0]->output("<!--$_[1]-->") }
+sub start { $_[0]->output($_[4]) }
+sub end { $_[0]->output($_[2]) }
+sub text { $_[0]->output($_[1]) }
+
+sub output { print $_[1] }
+
+1;
+
+__END__
+
+=head1 NAME
+
+HTML::Filter - Filter HTML text through the parser
+
+=head1 NOTE
+
+B<This module is deprecated.> The C<HTML::Parser> now provides the
+functionally of C<HTML::Filter> much more efficiently with the the
+C<default> handler.
+
+=head1 SYNOPSIS
+
+ require HTML::Filter;
+ $p = HTML::Filter->new->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+C<HTML::Filter> is an HTML parser that by default prints the
+original text of each HTML element (a slow version of cat(1) basically).
+The callback methods may be overridden to modify the filtering for some
+HTML elements and you can override output() method which is called to
+print the HTML text.
+
+C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
+the document should be given to the parser by calling the $p->parse()
+or $p->parse_file() methods.
+
+=head1 EXAMPLES
+
+The first example is a filter that will remove all comments from an
+HTML file. This is achieved by simply overriding the comment method
+to do nothing.
+
+ package CommentStripper;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub comment { } # ignore comments
+
+The second example shows a filter that will remove any E<lt>TABLE>s
+found in the HTML file. We specialize the start() and end() methods
+to count table tags and then make output not happen when inside a
+table.
+
+ package TableStripper;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub start
+ {
+ my $self = shift;
+ $self->{table_seen}++ if $_[0] eq "table";
+ $self->SUPER::start(@_);
+ }
+
+ sub end
+ {
+ my $self = shift;
+ $self->SUPER::end(@_);
+ $self->{table_seen}-- if $_[0] eq "table";
+ }
+
+ sub output
+ {
+ my $self = shift;
+ unless ($self->{table_seen}) {
+ $self->SUPER::output(@_);
+ }
+ }
+
+If you want to collect the parsed text internally you might want to do
+something like this:
+
+ package FilterIntoString;
+ require HTML::Filter;
+ @ISA=qw(HTML::Filter);
+ sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+ sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+
+=head1 SEE ALSO
+
+L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1997-1999 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
diff --git a/lib/HTML/HeadParser.pm b/lib/HTML/HeadParser.pm
new file mode 100644
index 0000000..28e9cac
--- /dev/null
+++ b/lib/HTML/HeadParser.pm
@@ -0,0 +1,315 @@
+package HTML::HeadParser;
+
+=head1 NAME
+
+HTML::HeadParser - Parse <HEAD> section of a HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::HeadParser;
+ $p = HTML::HeadParser->new;
+ $p->parse($text) and print "not finished";
+
+ $p->header('Title') # to access <title>....</title>
+ $p->header('Content-Base') # to access <base href="http://...">
+ $p->header('Foo') # to access <meta http-equiv="Foo" content="...">
+ $p->header('X-Meta-Author') # to access <meta name="author" content="...">
+ $p->header('X-Meta-Charset') # to access <meta charset="...">
+
+=head1 DESCRIPTION
+
+The C<HTML::HeadParser> is a specialized (and lightweight)
+C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
+section of an HTML document. The parse() method
+will return a FALSE value as soon as some E<lt>BODY> element or body
+text are found, and should not be called again after this.
+
+Note that the C<HTML::HeadParser> might get confused if raw undecoded
+UTF-8 is passed to the parse() method. Make sure the strings are
+properly decoded before passing them on.
+
+The C<HTML::HeadParser> keeps a reference to a header object, and the
+parser will update this header object as the various elements of the
+E<lt>HEAD> section of the HTML document are recognized. The following
+header fields are affected:
+
+=over 4
+
+=item Content-Base:
+
+The I<Content-Base> header is initialized from the E<lt>base
+href="..."> element.
+
+=item Title:
+
+The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
+element.
+
+=item Isindex:
+
+The I<Isindex> header will be added if there is a E<lt>isindex>
+element in the E<lt>head>. The header value is initialized from the
+I<prompt> attribute if it is present. If no I<prompt> attribute is
+given it will have '?' as the value.
+
+=item X-Meta-Foo:
+
+All E<lt>meta> elements containing a C<name> attribute will result in
+headers using the prefix C<X-Meta-> appended with the value of the
+C<name> attribute as the name of the header, and the value of the
+C<content> attribute as the pushed header value.
+
+E<lt>meta> elements containing a C<http-equiv> attribute will result
+in headers as in above, but without the C<X-Meta-> prefix in the
+header name.
+
+E<lt>meta> elements containing a C<charset> attribute will result in
+an C<X-Meta-Charset> header, using the value of the C<charset>
+attribute as the pushed header value.
+
+The ':' character can't be represented in header field names, so
+if the meta element contains this char it's substituted with '-'
+before forming the field name.
+
+=back
+
+=head1 METHODS
+
+The following methods (in addition to those provided by the
+superclass) are available:
+
+=over 4
+
+=cut
+
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+
+use HTML::Entities ();
+
+use strict;
+use vars qw($VERSION $DEBUG);
+#$DEBUG = 1;
+$VERSION = "3.71";
+
+=item $hp = HTML::HeadParser->new
+
+=item $hp = HTML::HeadParser->new( $header )
+
+The object constructor. The optional $header argument should be a
+reference to an object that implement the header() and push_header()
+methods as defined by the C<HTTP::Headers> class. Normally it will be
+of some class that is a or delegates to the C<HTTP::Headers> class.
+
+If no $header is given C<HTML::HeadParser> will create an
+C<HTTP::Headers> object by itself (initially empty).
+
+=cut
+
+sub new
+{
+ my($class, $header) = @_;
+ unless ($header) {
+ require HTTP::Headers;
+ $header = HTTP::Headers->new;
+ }
+
+ my $self = $class->SUPER::new(api_version => 3,
+ start_h => ["start", "self,tagname,attr"],
+ end_h => ["end", "self,tagname"],
+ text_h => ["text", "self,text"],
+ ignore_elements => [qw(script style)],
+ );
+ $self->{'header'} = $header;
+ $self->{'tag'} = ''; # name of active element that takes textual content
+ $self->{'text'} = ''; # the accumulated text associated with the element
+ $self;
+}
+
+=item $hp->header;
+
+Returns a reference to the header object.
+
+=item $hp->header( $key )
+
+Returns a header value. It is just a shorter way to write
+C<$hp-E<gt>header-E<gt>header($key)>.
+
+=cut
+
+sub header
+{
+ my $self = shift;
+ return $self->{'header'} unless @_;
+ $self->{'header'}->header(@_);
+}
+
+sub as_string # legacy
+{
+ my $self = shift;
+ $self->{'header'}->as_string;
+}
+
+sub flush_text # internal
+{
+ my $self = shift;
+ my $tag = $self->{'tag'};
+ my $text = $self->{'text'};
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ $text =~ s/\s+/ /g;
+ print "FLUSH $tag => '$text'\n" if $DEBUG;
+ if ($tag eq 'title') {
+ my $decoded;
+ $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode;
+ HTML::Entities::decode($text);
+ utf8::encode($text) if $decoded;
+ $self->{'header'}->push_header(Title => $text);
+ }
+ $self->{'tag'} = $self->{'text'} = '';
+}
+
+# This is an quote from the HTML3.2 DTD which shows which elements
+# that might be present in a <HEAD>...</HEAD>. Also note that the
+# <HEAD> tags themselves might be missing:
+#
+# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
+# SCRIPT* & META* & LINK*">
+#
+# <!ELEMENT HEAD O O (%head.content)>
+#
+# From HTML 4.01:
+#
+# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
+# <!ENTITY % head.content "TITLE & BASE?">
+# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
+#
+# From HTML 5 as of WD-html5-20090825:
+#
+# One or more elements of metadata content, [...]
+# => base, command, link, meta, noscript, script, style, title
+
+sub start
+{
+ my($self, $tag, $attr) = @_; # $attr is reference to a HASH
+ print "START[$tag]\n" if $DEBUG;
+ $self->flush_text if $self->{'tag'};
+ if ($tag eq 'meta') {
+ my $key = $attr->{'http-equiv'};
+ if (!defined($key) || !length($key)) {
+ if ($attr->{name}) {
+ $key = "X-Meta-\u$attr->{name}";
+ } elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
+ $key = "X-Meta-Charset";
+ $self->{header}->push_header($key => $attr->{charset});
+ return;
+ } else {
+ return;
+ }
+ }
+ $key =~ s/:/-/g;
+ $self->{'header'}->push_header($key => $attr->{content});
+ } elsif ($tag eq 'base') {
+ return unless exists $attr->{href};
+ (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5
+ $self->{'header'}->push_header('Content-Base' => $base);
+ } elsif ($tag eq 'isindex') {
+ # This is a non-standard header. Perhaps we should just ignore
+ # this element
+ $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
+ } elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
+ # Just remember tag. Initialize header when we see the end tag.
+ $self->{'tag'} = $tag;
+ } elsif ($tag eq 'link') {
+ return unless exists $attr->{href};
+ # <link href="http:..." rel="xxx" rev="xxx" title="xxx">
+ my $href = delete($attr->{href});
+ $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
+ my $h_val = "<$href>";
+ for (sort keys %{$attr}) {
+ next if $_ eq "/"; # XHTML junk
+ $h_val .= qq(; $_="$attr->{$_}");
+ }
+ $self->{'header'}->push_header(Link => $h_val);
+ } elsif ($tag eq 'head' || $tag eq 'html') {
+ # ignore
+ } else {
+ # stop parsing
+ $self->eof;
+ }
+}
+
+sub end
+{
+ my($self, $tag) = @_;
+ print "END[$tag]\n" if $DEBUG;
+ $self->flush_text if $self->{'tag'};
+ $self->eof if $tag eq 'head';
+}
+
+sub text
+{
+ my($self, $text) = @_;
+ print "TEXT[$text]\n" if $DEBUG;
+ unless ($self->{first_chunk}) {
+ # drop Unicode BOM if found
+ if ($self->utf8_mode) {
+ $text =~ s/^\xEF\xBB\xBF//;
+ }
+ else {
+ $text =~ s/^\x{FEFF}//;
+ }
+ $self->{first_chunk}++;
+ }
+ my $tag = $self->{tag};
+ if (!$tag && $text =~ /\S/) {
+ # Normal text means start of body
+ $self->eof;
+ return;
+ }
+ return if $tag ne 'title';
+ $self->{'text'} .= $text;
+}
+
+BEGIN {
+ *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
+}
+
+1;
+
+__END__
+
+=back
+
+=head1 EXAMPLE
+
+ $h = HTTP::Headers->new;
+ $p = HTML::HeadParser->new($h);
+ $p->parse(<<EOT);
+ <title>Stupid example</title>
+ <base href="http://www.linpro.no/lwp/">
+ Normal text starts here.
+ EOT
+ undef $p;
+ print $h->title; # should print "Stupid example"
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTTP::Headers>
+
+The C<HTTP::Headers> class is distributed as part of the
+I<libwww-perl> package. If you don't have that distribution installed
+you need to provide the $header argument to the C<HTML::HeadParser>
+constructor with your own object that implements the documented
+protocol.
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
diff --git a/lib/HTML/LinkExtor.pm b/lib/HTML/LinkExtor.pm
new file mode 100644
index 0000000..c2f08c6
--- /dev/null
+++ b/lib/HTML/LinkExtor.pm
@@ -0,0 +1,185 @@
+package HTML::LinkExtor;
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+$VERSION = "3.69";
+
+=head1 NAME
+
+HTML::LinkExtor - Extract links from an HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::LinkExtor;
+ $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
+ sub cb {
+ my($tag, %links) = @_;
+ print "$tag @{[%links]}\n";
+ }
+ $p->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+I<HTML::LinkExtor> is an HTML parser that extracts links from an
+HTML document. The I<HTML::LinkExtor> is a subclass of
+I<HTML::Parser>. This means that the document should be given to the
+parser by calling the $p->parse() or $p->parse_file() methods.
+
+=cut
+
+use strict;
+use HTML::Tagset ();
+
+# legacy (some applications grabs this hash directly)
+use vars qw(%LINK_ELEMENT);
+*LINK_ELEMENT = \%HTML::Tagset::linkElements;
+
+=over 4
+
+=item $p = HTML::LinkExtor->new
+
+=item $p = HTML::LinkExtor->new( $callback )
+
+=item $p = HTML::LinkExtor->new( $callback, $base )
+
+The constructor takes two optional arguments. The first is a reference
+to a callback routine. It will be called as links are found. If a
+callback is not provided, then links are just accumulated internally
+and can be retrieved by calling the $p->links() method.
+
+The $base argument is an optional base URL used to absolutize all URLs found.
+You need to have the I<URI> module installed if you provide $base.
+
+The callback is called with the lowercase tag name as first argument,
+and then all link attributes as separate key/value pairs. All
+non-link attributes are removed.
+
+=cut
+
+sub new
+{
+ my($class, $cb, $base) = @_;
+ my $self = $class->SUPER::new(
+ start_h => ["_start_tag", "self,tagname,attr"],
+ report_tags => [keys %HTML::Tagset::linkElements],
+ );
+ $self->{extractlink_cb} = $cb;
+ if ($base) {
+ require URI;
+ $self->{extractlink_base} = URI->new($base);
+ }
+ $self;
+}
+
+sub _start_tag
+{
+ my($self, $tag, $attr) = @_;
+
+ my $base = $self->{extractlink_base};
+ my $links = $HTML::Tagset::linkElements{$tag};
+ $links = [$links] unless ref $links;
+
+ my @links;
+ my $a;
+ for $a (@$links) {
+ next unless exists $attr->{$a};
+ (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
+ push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
+ }
+ return unless @links;
+ $self->_found_link($tag, @links);
+}
+
+sub _found_link
+{
+ my $self = shift;
+ my $cb = $self->{extractlink_cb};
+ if ($cb) {
+ &$cb(@_);
+ } else {
+ push(@{$self->{'links'}}, [@_]);
+ }
+}
+
+=item $p->links
+
+Returns a list of all links found in the document. The returned
+values will be anonymous arrays with the following elements:
+
+ [$tag, $attr => $url1, $attr2 => $url2,...]
+
+The $p->links method will also truncate the internal link list. This
+means that if the method is called twice without any parsing
+between them the second call will return an empty list.
+
+Also note that $p->links will always be empty if a callback routine
+was provided when the I<HTML::LinkExtor> was created.
+
+=cut
+
+sub links
+{
+ my $self = shift;
+ exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
+}
+
+# We override the parse_file() method so that we can clear the links
+# before we start a new file.
+sub parse_file
+{
+ my $self = shift;
+ delete $self->{'links'};
+ $self->SUPER::parse_file(@_);
+}
+
+=back
+
+=head1 EXAMPLE
+
+This is an example showing how you can extract links from a document
+received using LWP:
+
+ use LWP::UserAgent;
+ use HTML::LinkExtor;
+ use URI::URL;
+
+ $url = "http://www.perl.org/"; # for instance
+ $ua = LWP::UserAgent->new;
+
+ # Set up a callback that collect image links
+ my @imgs = ();
+ sub callback {
+ my($tag, %attr) = @_;
+ return if $tag ne 'img'; # we only look closer at <img ...>
+ push(@imgs, values %attr);
+ }
+
+ # Make the parser. Unfortunately, we don't know the base yet
+ # (it might be different from $url)
+ $p = HTML::LinkExtor->new(\&callback);
+
+ # Request document and parse it as it arrives
+ $res = $ua->request(HTTP::Request->new(GET => $url),
+ sub {$p->parse($_[0])});
+
+ # Expand all image URLs to absolute ones
+ my $base = $res->base;
+ @imgs = map { $_ = url($_, $base)->abs; } @imgs;
+
+ # Print them out
+ print join("\n", @imgs), "\n";
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+1;
diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm
new file mode 100644
index 0000000..3083379
--- /dev/null
+++ b/lib/HTML/PullParser.pm
@@ -0,0 +1,209 @@
+package HTML::PullParser;
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+$VERSION = "3.57";
+
+use strict;
+use Carp ();
+
+sub new
+{
+ my($class, %cnf) = @_;
+
+ # Construct argspecs for the various events
+ my %argspec;
+ for (qw(start end text declaration comment process default)) {
+ my $tmp = delete $cnf{$_};
+ next unless defined $tmp;
+ $argspec{$_} = $tmp;
+ }
+ Carp::croak("Info not collected for any events")
+ unless %argspec;
+
+ my $file = delete $cnf{file};
+ my $doc = delete $cnf{doc};
+ Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
+ if defined($file) && defined($doc);
+ Carp::croak("No 'doc' or 'file' given to parse from")
+ unless defined($file) || defined($doc);
+
+ # Create object
+ $cnf{api_version} = 3;
+ my $self = $class->SUPER::new(%cnf);
+
+ my $accum = $self->{pullparser_accum} = [];
+ while (my($event, $argspec) = each %argspec) {
+ $self->SUPER::handler($event => $accum, $argspec);
+ }
+
+ if (defined $doc) {
+ $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
+ $self->{pullparser_str_pos} = 0;
+ }
+ else {
+ if (!ref($file) && ref(\$file) ne "GLOB") {
+ require IO::File;
+ $file = IO::File->new($file, "r") || return;
+ }
+
+ $self->{pullparser_file} = $file;
+ }
+ $self;
+}
+
+
+sub handler
+{
+ Carp::croak("Can't set handlers for HTML::PullParser");
+}
+
+
+sub get_token
+{
+ my $self = shift;
+ while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
+ if (my $f = $self->{pullparser_file}) {
+ # must try to parse more from the file
+ my $buf;
+ if (read($f, $buf, 512)) {
+ $self->parse($buf);
+ } else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_file};
+ }
+ }
+ elsif (my $sref = $self->{pullparser_str_ref}) {
+ # must try to parse more from the scalar
+ my $pos = $self->{pullparser_str_pos};
+ my $chunk = substr($$sref, $pos, 512);
+ $self->parse($chunk);
+ $pos += length($chunk);
+ if ($pos < length($$sref)) {
+ $self->{pullparser_str_pos} = $pos;
+ }
+ else {
+ $self->eof;
+ $self->{pullparser_eof}++;
+ delete $self->{pullparser_str_ref};
+ delete $self->{pullparser_str_pos};
+ }
+ }
+ else {
+ die;
+ }
+ }
+ shift @{$self->{pullparser_accum}};
+}
+
+
+sub unget_token
+{
+ my $self = shift;
+ unshift @{$self->{pullparser_accum}}, @_;
+ $self;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::PullParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ use HTML::PullParser;
+
+ $p = HTML::PullParser->new(file => "index.html",
+ start => 'event, tagname, @attr',
+ end => 'event, tagname',
+ ignore_elements => [qw(script style)],
+ ) || die "Can't open: $!";
+ while (my $token = $p->get_token) {
+ #...do something with $token
+ }
+
+=head1 DESCRIPTION
+
+The HTML::PullParser is an alternative interface to the HTML::Parser class.
+It basically turns the HTML::Parser inside out. You associate a file
+(or any IO::Handle object or string) with the parser at construction time and
+then repeatedly call $parser->get_token to obtain the tags and text
+found in the parsed document.
+
+The following methods are provided:
+
+=over 4
+
+=item $p = HTML::PullParser->new( file => $file, %options )
+
+=item $p = HTML::PullParser->new( doc => \$doc, %options )
+
+A C<HTML::PullParser> can be made to parse from either a file or a
+literal document based on whether the C<file> or C<doc> option is
+passed to the parser's constructor.
+
+The C<file> passed in can either be a file name or a file handle
+object. If a file name is passed, and it can't be opened for reading,
+then the constructor will return an undefined value and $! will tell
+you why it failed. Otherwise the argument is taken to be some object
+that the C<HTML::PullParser> can read() from when it needs more data.
+The stream will be read() until EOF, but not closed.
+
+A C<doc> can be passed plain or as a reference
+to a scalar. If a reference is passed then the value of this scalar
+should not be changed before all tokens have been extracted.
+
+Next the information to be returned for the different token types must
+be set up. This is done by simply associating an argspec (as defined
+in L<HTML::Parser>) with the events you have an interest in. For
+instance, if you want C<start> tokens to be reported as the string
+C<'S'> followed by the tagname and the attributes you might pass an
+C<start>-option like this:
+
+ $p = HTML::PullParser->new(
+ doc => $document_to_parse,
+ start => '"S", tagname, @attr',
+ end => '"E", tagname',
+ );
+
+At last other C<HTML::Parser> options, like C<ignore_tags>, and
+C<unbroken_text>, can be passed in. Note that you should not use the
+I<event>_h options to set up parser handlers. That would confuse the
+inner logic of C<HTML::PullParser>.
+
+=item $token = $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document. The token is returned as an
+array reference. The content of this array match the argspec set up
+during C<HTML::PullParser> construction.
+
+=item $p->unget_token( @tokens )
+
+If you find out you have read too many tokens you can push them back,
+so that they are returned again the next time $p->get_token is called.
+
+=back
+
+=head1 EXAMPLES
+
+The 'eg/hform' script shows how we might parse the form section of
+HTML::Documents using HTML::PullParser.
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::TokeParser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
diff --git a/lib/HTML/TokeParser.pm b/lib/HTML/TokeParser.pm
new file mode 100644
index 0000000..959b96f
--- /dev/null
+++ b/lib/HTML/TokeParser.pm
@@ -0,0 +1,371 @@
+package HTML::TokeParser;
+
+require HTML::PullParser;
+@ISA=qw(HTML::PullParser);
+$VERSION = "3.69";
+
+use strict;
+use Carp ();
+use HTML::Entities qw(decode_entities);
+use HTML::Tagset ();
+
+my %ARGS =
+(
+ start => "'S',tagname,attr,attrseq,text",
+ end => "'E',tagname,text",
+ text => "'T',text,is_cdata",
+ process => "'PI',token0,text",
+ comment => "'C',text",
+ declaration => "'D',text",
+
+ # options that default on
+ unbroken_text => 1,
+);
+
+
+sub new
+{
+ my $class = shift;
+ my %cnf;
+
+ if (@_ == 1) {
+ my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
+ %cnf = ($type => $_[0]);
+ }
+ else {
+ unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
+ %cnf = @_;
+ }
+
+ my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
+
+ my $self = $class->SUPER::new(%ARGS, %cnf) || return undef;
+
+ $self->{textify} = $textify;
+ $self;
+}
+
+
+sub get_tag
+{
+ my $self = shift;
+ my $token;
+ while (1) {
+ $token = $self->get_token || return undef;
+ my $type = shift @$token;
+ next unless $type eq "S" || $type eq "E";
+ substr($token->[0], 0, 0) = "/" if $type eq "E";
+ return $token unless @_;
+ for (@_) {
+ return $token if $token->[0] eq $_;
+ }
+ }
+}
+
+
+sub _textify {
+ my($self, $token) = @_;
+ my $tag = $token->[1];
+ return undef unless exists $self->{textify}{$tag};
+
+ my $alt = $self->{textify}{$tag};
+ my $text;
+ if (ref($alt)) {
+ $text = &$alt(@$token);
+ } else {
+ $text = $token->[2]{$alt || "alt"};
+ $text = "[\U$tag]" unless defined $text;
+ }
+ return $text;
+}
+
+
+sub get_text
+{
+ my $self = shift;
+ my @text;
+ while (my $token = $self->get_token) {
+ my $type = $token->[0];
+ if ($type eq "T") {
+ my $text = $token->[1];
+ decode_entities($text) unless $token->[2];
+ push(@text, $text);
+ } elsif ($type =~ /^[SE]$/) {
+ my $tag = $token->[1];
+ if ($type eq "S") {
+ if (defined(my $text = _textify($self, $token))) {
+ push(@text, $text);
+ next;
+ }
+ } else {
+ $tag = "/$tag";
+ }
+ if (!@_ || grep $_ eq $tag, @_) {
+ $self->unget_token($token);
+ last;
+ }
+ push(@text, " ")
+ if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
+ }
+ }
+ join("", @text);
+}
+
+
+sub get_trimmed_text
+{
+ my $self = shift;
+ my $text = $self->get_text(@_);
+ $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+ $text;
+}
+
+sub get_phrase {
+ my $self = shift;
+ my @text;
+ while (my $token = $self->get_token) {
+ my $type = $token->[0];
+ if ($type eq "T") {
+ my $text = $token->[1];
+ decode_entities($text) unless $token->[2];
+ push(@text, $text);
+ } elsif ($type =~ /^[SE]$/) {
+ my $tag = $token->[1];
+ if ($type eq "S") {
+ if (defined(my $text = _textify($self, $token))) {
+ push(@text, $text);
+ next;
+ }
+ }
+ if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
+ $self->unget_token($token);
+ last;
+ }
+ push(@text, " ") if $tag eq "br";
+ }
+ }
+ my $text = join("", @text);
+ $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+ $text;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::TokeParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ require HTML::TokeParser;
+ $p = HTML::TokeParser->new("index.html") ||
+ die "Can't open: $!";
+ $p->empty_element_tags(1); # configure its behaviour
+
+ while (my $token = $p->get_token) {
+ #...
+ }
+
+=head1 DESCRIPTION
+
+The C<HTML::TokeParser> is an alternative interface to the
+C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a
+predeclared set of token types. If you wish the tokens to be reported
+differently you probably want to use the C<HTML::PullParser> directly.
+
+The following methods are available:
+
+=over 4
+
+=item $p = HTML::TokeParser->new( $filename, %opt );
+
+=item $p = HTML::TokeParser->new( $filehandle, %opt );
+
+=item $p = HTML::TokeParser->new( \$document, %opt );
+
+The object constructor argument is either a file name, a file handle
+object, or the complete document to be parsed. Extra options can be
+provided as key/value pairs and are processed as documented by the base
+classes.
+
+If the argument is a plain scalar, then it is taken as the name of a
+file to be opened and parsed. If the file can't be opened for
+reading, then the constructor will return C<undef> and $! will tell
+you why it failed.
+
+If the argument is a reference to a plain scalar, then this scalar is
+taken to be the literal document to parse. The value of this
+scalar should not be changed before all tokens have been extracted.
+
+Otherwise the argument is taken to be some object that the
+C<HTML::TokeParser> can read() from when it needs more data. Typically
+it will be a filehandle of some kind. The stream will be read() until
+EOF, but not closed.
+
+A newly constructed C<HTML::TokeParser> differ from its base classes
+by having the C<unbroken_text> attribute enabled by default. See
+L<HTML::Parser> for a description of this and other attributes that
+influence how the document is parsed. It is often a good idea to enable
+C<empty_element_tags> behaviour.
+
+Note that the parsing result will likely not be valid if raw undecoded
+UTF-8 is used as a source. When parsing UTF-8 encoded files turn
+on UTF-8 decoding:
+
+ open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
+ my $p = HTML::TokeParser->new( $fh );
+ # ...
+
+If a $filename is passed to the constructor the file will be opened in
+raw mode and the parsing result will only be valid if its content is
+Latin-1 or pure ASCII.
+
+If parsing from an UTF-8 encoded string buffer decode it first:
+
+ utf8::decode($document);
+ my $p = HTML::TokeParser->new( \$document );
+ # ...
+
+=item $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document. The token is returned as an
+array reference. The first element of the array will be a string
+denoting the type of this token: "S" for start tag, "E" for end tag,
+"T" for text, "C" for comment, "D" for declaration, and "PI" for
+process instructions. The rest of the token array depend on the type
+like this:
+
+ ["S", $tag, $attr, $attrseq, $text]
+ ["E", $tag, $text]
+ ["T", $text, $is_data]
+ ["C", $text]
+ ["D", $text]
+ ["PI", $token0, $text]
+
+where $attr is a hash reference, $attrseq is an array reference and
+the rest are plain scalars. The L<HTML::Parser/Argspec> explains the
+details.
+
+=item $p->unget_token( @tokens )
+
+If you find you have read too many tokens you can push them back,
+so that they are returned the next time $p->get_token is called.
+
+=item $p->get_tag
+
+=item $p->get_tag( @tags )
+
+This method returns the next start or end tag (skipping any other
+tokens), or C<undef> if there are no more tags in the document. If
+one or more arguments are given, then we skip tokens until one of the
+specified tag types is found. For example:
+
+ $p->get_tag("font", "/font");
+
+will find the next start or end tag for a font-element.
+
+The tag information is returned as an array reference in the same form
+as for $p->get_token above, but the type code (first element) is
+missing. A start tag will be returned like this:
+
+ [$tag, $attr, $attrseq, $text]
+
+The tagname of end tags are prefixed with "/", i.e. end tag is
+returned like this:
+
+ ["/$tag", $text]
+
+=item $p->get_text
+
+=item $p->get_text( @endtags )
+
+This method returns all text found at the current position. It will
+return a zero length string if the next token is not text. Any
+entities will be converted to their corresponding character.
+
+If one or more arguments are given, then we return all text occurring
+before the first of the specified tags found. For example:
+
+ $p->get_text("p", "br");
+
+will return the text up to either a paragraph of linebreak element.
+
+The text might span tags that should be I<textified>. This is
+controlled by the $p->{textify} attribute, which is a hash that
+defines how certain tags can be treated as text. If the name of a
+start tag matches a key in this hash then this tag is converted to
+text. The hash value is used to specify which tag attribute to obtain
+the text from. If this tag attribute is missing, then the upper case
+name of the tag enclosed in brackets is returned, e.g. "[IMG]". The
+hash value can also be a subroutine reference. In this case the
+routine is called with the start tag token content as its argument and
+the return value is treated as the text.
+
+The default $p->{textify} value is:
+
+ {img => "alt", applet => "alt"}
+
+This means that <IMG> and <APPLET> tags are treated as text, and that
+the text to substitute can be found in the ALT attribute.
+
+=item $p->get_trimmed_text
+
+=item $p->get_trimmed_text( @endtags )
+
+Same as $p->get_text above, but will collapse any sequences of white
+space to a single space character. Leading and trailing white space is
+removed.
+
+=item $p->get_phrase
+
+This will return all text found at the current position ignoring any
+phrasal-level tags. Text is extracted until the first non
+phrasal-level tag. Textification of tags is the same as for
+get_text(). This method will collapse white space in the same way as
+get_trimmed_text() does.
+
+The definition of <i>phrasal-level tags</i> is obtained from the
+HTML::Tagset module.
+
+=back
+
+=head1 EXAMPLES
+
+This example extracts all links from a document. It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+ use HTML::TokeParser;
+ $p = HTML::TokeParser->new(shift||"index.html");
+
+ while (my $token = $p->get_tag("a")) {
+ my $url = $token->[1]{href} || "-";
+ my $text = $p->get_trimmed_text("/a");
+ print "$url\t$text\n";
+ }
+
+This example extract the <TITLE> from the document:
+
+ use HTML::TokeParser;
+ $p = HTML::TokeParser->new(shift||"index.html");
+ if ($p->get_tag("title")) {
+ my $title = $p->get_trimmed_text;
+ print "Title: $title\n";
+ }
+
+=head1 SEE ALSO
+
+L<HTML::PullParser>, L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2005 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut