HTML-Parser-3.71HEAD HTML-Parser-3.71 master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
commit: 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree: 4734ccd522c71dd455879162006742002f8c1565 /lib
download: HTML-Parser-tarball-master.tar.gz
6 files changed, 1675 insertions, 0 deletions
diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm
new file mode 100644
index 0000000..ecd8e0d
--- /dev/null
+++ b/lib/HTML/Entities.pm
@@ -0,0 +1,483 @@
+package HTML::Entities;
+
+=encoding utf8
+
+=head1 NAME
+
+HTML::Entities - Encode or decode strings with HTML entities
+
+=head1 SYNOPSIS
+
+ use HTML::Entities;
+
+ $a = "V&aring;re norske tegn b&oslash;r &#230res";
+ decode_entities($a);
+ encode_entities($a, "\200-\377");
+
+For example, this:
+
+ $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé";
+ print encode_entities($input), "\n"
+
+Prints this out:
+
+ vis-&agrave;-vis Beyonc&eacute;'s na&iuml;ve
+ papier-m&acirc;ch&eacute; r&eacute;sum&eacute;
+
+=head1 DESCRIPTION
+
+This module deals with encoding and decoding of strings with HTML
+character entities.  The module provides the following functions:
+
+=over 4
+
+=item decode_entities( $string, ... )
+
+This routine replaces HTML entities found in the $string with the
+corresponding Unicode character.  Unrecognized entities are left alone.
+
+If multiple strings are provided as argument they are each decoded
+separately and the same number of strings are returned.
+
+If called in void context the arguments are decoded in-place.
+
+This routine is exported by default.
+
+=item _decode_entities( $string, \%entity2char )
+
+=item _decode_entities( $string, \%entity2char, $expand_prefix )
+
+This will in-place replace HTML entities in $string.  The %entity2char
+hash must be provided.  Named entities not found in the %entity2char
+hash are left alone.  Numeric entities are expanded unless their value
+overflow.
+
+The keys in %entity2char are the entity names to be expanded and their
+values are what they should expand into.  The values do not have to be
+single character strings.  If a key has ";" as suffix,
+then occurrences in $string are only expanded if properly terminated
+with ";".  Entities without ";" will be expanded regardless of how
+they are terminated for compatibility with how common browsers treat
+entities in the Latin-1 range.
+
+If $expand_prefix is TRUE then entities without trailing ";" in
+%entity2char will even be expanded as a prefix of a longer
+unrecognized name.  The longest matching name in %entity2char will be
+used. This is mainly present for compatibility with an MSIE
+misfeature.
+
+   $string = "foo&nbspbar";
+   _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1);
+   print $string;  # will print "foo bar"
+
+This routine is exported by default.
+
+=item encode_entities( $string )
+
+=item encode_entities( $string, $unsafe_chars )
+
+This routine replaces unsafe characters in $string with their entity
+representation. A second argument can be given to specify which characters to
+consider unsafe.  The unsafe characters is specified using the regular
+expression character class syntax (what you find within brackets in regular
+expressions).
+
+The default set of characters to encode are control chars, high-bit chars, and
+the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters.  But this,
+for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< "
+>> characters:
+
+  $encoded = encode_entities($input, '<>&"');
+
+and this would only encode non-plain ascii:
+
+  $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e');
+
+This routine is exported by default.
+
+=item encode_entities_numeric( $string )
+
+=item encode_entities_numeric( $string, $unsafe_chars )
+
+This routine works just like encode_entities, except that the replacement
+entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>.  For
+example, C<encode_entities("r\xF4le")> returns "r&ocirc;le", but
+C<encode_entities_numeric("r\xF4le")> returns "r&#xF4;le".
+
+This routine is I<not> exported by default.  But you can always
+export it with C<use HTML::Entities qw(encode_entities_numeric);>
+or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);>
+
+=back
+
+All these routines modify the string passed as the first argument, if
+called in a void context.  In scalar and array contexts, the encoded or
+decoded string is returned (without changing the input string).
+
+If you prefer not to import these routines into your namespace, you can
+call them as:
+
+  use HTML::Entities ();
+  $decoded = HTML::Entities::decode($a);
+  $encoded = HTML::Entities::encode($a);
+  $encoded = HTML::Entities::encode_numeric($a);
+
+The module can also export the %char2entity and the %entity2char
+hashes, which contain the mapping from all characters to the
+corresponding entities (and vice versa, respectively).
+
+=head1 COPYRIGHT
+
+Copyright 1995-2006 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+use strict;
+use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION);
+use vars qw(%entity2char %char2entity);
+
+require 5.004;
+require Exporter;
+@ISA = qw(Exporter);
+
+@EXPORT = qw(encode_entities decode_entities _decode_entities);
+@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric);
+
+$VERSION = "3.69";
+sub Version { $VERSION; }
+
+require HTML::Parser;  # for fast XS implemented decode_entities
+
+
+%entity2char = (
+ # Some normal chars that have special meaning in SGML context
+ amp    => '&',  # ampersand 
+'gt'    => '>',  # greater than
+'lt'    => '<',  # less than
+ quot   => '"',  # double quote
+ apos   => "'",  # single quote
+
+ # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML
+ AElig	=> chr(198),  # capital AE diphthong (ligature)
+ Aacute	=> chr(193),  # capital A, acute accent
+ Acirc	=> chr(194),  # capital A, circumflex accent
+ Agrave	=> chr(192),  # capital A, grave accent
+ Aring	=> chr(197),  # capital A, ring
+ Atilde	=> chr(195),  # capital A, tilde
+ Auml	=> chr(196),  # capital A, dieresis or umlaut mark
+ Ccedil	=> chr(199),  # capital C, cedilla
+ ETH	=> chr(208),  # capital Eth, Icelandic
+ Eacute	=> chr(201),  # capital E, acute accent
+ Ecirc	=> chr(202),  # capital E, circumflex accent
+ Egrave	=> chr(200),  # capital E, grave accent
+ Euml	=> chr(203),  # capital E, dieresis or umlaut mark
+ Iacute	=> chr(205),  # capital I, acute accent
+ Icirc	=> chr(206),  # capital I, circumflex accent
+ Igrave	=> chr(204),  # capital I, grave accent
+ Iuml	=> chr(207),  # capital I, dieresis or umlaut mark
+ Ntilde	=> chr(209),  # capital N, tilde
+ Oacute	=> chr(211),  # capital O, acute accent
+ Ocirc	=> chr(212),  # capital O, circumflex accent
+ Ograve	=> chr(210),  # capital O, grave accent
+ Oslash	=> chr(216),  # capital O, slash
+ Otilde	=> chr(213),  # capital O, tilde
+ Ouml	=> chr(214),  # capital O, dieresis or umlaut mark
+ THORN	=> chr(222),  # capital THORN, Icelandic
+ Uacute	=> chr(218),  # capital U, acute accent
+ Ucirc	=> chr(219),  # capital U, circumflex accent
+ Ugrave	=> chr(217),  # capital U, grave accent
+ Uuml	=> chr(220),  # capital U, dieresis or umlaut mark
+ Yacute	=> chr(221),  # capital Y, acute accent
+ aacute	=> chr(225),  # small a, acute accent
+ acirc	=> chr(226),  # small a, circumflex accent
+ aelig	=> chr(230),  # small ae diphthong (ligature)
+ agrave	=> chr(224),  # small a, grave accent
+ aring	=> chr(229),  # small a, ring
+ atilde	=> chr(227),  # small a, tilde
+ auml	=> chr(228),  # small a, dieresis or umlaut mark
+ ccedil	=> chr(231),  # small c, cedilla
+ eacute	=> chr(233),  # small e, acute accent
+ ecirc	=> chr(234),  # small e, circumflex accent
+ egrave	=> chr(232),  # small e, grave accent
+ eth	=> chr(240),  # small eth, Icelandic
+ euml	=> chr(235),  # small e, dieresis or umlaut mark
+ iacute	=> chr(237),  # small i, acute accent
+ icirc	=> chr(238),  # small i, circumflex accent
+ igrave	=> chr(236),  # small i, grave accent
+ iuml	=> chr(239),  # small i, dieresis or umlaut mark
+ ntilde	=> chr(241),  # small n, tilde
+ oacute	=> chr(243),  # small o, acute accent
+ ocirc	=> chr(244),  # small o, circumflex accent
+ ograve	=> chr(242),  # small o, grave accent
+ oslash	=> chr(248),  # small o, slash
+ otilde	=> chr(245),  # small o, tilde
+ ouml	=> chr(246),  # small o, dieresis or umlaut mark
+ szlig	=> chr(223),  # small sharp s, German (sz ligature)
+ thorn	=> chr(254),  # small thorn, Icelandic
+ uacute	=> chr(250),  # small u, acute accent
+ ucirc	=> chr(251),  # small u, circumflex accent
+ ugrave	=> chr(249),  # small u, grave accent
+ uuml	=> chr(252),  # small u, dieresis or umlaut mark
+ yacute	=> chr(253),  # small y, acute accent
+ yuml	=> chr(255),  # small y, dieresis or umlaut mark
+
+ # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96)
+ copy   => chr(169),  # copyright sign
+ reg    => chr(174),  # registered sign
+ nbsp   => chr(160),  # non breaking space
+
+ # Additional ISO-8859/1 entities listed in rfc1866 (section 14)
+ iexcl  => chr(161),
+ cent   => chr(162),
+ pound  => chr(163),
+ curren => chr(164),
+ yen    => chr(165),
+ brvbar => chr(166),
+ sect   => chr(167),
+ uml    => chr(168),
+ ordf   => chr(170),
+ laquo  => chr(171),
+'not'   => chr(172),    # not is a keyword in perl
+ shy    => chr(173),
+ macr   => chr(175),
+ deg    => chr(176),
+ plusmn => chr(177),
+ sup1   => chr(185),
+ sup2   => chr(178),
+ sup3   => chr(179),
+ acute  => chr(180),
+ micro  => chr(181),
+ para   => chr(182),
+ middot => chr(183),
+ cedil  => chr(184),
+ ordm   => chr(186),
+ raquo  => chr(187),
+ frac14 => chr(188),
+ frac12 => chr(189),
+ frac34 => chr(190),
+ iquest => chr(191),
+'times' => chr(215),    # times is a keyword in perl
+ divide => chr(247),
+
+ ( $] > 5.007 ? (
+  'OElig;'    => chr(338),
+  'oelig;'    => chr(339),
+  'Scaron;'   => chr(352),
+  'scaron;'   => chr(353),
+  'Yuml;'     => chr(376),
+  'fnof;'     => chr(402),
+  'circ;'     => chr(710),
+  'tilde;'    => chr(732),
+  'Alpha;'    => chr(913),
+  'Beta;'     => chr(914),
+  'Gamma;'    => chr(915),
+  'Delta;'    => chr(916),
+  'Epsilon;'  => chr(917),
+  'Zeta;'     => chr(918),
+  'Eta;'      => chr(919),
+  'Theta;'    => chr(920),
+  'Iota;'     => chr(921),
+  'Kappa;'    => chr(922),
+  'Lambda;'   => chr(923),
+  'Mu;'       => chr(924),
+  'Nu;'       => chr(925),
+  'Xi;'       => chr(926),
+  'Omicron;'  => chr(927),
+  'Pi;'       => chr(928),
+  'Rho;'      => chr(929),
+  'Sigma;'    => chr(931),
+  'Tau;'      => chr(932),
+  'Upsilon;'  => chr(933),
+  'Phi;'      => chr(934),
+  'Chi;'      => chr(935),
+  'Psi;'      => chr(936),
+  'Omega;'    => chr(937),
+  'alpha;'    => chr(945),
+  'beta;'     => chr(946),
+  'gamma;'    => chr(947),
+  'delta;'    => chr(948),
+  'epsilon;'  => chr(949),
+  'zeta;'     => chr(950),
+  'eta;'      => chr(951),
+  'theta;'    => chr(952),
+  'iota;'     => chr(953),
+  'kappa;'    => chr(954),
+  'lambda;'   => chr(955),
+  'mu;'       => chr(956),
+  'nu;'       => chr(957),
+  'xi;'       => chr(958),
+  'omicron;'  => chr(959),
+  'pi;'       => chr(960),
+  'rho;'      => chr(961),
+  'sigmaf;'   => chr(962),
+  'sigma;'    => chr(963),
+  'tau;'      => chr(964),
+  'upsilon;'  => chr(965),
+  'phi;'      => chr(966),
+  'chi;'      => chr(967),
+  'psi;'      => chr(968),
+  'omega;'    => chr(969),
+  'thetasym;' => chr(977),
+  'upsih;'    => chr(978),
+  'piv;'      => chr(982),
+  'ensp;'     => chr(8194),
+  'emsp;'     => chr(8195),
+  'thinsp;'   => chr(8201),
+  'zwnj;'     => chr(8204),
+  'zwj;'      => chr(8205),
+  'lrm;'      => chr(8206),
+  'rlm;'      => chr(8207),
+  'ndash;'    => chr(8211),
+  'mdash;'    => chr(8212),
+  'lsquo;'    => chr(8216),
+  'rsquo;'    => chr(8217),
+  'sbquo;'    => chr(8218),
+  'ldquo;'    => chr(8220),
+  'rdquo;'    => chr(8221),
+  'bdquo;'    => chr(8222),
+  'dagger;'   => chr(8224),
+  'Dagger;'   => chr(8225),
+  'bull;'     => chr(8226),
+  'hellip;'   => chr(8230),
+  'permil;'   => chr(8240),
+  'prime;'    => chr(8242),
+  'Prime;'    => chr(8243),
+  'lsaquo;'   => chr(8249),
+  'rsaquo;'   => chr(8250),
+  'oline;'    => chr(8254),
+  'frasl;'    => chr(8260),
+  'euro;'     => chr(8364),
+  'image;'    => chr(8465),
+  'weierp;'   => chr(8472),
+  'real;'     => chr(8476),
+  'trade;'    => chr(8482),
+  'alefsym;'  => chr(8501),
+  'larr;'     => chr(8592),
+  'uarr;'     => chr(8593),
+  'rarr;'     => chr(8594),
+  'darr;'     => chr(8595),
+  'harr;'     => chr(8596),
+  'crarr;'    => chr(8629),
+  'lArr;'     => chr(8656),
+  'uArr;'     => chr(8657),
+  'rArr;'     => chr(8658),
+  'dArr;'     => chr(8659),
+  'hArr;'     => chr(8660),
+  'forall;'   => chr(8704),
+  'part;'     => chr(8706),
+  'exist;'    => chr(8707),
+  'empty;'    => chr(8709),
+  'nabla;'    => chr(8711),
+  'isin;'     => chr(8712),
+  'notin;'    => chr(8713),
+  'ni;'       => chr(8715),
+  'prod;'     => chr(8719),
+  'sum;'      => chr(8721),
+  'minus;'    => chr(8722),
+  'lowast;'   => chr(8727),
+  'radic;'    => chr(8730),
+  'prop;'     => chr(8733),
+  'infin;'    => chr(8734),
+  'ang;'      => chr(8736),
+  'and;'      => chr(8743),
+  'or;'       => chr(8744),
+  'cap;'      => chr(8745),
+  'cup;'      => chr(8746),
+  'int;'      => chr(8747),
+  'there4;'   => chr(8756),
+  'sim;'      => chr(8764),
+  'cong;'     => chr(8773),
+  'asymp;'    => chr(8776),
+  'ne;'       => chr(8800),
+  'equiv;'    => chr(8801),
+  'le;'       => chr(8804),
+  'ge;'       => chr(8805),
+  'sub;'      => chr(8834),
+  'sup;'      => chr(8835),
+  'nsub;'     => chr(8836),
+  'sube;'     => chr(8838),
+  'supe;'     => chr(8839),
+  'oplus;'    => chr(8853),
+  'otimes;'   => chr(8855),
+  'perp;'     => chr(8869),
+  'sdot;'     => chr(8901),
+  'lceil;'    => chr(8968),
+  'rceil;'    => chr(8969),
+  'lfloor;'   => chr(8970),
+  'rfloor;'   => chr(8971),
+  'lang;'     => chr(9001),
+  'rang;'     => chr(9002),
+  'loz;'      => chr(9674),
+  'spades;'   => chr(9824),
+  'clubs;'    => chr(9827),
+  'hearts;'   => chr(9829),
+  'diams;'    => chr(9830),
+ ) : ())
+);
+
+
+# Make the opposite mapping
+while (my($entity, $char) = each(%entity2char)) {
+    $entity =~ s/;\z//;
+    $char2entity{$char} = "&$entity;";
+}
+delete $char2entity{"'"};  # only one-way decoding
+
+# Fill in missing entities
+for (0 .. 255) {
+    next if exists $char2entity{chr($_)};
+    $char2entity{chr($_)} = "&#$_;";
+}
+
+my %subst;  # compiled encoding regexps
+
+sub encode_entities
+{
+    return undef unless defined $_[0];
+    my $ref;
+    if (defined wantarray) {
+	my $x = $_[0];
+	$ref = \$x;     # copy
+    } else {
+	$ref = \$_[0];  # modify in-place
+    }
+    if (defined $_[1] and length $_[1]) {
+	unless (exists $subst{$_[1]}) {
+	    # Because we can't compile regex we fake it with a cached sub
+	    my $chars = $_[1];
+	    $chars =~ s,(?<!\\)([]/]),\\$1,g;
+	    $chars =~ s,(?<!\\)\\\z,\\\\,;
+	    my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }";
+	    $subst{$_[1]} = eval $code;
+	    die( $@ . " while trying to turn range: \"$_[1]\"\n "
+	      . "into code: $code\n "
+	    ) if $@;
+	}
+	&{$subst{$_[1]}}($$ref);
+    } else {
+	# Encode control chars, high bit chars and '<', '&', '>', ''' and '"'
+	$$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge;
+    }
+    $$ref;
+}
+
+sub encode_entities_numeric {
+    local %char2entity;
+    return &encode_entities;   # a goto &encode_entities wouldn't work
+}
+
+
+sub num_entity {
+    sprintf "&#x%X;", ord($_[0]);
+}
+
+# Set up aliases
+*encode = \&encode_entities;
+*encode_numeric = \&encode_entities_numeric;
+*encode_numerically = \&encode_entities_numeric;
+*decode = \&decode_entities;
+
+1;
diff --git a/lib/HTML/Filter.pm b/lib/HTML/Filter.pm
new file mode 100644
index 0000000..c5aa16e
--- /dev/null
+++ b/lib/HTML/Filter.pm
@@ -0,0 +1,112 @@
+package HTML::Filter;
+
+use strict;
+use vars qw(@ISA $VERSION);
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+
+$VERSION = "3.57";
+
+sub declaration { $_[0]->output("<!$_[1]>")     }
+sub process     { $_[0]->output($_[2])          }
+sub comment     { $_[0]->output("<!--$_[1]-->") }
+sub start       { $_[0]->output($_[4])          }
+sub end         { $_[0]->output($_[2])          }
+sub text        { $_[0]->output($_[1])          }
+
+sub output      { print $_[1] }
+
+1;
+
+__END__
+
+=head1 NAME
+
+HTML::Filter - Filter HTML text through the parser
+
+=head1 NOTE
+
+B<This module is deprecated.> The C<HTML::Parser> now provides the
+functionally of C<HTML::Filter> much more efficiently with the the
+C<default> handler.
+
+=head1 SYNOPSIS
+
+ require HTML::Filter;
+ $p = HTML::Filter->new->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+C<HTML::Filter> is an HTML parser that by default prints the
+original text of each HTML element (a slow version of cat(1) basically).
+The callback methods may be overridden to modify the filtering for some
+HTML elements and you can override output() method which is called to
+print the HTML text.
+
+C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that
+the document should be given to the parser by calling the $p->parse()
+or $p->parse_file() methods.
+
+=head1 EXAMPLES
+
+The first example is a filter that will remove all comments from an
+HTML file.  This is achieved by simply overriding the comment method
+to do nothing.
+
+  package CommentStripper;
+  require HTML::Filter;
+  @ISA=qw(HTML::Filter);
+  sub comment { }  # ignore comments
+
+The second example shows a filter that will remove any E<lt>TABLE>s
+found in the HTML file.  We specialize the start() and end() methods
+to count table tags and then make output not happen when inside a
+table.
+
+  package TableStripper;
+  require HTML::Filter;
+  @ISA=qw(HTML::Filter);
+  sub start
+  {
+     my $self = shift;
+     $self->{table_seen}++ if $_[0] eq "table";
+     $self->SUPER::start(@_);
+  }
+
+  sub end
+  {
+     my $self = shift;
+     $self->SUPER::end(@_);
+     $self->{table_seen}-- if $_[0] eq "table";
+  }
+
+  sub output
+  {
+      my $self = shift;
+      unless ($self->{table_seen}) {
+	  $self->SUPER::output(@_);
+      }
+  }
+
+If you want to collect the parsed text internally you might want to do
+something like this:
+
+  package FilterIntoString;
+  require HTML::Filter;
+  @ISA=qw(HTML::Filter);
+  sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+  sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+
+=head1 SEE ALSO
+
+L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1997-1999 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
diff --git a/lib/HTML/HeadParser.pm b/lib/HTML/HeadParser.pm
new file mode 100644
index 0000000..28e9cac
--- /dev/null
+++ b/lib/HTML/HeadParser.pm
@@ -0,0 +1,315 @@
+package HTML::HeadParser;
+
+=head1 NAME
+
+HTML::HeadParser - Parse <HEAD> section of a HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::HeadParser;
+ $p = HTML::HeadParser->new;
+ $p->parse($text) and  print "not finished";
+
+ $p->header('Title')          # to access <title>....</title>
+ $p->header('Content-Base')   # to access <base href="http://...">
+ $p->header('Foo')            # to access <meta http-equiv="Foo" content="...">
+ $p->header('X-Meta-Author')  # to access <meta name="author" content="...">
+ $p->header('X-Meta-Charset') # to access <meta charset="...">
+
+=head1 DESCRIPTION
+
+The C<HTML::HeadParser> is a specialized (and lightweight)
+C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD>
+section of an HTML document.  The parse() method
+will return a FALSE value as soon as some E<lt>BODY> element or body
+text are found, and should not be called again after this.
+
+Note that the C<HTML::HeadParser> might get confused if raw undecoded
+UTF-8 is passed to the parse() method.  Make sure the strings are
+properly decoded before passing them on.
+
+The C<HTML::HeadParser> keeps a reference to a header object, and the
+parser will update this header object as the various elements of the
+E<lt>HEAD> section of the HTML document are recognized.  The following
+header fields are affected:
+
+=over 4
+
+=item Content-Base:
+
+The I<Content-Base> header is initialized from the E<lt>base
+href="..."> element.
+
+=item Title:
+
+The I<Title> header is initialized from the E<lt>title>...E<lt>/title>
+element.
+
+=item Isindex:
+
+The I<Isindex> header will be added if there is a E<lt>isindex>
+element in the E<lt>head>.  The header value is initialized from the
+I<prompt> attribute if it is present.  If no I<prompt> attribute is
+given it will have '?' as the value.
+
+=item X-Meta-Foo:
+
+All E<lt>meta> elements containing a C<name> attribute will result in
+headers using the prefix C<X-Meta-> appended with the value of the
+C<name> attribute as the name of the header, and the value of the
+C<content> attribute as the pushed header value.
+
+E<lt>meta> elements containing a C<http-equiv> attribute will result
+in headers as in above, but without the C<X-Meta-> prefix in the
+header name.
+
+E<lt>meta> elements containing a C<charset> attribute will result in
+an C<X-Meta-Charset> header, using the value of the C<charset>
+attribute as the pushed header value.
+
+The ':' character can't be represented in header field names, so
+if the meta element contains this char it's substituted with '-'
+before forming the field name.
+
+=back
+
+=head1 METHODS
+
+The following methods (in addition to those provided by the
+superclass) are available:
+
+=over 4
+
+=cut
+
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+
+use HTML::Entities ();
+
+use strict;
+use vars qw($VERSION $DEBUG);
+#$DEBUG = 1;
+$VERSION = "3.71";
+
+=item $hp = HTML::HeadParser->new
+
+=item $hp = HTML::HeadParser->new( $header )
+
+The object constructor.  The optional $header argument should be a
+reference to an object that implement the header() and push_header()
+methods as defined by the C<HTTP::Headers> class.  Normally it will be
+of some class that is a or delegates to the C<HTTP::Headers> class.
+
+If no $header is given C<HTML::HeadParser> will create an
+C<HTTP::Headers> object by itself (initially empty).
+
+=cut
+
+sub new
+{
+    my($class, $header) = @_;
+    unless ($header) {
+	require HTTP::Headers;
+	$header = HTTP::Headers->new;
+    }
+
+    my $self = $class->SUPER::new(api_version => 3,
+				  start_h => ["start", "self,tagname,attr"],
+				  end_h   => ["end",   "self,tagname"],
+				  text_h  => ["text",  "self,text"],
+				  ignore_elements => [qw(script style)],
+				 );
+    $self->{'header'} = $header;
+    $self->{'tag'} = '';   # name of active element that takes textual content
+    $self->{'text'} = '';  # the accumulated text associated with the element
+    $self;
+}
+
+=item $hp->header;
+
+Returns a reference to the header object.
+
+=item $hp->header( $key )
+
+Returns a header value.  It is just a shorter way to write
+C<$hp-E<gt>header-E<gt>header($key)>.
+
+=cut
+
+sub header
+{
+    my $self = shift;
+    return $self->{'header'} unless @_;
+    $self->{'header'}->header(@_);
+}
+
+sub as_string    # legacy
+{
+    my $self = shift;
+    $self->{'header'}->as_string;
+}
+
+sub flush_text   # internal
+{
+    my $self = shift;
+    my $tag  = $self->{'tag'};
+    my $text = $self->{'text'};
+    $text =~ s/^\s+//;
+    $text =~ s/\s+$//;
+    $text =~ s/\s+/ /g;
+    print "FLUSH $tag => '$text'\n"  if $DEBUG;
+    if ($tag eq 'title') {
+	my $decoded;
+	$decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode;
+	HTML::Entities::decode($text);
+	utf8::encode($text) if $decoded;
+	$self->{'header'}->push_header(Title => $text);
+    }
+    $self->{'tag'} = $self->{'text'} = '';
+}
+
+# This is an quote from the HTML3.2 DTD which shows which elements
+# that might be present in a <HEAD>...</HEAD>.  Also note that the
+# <HEAD> tags themselves might be missing:
+#
+# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? &
+#                            SCRIPT* & META* & LINK*">
+#
+# <!ELEMENT HEAD O O  (%head.content)>
+#
+# From HTML 4.01:
+#
+# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT">
+# <!ENTITY % head.content "TITLE & BASE?">
+# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)>
+#
+# From HTML 5 as of WD-html5-20090825:
+#
+# One or more elements of metadata content, [...]
+# => base, command, link, meta, noscript, script, style, title
+
+sub start
+{
+    my($self, $tag, $attr) = @_;  # $attr is reference to a HASH
+    print "START[$tag]\n" if $DEBUG;
+    $self->flush_text if $self->{'tag'};
+    if ($tag eq 'meta') {
+	my $key = $attr->{'http-equiv'};
+	if (!defined($key) || !length($key)) {
+	    if ($attr->{name}) {
+		$key = "X-Meta-\u$attr->{name}";
+	    } elsif ($attr->{charset}) { # HTML 5 <meta charset="...">
+		$key = "X-Meta-Charset";
+		$self->{header}->push_header($key => $attr->{charset});
+		return;
+	    } else {
+		return;
+	    }
+	}
+	$key =~ s/:/-/g;
+	$self->{'header'}->push_header($key => $attr->{content});
+    } elsif ($tag eq 'base') {
+	return unless exists $attr->{href};
+	(my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5
+	$self->{'header'}->push_header('Content-Base' => $base);
+    } elsif ($tag eq 'isindex') {
+	# This is a non-standard header.  Perhaps we should just ignore
+	# this element
+	$self->{'header'}->push_header(Isindex => $attr->{prompt} || '?');
+    } elsif ($tag =~ /^(?:title|noscript|object|command)$/) {
+	# Just remember tag.  Initialize header when we see the end tag.
+	$self->{'tag'} = $tag;
+    } elsif ($tag eq 'link') {
+	return unless exists $attr->{href};
+	# <link href="http:..." rel="xxx" rev="xxx" title="xxx">
+	my $href = delete($attr->{href});
+	$href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5
+	my $h_val = "<$href>";
+	for (sort keys %{$attr}) {
+	    next if $_ eq "/";  # XHTML junk
+	    $h_val .= qq(; $_="$attr->{$_}");
+	}
+	$self->{'header'}->push_header(Link => $h_val);
+    } elsif ($tag eq 'head' || $tag eq 'html') {
+	# ignore
+    } else {
+	 # stop parsing
+	$self->eof;
+    }
+}
+
+sub end
+{
+    my($self, $tag) = @_;
+    print "END[$tag]\n" if $DEBUG;
+    $self->flush_text if $self->{'tag'};
+    $self->eof if $tag eq 'head';
+}
+
+sub text
+{
+    my($self, $text) = @_;
+    print "TEXT[$text]\n" if $DEBUG;
+    unless ($self->{first_chunk}) {
+	# drop Unicode BOM if found
+	if ($self->utf8_mode) {
+	    $text =~ s/^\xEF\xBB\xBF//;
+	}
+	else {
+	    $text =~ s/^\x{FEFF}//;
+	}
+	$self->{first_chunk}++;
+    }
+    my $tag = $self->{tag};
+    if (!$tag && $text =~ /\S/) {
+	# Normal text means start of body
+        $self->eof;
+	return;
+    }
+    return if $tag ne 'title';
+    $self->{'text'} .= $text;
+}
+
+BEGIN {
+    *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT;
+}
+
+1;
+
+__END__
+
+=back
+
+=head1 EXAMPLE
+
+ $h = HTTP::Headers->new;
+ $p = HTML::HeadParser->new($h);
+ $p->parse(<<EOT);
+ <title>Stupid example</title>
+ <base href="http://www.linpro.no/lwp/">
+ Normal text starts here.
+ EOT
+ undef $p;
+ print $h->title;   # should print "Stupid example"
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTTP::Headers>
+
+The C<HTTP::Headers> class is distributed as part of the
+I<libwww-perl> package.  If you don't have that distribution installed
+you need to provide the $header argument to the C<HTML::HeadParser>
+constructor with your own object that implements the documented
+protocol.
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas. All rights reserved.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
diff --git a/lib/HTML/LinkExtor.pm b/lib/HTML/LinkExtor.pm
new file mode 100644
index 0000000..c2f08c6
--- /dev/null
+++ b/lib/HTML/LinkExtor.pm
@@ -0,0 +1,185 @@
+package HTML::LinkExtor;
+
+require HTML::Parser;
+@ISA = qw(HTML::Parser);
+$VERSION = "3.69";
+
+=head1 NAME
+
+HTML::LinkExtor - Extract links from an HTML document
+
+=head1 SYNOPSIS
+
+ require HTML::LinkExtor;
+ $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/");
+ sub cb {
+     my($tag, %links) = @_;
+     print "$tag @{[%links]}\n";
+ }
+ $p->parse_file("index.html");
+
+=head1 DESCRIPTION
+
+I<HTML::LinkExtor> is an HTML parser that extracts links from an
+HTML document.  The I<HTML::LinkExtor> is a subclass of
+I<HTML::Parser>. This means that the document should be given to the
+parser by calling the $p->parse() or $p->parse_file() methods.
+
+=cut
+
+use strict;
+use HTML::Tagset ();
+
+# legacy (some applications grabs this hash directly)
+use vars qw(%LINK_ELEMENT);
+*LINK_ELEMENT = \%HTML::Tagset::linkElements;
+
+=over 4
+
+=item $p = HTML::LinkExtor->new
+
+=item $p = HTML::LinkExtor->new( $callback )
+
+=item $p = HTML::LinkExtor->new( $callback, $base )
+
+The constructor takes two optional arguments. The first is a reference
+to a callback routine. It will be called as links are found. If a
+callback is not provided, then links are just accumulated internally
+and can be retrieved by calling the $p->links() method.
+
+The $base argument is an optional base URL used to absolutize all URLs found.
+You need to have the I<URI> module installed if you provide $base.
+
+The callback is called with the lowercase tag name as first argument,
+and then all link attributes as separate key/value pairs.  All
+non-link attributes are removed.
+
+=cut
+
+sub new
+{
+    my($class, $cb, $base) = @_;
+    my $self = $class->SUPER::new(
+                    start_h => ["_start_tag", "self,tagname,attr"],
+		    report_tags => [keys %HTML::Tagset::linkElements],
+	       );
+    $self->{extractlink_cb} = $cb;
+    if ($base) {
+	require URI;
+	$self->{extractlink_base} = URI->new($base);
+    }
+    $self;
+}
+
+sub _start_tag
+{
+    my($self, $tag, $attr) = @_;
+
+    my $base = $self->{extractlink_base};
+    my $links = $HTML::Tagset::linkElements{$tag};
+    $links = [$links] unless ref $links;
+
+    my @links;
+    my $a;
+    for $a (@$links) {
+	next unless exists $attr->{$a};
+	(my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5
+	push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link);
+    }
+    return unless @links;
+    $self->_found_link($tag, @links);
+}
+
+sub _found_link
+{
+    my $self = shift;
+    my $cb = $self->{extractlink_cb};
+    if ($cb) {
+	&$cb(@_);
+    } else {
+	push(@{$self->{'links'}}, [@_]);
+    }
+}
+
+=item $p->links
+
+Returns a list of all links found in the document.  The returned
+values will be anonymous arrays with the following elements:
+
+  [$tag, $attr => $url1, $attr2 => $url2,...]
+
+The $p->links method will also truncate the internal link list.  This
+means that if the method is called twice without any parsing
+between them the second call will return an empty list.
+
+Also note that $p->links will always be empty if a callback routine
+was provided when the I<HTML::LinkExtor> was created.
+
+=cut
+
+sub links
+{
+    my $self = shift;
+    exists($self->{'links'}) ? @{delete $self->{'links'}} : ();
+}
+
+# We override the parse_file() method so that we can clear the links
+# before we start a new file.
+sub parse_file
+{
+    my $self = shift;
+    delete $self->{'links'};
+    $self->SUPER::parse_file(@_);
+}
+
+=back
+
+=head1 EXAMPLE
+
+This is an example showing how you can extract links from a document
+received using LWP:
+
+  use LWP::UserAgent;
+  use HTML::LinkExtor;
+  use URI::URL;
+
+  $url = "http://www.perl.org/";  # for instance
+  $ua = LWP::UserAgent->new;
+
+  # Set up a callback that collect image links
+  my @imgs = ();
+  sub callback {
+     my($tag, %attr) = @_;
+     return if $tag ne 'img';  # we only look closer at <img ...>
+     push(@imgs, values %attr);
+  }
+
+  # Make the parser.  Unfortunately, we don't know the base yet
+  # (it might be different from $url)
+  $p = HTML::LinkExtor->new(\&callback);
+
+  # Request document and parse it as it arrives
+  $res = $ua->request(HTTP::Request->new(GET => $url),
+                      sub {$p->parse($_[0])});
+
+  # Expand all image URLs to absolute ones
+  my $base = $res->base;
+  @imgs = map { $_ = url($_, $base)->abs; } @imgs;
+
+  # Print them out
+  print join("\n", @imgs), "\n";
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL>
+
+=head1 COPYRIGHT
+
+Copyright 1996-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+1;
diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm
new file mode 100644
index 0000000..3083379
--- /dev/null
+++ b/lib/HTML/PullParser.pm
@@ -0,0 +1,209 @@
+package HTML::PullParser;
+
+require HTML::Parser;
+@ISA=qw(HTML::Parser);
+$VERSION = "3.57";
+
+use strict;
+use Carp ();
+
+sub new
+{
+    my($class, %cnf) = @_;
+
+    # Construct argspecs for the various events
+    my %argspec;
+    for (qw(start end text declaration comment process default)) {
+	my $tmp = delete $cnf{$_};
+	next unless defined $tmp;
+	$argspec{$_} = $tmp;
+    }
+    Carp::croak("Info not collected for any events")
+	  unless %argspec;
+
+    my $file = delete $cnf{file};
+    my $doc  = delete $cnf{doc};
+    Carp::croak("Can't parse from both 'doc' and 'file' at the same time")
+	  if defined($file) && defined($doc);
+    Carp::croak("No 'doc' or 'file' given to parse from")
+	  unless defined($file) || defined($doc);
+
+    # Create object
+    $cnf{api_version} = 3;
+    my $self = $class->SUPER::new(%cnf);
+
+    my $accum = $self->{pullparser_accum} = [];
+    while (my($event, $argspec) = each %argspec) {
+	$self->SUPER::handler($event => $accum, $argspec);
+    }
+
+    if (defined $doc) {
+	$self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc;
+	$self->{pullparser_str_pos} = 0;
+    }
+    else {
+	if (!ref($file) && ref(\$file) ne "GLOB") {
+	    require IO::File;
+	    $file = IO::File->new($file, "r") || return;
+	}
+
+	$self->{pullparser_file} = $file;
+    }
+    $self;
+}
+
+
+sub handler
+{
+    Carp::croak("Can't set handlers for HTML::PullParser");
+}
+
+
+sub get_token
+{
+    my $self = shift;
+    while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) {
+	if (my $f = $self->{pullparser_file}) {
+	    # must try to parse more from the file
+	    my $buf;
+	    if (read($f, $buf, 512)) {
+		$self->parse($buf);
+	    } else {
+		$self->eof;
+		$self->{pullparser_eof}++;
+		delete $self->{pullparser_file};
+	    }
+	}
+	elsif (my $sref = $self->{pullparser_str_ref}) {
+	    # must try to parse more from the scalar
+	    my $pos = $self->{pullparser_str_pos};
+	    my $chunk = substr($$sref, $pos, 512);
+	    $self->parse($chunk);
+	    $pos += length($chunk);
+	    if ($pos < length($$sref)) {
+		$self->{pullparser_str_pos} = $pos;
+	    }
+	    else {
+		$self->eof;
+		$self->{pullparser_eof}++;
+		delete $self->{pullparser_str_ref};
+		delete $self->{pullparser_str_pos};
+	    }
+	}
+	else {
+	    die;
+	}
+    }
+    shift @{$self->{pullparser_accum}};
+}
+
+
+sub unget_token
+{
+    my $self = shift;
+    unshift @{$self->{pullparser_accum}}, @_;
+    $self;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::PullParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ use HTML::PullParser;
+
+ $p = HTML::PullParser->new(file => "index.html",
+                            start => 'event, tagname, @attr',
+                            end   => 'event, tagname',
+                            ignore_elements => [qw(script style)],
+                           ) || die "Can't open: $!";
+ while (my $token = $p->get_token) {
+     #...do something with $token
+ }
+
+=head1 DESCRIPTION
+
+The HTML::PullParser is an alternative interface to the HTML::Parser class.
+It basically turns the HTML::Parser inside out.  You associate a file
+(or any IO::Handle object or string) with the parser at construction time and
+then repeatedly call $parser->get_token to obtain the tags and text
+found in the parsed document.
+
+The following methods are provided:
+
+=over 4
+
+=item $p = HTML::PullParser->new( file => $file, %options )
+
+=item $p = HTML::PullParser->new( doc => \$doc, %options )
+
+A C<HTML::PullParser> can be made to parse from either a file or a
+literal document based on whether the C<file> or C<doc> option is
+passed to the parser's constructor.
+
+The C<file> passed in can either be a file name or a file handle
+object.  If a file name is passed, and it can't be opened for reading,
+then the constructor will return an undefined value and $!  will tell
+you why it failed.  Otherwise the argument is taken to be some object
+that the C<HTML::PullParser> can read() from when it needs more data.
+The stream will be read() until EOF, but not closed.
+
+A C<doc> can be passed plain or as a reference
+to a scalar.  If a reference is passed then the value of this scalar
+should not be changed before all tokens have been extracted.
+
+Next the information to be returned for the different token types must
+be set up.  This is done by simply associating an argspec (as defined
+in L<HTML::Parser>) with the events you have an interest in.  For
+instance, if you want C<start> tokens to be reported as the string
+C<'S'> followed by the tagname and the attributes you might pass an
+C<start>-option like this:
+
+   $p = HTML::PullParser->new(
+          doc   => $document_to_parse,
+          start => '"S", tagname, @attr',
+          end   => '"E", tagname',
+        );
+
+At last other C<HTML::Parser> options, like C<ignore_tags>, and
+C<unbroken_text>, can be passed in.  Note that you should not use the
+I<event>_h options to set up parser handlers.  That would confuse the
+inner logic of C<HTML::PullParser>.
+
+=item $token = $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document.  The token is returned as an
+array reference.  The content of this array match the argspec set up
+during C<HTML::PullParser> construction.
+
+=item $p->unget_token( @tokens )
+
+If you find out you have read too many tokens you can push them back,
+so that they are returned again the next time $p->get_token is called.
+
+=back
+
+=head1 EXAMPLES
+
+The 'eg/hform' script shows how we might parse the form section of
+HTML::Documents using HTML::PullParser.
+
+=head1 SEE ALSO
+
+L<HTML::Parser>, L<HTML::TokeParser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2001 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
diff --git a/lib/HTML/TokeParser.pm b/lib/HTML/TokeParser.pm
new file mode 100644
index 0000000..959b96f
--- /dev/null
+++ b/lib/HTML/TokeParser.pm
@@ -0,0 +1,371 @@
+package HTML::TokeParser;
+
+require HTML::PullParser;
+@ISA=qw(HTML::PullParser);
+$VERSION = "3.69";
+
+use strict;
+use Carp ();
+use HTML::Entities qw(decode_entities);
+use HTML::Tagset ();
+
+my %ARGS =
+(
+ start       => "'S',tagname,attr,attrseq,text",
+ end         => "'E',tagname,text",
+ text        => "'T',text,is_cdata",
+ process     => "'PI',token0,text",
+ comment     => "'C',text",
+ declaration => "'D',text",
+
+ # options that default on
+ unbroken_text => 1,
+);
+
+
+sub new
+{
+    my $class = shift;
+    my %cnf;
+
+    if (@_ == 1) {
+	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
+	%cnf = ($type => $_[0]);
+    }
+    else {
+	unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1);
+	%cnf = @_;
+    }
+
+    my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
+
+    my $self = $class->SUPER::new(%ARGS, %cnf) || return undef;
+
+    $self->{textify} = $textify;
+    $self;
+}
+
+
+sub get_tag
+{
+    my $self = shift;
+    my $token;
+    while (1) {
+	$token = $self->get_token || return undef;
+	my $type = shift @$token;
+	next unless $type eq "S" || $type eq "E";
+	substr($token->[0], 0, 0) = "/" if $type eq "E";
+	return $token unless @_;
+	for (@_) {
+	    return $token if $token->[0] eq $_;
+	}
+    }
+}
+
+
+sub _textify {
+    my($self, $token) = @_;
+    my $tag = $token->[1];
+    return undef unless exists $self->{textify}{$tag};
+
+    my $alt = $self->{textify}{$tag};
+    my $text;
+    if (ref($alt)) {
+	$text = &$alt(@$token);
+    } else {
+	$text = $token->[2]{$alt || "alt"};
+	$text = "[\U$tag]" unless defined $text;
+    }
+    return $text;
+}
+
+
+sub get_text
+{
+    my $self = shift;
+    my @text;
+    while (my $token = $self->get_token) {
+	my $type = $token->[0];
+	if ($type eq "T") {
+	    my $text = $token->[1];
+	    decode_entities($text) unless $token->[2];
+	    push(@text, $text);
+	} elsif ($type =~ /^[SE]$/) {
+	    my $tag = $token->[1];
+	    if ($type eq "S") {
+		if (defined(my $text = _textify($self, $token))) {
+		    push(@text, $text);
+		    next;
+		}
+	    } else {
+		$tag = "/$tag";
+	    }
+	    if (!@_ || grep $_ eq $tag, @_) {
+		 $self->unget_token($token);
+		 last;
+	    }
+	    push(@text, " ")
+		if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]};
+	}
+    }
+    join("", @text);
+}
+
+
+sub get_trimmed_text
+{
+    my $self = shift;
+    my $text = $self->get_text(@_);
+    $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+    $text;
+}
+
+sub get_phrase {
+    my $self = shift;
+    my @text;
+    while (my $token = $self->get_token) {
+	my $type = $token->[0];
+	if ($type eq "T") {
+	    my $text = $token->[1];
+	    decode_entities($text) unless $token->[2];
+	    push(@text, $text);
+	} elsif ($type =~ /^[SE]$/) {
+	    my $tag = $token->[1];
+	    if ($type eq "S") {
+		if (defined(my $text = _textify($self, $token))) {
+		    push(@text, $text);
+		    next;
+		}
+	    }
+	    if (!$HTML::Tagset::isPhraseMarkup{$tag}) {
+		$self->unget_token($token);
+		last;
+	    }
+	    push(@text, " ") if $tag eq "br";
+	}
+    }
+    my $text = join("", @text);
+    $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
+    $text;
+}
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+HTML::TokeParser - Alternative HTML::Parser interface
+
+=head1 SYNOPSIS
+
+ require HTML::TokeParser;
+ $p = HTML::TokeParser->new("index.html") ||
+      die "Can't open: $!";
+ $p->empty_element_tags(1);  # configure its behaviour
+
+ while (my $token = $p->get_token) {
+     #...
+ }
+
+=head1 DESCRIPTION
+
+The C<HTML::TokeParser> is an alternative interface to the
+C<HTML::Parser> class.  It is an C<HTML::PullParser> subclass with a
+predeclared set of token types.  If you wish the tokens to be reported
+differently you probably want to use the C<HTML::PullParser> directly.
+
+The following methods are available:
+
+=over 4
+
+=item $p = HTML::TokeParser->new( $filename, %opt );
+
+=item $p = HTML::TokeParser->new( $filehandle, %opt );
+
+=item $p = HTML::TokeParser->new( \$document, %opt );
+
+The object constructor argument is either a file name, a file handle
+object, or the complete document to be parsed.  Extra options can be
+provided as key/value pairs and are processed as documented by the base
+classes.
+
+If the argument is a plain scalar, then it is taken as the name of a
+file to be opened and parsed.  If the file can't be opened for
+reading, then the constructor will return C<undef> and $! will tell
+you why it failed.
+
+If the argument is a reference to a plain scalar, then this scalar is
+taken to be the literal document to parse.  The value of this
+scalar should not be changed before all tokens have been extracted.
+
+Otherwise the argument is taken to be some object that the
+C<HTML::TokeParser> can read() from when it needs more data.  Typically
+it will be a filehandle of some kind.  The stream will be read() until
+EOF, but not closed.
+
+A newly constructed C<HTML::TokeParser> differ from its base classes
+by having the C<unbroken_text> attribute enabled by default. See
+L<HTML::Parser> for a description of this and other attributes that
+influence how the document is parsed. It is often a good idea to enable
+C<empty_element_tags> behaviour.
+
+Note that the parsing result will likely not be valid if raw undecoded
+UTF-8 is used as a source.  When parsing UTF-8 encoded files turn
+on UTF-8 decoding:
+
+   open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!";
+   my $p = HTML::TokeParser->new( $fh );
+   # ...
+
+If a $filename is passed to the constructor the file will be opened in
+raw mode and the parsing result will only be valid if its content is
+Latin-1 or pure ASCII.
+
+If parsing from an UTF-8 encoded string buffer decode it first:
+
+   utf8::decode($document);
+   my $p = HTML::TokeParser->new( \$document );
+   # ...
+
+=item $p->get_token
+
+This method will return the next I<token> found in the HTML document,
+or C<undef> at the end of the document.  The token is returned as an
+array reference.  The first element of the array will be a string
+denoting the type of this token: "S" for start tag, "E" for end tag,
+"T" for text, "C" for comment, "D" for declaration, and "PI" for
+process instructions.  The rest of the token array depend on the type
+like this:
+
+  ["S",  $tag, $attr, $attrseq, $text]
+  ["E",  $tag, $text]
+  ["T",  $text, $is_data]
+  ["C",  $text]
+  ["D",  $text]
+  ["PI", $token0, $text]
+
+where $attr is a hash reference, $attrseq is an array reference and
+the rest are plain scalars.  The L<HTML::Parser/Argspec> explains the
+details.
+
+=item $p->unget_token( @tokens )
+
+If you find you have read too many tokens you can push them back,
+so that they are returned the next time $p->get_token is called.
+
+=item $p->get_tag
+
+=item $p->get_tag( @tags )
+
+This method returns the next start or end tag (skipping any other
+tokens), or C<undef> if there are no more tags in the document.  If
+one or more arguments are given, then we skip tokens until one of the
+specified tag types is found.  For example:
+
+   $p->get_tag("font", "/font");
+
+will find the next start or end tag for a font-element.
+
+The tag information is returned as an array reference in the same form
+as for $p->get_token above, but the type code (first element) is
+missing. A start tag will be returned like this:
+
+  [$tag, $attr, $attrseq, $text]
+
+The tagname of end tags are prefixed with "/", i.e. end tag is
+returned like this:
+
+  ["/$tag", $text]
+
+=item $p->get_text
+
+=item $p->get_text( @endtags )
+
+This method returns all text found at the current position. It will
+return a zero length string if the next token is not text. Any
+entities will be converted to their corresponding character.
+
+If one or more arguments are given, then we return all text occurring
+before the first of the specified tags found. For example:
+
+   $p->get_text("p", "br");
+
+will return the text up to either a paragraph of linebreak element.
+
+The text might span tags that should be I<textified>.  This is
+controlled by the $p->{textify} attribute, which is a hash that
+defines how certain tags can be treated as text.  If the name of a
+start tag matches a key in this hash then this tag is converted to
+text.  The hash value is used to specify which tag attribute to obtain
+the text from.  If this tag attribute is missing, then the upper case
+name of the tag enclosed in brackets is returned, e.g. "[IMG]".  The
+hash value can also be a subroutine reference.  In this case the
+routine is called with the start tag token content as its argument and
+the return value is treated as the text.
+
+The default $p->{textify} value is:
+
+  {img => "alt", applet => "alt"}
+
+This means that <IMG> and <APPLET> tags are treated as text, and that
+the text to substitute can be found in the ALT attribute.
+
+=item $p->get_trimmed_text
+
+=item $p->get_trimmed_text( @endtags )
+
+Same as $p->get_text above, but will collapse any sequences of white
+space to a single space character.  Leading and trailing white space is
+removed.
+
+=item $p->get_phrase
+
+This will return all text found at the current position ignoring any
+phrasal-level tags.  Text is extracted until the first non
+phrasal-level tag.  Textification of tags is the same as for
+get_text().  This method will collapse white space in the same way as
+get_trimmed_text() does.
+
+The definition of <i>phrasal-level tags</i> is obtained from the
+HTML::Tagset module.
+
+=back
+
+=head1 EXAMPLES
+
+This example extracts all links from a document.  It will print one
+line for each link, containing the URL and the textual description
+between the <A>...</A> tags:
+
+  use HTML::TokeParser;
+  $p = HTML::TokeParser->new(shift||"index.html");
+
+  while (my $token = $p->get_tag("a")) {
+      my $url = $token->[1]{href} || "-";
+      my $text = $p->get_trimmed_text("/a");
+      print "$url\t$text\n";
+  }
+
+This example extract the <TITLE> from the document:
+
+  use HTML::TokeParser;
+  $p = HTML::TokeParser->new(shift||"index.html");
+  if ($p->get_tag("title")) {
+      my $title = $p->get_trimmed_text;
+      print "Title: $title\n";
+  }
+
+=head1 SEE ALSO
+
+L<HTML::PullParser>, L<HTML::Parser>
+
+=head1 COPYRIGHT
+
+Copyright 1998-2005 Gisle Aas.
+
+This library is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
commit	2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree	4734ccd522c71dd455879162006742002f8c1565 /lib
download	HTML-Parser-tarball-master.tar.gz