diff options
Diffstat (limited to 'lib')
-rw-r--r-- | lib/HTML/Entities.pm | 483 | ||||
-rw-r--r-- | lib/HTML/Filter.pm | 112 | ||||
-rw-r--r-- | lib/HTML/HeadParser.pm | 315 | ||||
-rw-r--r-- | lib/HTML/LinkExtor.pm | 185 | ||||
-rw-r--r-- | lib/HTML/PullParser.pm | 209 | ||||
-rw-r--r-- | lib/HTML/TokeParser.pm | 371 |
6 files changed, 1675 insertions, 0 deletions
diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm new file mode 100644 index 0000000..ecd8e0d --- /dev/null +++ b/lib/HTML/Entities.pm @@ -0,0 +1,483 @@ +package HTML::Entities; + +=encoding utf8 + +=head1 NAME + +HTML::Entities - Encode or decode strings with HTML entities + +=head1 SYNOPSIS + + use HTML::Entities; + + $a = "Våre norske tegn bør æres"; + decode_entities($a); + encode_entities($a, "\200-\377"); + +For example, this: + + $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; + print encode_entities($input), "\n" + +Prints this out: + + vis-à-vis Beyoncé's naïve + papier-mâché résumé + +=head1 DESCRIPTION + +This module deals with encoding and decoding of strings with HTML +character entities. The module provides the following functions: + +=over 4 + +=item decode_entities( $string, ... ) + +This routine replaces HTML entities found in the $string with the +corresponding Unicode character. Unrecognized entities are left alone. + +If multiple strings are provided as argument they are each decoded +separately and the same number of strings are returned. + +If called in void context the arguments are decoded in-place. + +This routine is exported by default. + +=item _decode_entities( $string, \%entity2char ) + +=item _decode_entities( $string, \%entity2char, $expand_prefix ) + +This will in-place replace HTML entities in $string. The %entity2char +hash must be provided. Named entities not found in the %entity2char +hash are left alone. Numeric entities are expanded unless their value +overflow. + +The keys in %entity2char are the entity names to be expanded and their +values are what they should expand into. The values do not have to be +single character strings. If a key has ";" as suffix, +then occurrences in $string are only expanded if properly terminated +with ";". Entities without ";" will be expanded regardless of how +they are terminated for compatibility with how common browsers treat +entities in the Latin-1 range. + +If $expand_prefix is TRUE then entities without trailing ";" in +%entity2char will even be expanded as a prefix of a longer +unrecognized name. The longest matching name in %entity2char will be +used. This is mainly present for compatibility with an MSIE +misfeature. + + $string = "foo bar"; + _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); + print $string; # will print "foo bar" + +This routine is exported by default. + +=item encode_entities( $string ) + +=item encode_entities( $string, $unsafe_chars ) + +This routine replaces unsafe characters in $string with their entity +representation. A second argument can be given to specify which characters to +consider unsafe. The unsafe characters is specified using the regular +expression character class syntax (what you find within brackets in regular +expressions). + +The default set of characters to encode are control chars, high-bit chars, and +the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this, +for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< " +>> characters: + + $encoded = encode_entities($input, '<>&"'); + +and this would only encode non-plain ascii: + + $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e'); + +This routine is exported by default. + +=item encode_entities_numeric( $string ) + +=item encode_entities_numeric( $string, $unsafe_chars ) + +This routine works just like encode_entities, except that the replacement +entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For +example, C<encode_entities("r\xF4le")> returns "rôle", but +C<encode_entities_numeric("r\xF4le")> returns "rôle". + +This routine is I<not> exported by default. But you can always +export it with C<use HTML::Entities qw(encode_entities_numeric);> +or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> + +=back + +All these routines modify the string passed as the first argument, if +called in a void context. In scalar and array contexts, the encoded or +decoded string is returned (without changing the input string). + +If you prefer not to import these routines into your namespace, you can +call them as: + + use HTML::Entities (); + $decoded = HTML::Entities::decode($a); + $encoded = HTML::Entities::encode($a); + $encoded = HTML::Entities::encode_numeric($a); + +The module can also export the %char2entity and the %entity2char +hashes, which contain the mapping from all characters to the +corresponding entities (and vice versa, respectively). + +=head1 COPYRIGHT + +Copyright 1995-2006 Gisle Aas. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); +use vars qw(%entity2char %char2entity); + +require 5.004; +require Exporter; +@ISA = qw(Exporter); + +@EXPORT = qw(encode_entities decode_entities _decode_entities); +@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); + +$VERSION = "3.69"; +sub Version { $VERSION; } + +require HTML::Parser; # for fast XS implemented decode_entities + + +%entity2char = ( + # Some normal chars that have special meaning in SGML context + amp => '&', # ampersand +'gt' => '>', # greater than +'lt' => '<', # less than + quot => '"', # double quote + apos => "'", # single quote + + # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML + AElig => chr(198), # capital AE diphthong (ligature) + Aacute => chr(193), # capital A, acute accent + Acirc => chr(194), # capital A, circumflex accent + Agrave => chr(192), # capital A, grave accent + Aring => chr(197), # capital A, ring + Atilde => chr(195), # capital A, tilde + Auml => chr(196), # capital A, dieresis or umlaut mark + Ccedil => chr(199), # capital C, cedilla + ETH => chr(208), # capital Eth, Icelandic + Eacute => chr(201), # capital E, acute accent + Ecirc => chr(202), # capital E, circumflex accent + Egrave => chr(200), # capital E, grave accent + Euml => chr(203), # capital E, dieresis or umlaut mark + Iacute => chr(205), # capital I, acute accent + Icirc => chr(206), # capital I, circumflex accent + Igrave => chr(204), # capital I, grave accent + Iuml => chr(207), # capital I, dieresis or umlaut mark + Ntilde => chr(209), # capital N, tilde + Oacute => chr(211), # capital O, acute accent + Ocirc => chr(212), # capital O, circumflex accent + Ograve => chr(210), # capital O, grave accent + Oslash => chr(216), # capital O, slash + Otilde => chr(213), # capital O, tilde + Ouml => chr(214), # capital O, dieresis or umlaut mark + THORN => chr(222), # capital THORN, Icelandic + Uacute => chr(218), # capital U, acute accent + Ucirc => chr(219), # capital U, circumflex accent + Ugrave => chr(217), # capital U, grave accent + Uuml => chr(220), # capital U, dieresis or umlaut mark + Yacute => chr(221), # capital Y, acute accent + aacute => chr(225), # small a, acute accent + acirc => chr(226), # small a, circumflex accent + aelig => chr(230), # small ae diphthong (ligature) + agrave => chr(224), # small a, grave accent + aring => chr(229), # small a, ring + atilde => chr(227), # small a, tilde + auml => chr(228), # small a, dieresis or umlaut mark + ccedil => chr(231), # small c, cedilla + eacute => chr(233), # small e, acute accent + ecirc => chr(234), # small e, circumflex accent + egrave => chr(232), # small e, grave accent + eth => chr(240), # small eth, Icelandic + euml => chr(235), # small e, dieresis or umlaut mark + iacute => chr(237), # small i, acute accent + icirc => chr(238), # small i, circumflex accent + igrave => chr(236), # small i, grave accent + iuml => chr(239), # small i, dieresis or umlaut mark + ntilde => chr(241), # small n, tilde + oacute => chr(243), # small o, acute accent + ocirc => chr(244), # small o, circumflex accent + ograve => chr(242), # small o, grave accent + oslash => chr(248), # small o, slash + otilde => chr(245), # small o, tilde + ouml => chr(246), # small o, dieresis or umlaut mark + szlig => chr(223), # small sharp s, German (sz ligature) + thorn => chr(254), # small thorn, Icelandic + uacute => chr(250), # small u, acute accent + ucirc => chr(251), # small u, circumflex accent + ugrave => chr(249), # small u, grave accent + uuml => chr(252), # small u, dieresis or umlaut mark + yacute => chr(253), # small y, acute accent + yuml => chr(255), # small y, dieresis or umlaut mark + + # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) + copy => chr(169), # copyright sign + reg => chr(174), # registered sign + nbsp => chr(160), # non breaking space + + # Additional ISO-8859/1 entities listed in rfc1866 (section 14) + iexcl => chr(161), + cent => chr(162), + pound => chr(163), + curren => chr(164), + yen => chr(165), + brvbar => chr(166), + sect => chr(167), + uml => chr(168), + ordf => chr(170), + laquo => chr(171), +'not' => chr(172), # not is a keyword in perl + shy => chr(173), + macr => chr(175), + deg => chr(176), + plusmn => chr(177), + sup1 => chr(185), + sup2 => chr(178), + sup3 => chr(179), + acute => chr(180), + micro => chr(181), + para => chr(182), + middot => chr(183), + cedil => chr(184), + ordm => chr(186), + raquo => chr(187), + frac14 => chr(188), + frac12 => chr(189), + frac34 => chr(190), + iquest => chr(191), +'times' => chr(215), # times is a keyword in perl + divide => chr(247), + + ( $] > 5.007 ? ( + 'OElig;' => chr(338), + 'oelig;' => chr(339), + 'Scaron;' => chr(352), + 'scaron;' => chr(353), + 'Yuml;' => chr(376), + 'fnof;' => chr(402), + 'circ;' => chr(710), + 'tilde;' => chr(732), + 'Alpha;' => chr(913), + 'Beta;' => chr(914), + 'Gamma;' => chr(915), + 'Delta;' => chr(916), + 'Epsilon;' => chr(917), + 'Zeta;' => chr(918), + 'Eta;' => chr(919), + 'Theta;' => chr(920), + 'Iota;' => chr(921), + 'Kappa;' => chr(922), + 'Lambda;' => chr(923), + 'Mu;' => chr(924), + 'Nu;' => chr(925), + 'Xi;' => chr(926), + 'Omicron;' => chr(927), + 'Pi;' => chr(928), + 'Rho;' => chr(929), + 'Sigma;' => chr(931), + 'Tau;' => chr(932), + 'Upsilon;' => chr(933), + 'Phi;' => chr(934), + 'Chi;' => chr(935), + 'Psi;' => chr(936), + 'Omega;' => chr(937), + 'alpha;' => chr(945), + 'beta;' => chr(946), + 'gamma;' => chr(947), + 'delta;' => chr(948), + 'epsilon;' => chr(949), + 'zeta;' => chr(950), + 'eta;' => chr(951), + 'theta;' => chr(952), + 'iota;' => chr(953), + 'kappa;' => chr(954), + 'lambda;' => chr(955), + 'mu;' => chr(956), + 'nu;' => chr(957), + 'xi;' => chr(958), + 'omicron;' => chr(959), + 'pi;' => chr(960), + 'rho;' => chr(961), + 'sigmaf;' => chr(962), + 'sigma;' => chr(963), + 'tau;' => chr(964), + 'upsilon;' => chr(965), + 'phi;' => chr(966), + 'chi;' => chr(967), + 'psi;' => chr(968), + 'omega;' => chr(969), + 'thetasym;' => chr(977), + 'upsih;' => chr(978), + 'piv;' => chr(982), + 'ensp;' => chr(8194), + 'emsp;' => chr(8195), + 'thinsp;' => chr(8201), + 'zwnj;' => chr(8204), + 'zwj;' => chr(8205), + 'lrm;' => chr(8206), + 'rlm;' => chr(8207), + 'ndash;' => chr(8211), + 'mdash;' => chr(8212), + 'lsquo;' => chr(8216), + 'rsquo;' => chr(8217), + 'sbquo;' => chr(8218), + 'ldquo;' => chr(8220), + 'rdquo;' => chr(8221), + 'bdquo;' => chr(8222), + 'dagger;' => chr(8224), + 'Dagger;' => chr(8225), + 'bull;' => chr(8226), + 'hellip;' => chr(8230), + 'permil;' => chr(8240), + 'prime;' => chr(8242), + 'Prime;' => chr(8243), + 'lsaquo;' => chr(8249), + 'rsaquo;' => chr(8250), + 'oline;' => chr(8254), + 'frasl;' => chr(8260), + 'euro;' => chr(8364), + 'image;' => chr(8465), + 'weierp;' => chr(8472), + 'real;' => chr(8476), + 'trade;' => chr(8482), + 'alefsym;' => chr(8501), + 'larr;' => chr(8592), + 'uarr;' => chr(8593), + 'rarr;' => chr(8594), + 'darr;' => chr(8595), + 'harr;' => chr(8596), + 'crarr;' => chr(8629), + 'lArr;' => chr(8656), + 'uArr;' => chr(8657), + 'rArr;' => chr(8658), + 'dArr;' => chr(8659), + 'hArr;' => chr(8660), + 'forall;' => chr(8704), + 'part;' => chr(8706), + 'exist;' => chr(8707), + 'empty;' => chr(8709), + 'nabla;' => chr(8711), + 'isin;' => chr(8712), + 'notin;' => chr(8713), + 'ni;' => chr(8715), + 'prod;' => chr(8719), + 'sum;' => chr(8721), + 'minus;' => chr(8722), + 'lowast;' => chr(8727), + 'radic;' => chr(8730), + 'prop;' => chr(8733), + 'infin;' => chr(8734), + 'ang;' => chr(8736), + 'and;' => chr(8743), + 'or;' => chr(8744), + 'cap;' => chr(8745), + 'cup;' => chr(8746), + 'int;' => chr(8747), + 'there4;' => chr(8756), + 'sim;' => chr(8764), + 'cong;' => chr(8773), + 'asymp;' => chr(8776), + 'ne;' => chr(8800), + 'equiv;' => chr(8801), + 'le;' => chr(8804), + 'ge;' => chr(8805), + 'sub;' => chr(8834), + 'sup;' => chr(8835), + 'nsub;' => chr(8836), + 'sube;' => chr(8838), + 'supe;' => chr(8839), + 'oplus;' => chr(8853), + 'otimes;' => chr(8855), + 'perp;' => chr(8869), + 'sdot;' => chr(8901), + 'lceil;' => chr(8968), + 'rceil;' => chr(8969), + 'lfloor;' => chr(8970), + 'rfloor;' => chr(8971), + 'lang;' => chr(9001), + 'rang;' => chr(9002), + 'loz;' => chr(9674), + 'spades;' => chr(9824), + 'clubs;' => chr(9827), + 'hearts;' => chr(9829), + 'diams;' => chr(9830), + ) : ()) +); + + +# Make the opposite mapping +while (my($entity, $char) = each(%entity2char)) { + $entity =~ s/;\z//; + $char2entity{$char} = "&$entity;"; +} +delete $char2entity{"'"}; # only one-way decoding + +# Fill in missing entities +for (0 .. 255) { + next if exists $char2entity{chr($_)}; + $char2entity{chr($_)} = "&#$_;"; +} + +my %subst; # compiled encoding regexps + +sub encode_entities +{ + return undef unless defined $_[0]; + my $ref; + if (defined wantarray) { + my $x = $_[0]; + $ref = \$x; # copy + } else { + $ref = \$_[0]; # modify in-place + } + if (defined $_[1] and length $_[1]) { + unless (exists $subst{$_[1]}) { + # Because we can't compile regex we fake it with a cached sub + my $chars = $_[1]; + $chars =~ s,(?<!\\)([]/]),\\$1,g; + $chars =~ s,(?<!\\)\\\z,\\\\,; + my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; + $subst{$_[1]} = eval $code; + die( $@ . " while trying to turn range: \"$_[1]\"\n " + . "into code: $code\n " + ) if $@; + } + &{$subst{$_[1]}}($$ref); + } else { + # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' + $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; + } + $$ref; +} + +sub encode_entities_numeric { + local %char2entity; + return &encode_entities; # a goto &encode_entities wouldn't work +} + + +sub num_entity { + sprintf "&#x%X;", ord($_[0]); +} + +# Set up aliases +*encode = \&encode_entities; +*encode_numeric = \&encode_entities_numeric; +*encode_numerically = \&encode_entities_numeric; +*decode = \&decode_entities; + +1; diff --git a/lib/HTML/Filter.pm b/lib/HTML/Filter.pm new file mode 100644 index 0000000..c5aa16e --- /dev/null +++ b/lib/HTML/Filter.pm @@ -0,0 +1,112 @@ +package HTML::Filter; + +use strict; +use vars qw(@ISA $VERSION); + +require HTML::Parser; +@ISA=qw(HTML::Parser); + +$VERSION = "3.57"; + +sub declaration { $_[0]->output("<!$_[1]>") } +sub process { $_[0]->output($_[2]) } +sub comment { $_[0]->output("<!--$_[1]-->") } +sub start { $_[0]->output($_[4]) } +sub end { $_[0]->output($_[2]) } +sub text { $_[0]->output($_[1]) } + +sub output { print $_[1] } + +1; + +__END__ + +=head1 NAME + +HTML::Filter - Filter HTML text through the parser + +=head1 NOTE + +B<This module is deprecated.> The C<HTML::Parser> now provides the +functionally of C<HTML::Filter> much more efficiently with the the +C<default> handler. + +=head1 SYNOPSIS + + require HTML::Filter; + $p = HTML::Filter->new->parse_file("index.html"); + +=head1 DESCRIPTION + +C<HTML::Filter> is an HTML parser that by default prints the +original text of each HTML element (a slow version of cat(1) basically). +The callback methods may be overridden to modify the filtering for some +HTML elements and you can override output() method which is called to +print the HTML text. + +C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that +the document should be given to the parser by calling the $p->parse() +or $p->parse_file() methods. + +=head1 EXAMPLES + +The first example is a filter that will remove all comments from an +HTML file. This is achieved by simply overriding the comment method +to do nothing. + + package CommentStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub comment { } # ignore comments + +The second example shows a filter that will remove any E<lt>TABLE>s +found in the HTML file. We specialize the start() and end() methods +to count table tags and then make output not happen when inside a +table. + + package TableStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub start + { + my $self = shift; + $self->{table_seen}++ if $_[0] eq "table"; + $self->SUPER::start(@_); + } + + sub end + { + my $self = shift; + $self->SUPER::end(@_); + $self->{table_seen}-- if $_[0] eq "table"; + } + + sub output + { + my $self = shift; + unless ($self->{table_seen}) { + $self->SUPER::output(@_); + } + } + +If you want to collect the parsed text internally you might want to do +something like this: + + package FilterIntoString; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub output { push(@{$_[0]->{fhtml}}, $_[1]) } + sub filtered_html { join("", @{$_[0]->{fhtml}}) } + +=head1 SEE ALSO + +L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1997-1999 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/lib/HTML/HeadParser.pm b/lib/HTML/HeadParser.pm new file mode 100644 index 0000000..28e9cac --- /dev/null +++ b/lib/HTML/HeadParser.pm @@ -0,0 +1,315 @@ +package HTML::HeadParser; + +=head1 NAME + +HTML::HeadParser - Parse <HEAD> section of a HTML document + +=head1 SYNOPSIS + + require HTML::HeadParser; + $p = HTML::HeadParser->new; + $p->parse($text) and print "not finished"; + + $p->header('Title') # to access <title>....</title> + $p->header('Content-Base') # to access <base href="http://..."> + $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> + $p->header('X-Meta-Author') # to access <meta name="author" content="..."> + $p->header('X-Meta-Charset') # to access <meta charset="..."> + +=head1 DESCRIPTION + +The C<HTML::HeadParser> is a specialized (and lightweight) +C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> +section of an HTML document. The parse() method +will return a FALSE value as soon as some E<lt>BODY> element or body +text are found, and should not be called again after this. + +Note that the C<HTML::HeadParser> might get confused if raw undecoded +UTF-8 is passed to the parse() method. Make sure the strings are +properly decoded before passing them on. + +The C<HTML::HeadParser> keeps a reference to a header object, and the +parser will update this header object as the various elements of the +E<lt>HEAD> section of the HTML document are recognized. The following +header fields are affected: + +=over 4 + +=item Content-Base: + +The I<Content-Base> header is initialized from the E<lt>base +href="..."> element. + +=item Title: + +The I<Title> header is initialized from the E<lt>title>...E<lt>/title> +element. + +=item Isindex: + +The I<Isindex> header will be added if there is a E<lt>isindex> +element in the E<lt>head>. The header value is initialized from the +I<prompt> attribute if it is present. If no I<prompt> attribute is +given it will have '?' as the value. + +=item X-Meta-Foo: + +All E<lt>meta> elements containing a C<name> attribute will result in +headers using the prefix C<X-Meta-> appended with the value of the +C<name> attribute as the name of the header, and the value of the +C<content> attribute as the pushed header value. + +E<lt>meta> elements containing a C<http-equiv> attribute will result +in headers as in above, but without the C<X-Meta-> prefix in the +header name. + +E<lt>meta> elements containing a C<charset> attribute will result in +an C<X-Meta-Charset> header, using the value of the C<charset> +attribute as the pushed header value. + +The ':' character can't be represented in header field names, so +if the meta element contains this char it's substituted with '-' +before forming the field name. + +=back + +=head1 METHODS + +The following methods (in addition to those provided by the +superclass) are available: + +=over 4 + +=cut + + +require HTML::Parser; +@ISA = qw(HTML::Parser); + +use HTML::Entities (); + +use strict; +use vars qw($VERSION $DEBUG); +#$DEBUG = 1; +$VERSION = "3.71"; + +=item $hp = HTML::HeadParser->new + +=item $hp = HTML::HeadParser->new( $header ) + +The object constructor. The optional $header argument should be a +reference to an object that implement the header() and push_header() +methods as defined by the C<HTTP::Headers> class. Normally it will be +of some class that is a or delegates to the C<HTTP::Headers> class. + +If no $header is given C<HTML::HeadParser> will create an +C<HTTP::Headers> object by itself (initially empty). + +=cut + +sub new +{ + my($class, $header) = @_; + unless ($header) { + require HTTP::Headers; + $header = HTTP::Headers->new; + } + + my $self = $class->SUPER::new(api_version => 3, + start_h => ["start", "self,tagname,attr"], + end_h => ["end", "self,tagname"], + text_h => ["text", "self,text"], + ignore_elements => [qw(script style)], + ); + $self->{'header'} = $header; + $self->{'tag'} = ''; # name of active element that takes textual content + $self->{'text'} = ''; # the accumulated text associated with the element + $self; +} + +=item $hp->header; + +Returns a reference to the header object. + +=item $hp->header( $key ) + +Returns a header value. It is just a shorter way to write +C<$hp-E<gt>header-E<gt>header($key)>. + +=cut + +sub header +{ + my $self = shift; + return $self->{'header'} unless @_; + $self->{'header'}->header(@_); +} + +sub as_string # legacy +{ + my $self = shift; + $self->{'header'}->as_string; +} + +sub flush_text # internal +{ + my $self = shift; + my $tag = $self->{'tag'}; + my $text = $self->{'text'}; + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + print "FLUSH $tag => '$text'\n" if $DEBUG; + if ($tag eq 'title') { + my $decoded; + $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode; + HTML::Entities::decode($text); + utf8::encode($text) if $decoded; + $self->{'header'}->push_header(Title => $text); + } + $self->{'tag'} = $self->{'text'} = ''; +} + +# This is an quote from the HTML3.2 DTD which shows which elements +# that might be present in a <HEAD>...</HEAD>. Also note that the +# <HEAD> tags themselves might be missing: +# +# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & +# SCRIPT* & META* & LINK*"> +# +# <!ELEMENT HEAD O O (%head.content)> +# +# From HTML 4.01: +# +# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT"> +# <!ENTITY % head.content "TITLE & BASE?"> +# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)> +# +# From HTML 5 as of WD-html5-20090825: +# +# One or more elements of metadata content, [...] +# => base, command, link, meta, noscript, script, style, title + +sub start +{ + my($self, $tag, $attr) = @_; # $attr is reference to a HASH + print "START[$tag]\n" if $DEBUG; + $self->flush_text if $self->{'tag'}; + if ($tag eq 'meta') { + my $key = $attr->{'http-equiv'}; + if (!defined($key) || !length($key)) { + if ($attr->{name}) { + $key = "X-Meta-\u$attr->{name}"; + } elsif ($attr->{charset}) { # HTML 5 <meta charset="..."> + $key = "X-Meta-Charset"; + $self->{header}->push_header($key => $attr->{charset}); + return; + } else { + return; + } + } + $key =~ s/:/-/g; + $self->{'header'}->push_header($key => $attr->{content}); + } elsif ($tag eq 'base') { + return unless exists $attr->{href}; + (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5 + $self->{'header'}->push_header('Content-Base' => $base); + } elsif ($tag eq 'isindex') { + # This is a non-standard header. Perhaps we should just ignore + # this element + $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); + } elsif ($tag =~ /^(?:title|noscript|object|command)$/) { + # Just remember tag. Initialize header when we see the end tag. + $self->{'tag'} = $tag; + } elsif ($tag eq 'link') { + return unless exists $attr->{href}; + # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> + my $href = delete($attr->{href}); + $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5 + my $h_val = "<$href>"; + for (sort keys %{$attr}) { + next if $_ eq "/"; # XHTML junk + $h_val .= qq(; $_="$attr->{$_}"); + } + $self->{'header'}->push_header(Link => $h_val); + } elsif ($tag eq 'head' || $tag eq 'html') { + # ignore + } else { + # stop parsing + $self->eof; + } +} + +sub end +{ + my($self, $tag) = @_; + print "END[$tag]\n" if $DEBUG; + $self->flush_text if $self->{'tag'}; + $self->eof if $tag eq 'head'; +} + +sub text +{ + my($self, $text) = @_; + print "TEXT[$text]\n" if $DEBUG; + unless ($self->{first_chunk}) { + # drop Unicode BOM if found + if ($self->utf8_mode) { + $text =~ s/^\xEF\xBB\xBF//; + } + else { + $text =~ s/^\x{FEFF}//; + } + $self->{first_chunk}++; + } + my $tag = $self->{tag}; + if (!$tag && $text =~ /\S/) { + # Normal text means start of body + $self->eof; + return; + } + return if $tag ne 'title'; + $self->{'text'} .= $text; +} + +BEGIN { + *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT; +} + +1; + +__END__ + +=back + +=head1 EXAMPLE + + $h = HTTP::Headers->new; + $p = HTML::HeadParser->new($h); + $p->parse(<<EOT); + <title>Stupid example</title> + <base href="http://www.linpro.no/lwp/"> + Normal text starts here. + EOT + undef $p; + print $h->title; # should print "Stupid example" + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTTP::Headers> + +The C<HTTP::Headers> class is distributed as part of the +I<libwww-perl> package. If you don't have that distribution installed +you need to provide the $header argument to the C<HTML::HeadParser> +constructor with your own object that implements the documented +protocol. + +=head1 COPYRIGHT + +Copyright 1996-2001 Gisle Aas. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + diff --git a/lib/HTML/LinkExtor.pm b/lib/HTML/LinkExtor.pm new file mode 100644 index 0000000..c2f08c6 --- /dev/null +++ b/lib/HTML/LinkExtor.pm @@ -0,0 +1,185 @@ +package HTML::LinkExtor; + +require HTML::Parser; +@ISA = qw(HTML::Parser); +$VERSION = "3.69"; + +=head1 NAME + +HTML::LinkExtor - Extract links from an HTML document + +=head1 SYNOPSIS + + require HTML::LinkExtor; + $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); + sub cb { + my($tag, %links) = @_; + print "$tag @{[%links]}\n"; + } + $p->parse_file("index.html"); + +=head1 DESCRIPTION + +I<HTML::LinkExtor> is an HTML parser that extracts links from an +HTML document. The I<HTML::LinkExtor> is a subclass of +I<HTML::Parser>. This means that the document should be given to the +parser by calling the $p->parse() or $p->parse_file() methods. + +=cut + +use strict; +use HTML::Tagset (); + +# legacy (some applications grabs this hash directly) +use vars qw(%LINK_ELEMENT); +*LINK_ELEMENT = \%HTML::Tagset::linkElements; + +=over 4 + +=item $p = HTML::LinkExtor->new + +=item $p = HTML::LinkExtor->new( $callback ) + +=item $p = HTML::LinkExtor->new( $callback, $base ) + +The constructor takes two optional arguments. The first is a reference +to a callback routine. It will be called as links are found. If a +callback is not provided, then links are just accumulated internally +and can be retrieved by calling the $p->links() method. + +The $base argument is an optional base URL used to absolutize all URLs found. +You need to have the I<URI> module installed if you provide $base. + +The callback is called with the lowercase tag name as first argument, +and then all link attributes as separate key/value pairs. All +non-link attributes are removed. + +=cut + +sub new +{ + my($class, $cb, $base) = @_; + my $self = $class->SUPER::new( + start_h => ["_start_tag", "self,tagname,attr"], + report_tags => [keys %HTML::Tagset::linkElements], + ); + $self->{extractlink_cb} = $cb; + if ($base) { + require URI; + $self->{extractlink_base} = URI->new($base); + } + $self; +} + +sub _start_tag +{ + my($self, $tag, $attr) = @_; + + my $base = $self->{extractlink_base}; + my $links = $HTML::Tagset::linkElements{$tag}; + $links = [$links] unless ref $links; + + my @links; + my $a; + for $a (@$links) { + next unless exists $attr->{$a}; + (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 + push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); + } + return unless @links; + $self->_found_link($tag, @links); +} + +sub _found_link +{ + my $self = shift; + my $cb = $self->{extractlink_cb}; + if ($cb) { + &$cb(@_); + } else { + push(@{$self->{'links'}}, [@_]); + } +} + +=item $p->links + +Returns a list of all links found in the document. The returned +values will be anonymous arrays with the following elements: + + [$tag, $attr => $url1, $attr2 => $url2,...] + +The $p->links method will also truncate the internal link list. This +means that if the method is called twice without any parsing +between them the second call will return an empty list. + +Also note that $p->links will always be empty if a callback routine +was provided when the I<HTML::LinkExtor> was created. + +=cut + +sub links +{ + my $self = shift; + exists($self->{'links'}) ? @{delete $self->{'links'}} : (); +} + +# We override the parse_file() method so that we can clear the links +# before we start a new file. +sub parse_file +{ + my $self = shift; + delete $self->{'links'}; + $self->SUPER::parse_file(@_); +} + +=back + +=head1 EXAMPLE + +This is an example showing how you can extract links from a document +received using LWP: + + use LWP::UserAgent; + use HTML::LinkExtor; + use URI::URL; + + $url = "http://www.perl.org/"; # for instance + $ua = LWP::UserAgent->new; + + # Set up a callback that collect image links + my @imgs = (); + sub callback { + my($tag, %attr) = @_; + return if $tag ne 'img'; # we only look closer at <img ...> + push(@imgs, values %attr); + } + + # Make the parser. Unfortunately, we don't know the base yet + # (it might be different from $url) + $p = HTML::LinkExtor->new(\&callback); + + # Request document and parse it as it arrives + $res = $ua->request(HTTP::Request->new(GET => $url), + sub {$p->parse($_[0])}); + + # Expand all image URLs to absolute ones + my $base = $res->base; + @imgs = map { $_ = url($_, $base)->abs; } @imgs; + + # Print them out + print join("\n", @imgs), "\n"; + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> + +=head1 COPYRIGHT + +Copyright 1996-2001 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +1; diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm new file mode 100644 index 0000000..3083379 --- /dev/null +++ b/lib/HTML/PullParser.pm @@ -0,0 +1,209 @@ +package HTML::PullParser; + +require HTML::Parser; +@ISA=qw(HTML::Parser); +$VERSION = "3.57"; + +use strict; +use Carp (); + +sub new +{ + my($class, %cnf) = @_; + + # Construct argspecs for the various events + my %argspec; + for (qw(start end text declaration comment process default)) { + my $tmp = delete $cnf{$_}; + next unless defined $tmp; + $argspec{$_} = $tmp; + } + Carp::croak("Info not collected for any events") + unless %argspec; + + my $file = delete $cnf{file}; + my $doc = delete $cnf{doc}; + Carp::croak("Can't parse from both 'doc' and 'file' at the same time") + if defined($file) && defined($doc); + Carp::croak("No 'doc' or 'file' given to parse from") + unless defined($file) || defined($doc); + + # Create object + $cnf{api_version} = 3; + my $self = $class->SUPER::new(%cnf); + + my $accum = $self->{pullparser_accum} = []; + while (my($event, $argspec) = each %argspec) { + $self->SUPER::handler($event => $accum, $argspec); + } + + if (defined $doc) { + $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; + $self->{pullparser_str_pos} = 0; + } + else { + if (!ref($file) && ref(\$file) ne "GLOB") { + require IO::File; + $file = IO::File->new($file, "r") || return; + } + + $self->{pullparser_file} = $file; + } + $self; +} + + +sub handler +{ + Carp::croak("Can't set handlers for HTML::PullParser"); +} + + +sub get_token +{ + my $self = shift; + while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { + if (my $f = $self->{pullparser_file}) { + # must try to parse more from the file + my $buf; + if (read($f, $buf, 512)) { + $self->parse($buf); + } else { + $self->eof; + $self->{pullparser_eof}++; + delete $self->{pullparser_file}; + } + } + elsif (my $sref = $self->{pullparser_str_ref}) { + # must try to parse more from the scalar + my $pos = $self->{pullparser_str_pos}; + my $chunk = substr($$sref, $pos, 512); + $self->parse($chunk); + $pos += length($chunk); + if ($pos < length($$sref)) { + $self->{pullparser_str_pos} = $pos; + } + else { + $self->eof; + $self->{pullparser_eof}++; + delete $self->{pullparser_str_ref}; + delete $self->{pullparser_str_pos}; + } + } + else { + die; + } + } + shift @{$self->{pullparser_accum}}; +} + + +sub unget_token +{ + my $self = shift; + unshift @{$self->{pullparser_accum}}, @_; + $self; +} + +1; + + +__END__ + +=head1 NAME + +HTML::PullParser - Alternative HTML::Parser interface + +=head1 SYNOPSIS + + use HTML::PullParser; + + $p = HTML::PullParser->new(file => "index.html", + start => 'event, tagname, @attr', + end => 'event, tagname', + ignore_elements => [qw(script style)], + ) || die "Can't open: $!"; + while (my $token = $p->get_token) { + #...do something with $token + } + +=head1 DESCRIPTION + +The HTML::PullParser is an alternative interface to the HTML::Parser class. +It basically turns the HTML::Parser inside out. You associate a file +(or any IO::Handle object or string) with the parser at construction time and +then repeatedly call $parser->get_token to obtain the tags and text +found in the parsed document. + +The following methods are provided: + +=over 4 + +=item $p = HTML::PullParser->new( file => $file, %options ) + +=item $p = HTML::PullParser->new( doc => \$doc, %options ) + +A C<HTML::PullParser> can be made to parse from either a file or a +literal document based on whether the C<file> or C<doc> option is +passed to the parser's constructor. + +The C<file> passed in can either be a file name or a file handle +object. If a file name is passed, and it can't be opened for reading, +then the constructor will return an undefined value and $! will tell +you why it failed. Otherwise the argument is taken to be some object +that the C<HTML::PullParser> can read() from when it needs more data. +The stream will be read() until EOF, but not closed. + +A C<doc> can be passed plain or as a reference +to a scalar. If a reference is passed then the value of this scalar +should not be changed before all tokens have been extracted. + +Next the information to be returned for the different token types must +be set up. This is done by simply associating an argspec (as defined +in L<HTML::Parser>) with the events you have an interest in. For +instance, if you want C<start> tokens to be reported as the string +C<'S'> followed by the tagname and the attributes you might pass an +C<start>-option like this: + + $p = HTML::PullParser->new( + doc => $document_to_parse, + start => '"S", tagname, @attr', + end => '"E", tagname', + ); + +At last other C<HTML::Parser> options, like C<ignore_tags>, and +C<unbroken_text>, can be passed in. Note that you should not use the +I<event>_h options to set up parser handlers. That would confuse the +inner logic of C<HTML::PullParser>. + +=item $token = $p->get_token + +This method will return the next I<token> found in the HTML document, +or C<undef> at the end of the document. The token is returned as an +array reference. The content of this array match the argspec set up +during C<HTML::PullParser> construction. + +=item $p->unget_token( @tokens ) + +If you find out you have read too many tokens you can push them back, +so that they are returned again the next time $p->get_token is called. + +=back + +=head1 EXAMPLES + +The 'eg/hform' script shows how we might parse the form section of +HTML::Documents using HTML::PullParser. + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTML::TokeParser> + +=head1 COPYRIGHT + +Copyright 1998-2001 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/lib/HTML/TokeParser.pm b/lib/HTML/TokeParser.pm new file mode 100644 index 0000000..959b96f --- /dev/null +++ b/lib/HTML/TokeParser.pm @@ -0,0 +1,371 @@ +package HTML::TokeParser; + +require HTML::PullParser; +@ISA=qw(HTML::PullParser); +$VERSION = "3.69"; + +use strict; +use Carp (); +use HTML::Entities qw(decode_entities); +use HTML::Tagset (); + +my %ARGS = +( + start => "'S',tagname,attr,attrseq,text", + end => "'E',tagname,text", + text => "'T',text,is_cdata", + process => "'PI',token0,text", + comment => "'C',text", + declaration => "'D',text", + + # options that default on + unbroken_text => 1, +); + + +sub new +{ + my $class = shift; + my %cnf; + + if (@_ == 1) { + my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; + %cnf = ($type => $_[0]); + } + else { + unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1); + %cnf = @_; + } + + my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; + + my $self = $class->SUPER::new(%ARGS, %cnf) || return undef; + + $self->{textify} = $textify; + $self; +} + + +sub get_tag +{ + my $self = shift; + my $token; + while (1) { + $token = $self->get_token || return undef; + my $type = shift @$token; + next unless $type eq "S" || $type eq "E"; + substr($token->[0], 0, 0) = "/" if $type eq "E"; + return $token unless @_; + for (@_) { + return $token if $token->[0] eq $_; + } + } +} + + +sub _textify { + my($self, $token) = @_; + my $tag = $token->[1]; + return undef unless exists $self->{textify}{$tag}; + + my $alt = $self->{textify}{$tag}; + my $text; + if (ref($alt)) { + $text = &$alt(@$token); + } else { + $text = $token->[2]{$alt || "alt"}; + $text = "[\U$tag]" unless defined $text; + } + return $text; +} + + +sub get_text +{ + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } else { + $tag = "/$tag"; + } + if (!@_ || grep $_ eq $tag, @_) { + $self->unget_token($token); + last; + } + push(@text, " ") + if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; + } + } + join("", @text); +} + + +sub get_trimmed_text +{ + my $self = shift; + my $text = $self->get_text(@_); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +sub get_phrase { + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } + if (!$HTML::Tagset::isPhraseMarkup{$tag}) { + $self->unget_token($token); + last; + } + push(@text, " ") if $tag eq "br"; + } + } + my $text = join("", @text); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +1; + + +__END__ + +=head1 NAME + +HTML::TokeParser - Alternative HTML::Parser interface + +=head1 SYNOPSIS + + require HTML::TokeParser; + $p = HTML::TokeParser->new("index.html") || + die "Can't open: $!"; + $p->empty_element_tags(1); # configure its behaviour + + while (my $token = $p->get_token) { + #... + } + +=head1 DESCRIPTION + +The C<HTML::TokeParser> is an alternative interface to the +C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a +predeclared set of token types. If you wish the tokens to be reported +differently you probably want to use the C<HTML::PullParser> directly. + +The following methods are available: + +=over 4 + +=item $p = HTML::TokeParser->new( $filename, %opt ); + +=item $p = HTML::TokeParser->new( $filehandle, %opt ); + +=item $p = HTML::TokeParser->new( \$document, %opt ); + +The object constructor argument is either a file name, a file handle +object, or the complete document to be parsed. Extra options can be +provided as key/value pairs and are processed as documented by the base +classes. + +If the argument is a plain scalar, then it is taken as the name of a +file to be opened and parsed. If the file can't be opened for +reading, then the constructor will return C<undef> and $! will tell +you why it failed. + +If the argument is a reference to a plain scalar, then this scalar is +taken to be the literal document to parse. The value of this +scalar should not be changed before all tokens have been extracted. + +Otherwise the argument is taken to be some object that the +C<HTML::TokeParser> can read() from when it needs more data. Typically +it will be a filehandle of some kind. The stream will be read() until +EOF, but not closed. + +A newly constructed C<HTML::TokeParser> differ from its base classes +by having the C<unbroken_text> attribute enabled by default. See +L<HTML::Parser> for a description of this and other attributes that +influence how the document is parsed. It is often a good idea to enable +C<empty_element_tags> behaviour. + +Note that the parsing result will likely not be valid if raw undecoded +UTF-8 is used as a source. When parsing UTF-8 encoded files turn +on UTF-8 decoding: + + open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; + my $p = HTML::TokeParser->new( $fh ); + # ... + +If a $filename is passed to the constructor the file will be opened in +raw mode and the parsing result will only be valid if its content is +Latin-1 or pure ASCII. + +If parsing from an UTF-8 encoded string buffer decode it first: + + utf8::decode($document); + my $p = HTML::TokeParser->new( \$document ); + # ... + +=item $p->get_token + +This method will return the next I<token> found in the HTML document, +or C<undef> at the end of the document. The token is returned as an +array reference. The first element of the array will be a string +denoting the type of this token: "S" for start tag, "E" for end tag, +"T" for text, "C" for comment, "D" for declaration, and "PI" for +process instructions. The rest of the token array depend on the type +like this: + + ["S", $tag, $attr, $attrseq, $text] + ["E", $tag, $text] + ["T", $text, $is_data] + ["C", $text] + ["D", $text] + ["PI", $token0, $text] + +where $attr is a hash reference, $attrseq is an array reference and +the rest are plain scalars. The L<HTML::Parser/Argspec> explains the +details. + +=item $p->unget_token( @tokens ) + +If you find you have read too many tokens you can push them back, +so that they are returned the next time $p->get_token is called. + +=item $p->get_tag + +=item $p->get_tag( @tags ) + +This method returns the next start or end tag (skipping any other +tokens), or C<undef> if there are no more tags in the document. If +one or more arguments are given, then we skip tokens until one of the +specified tag types is found. For example: + + $p->get_tag("font", "/font"); + +will find the next start or end tag for a font-element. + +The tag information is returned as an array reference in the same form +as for $p->get_token above, but the type code (first element) is +missing. A start tag will be returned like this: + + [$tag, $attr, $attrseq, $text] + +The tagname of end tags are prefixed with "/", i.e. end tag is +returned like this: + + ["/$tag", $text] + +=item $p->get_text + +=item $p->get_text( @endtags ) + +This method returns all text found at the current position. It will +return a zero length string if the next token is not text. Any +entities will be converted to their corresponding character. + +If one or more arguments are given, then we return all text occurring +before the first of the specified tags found. For example: + + $p->get_text("p", "br"); + +will return the text up to either a paragraph of linebreak element. + +The text might span tags that should be I<textified>. This is +controlled by the $p->{textify} attribute, which is a hash that +defines how certain tags can be treated as text. If the name of a +start tag matches a key in this hash then this tag is converted to +text. The hash value is used to specify which tag attribute to obtain +the text from. If this tag attribute is missing, then the upper case +name of the tag enclosed in brackets is returned, e.g. "[IMG]". The +hash value can also be a subroutine reference. In this case the +routine is called with the start tag token content as its argument and +the return value is treated as the text. + +The default $p->{textify} value is: + + {img => "alt", applet => "alt"} + +This means that <IMG> and <APPLET> tags are treated as text, and that +the text to substitute can be found in the ALT attribute. + +=item $p->get_trimmed_text + +=item $p->get_trimmed_text( @endtags ) + +Same as $p->get_text above, but will collapse any sequences of white +space to a single space character. Leading and trailing white space is +removed. + +=item $p->get_phrase + +This will return all text found at the current position ignoring any +phrasal-level tags. Text is extracted until the first non +phrasal-level tag. Textification of tags is the same as for +get_text(). This method will collapse white space in the same way as +get_trimmed_text() does. + +The definition of <i>phrasal-level tags</i> is obtained from the +HTML::Tagset module. + +=back + +=head1 EXAMPLES + +This example extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the <A>...</A> tags: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + + while (my $token = $p->get_tag("a")) { + my $url = $token->[1]{href} || "-"; + my $text = $p->get_trimmed_text("/a"); + print "$url\t$text\n"; + } + +This example extract the <TITLE> from the document: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + if ($p->get_tag("title")) { + my $title = $p->get_trimmed_text; + print "Title: $title\n"; + } + +=head1 SEE ALSO + +L<HTML::PullParser>, L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1998-2005 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut |