diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
commit | 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch) | |
tree | 4734ccd522c71dd455879162006742002f8c1565 /eg | |
download | HTML-Parser-tarball-master.tar.gz |
HTML-Parser-3.71HEADHTML-Parser-3.71master
Diffstat (limited to 'eg')
-rwxr-xr-x | eg/hanchors | 48 | ||||
-rwxr-xr-x | eg/hdump | 23 | ||||
-rwxr-xr-x | eg/hform | 83 | ||||
-rwxr-xr-x | eg/hlc | 20 | ||||
-rwxr-xr-x | eg/hrefsub | 93 | ||||
-rwxr-xr-x | eg/hstrip | 65 | ||||
-rwxr-xr-x | eg/htext | 29 | ||||
-rwxr-xr-x | eg/htextsub | 28 | ||||
-rwxr-xr-x | eg/htitle | 21 |
9 files changed, 410 insertions, 0 deletions
diff --git a/eg/hanchors b/eg/hanchors new file mode 100755 index 0000000..c7693fd --- /dev/null +++ b/eg/hanchors @@ -0,0 +1,48 @@ +#!/usr/bin/perl -w + +# This program will print out all <a href=".."> links in a +# document together with the text that goes with it. +# +# See also HTML::LinkExtor + +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3, + start_h => [\&a_start_handler, "self,tagname,attr"], + report_tags => [qw(a img)], + ); +$p->parse_file(shift || die) || die $!; + +sub a_start_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "a"; + return unless exists $attr->{href}; + print "A $attr->{href}\n"; + + $self->handler(text => [], '@{dtext}' ); + $self->handler(start => \&img_handler); + $self->handler(end => \&a_end_handler, "self,tagname"); +} + +sub img_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "img"; + push(@{$self->handler("text")}, $attr->{alt} || "[IMG]"); +} + +sub a_end_handler +{ + my($self, $tag) = @_; + my $text = join("", @{$self->handler("text")}); + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + print "T $text\n"; + + $self->handler("text", undef); + $self->handler("start", \&a_start_handler); + $self->handler("end", undef); +} + diff --git a/eg/hdump b/eg/hdump new file mode 100755 index 0000000..2174584 --- /dev/null +++ b/eg/hdump @@ -0,0 +1,23 @@ +#!/usr/bin/perl -w + +use HTML::Parser (); +use Data::Dump (); + +sub h { + my($event, $line, $column, $text, $tagname, $attr) = @_; + + my @d = (uc(substr($event,0,1)) . " L$line C$column"); + substr($text, 40) = "..." if length($text) > 40; + push(@d, $text); + push(@d, $tagname) if defined $tagname; + push(@d, $attr) if $attr; + + print Data::Dump::dump(@d), "\n"; +} + +my $p = HTML::Parser->new(api_version => 3); +$p->handler(default => \&h, "event, line, column, text, tagname, attr"); + +$p->parse_file(@ARGV ? shift : *STDIN); + + diff --git a/eg/hform b/eg/hform new file mode 100755 index 0000000..d2599ed --- /dev/null +++ b/eg/hform @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +# See also HTML::Form module + +use HTML::PullParser (); +use HTML::Entities qw(decode_entities); +use Data::Dump qw(dump); + +my @FORM_TAGS = qw(form input textarea button select option); + +my $p = HTML::PullParser->new(file => shift || "xxx.html", + start => 'tag, attr', + end => 'tag', + text => '@{text}', + report_tags => \@FORM_TAGS, + ) || die "$!"; + +# a little helper function +sub get_text { + my($p, $stop) = @_; + my $text; + while (defined(my $t = $p->get_token)) { + if (ref $t) { + $p->unget_token($t) unless $t->[0] eq $stop; + last; + } + else { + $text .= $t; + } + } + return $text; +} + +my @forms; +while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + if ($t->[0] eq "form") { + shift @$t; + push(@forms, $t); + while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + last if $t->[0] eq "/form"; + if ($t->[0] eq "select") { + my $sel = $t; + push(@{$forms[-1]}, $t); + while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + last if $t->[0] eq "/select"; + #print "select ", dump($t), "\n"; + if ($t->[0] eq "option") { + my $value = $t->[1]->{value}; + my $text = get_text($p, "/option"); + unless (defined $value) { + $value = decode_entities($text); + } + push(@$sel, $value); + } + else { + warn "$t->[0] inside select"; + } + } + } + elsif ($t->[0] =~ /^\/?option$/) { + warn "option tag outside select"; + } + elsif ($t->[0] eq "textarea") { + push(@{$forms[-1]}, $t); + $t->[1]{value} = get_text($p, "/textarea"); + } + elsif ($t->[0] =~ m,^/,) { + warn "stray $t->[0] tag"; + } + else { + push(@{$forms[-1]}, $t); + } + } + } + else { + warn "form tag $t->[0] outside form"; + } +} + +print dump(\@forms), "\n"; @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::Parser (); + +HTML::Parser->new(start_h => [ \&start_lc, "tokenpos, text" ], + end_h => [ sub { print lc shift }, "text" ], + default_h => [ sub { print shift }, "text" ], + ) + ->parse_file(shift) || die "Can't open file: $!\n"; + +sub start_lc { + my($tpos, $text) = @_; + for (my $i = 0; $i < @$tpos; $i += 2) { + next if $i && ($i/2) % 2 == 0; # skip attribute values + $_ = lc $_ for substr($text, $tpos->[$i], $tpos->[$i+1]); + } + print $text; +} + diff --git a/eg/hrefsub b/eg/hrefsub new file mode 100755 index 0000000..fe14159 --- /dev/null +++ b/eg/hrefsub @@ -0,0 +1,93 @@ +#!/usr/bin/perl + +# Perform transformations on link attributes in an HTML document. +# Examples: +# +# $ hrefsub 's/foo/bar/g' index.html +# $ hrefsub '$_=URI->new_abs($_, "http://foo")' index.html +# +# The first argument is a perl expression that might modify $_. +# It is called for each link in the document with $_ set to +# the original value of the link URI. The variables $tag and +# $attr can be used to access the tagname and attributename +# within the tag where the current link is found. +# +# The second argument is the name of a file to process. + +use strict; +use HTML::Parser (); +use URI; + +# Construct a hash of tag names that may have links. +my %link_attr; +{ + # To simplify things, reformat the %HTML::Tagset::linkElements + # hash so that it is always a hash of hashes. + require HTML::Tagset; + while (my($k,$v) = each %HTML::Tagset::linkElements) { + if (ref($v)) { + $v = { map {$_ => 1} @$v }; + } + else { + $v = { $v => 1}; + } + $link_attr{$k} = $v; + } + # Uncomment this to see what HTML::Tagset::linkElements thinks are + # the tags with link attributes + #use Data::Dump; Data::Dump::dump(\%link_attr); exit; +} + +# Create a subroutine named 'edit' to perform the operation +# passed in from the command line. The code should modify $_ +# to change things. +my $code = shift; +my $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' . + $code . + '; $_; }'; +#print $code; +eval $code; +die $@ if $@; + +# Set up the parser. +my $p = HTML::Parser->new(api_version => 3); + +# The default is to print everything as is. +$p->handler(default => sub { print @_ }, "text"); + +# All links are found in start tags. This handler will evaluate +# &edit for each link attribute found. +$p->handler(start => sub { + my($tagname, $pos, $text) = @_; + if (my $link_attr = $link_attr{$tagname}) { + while (4 <= @$pos) { + # use attribute sets from right to left + # to avoid invalidating the offsets + # when replacing the values + my($k_offset, $k_len, $v_offset, $v_len) = + splice(@$pos, -4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless $link_attr->{$attrname}; + next unless $v_offset; # 0 v_offset means no value + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + my $new_v = edit($v, $attrname, $tagname); + next if $new_v eq $v; + $new_v =~ s/\"/"/g; # since we quote with "" + substr($text, $v_offset, $v_len) = qq("$new_v"); + } + } + print $text; + }, + "tagname, tokenpos, text"); + +# Parse the file passed in from the command line +my $file = shift || usage(); +$p->parse_file($file) || die "Can't open file $file: $!\n"; + +sub usage +{ + my $progname = $0; + $progname =~ s,^.*/,,; + die "Usage: $progname <perlexpr> <filename>\n"; +} diff --git a/eg/hstrip b/eg/hstrip new file mode 100755 index 0000000..b94df3c --- /dev/null +++ b/eg/hstrip @@ -0,0 +1,65 @@ +#!/usr/bin/perl -w + +# This script cleans up an HTML document + +use strict; +use HTML::Parser (); + +# configure these values +my @ignore_attr = + qw(bgcolor background color face style link alink vlink text + onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload + onmousedown onmousemove onmouseout onmouseover onmouseup + onreset onselect onunload + ); +my @ignore_tags = qw(font big small b i); +my @ignore_elements = qw(script style); + +# make it easier to look up attributes +my %ignore_attr = map { $_ => 1} @ignore_attr; + +sub tag +{ + my($pos, $text) = @_; + if (@$pos >= 4) { + # kill some attributes + my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1]; + my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len; + my $edited; + while (@$pos >= 4) { + ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4; + if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) { + substr($text, $k_offset, $next_attr - $k_offset) = ""; + $edited++; + } + $next_attr = $k_offset; + } + # if we killed all attributed, kill any extra whitespace too + $text =~ s/^(<\w+)\s+>$/$1>/ if $edited; + } + print $text; +} + +sub decl +{ + my $type = shift; + print shift if $type eq "doctype"; +} + +sub text +{ + print shift; +} + +HTML::Parser->new(api_version => 3, + start_h => [\&tag, "tokenpos, text"], + process_h => ["", ""], + comment_h => ["", ""], + declaration_h => [\&decl, "tagname, text"], + default_h => [\&text, "text"], + + ignore_tags => \@ignore_tags, + ignore_elements => \@ignore_elements, + ) + ->parse_file(shift) || die "Can't open file: $!\n"; + diff --git a/eg/htext b/eg/htext new file mode 100755 index 0000000..e4d276d --- /dev/null +++ b/eg/htext @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w + +# Extract all plain text from an HTML file + +use strict; +use HTML::Parser 3.00 (); + +my %inside; + +sub tag +{ + my($tag, $num) = @_; + $inside{$tag} += $num; + print " "; # not for all tags +} + +sub text +{ + return if $inside{script} || $inside{style}; + print $_[0]; +} + +HTML::Parser->new(api_version => 3, + handlers => [start => [\&tag, "tagname, '+1'"], + end => [\&tag, "tagname, '-1'"], + text => [\&text, "dtext"], + ], + marked_sections => 1, + )->parse_file(shift) || die "Can't open file: $!\n";; diff --git a/eg/htextsub b/eg/htextsub new file mode 100755 index 0000000..5091273 --- /dev/null +++ b/eg/htextsub @@ -0,0 +1,28 @@ +#!/usr/bin/perl -w + +# Shows how to mangle all plain text in an HTML document, using an arbitrary +# Perl expression. Plain text is all text not within a tag declaration, i.e. +# not in <p ...>, but possibly between <p> and </p> + +use strict; +my $code = shift || usage(); +$code = 'sub edit_print { local $_ = shift; ' . $code . '; print }'; +#print $code; +eval $code; +die $@ if $@; + +use HTML::Parser 3.05; +my $p = HTML::Parser->new(unbroken_text => 1, + default_h => [ sub { print @_; }, "text" ], + text_h => [ \&edit_print, "text" ], + ); + +my $file = shift || usage(); +$p->parse_file($file) || die "Can't open file $file: $!\n"; + +sub usage +{ + my $progname = $0; + $progname =~ s,^.*/,,; + die "Usage: $progname <perlexpr> <filename>\n"; +} diff --git a/eg/htitle b/eg/htitle new file mode 100755 index 0000000..38da5d6 --- /dev/null +++ b/eg/htitle @@ -0,0 +1,21 @@ +#!/usr/bin/perl + +# This program will print out the title of an HTML document. + +use strict; +use HTML::Parser (); + +sub title_handler +{ + my $self = shift; + $self->handler(text => sub { print @_ }, "dtext"); + $self->handler(end => "eof", "self"); +} + +my $p = HTML::Parser->new(api_version => 3, + start_h => [\&title_handler, "self"], + report_tags => ['title'], + ); +$p->parse_file(shift || die) || die $!; +print "\n"; + |