summaryrefslogtreecommitdiff
path: root/eg
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
commit2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree4734ccd522c71dd455879162006742002f8c1565 /eg
downloadHTML-Parser-tarball-master.tar.gz
Diffstat (limited to 'eg')
-rwxr-xr-xeg/hanchors48
-rwxr-xr-xeg/hdump23
-rwxr-xr-xeg/hform83
-rwxr-xr-xeg/hlc20
-rwxr-xr-xeg/hrefsub93
-rwxr-xr-xeg/hstrip65
-rwxr-xr-xeg/htext29
-rwxr-xr-xeg/htextsub28
-rwxr-xr-xeg/htitle21
9 files changed, 410 insertions, 0 deletions
diff --git a/eg/hanchors b/eg/hanchors
new file mode 100755
index 0000000..c7693fd
--- /dev/null
+++ b/eg/hanchors
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+# This program will print out all <a href=".."> links in a
+# document together with the text that goes with it.
+#
+# See also HTML::LinkExtor
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3,
+ start_h => [\&a_start_handler, "self,tagname,attr"],
+ report_tags => [qw(a img)],
+ );
+$p->parse_file(shift || die) || die $!;
+
+sub a_start_handler
+{
+ my($self, $tag, $attr) = @_;
+ return unless $tag eq "a";
+ return unless exists $attr->{href};
+ print "A $attr->{href}\n";
+
+ $self->handler(text => [], '@{dtext}' );
+ $self->handler(start => \&img_handler);
+ $self->handler(end => \&a_end_handler, "self,tagname");
+}
+
+sub img_handler
+{
+ my($self, $tag, $attr) = @_;
+ return unless $tag eq "img";
+ push(@{$self->handler("text")}, $attr->{alt} || "[IMG]");
+}
+
+sub a_end_handler
+{
+ my($self, $tag) = @_;
+ my $text = join("", @{$self->handler("text")});
+ $text =~ s/^\s+//;
+ $text =~ s/\s+$//;
+ $text =~ s/\s+/ /g;
+ print "T $text\n";
+
+ $self->handler("text", undef);
+ $self->handler("start", \&a_start_handler);
+ $self->handler("end", undef);
+}
+
diff --git a/eg/hdump b/eg/hdump
new file mode 100755
index 0000000..2174584
--- /dev/null
+++ b/eg/hdump
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+
+use HTML::Parser ();
+use Data::Dump ();
+
+sub h {
+ my($event, $line, $column, $text, $tagname, $attr) = @_;
+
+ my @d = (uc(substr($event,0,1)) . " L$line C$column");
+ substr($text, 40) = "..." if length($text) > 40;
+ push(@d, $text);
+ push(@d, $tagname) if defined $tagname;
+ push(@d, $attr) if $attr;
+
+ print Data::Dump::dump(@d), "\n";
+}
+
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \&h, "event, line, column, text, tagname, attr");
+
+$p->parse_file(@ARGV ? shift : *STDIN);
+
+
diff --git a/eg/hform b/eg/hform
new file mode 100755
index 0000000..d2599ed
--- /dev/null
+++ b/eg/hform
@@ -0,0 +1,83 @@
+#!/usr/bin/perl -w
+
+# See also HTML::Form module
+
+use HTML::PullParser ();
+use HTML::Entities qw(decode_entities);
+use Data::Dump qw(dump);
+
+my @FORM_TAGS = qw(form input textarea button select option);
+
+my $p = HTML::PullParser->new(file => shift || "xxx.html",
+ start => 'tag, attr',
+ end => 'tag',
+ text => '@{text}',
+ report_tags => \@FORM_TAGS,
+ ) || die "$!";
+
+# a little helper function
+sub get_text {
+ my($p, $stop) = @_;
+ my $text;
+ while (defined(my $t = $p->get_token)) {
+ if (ref $t) {
+ $p->unget_token($t) unless $t->[0] eq $stop;
+ last;
+ }
+ else {
+ $text .= $t;
+ }
+ }
+ return $text;
+}
+
+my @forms;
+while (defined(my $t = $p->get_token)) {
+ next unless ref $t; # skip text
+ if ($t->[0] eq "form") {
+ shift @$t;
+ push(@forms, $t);
+ while (defined(my $t = $p->get_token)) {
+ next unless ref $t; # skip text
+ last if $t->[0] eq "/form";
+ if ($t->[0] eq "select") {
+ my $sel = $t;
+ push(@{$forms[-1]}, $t);
+ while (defined(my $t = $p->get_token)) {
+ next unless ref $t; # skip text
+ last if $t->[0] eq "/select";
+ #print "select ", dump($t), "\n";
+ if ($t->[0] eq "option") {
+ my $value = $t->[1]->{value};
+ my $text = get_text($p, "/option");
+ unless (defined $value) {
+ $value = decode_entities($text);
+ }
+ push(@$sel, $value);
+ }
+ else {
+ warn "$t->[0] inside select";
+ }
+ }
+ }
+ elsif ($t->[0] =~ /^\/?option$/) {
+ warn "option tag outside select";
+ }
+ elsif ($t->[0] eq "textarea") {
+ push(@{$forms[-1]}, $t);
+ $t->[1]{value} = get_text($p, "/textarea");
+ }
+ elsif ($t->[0] =~ m,^/,) {
+ warn "stray $t->[0] tag";
+ }
+ else {
+ push(@{$forms[-1]}, $t);
+ }
+ }
+ }
+ else {
+ warn "form tag $t->[0] outside form";
+ }
+}
+
+print dump(\@forms), "\n";
diff --git a/eg/hlc b/eg/hlc
new file mode 100755
index 0000000..664e1e9
--- /dev/null
+++ b/eg/hlc
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+
+use strict;
+use HTML::Parser ();
+
+HTML::Parser->new(start_h => [ \&start_lc, "tokenpos, text" ],
+ end_h => [ sub { print lc shift }, "text" ],
+ default_h => [ sub { print shift }, "text" ],
+ )
+ ->parse_file(shift) || die "Can't open file: $!\n";
+
+sub start_lc {
+ my($tpos, $text) = @_;
+ for (my $i = 0; $i < @$tpos; $i += 2) {
+ next if $i && ($i/2) % 2 == 0; # skip attribute values
+ $_ = lc $_ for substr($text, $tpos->[$i], $tpos->[$i+1]);
+ }
+ print $text;
+}
+
diff --git a/eg/hrefsub b/eg/hrefsub
new file mode 100755
index 0000000..fe14159
--- /dev/null
+++ b/eg/hrefsub
@@ -0,0 +1,93 @@
+#!/usr/bin/perl
+
+# Perform transformations on link attributes in an HTML document.
+# Examples:
+#
+# $ hrefsub 's/foo/bar/g' index.html
+# $ hrefsub '$_=URI->new_abs($_, "http://foo")' index.html
+#
+# The first argument is a perl expression that might modify $_.
+# It is called for each link in the document with $_ set to
+# the original value of the link URI. The variables $tag and
+# $attr can be used to access the tagname and attributename
+# within the tag where the current link is found.
+#
+# The second argument is the name of a file to process.
+
+use strict;
+use HTML::Parser ();
+use URI;
+
+# Construct a hash of tag names that may have links.
+my %link_attr;
+{
+ # To simplify things, reformat the %HTML::Tagset::linkElements
+ # hash so that it is always a hash of hashes.
+ require HTML::Tagset;
+ while (my($k,$v) = each %HTML::Tagset::linkElements) {
+ if (ref($v)) {
+ $v = { map {$_ => 1} @$v };
+ }
+ else {
+ $v = { $v => 1};
+ }
+ $link_attr{$k} = $v;
+ }
+ # Uncomment this to see what HTML::Tagset::linkElements thinks are
+ # the tags with link attributes
+ #use Data::Dump; Data::Dump::dump(\%link_attr); exit;
+}
+
+# Create a subroutine named 'edit' to perform the operation
+# passed in from the command line. The code should modify $_
+# to change things.
+my $code = shift;
+my $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' .
+ $code .
+ '; $_; }';
+#print $code;
+eval $code;
+die $@ if $@;
+
+# Set up the parser.
+my $p = HTML::Parser->new(api_version => 3);
+
+# The default is to print everything as is.
+$p->handler(default => sub { print @_ }, "text");
+
+# All links are found in start tags. This handler will evaluate
+# &edit for each link attribute found.
+$p->handler(start => sub {
+ my($tagname, $pos, $text) = @_;
+ if (my $link_attr = $link_attr{$tagname}) {
+ while (4 <= @$pos) {
+ # use attribute sets from right to left
+ # to avoid invalidating the offsets
+ # when replacing the values
+ my($k_offset, $k_len, $v_offset, $v_len) =
+ splice(@$pos, -4);
+ my $attrname = lc(substr($text, $k_offset, $k_len));
+ next unless $link_attr->{$attrname};
+ next unless $v_offset; # 0 v_offset means no value
+ my $v = substr($text, $v_offset, $v_len);
+ $v =~ s/^([\'\"])(.*)\1$/$2/;
+ my $new_v = edit($v, $attrname, $tagname);
+ next if $new_v eq $v;
+ $new_v =~ s/\"/&quot;/g; # since we quote with ""
+ substr($text, $v_offset, $v_len) = qq("$new_v");
+ }
+ }
+ print $text;
+ },
+ "tagname, tokenpos, text");
+
+# Parse the file passed in from the command line
+my $file = shift || usage();
+$p->parse_file($file) || die "Can't open file $file: $!\n";
+
+sub usage
+{
+ my $progname = $0;
+ $progname =~ s,^.*/,,;
+ die "Usage: $progname <perlexpr> <filename>\n";
+}
diff --git a/eg/hstrip b/eg/hstrip
new file mode 100755
index 0000000..b94df3c
--- /dev/null
+++ b/eg/hstrip
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+
+# This script cleans up an HTML document
+
+use strict;
+use HTML::Parser ();
+
+# configure these values
+my @ignore_attr =
+ qw(bgcolor background color face style link alink vlink text
+ onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload
+ onmousedown onmousemove onmouseout onmouseover onmouseup
+ onreset onselect onunload
+ );
+my @ignore_tags = qw(font big small b i);
+my @ignore_elements = qw(script style);
+
+# make it easier to look up attributes
+my %ignore_attr = map { $_ => 1} @ignore_attr;
+
+sub tag
+{
+ my($pos, $text) = @_;
+ if (@$pos >= 4) {
+ # kill some attributes
+ my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1];
+ my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len;
+ my $edited;
+ while (@$pos >= 4) {
+ ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4;
+ if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) {
+ substr($text, $k_offset, $next_attr - $k_offset) = "";
+ $edited++;
+ }
+ $next_attr = $k_offset;
+ }
+ # if we killed all attributed, kill any extra whitespace too
+ $text =~ s/^(<\w+)\s+>$/$1>/ if $edited;
+ }
+ print $text;
+}
+
+sub decl
+{
+ my $type = shift;
+ print shift if $type eq "doctype";
+}
+
+sub text
+{
+ print shift;
+}
+
+HTML::Parser->new(api_version => 3,
+ start_h => [\&tag, "tokenpos, text"],
+ process_h => ["", ""],
+ comment_h => ["", ""],
+ declaration_h => [\&decl, "tagname, text"],
+ default_h => [\&text, "text"],
+
+ ignore_tags => \@ignore_tags,
+ ignore_elements => \@ignore_elements,
+ )
+ ->parse_file(shift) || die "Can't open file: $!\n";
+
diff --git a/eg/htext b/eg/htext
new file mode 100755
index 0000000..e4d276d
--- /dev/null
+++ b/eg/htext
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+
+# Extract all plain text from an HTML file
+
+use strict;
+use HTML::Parser 3.00 ();
+
+my %inside;
+
+sub tag
+{
+ my($tag, $num) = @_;
+ $inside{$tag} += $num;
+ print " "; # not for all tags
+}
+
+sub text
+{
+ return if $inside{script} || $inside{style};
+ print $_[0];
+}
+
+HTML::Parser->new(api_version => 3,
+ handlers => [start => [\&tag, "tagname, '+1'"],
+ end => [\&tag, "tagname, '-1'"],
+ text => [\&text, "dtext"],
+ ],
+ marked_sections => 1,
+ )->parse_file(shift) || die "Can't open file: $!\n";;
diff --git a/eg/htextsub b/eg/htextsub
new file mode 100755
index 0000000..5091273
--- /dev/null
+++ b/eg/htextsub
@@ -0,0 +1,28 @@
+#!/usr/bin/perl -w
+
+# Shows how to mangle all plain text in an HTML document, using an arbitrary
+# Perl expression. Plain text is all text not within a tag declaration, i.e.
+# not in <p ...>, but possibly between <p> and </p>
+
+use strict;
+my $code = shift || usage();
+$code = 'sub edit_print { local $_ = shift; ' . $code . '; print }';
+#print $code;
+eval $code;
+die $@ if $@;
+
+use HTML::Parser 3.05;
+my $p = HTML::Parser->new(unbroken_text => 1,
+ default_h => [ sub { print @_; }, "text" ],
+ text_h => [ \&edit_print, "text" ],
+ );
+
+my $file = shift || usage();
+$p->parse_file($file) || die "Can't open file $file: $!\n";
+
+sub usage
+{
+ my $progname = $0;
+ $progname =~ s,^.*/,,;
+ die "Usage: $progname <perlexpr> <filename>\n";
+}
diff --git a/eg/htitle b/eg/htitle
new file mode 100755
index 0000000..38da5d6
--- /dev/null
+++ b/eg/htitle
@@ -0,0 +1,21 @@
+#!/usr/bin/perl
+
+# This program will print out the title of an HTML document.
+
+use strict;
+use HTML::Parser ();
+
+sub title_handler
+{
+ my $self = shift;
+ $self->handler(text => sub { print @_ }, "dtext");
+ $self->handler(end => "eof", "self");
+}
+
+my $p = HTML::Parser->new(api_version => 3,
+ start_h => [\&title_handler, "self"],
+ report_tags => ['title'],
+ );
+$p->parse_file(shift || die) || die $!;
+print "\n";
+