HTML-Parser-3.71HEAD HTML-Parser-3.71 master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
commit: 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree: 4734ccd522c71dd455879162006742002f8c1565 /eg
download: HTML-Parser-tarball-master.tar.gz
9 files changed, 410 insertions, 0 deletions
diff --git a/eg/hanchors b/eg/hanchors
new file mode 100755
index 0000000..c7693fd
--- /dev/null
+++ b/eg/hanchors
@@ -0,0 +1,48 @@
+#!/usr/bin/perl -w
+
+# This program will print out all <a href=".."> links in a
+# document together with the text that goes with it.
+#
+# See also HTML::LinkExtor
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3,
+     start_h => [\&a_start_handler, "self,tagname,attr"],
+     report_tags => [qw(a img)],
+    );
+$p->parse_file(shift || die) || die $!;
+
+sub a_start_handler
+{
+    my($self, $tag, $attr) = @_;
+    return unless $tag eq "a";
+    return unless exists $attr->{href};
+    print "A $attr->{href}\n";
+
+    $self->handler(text  => [], '@{dtext}' );
+    $self->handler(start => \&img_handler);
+    $self->handler(end   => \&a_end_handler, "self,tagname");
+}
+
+sub img_handler
+{
+    my($self, $tag, $attr) = @_;
+    return unless $tag eq "img";
+    push(@{$self->handler("text")}, $attr->{alt} || "[IMG]");
+}
+
+sub a_end_handler
+{
+    my($self, $tag) = @_;
+    my $text = join("", @{$self->handler("text")});
+    $text =~ s/^\s+//;
+    $text =~ s/\s+$//;
+    $text =~ s/\s+/ /g;
+    print "T $text\n";
+
+    $self->handler("text", undef);
+    $self->handler("start", \&a_start_handler);
+    $self->handler("end", undef);
+}
+
diff --git a/eg/hdump b/eg/hdump
new file mode 100755
index 0000000..2174584
--- /dev/null
+++ b/eg/hdump
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+
+use HTML::Parser ();
+use Data::Dump ();
+
+sub h {
+    my($event, $line, $column, $text, $tagname, $attr) = @_;
+
+    my @d = (uc(substr($event,0,1)) . " L$line C$column");
+    substr($text, 40) = "..." if length($text) > 40;
+    push(@d, $text);
+    push(@d, $tagname) if defined $tagname;
+    push(@d, $attr) if $attr;
+
+    print Data::Dump::dump(@d), "\n";
+}
+
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \&h, "event, line, column, text, tagname, attr");
+
+$p->parse_file(@ARGV ? shift : *STDIN);
+
+
diff --git a/eg/hform b/eg/hform
new file mode 100755
index 0000000..d2599ed
--- /dev/null
+++ b/eg/hform
@@ -0,0 +1,83 @@
+#!/usr/bin/perl -w
+
+# See also HTML::Form module
+
+use HTML::PullParser ();
+use HTML::Entities qw(decode_entities);
+use Data::Dump qw(dump);
+
+my @FORM_TAGS = qw(form input textarea button select option);
+
+my $p = HTML::PullParser->new(file => shift || "xxx.html",
+			      start => 'tag, attr',
+			      end   => 'tag',
+			      text  => '@{text}',
+			      report_tags => \@FORM_TAGS,
+			     ) || die "$!";
+
+# a little helper function
+sub get_text {
+    my($p, $stop) = @_;
+    my $text;
+    while (defined(my $t = $p->get_token)) {
+	if (ref $t) {
+	    $p->unget_token($t) unless $t->[0] eq $stop;
+	    last;
+	}
+	else {
+	    $text .= $t;
+	}
+    }
+    return $text;
+}
+
+my @forms;
+while (defined(my $t = $p->get_token)) {
+    next unless ref $t; # skip text
+    if ($t->[0] eq "form") {
+	shift @$t;
+	push(@forms, $t);
+	while (defined(my $t = $p->get_token)) {
+	    next unless ref $t;  # skip text
+	    last if $t->[0] eq "/form";
+	    if ($t->[0] eq "select") {
+		my $sel = $t;
+		push(@{$forms[-1]}, $t);
+		while (defined(my $t = $p->get_token)) {
+		    next unless ref $t; # skip text
+		    last if $t->[0] eq "/select";
+		    #print "select ", dump($t), "\n";
+		    if ($t->[0] eq "option") {
+			my $value = $t->[1]->{value};
+			my $text = get_text($p, "/option");
+			unless (defined $value) {
+			    $value = decode_entities($text);
+			}
+			push(@$sel, $value);
+		    }
+		    else {
+			warn "$t->[0] inside select";
+		    }
+		}
+	    }
+	    elsif ($t->[0] =~ /^\/?option$/) {
+		warn "option tag outside select";
+	    }
+	    elsif ($t->[0] eq "textarea") {
+		push(@{$forms[-1]}, $t);
+		$t->[1]{value} = get_text($p, "/textarea");
+	    }
+	    elsif ($t->[0] =~ m,^/,) {
+		warn "stray $t->[0] tag";
+	    }
+	    else {
+		push(@{$forms[-1]}, $t);
+	    }
+	}
+    }
+    else {
+	warn "form tag $t->[0] outside form";
+    }
+}
+
+print dump(\@forms), "\n";
diff --git a/eg/hlc b/eg/hlc
new file mode 100755
index 0000000..664e1e9
--- /dev/null
+++ b/eg/hlc
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+
+use strict;
+use HTML::Parser ();
+
+HTML::Parser->new(start_h   => [ \&start_lc, "tokenpos, text" ],
+	          end_h     => [ sub { print lc shift }, "text" ],
+                  default_h => [ sub { print shift }, "text" ],
+                 )
+    ->parse_file(shift) || die "Can't open file: $!\n";
+
+sub start_lc {
+    my($tpos, $text) = @_;
+    for (my $i = 0; $i < @$tpos; $i += 2) {
+	next if $i && ($i/2) % 2 == 0;  # skip attribute values
+	$_ = lc $_ for substr($text, $tpos->[$i], $tpos->[$i+1]);
+    }
+    print $text;
+}
+
diff --git a/eg/hrefsub b/eg/hrefsub
new file mode 100755
index 0000000..fe14159
--- /dev/null
+++ b/eg/hrefsub
@@ -0,0 +1,93 @@
+#!/usr/bin/perl
+
+# Perform transformations on link attributes in an HTML document.
+# Examples:
+#
+#  $ hrefsub 's/foo/bar/g' index.html
+#  $ hrefsub '$_=URI->new_abs($_, "http://foo")' index.html
+#
+# The first argument is a perl expression that might modify $_.
+# It is called for each link in the document with $_ set to
+# the original value of the link URI.  The variables $tag and
+# $attr can be used to access the tagname and attributename
+# within the tag where the current link is found.
+#
+# The second argument is the name of a file to process.
+
+use strict;
+use HTML::Parser ();
+use URI;
+
+# Construct a hash of tag names that may have links.
+my %link_attr;
+{
+    # To simplify things, reformat the %HTML::Tagset::linkElements
+    # hash so that it is always a hash of hashes.
+    require HTML::Tagset;
+    while (my($k,$v) = each %HTML::Tagset::linkElements) {
+	if (ref($v)) {
+	    $v = { map {$_ => 1} @$v };
+	}
+	else {
+	    $v = { $v => 1};
+	}
+	$link_attr{$k} = $v;
+    }
+    # Uncomment this to see what HTML::Tagset::linkElements thinks are
+    # the tags with link attributes
+    #use Data::Dump; Data::Dump::dump(\%link_attr); exit;
+}
+
+# Create a subroutine named 'edit' to perform the operation
+# passed in from the command line.  The code should modify $_
+# to change things.
+my $code = shift;
+my $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' .
+           $code .
+           '; $_; }';
+#print $code;
+eval $code;
+die $@ if $@;
+
+# Set up the parser.
+my $p = HTML::Parser->new(api_version => 3);
+
+# The default is to print everything as is.
+$p->handler(default => sub { print @_ }, "text");
+
+# All links are found in start tags.  This handler will evaluate
+# &edit for each link attribute found.
+$p->handler(start => sub {
+		my($tagname, $pos, $text) = @_;
+		if (my $link_attr = $link_attr{$tagname}) {
+		    while (4 <= @$pos) {
+			# use attribute sets from right to left
+			# to avoid invalidating the offsets
+			# when replacing the values
+			my($k_offset, $k_len, $v_offset, $v_len) =
+			    splice(@$pos, -4);
+			my $attrname = lc(substr($text, $k_offset, $k_len));
+			next unless $link_attr->{$attrname};
+			next unless $v_offset; # 0 v_offset means no value
+			my $v = substr($text, $v_offset, $v_len);
+			$v =~ s/^([\'\"])(.*)\1$/$2/;
+			my $new_v = edit($v, $attrname, $tagname);
+			next if $new_v eq $v;
+			$new_v =~ s/\"/&quot;/g;  # since we quote with ""
+			substr($text, $v_offset, $v_len) = qq("$new_v");
+		    }
+		}
+		print $text;
+	    },
+	    "tagname, tokenpos, text");
+
+# Parse the file passed in from the command line
+my $file = shift || usage();
+$p->parse_file($file) || die "Can't open file $file: $!\n";
+
+sub usage
+{
+    my $progname = $0;
+    $progname =~ s,^.*/,,;
+    die "Usage: $progname <perlexpr> <filename>\n";
+}
diff --git a/eg/hstrip b/eg/hstrip
new file mode 100755
index 0000000..b94df3c
--- /dev/null
+++ b/eg/hstrip
@@ -0,0 +1,65 @@
+#!/usr/bin/perl -w
+
+# This script cleans up an HTML document
+
+use strict;
+use HTML::Parser ();
+
+# configure these values
+my @ignore_attr =
+    qw(bgcolor background color face style link alink vlink text
+       onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload
+       onmousedown onmousemove onmouseout onmouseover onmouseup
+       onreset onselect onunload
+      );
+my @ignore_tags = qw(font big small b i);
+my @ignore_elements = qw(script style);
+
+# make it easier to look up attributes
+my %ignore_attr = map { $_ => 1} @ignore_attr;
+
+sub tag
+{
+    my($pos, $text) = @_;
+    if (@$pos >= 4) {
+	# kill some attributes
+	my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1];
+	my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len;
+	my $edited;
+	while (@$pos >= 4) {
+	    ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4;
+	    if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) {
+		substr($text, $k_offset, $next_attr - $k_offset) = "";
+		$edited++;
+	    }
+	    $next_attr = $k_offset;
+	}
+	# if we killed all attributed, kill any extra whitespace too
+	$text =~ s/^(<\w+)\s+>$/$1>/ if $edited;
+    }
+    print $text;
+}
+
+sub decl
+{
+    my $type = shift;
+    print shift if $type eq "doctype";
+}
+
+sub text
+{
+    print shift;
+}
+
+HTML::Parser->new(api_version   => 3,
+		  start_h       => [\&tag,   "tokenpos, text"],
+                  process_h     => ["", ""],
+		  comment_h     => ["", ""],
+                  declaration_h => [\&decl,   "tagname, text"],
+                  default_h     => [\&text,   "text"],
+
+		  ignore_tags   => \@ignore_tags,
+		  ignore_elements => \@ignore_elements,
+                 )
+    ->parse_file(shift) || die "Can't open file: $!\n";
+
diff --git a/eg/htext b/eg/htext
new file mode 100755
index 0000000..e4d276d
--- /dev/null
+++ b/eg/htext
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+
+# Extract all plain text from an HTML file
+
+use strict;
+use HTML::Parser 3.00 ();
+
+my %inside;
+
+sub tag
+{
+   my($tag, $num) = @_;
+   $inside{$tag} += $num;
+   print " ";  # not for all tags
+}
+
+sub text
+{
+    return if $inside{script} || $inside{style};
+    print $_[0];
+}
+
+HTML::Parser->new(api_version => 3,
+		  handlers    => [start => [\&tag, "tagname, '+1'"],
+				  end   => [\&tag, "tagname, '-1'"],
+				  text  => [\&text, "dtext"],
+				 ],
+		  marked_sections => 1,
+	)->parse_file(shift) || die "Can't open file: $!\n";;
diff --git a/eg/htextsub b/eg/htextsub
new file mode 100755
index 0000000..5091273
--- /dev/null
+++ b/eg/htextsub
@@ -0,0 +1,28 @@
+#!/usr/bin/perl -w
+
+# Shows how to mangle all plain  text in an HTML document, using an arbitrary
+# Perl expression. Plain text is all text not within a tag declaration, i.e.
+# not in <p ...>, but possibly between <p> and </p>
+
+use strict;
+my $code = shift || usage();
+$code = 'sub edit_print { local $_ = shift; ' . $code . '; print }';
+#print $code;
+eval $code;
+die $@ if $@;
+
+use HTML::Parser 3.05;
+my $p = HTML::Parser->new(unbroken_text => 1,
+			  default_h => [ sub { print @_; }, "text" ],
+                	  text_h    => [ \&edit_print,      "text" ],
+        	         );
+
+my $file = shift || usage();
+$p->parse_file($file) || die "Can't open file $file: $!\n";
+
+sub usage
+{
+    my $progname = $0;
+    $progname =~ s,^.*/,,;
+    die "Usage: $progname <perlexpr> <filename>\n";
+}
diff --git a/eg/htitle b/eg/htitle
new file mode 100755
index 0000000..38da5d6
--- /dev/null
+++ b/eg/htitle
@@ -0,0 +1,21 @@
+#!/usr/bin/perl
+
+# This program will print out the title of an HTML document.
+
+use strict;
+use HTML::Parser ();
+
+sub title_handler
+{
+    my $self = shift;
+    $self->handler(text => sub { print @_ }, "dtext");
+    $self->handler(end  => "eof", "self");
+}
+
+my $p = HTML::Parser->new(api_version => 3,
+			  start_h => [\&title_handler, "self"],
+			  report_tags => ['title'],
+			 );
+$p->parse_file(shift || die) || die $!;
+print "\n";
+
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
commit	2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree	4734ccd522c71dd455879162006742002f8c1565 /eg
download	HTML-Parser-tarball-master.tar.gz