HTML-Parser-3.71HEAD HTML-Parser-3.71 master

author: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
committer: Lorry Tar Creator <lorry-tar-importer@lorry> 2013-05-08 22:21:52 +0000
commit: 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree: 4734ccd522c71dd455879162006742002f8c1565 /t
download: HTML-Parser-tarball-master.tar.gz
48 files changed, 3447 insertions, 0 deletions
diff --git a/t/api_version.t b/t/api_version.t
new file mode 100644
index 0000000..9803121
--- /dev/null
+++ b/t/api_version.t
@@ -0,0 +1,22 @@
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+ok(!$p->handler("start"), "API version 3");
+
+my $failed;
+eval {
+   my $p = HTML::Parser->new(api_version => 4);
+   $failed++;
+};
+like($@, qr/^API version 4 not supported/);
+ok(!$failed, "API version 4");
+
+$p = HTML::Parser->new(api_version => 2);
+
+is($p->handler("start"), "start", "API version 2");
+
+
diff --git a/t/argspec-bad.t b/t/argspec-bad.t
new file mode 100644
index 0000000..8c0b199
--- /dev/null
+++ b/t/argspec-bad.t
@@ -0,0 +1,40 @@
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+eval {
+   $p->handler(end => "end", q(xyzzy));
+};
+like($@, qr/^Unrecognized identifier xyzzy in argspec/);
+
+
+eval {
+   $p->handler(end => "end", q(tagname text));
+};
+like($@, qr/^Missing comma separator in argspec/);
+
+
+eval {
+   $p->handler(end => "end", q(tagname, "text));
+};
+like($@, qr/^Unterminated literal string in argspec/);
+
+
+eval {
+   $p->handler(end => "end", q(tagname, "t\\t"));
+};
+like($@, qr/^Backslash reserved for literal string in argspec/);
+
+eval {
+   $p->handler(end => "end", '"' . ("x" x 256) . '"');
+};
+like($@, qr/^Literal string is longer than 255 chars in argspec/);
+
+$p->handler(end => sub { is(length(shift), 255) },
+	           '"' . ("x" x 255) . '"');
+$p->parse("</x>");
+
+
diff --git a/t/argspec.t b/t/argspec.t
new file mode 100644
index 0000000..e8aa7a5
--- /dev/null
+++ b/t/argspec.t
@@ -0,0 +1,148 @@
+
+use strict;
+require HTML::Parser;
+
+my $decl = '<!ENTITY nbsp   CDATA "&#160;" -- no-break space -->';
+my $com1 = '<!-- Comment -->';
+my $com2 = '<!-- Comment -- -- Comment -->';
+my $start = '<a href="foo">';
+my $end = '</a>';
+my $empty = "<IMG SRC='foo'/>";
+my $proc = '<? something completely different ?>';
+
+my @argspec = qw( self offset length
+		  event tagname tag token0
+	          text
+	          is_cdata dtext
+	          tokens
+	          tokenpos
+	          attr
+	          attrseq );
+
+my @result = ();
+my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)],
+			    strict_comment => 1, xml_mode => 1);
+
+my @tests =
+    ( # string, expected results
+      $decl  => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY',
+		 '<!ENTITY nbsp   CDATA "&#160;" -- no-break space -->',
+		 undef, undef,
+	       ['ENTITY', 'nbsp', 'CDATA', '"&#160;"', '-- no-break space --'],
+		 [2, 6, 9, 4, 16, 5, 22, 8, 31, 20],
+		 undef, undef ]],
+      $com1  => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ',
+		 '<!-- Comment -->', 
+		 undef, undef,
+		 [' Comment '],
+		 [4, 9],
+		 undef, undef ]],
+      $com2  => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ',
+		 '<!-- Comment -- -- Comment -->',
+		 undef, undef,
+		 [' Comment ', ' Comment '],
+		 [4, 9, 18, 9],
+		 undef, undef ]],
+      $start => [[$p, 0, 14, 'start', 'a', 'a', 'a',
+		 '<a href="foo">', 
+		 undef, undef,
+		 ['a', 'href', '"foo"'],
+		 [1, 1, 3, 4, 8, 5],
+		 {'href', 'foo'}, ['href'] ]],
+      $end   => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+		 '</a>',
+		 undef, undef,
+		 ['a'],
+		 [2, 1],
+		 undef, undef ]],
+      $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG',
+		  "<IMG SRC='foo'/>",
+		  undef, undef,
+		  ['IMG', 'SRC', "'foo'"],
+		  [1, 3, 5, 3, 9, 5],
+		  {'SRC', 'foo'}, ['SRC'] ],
+		 [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG',
+		  '',
+		  undef, undef,
+		  ['IMG'],
+		  undef,
+		  undef, undef ],
+		 ],
+       $proc  => [[$p, 0, 36, 'process', ' something completely different ',
+		  '? something completely different ',
+		  ' something completely different ',
+		  '<? something completely different ?>',
+		  undef, undef,
+		  [' something completely different '],
+		  [2, 32],
+		  undef, undef ]],
+      "$end\n$end"   => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+			  '</a>',
+			  undef, undef,
+			  ['a'],
+			  [2, 1],
+			  undef, undef],
+			 [$p, 4, 1, 'text', undef, undef, undef,
+			  "\n",
+			  '', "\n",
+			  undef,
+			  undef,
+			  undef, undef],
+			 [$p, 5, 4, 'end', 'a', '/a', 'a',
+			  '</a>',
+			  undef, undef,
+			  ['a'],
+			  [2, 1],
+			  undef, undef ]],
+      );
+
+use Test::More;
+plan tests => @tests / 2;
+
+sub string_tag {
+    my (@pieces) = @_;
+    my $part;
+    foreach $part ( @pieces ) {
+	if (!defined $part) {
+	    $part = 'undef';
+	}
+	elsif (!ref $part) {
+	    $part = "'$part'" if $part !~ /^\d+$/;
+	}
+	elsif ('ARRAY' eq ref $part ) {
+	    $part = '[' . join(', ', string_tag(@$part)) . ']';
+	}
+	elsif ('HASH' eq ref $part ) {
+	    $part = '{' . join(',', string_tag(%$part)) . '}';
+	}
+	else {
+	    $part = '<' . ref($part) . '>';
+	}
+    }
+    return join(", ", @pieces );
+}
+
+my $i = 0;
+TEST:
+while (@tests) {
+    my($html, $expected) = splice @tests, 0, 2;
+    ++$i;
+
+    @result = ();
+    $p->parse($html)->eof;
+
+    shift(@result) if $result[0][3] eq "start_document";
+    pop(@result)   if $result[-1][3] eq "end_document";
+
+    # Compare results for each element expected
+    foreach (@$expected) {
+	my $want = string_tag($_);
+	my $got = string_tag(shift @result);
+	if ($want ne $got) {
+           is($want, $got);
+           next TEST;
+        }
+    }
+
+    pass;
+}
diff --git a/t/argspec2.t b/t/argspec2.t
new file mode 100644
index 0000000..6f594b9
--- /dev/null
+++ b/t/argspec2.t
@@ -0,0 +1,21 @@
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser;
+
+my @start;
+my @text;
+
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(start => \@start, '@{tagname, @attr}');
+$p->handler(text  => \@text,  '@{dtext}');
+$p->parse(<<EOT)->eof;
+Hi
+<a href="abc">Foo</a><b>:-)</b>
+EOT
+
+is("@start", "a href abc b");
+
+is(join("", @text), "Hi\nFoo:-)\n");
+
+
diff --git a/t/attr-encoded.t b/t/attr-encoded.t
new file mode 100644
index 0000000..4d458eb
--- /dev/null
+++ b/t/attr-encoded.t
@@ -0,0 +1,32 @@
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->attr_encoded(1);
+
+my $text = "";
+$p->handler(start =>
+	    sub {
+		 my($tag, $attr) = @_;
+		 $text .= "S[$tag";
+		 for my $k (sort keys %$attr) {
+		     my $v =  $attr->{$k};
+		     $text .= " $k=$v";
+		 }
+		 $text .= "]";
+	     }, "tagname,attr");
+
+my $html = <<'EOT';
+<tag arg="&amp;&lt;&gt">
+EOT
+
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&amp;&lt;&gt]');
+
+$text = "";
+$p->attr_encoded(0);
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&<>]');
diff --git a/t/callback.t b/t/callback.t
new file mode 100644
index 0000000..7a456cf
--- /dev/null
+++ b/t/callback.t
@@ -0,0 +1,49 @@
+use Test::More tests => 47;
+
+use strict;
+use HTML::Parser;
+
+my @expected;
+my $p = HTML::Parser->new(api_version => 3,
+                          unbroken_text => 1,
+	                  default_h => [\@expected, '@{event, text}'],
+			 );
+
+my $doc = <<'EOT';
+<title>Hi</title>
+<h1>Ho ho</h1>
+<--comment->
+EOT
+
+$p->parse($doc)->eof;
+#use Data::Dump; Data::Dump::dump(@expected);
+
+for my $i (1..length($doc)) {
+     my @t;
+     $p->handler(default => \@t);
+     $p->parse(chunk($doc, $i));
+
+     # check that we got the same stuff
+     #diag "X:", join(":", @t);
+     #diag "Y:", join(":", @expected);
+     is(join(":", @t), join(":", @expected));
+}
+
+sub chunk {
+    my $str = shift;
+    my $size = shift || 1;
+    sub {
+	my $res = substr($str, 0, $size);
+        #diag "...$res";
+        substr($str, 0, $size) = "";
+	$res;
+    }
+}
+
+# Test croking behaviour
+$p->handler(default => []);
+
+eval {
+   $p->parse(sub { die "Hi" });
+};
+like($@, qr/^Hi/);
diff --git a/t/case-sensitive.t b/t/case-sensitive.t
new file mode 100644
index 0000000..565b20b
--- /dev/null
+++ b/t/case-sensitive.t
@@ -0,0 +1,85 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->case_sensitive(1);
+
+my $text = "";
+$p->handler(start =>
+	    sub {
+		 my($tag, $attr, $attrseq) = @_;
+		 $text .= "S[$tag";
+		 for my $k (sort keys %$attr) {
+		     my $v =  $attr->{$k};
+		     $text .= " $k=$v";
+		 }
+		 if (@$attrseq) { $text.=" Order:" ; }
+		 for my $k (@$attrseq) {
+		     $text .= " $k";
+		 }
+		 $text .= "]";
+	     }, "tagname,attr,attrseq");
+$p->handler(end =>
+	    sub {
+		 my ($tag) = @_;
+		 $text .= "E[$tag]";
+	     }, "tagname");
+
+my $html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+EOT
+my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]';
+my $ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $ci);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+<iGnOrE></ignore>
+EOT
+$p->ignore_tags('ignore');
+$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]';
+$ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $ci);
+ 
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+ 
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+ 
diff --git a/t/cases.t b/t/cases.t
new file mode 100644
index 0000000..a537279
--- /dev/null
+++ b/t/cases.t
@@ -0,0 +1,105 @@
+use Test::More;
+
+require HTML::Parser;
+
+package P; @ISA = qw(HTML::Parser);
+
+my @result;
+sub start
+{
+    my($self, $tag, $attr) = @_;
+    push @result, "START[$tag]";
+    for (sort keys %$attr) {
+        push @result, "\t$_: " . $attr->{$_};
+    }
+    $start++;
+}
+
+sub end
+{
+    my($self, $tag) = @_;
+    push @result, "END[$tag]";
+    $end++;
+}
+
+sub text
+{
+    my $self = shift;
+    push @result, "TEXT[$_[0]]";
+    $text++;
+}
+
+sub comment
+{
+    my $self = shift;
+    push @result, "COMMENT[$_[0]]";
+    $comment++;
+}
+
+sub declaration
+{
+    my $self = shift;
+    push @result, "DECLARATION[$_[0]]";
+    $declaration++;
+}
+
+package main;
+
+
+@tests =
+    (
+     '<a ">' => ['START[a]', "\t\": \""],
+     '<a/>' => ['START[a/]',],
+     '<a />' => ['START[a]', "\t/: /"],
+     '<a a/>' => ['START[a]', "\ta/: a/"],
+     '<a a/=/>' => ['START[a]', "\ta/: /"],
+     '<a x="foo&nbsp;bar">' => ['START[a]', "\tx: foo\xA0bar"],
+     '<a x="foo&nbspbar">' => ['START[a]', "\tx: foo&nbspbar"],
+     '<� >' => ['TEXT[<�]', 'TEXT[ >]'],
+     '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'],
+     '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'],
+     '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'],
+     '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'],
+     '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'],
+     "2 <a href='foo bar'> 2" =>
+         ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+     '2 <a href=foo bar> 2' =>
+         ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'],
+     '2 <a href="foo bar"> 2' =>
+         ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+     '2 <a href="foo\'bar"> 2' =>
+         ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'],
+     "2 <a href='foo\"bar'> 2" =>
+         ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+     "2 <a href='foo&quot;bar'> 2" =>
+         ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+     '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'],
+     '2 <a.b-12 a.b = 2 a> 2' =>
+         ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'],
+     '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'],
+     '<!ENTITY nbsp   CDATA "&#160;" -- no-break space -->' =>
+         ['DECLARATION[ENTITY nbsp   CDATA "&#160;" -- no-break space --]'],
+     '<!-- comment -->' => ['COMMENT[ comment ]'],
+     '<!-- comment -- --- comment -->' =>
+         ['COMMENT[ comment ]', 'COMMENT[- comment ]'],
+     '<!-- comment <!-- not comment --> comment -->' =>
+         ['COMMENT[ comment <!]', 'COMMENT[> comment ]'],
+     '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'],
+     );
+
+plan tests => @tests / 2;
+
+my $i = 0;
+TEST:
+while (@tests) {
+    ++$i;
+    my ($html, $expected) = splice @tests, 0, 2;
+    @result = ();
+
+    $p = new P;
+    $p->strict_comment(1);
+    $p->parse($html)->eof;
+
+    ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n",
+					      "Got:      @result\n");
+}
diff --git a/t/comment.t b/t/comment.t
new file mode 100644
index 0000000..303449e
--- /dev/null
+++ b/t/comment.t
@@ -0,0 +1,24 @@
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+my @com;
+$p->handler(comment => sub { push(@com, shift) }, "token0");
+$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text");
+
+$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>");
+$p->parse("<!--+--");
+$p->parse("\n\n");
+$p->parse(">");
+$p->parse("<!a'b>");
+$p->parse("<!--foo--->");
+$p->parse("<!--foo---->");
+$p->parse("<!--foo----->-->");
+$p->parse("<foo>");
+$p->parse("<!3453><!-3456><!FOO><>");
+$p->eof;
+
+my $com = join(":", @com);
+is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]");
diff --git a/t/crashme.t b/t/crashme.t
new file mode 100644
index 0000000..1a1e8e4
--- /dev/null
+++ b/t/crashme.t
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# This test will simply run the parser on random junk.
+
+my $no_tests = shift || 3;
+use Test::More;
+plan tests => $no_tests;
+
+use HTML::Parser ();
+
+my $file = "junk$$.html";
+die if -e $file;
+
+for (1..$no_tests) {
+
+    open(JUNK, ">$file") || die;
+    for (1 .. rand(5000)) {
+        for (1 .. rand(200)) {
+            print JUNK pack("N", rand(2**32));
+        }
+        print JUNK ("<", "&", ">")[rand(3)];  # make these a bit more likely
+    }
+    close(JUNK);
+
+    #diag "Parse @{[-s $file]} bytes of junk";
+
+    HTML::Parser->new->parse_file($file);
+    pass();
+
+    #print_mem();
+}
+
+unlink($file);
+
+
+sub print_mem
+{
+    # this probably only works on Linux
+    open(STAT, "/proc/self/status") || return;
+    while (<STAT>) {
+        diag $_ if /^VmSize/;
+    }
+}
diff --git a/t/declaration.t b/t/declaration.t
new file mode 100644
index 0000000..17de561
--- /dev/null
+++ b/t/declaration.t
@@ -0,0 +1,62 @@
+use Test::More tests => 2;
+
+use HTML::Parser;
+my $res = "";
+
+sub decl
+{
+    my $t = shift;
+    $res .= "[" . join("\n", map "<$_>", @$t) . "]";
+}
+
+sub text
+{
+    $res .= shift;
+}
+
+my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"],
+			  default_h     => [\&text, "text"],
+	                 );
+
+$p->parse(<<EOT)->eof;
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>--
+  "http://www.w3.org/TR/html40/strict.dtd">
+
+<!ENTITY foo "<!-- foo -->">
+<!Entity foo "<!-- foo -->">
+
+<!row --> foo
+EOT
+
+is($res, <<EOT);
+[<DOCTYPE>
+<HTML>
+<PUBLIC>
+<"-//W3C//DTD HTML 4.01//EN">
+<--<comment>-->
+<"http://www.w3.org/TR/html40/strict.dtd">]
+
+[<ENTITY>
+<foo>
+<"<!-- foo -->">]
+[<Entity>
+<foo>
+<"<!-- foo -->">]
+
+<!row --> foo
+EOT
+
+$res = "";
+$p->parse(<<EOT)->eof;
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]>
+EOT
+is($res, <<EOT);
+[<DOCTYPE>
+<html>
+<PUBLIC>
+<"-//W3C//DTD XHTML 1.0 Strict//EN">
+<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<[]>]
+EOT
+
diff --git a/t/default.t b/t/default.t
new file mode 100644
index 0000000..4b5ed79
--- /dev/null
+++ b/t/default.t
@@ -0,0 +1,43 @@
+use strict;
+use Test::More tests => 3;
+
+my $text = "";
+use HTML::Parser ();
+my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"],
+                         );
+
+my $html = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+                       "http://www.w3.org/TR/html40/strict.dtd">
+
+<title>foo</title>
+<!-- comment <a> -->
+<?process instruction>
+
+EOT
+
+$p->parse($html)->eof;
+
+is($text, $html);
+
+$text = "";
+$p->handler(start => sub { }, "");
+$p->handler(declaration => sub { }, "");
+$p->parse($html)->eof;
+
+my $html2;
+$html2 = $html;
+$html2 =~ s/<title>//;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
+
+$text = "";
+$p->handler(start => undef);
+$p->parse($html)->eof;
+
+$html2 = $html;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
diff --git a/t/document.t b/t/document.t
new file mode 100644
index 0000000..6696939
--- /dev/null
+++ b/t/document.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use Test;
+plan tests => 6;
+
+
+use HTML::Parser;
+use File::Spec;
+
+my $events;
+my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]);
+
+$events = "";
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse_file(File::Spec->devnull);
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("<a>");
+$p->eof;
+ok($events, "start_document\nstart\nend_document\n");
+
+$events = "";
+$p->parse("<a> ");
+$p->eof;
+ok($events, "start_document\nstart\ntext\nend_document\n");
diff --git a/t/dtext.t b/t/dtext.t
new file mode 100644
index 0000000..883c61f
--- /dev/null
+++ b/t/dtext.t
@@ -0,0 +1,72 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+
+my $dtext = "";
+my $text  = "";
+
+sub append
+{
+    $dtext .= shift;
+    $text .= shift;
+}
+
+my $p = HTML::Parser->new(text_h    => [\&append, "dtext, text"],
+			  default_h => [\&append, "text,  text" ],
+			 );
+
+my $doc = <<'EOT';
+<title>&aring</title>
+<a href="foo&aring">&aring&aring;&#65&#65;&lt&#65&gt;&#x41&#X41;</a>
+<?&aring>
+foo&nbsp;bar
+foo&nbspbar
+&xyzzy
+&xyzzy;
+<!-- &#0; -->
+&#1;
+&#255;
+&#xFF
+&#xFFG
+<!-- &#256; -->
+&#40000000000000000000000000000;
+&#x400000000000000000000000000000000;
+&
+&#
+&#x
+<xmp>&aring</xmp>
+<script>&aring</script>
+<ScRIPT>&aring</scRIPT>
+<skript>&aring</script>
+EOT
+
+$p->parse($doc)->eof;
+
+is($text, $doc);
+is($dtext, <<"EOT");
+<title>�</title>
+<a href="foo&aring">��AA<A>AA</a>
+<?&aring>
+foo\240bar
+foo\240bar
+&xyzzy
+&xyzzy;
+<!-- &#0; -->
+\1
+\377
+\377
+\377G
+<!-- &#256; -->
+&#40000000000000000000000000000;
+&#x400000000000000000000000000000000;
+&
+&#
+&#x
+<xmp>&aring</xmp>
+<script>&aring</script>
+<ScRIPT>&aring</scRIPT>
+<skript>�</script>
+EOT
diff --git a/t/entities.t b/t/entities.t
new file mode 100644
index 0000000..f12d2fd
--- /dev/null
+++ b/t/entities.t
@@ -0,0 +1,213 @@
+use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
+
+use Test::More tests => 20;
+
+$a = "V&aring;re norske tegn b&oslash;r &#230res";
+
+decode_entities($a);
+
+is($a, "V�re norske tegn b�r �res");
+
+encode_entities($a);
+
+is($a, "V&aring;re norske tegn b&oslash;r &aelig;res");
+
+decode_entities($a);
+encode_entities_numeric($a);
+
+is($a, "V&#xE5;re norske tegn b&#xF8;r &#xE6;res");
+
+$a = "<&>\"'";
+is(encode_entities($a), "&lt;&amp;&gt;&quot;&#39;");
+is(encode_entities_numeric($a), "&#x3C;&#x26;&#x3E;&#x22;&#x27;");
+
+$a = "abcdef";
+is(encode_entities($a, 'a-c'), "&#97;&#98;&#99;def");
+
+$a = "[24/7]\\";
+is(encode_entities($a, '/'), "[24&#47;7]\\");
+is(encode_entities($a, '\\/'), "[24&#47;7]\\");
+is(encode_entities($a, '\\'), "[24/7]&#92;");
+is(encode_entities($a, ']\\'), "[24/7&#93;&#92;");
+
+# See how well it does against rfc1866...
+$ent = $plain = "";
+while (<DATA>) {
+    next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
+    $ent .= "&$1;";
+    $plain .= chr($2);
+}
+
+$a = $ent;
+decode_entities($a);
+is($a, $plain);
+
+# Try decoding when the ";" are left out
+$a = $ent,
+$a =~ s/;//g;
+decode_entities($a);
+is($a, $plain);
+
+
+$a = $plain;
+encode_entities($a);
+is($a, $ent);
+
+{   #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144
+
+    my %hash= (
+        "V&aring;re norske tegn b&oslash;r &#230res" => "V�re norske tegn b�r �res"
+    );
+
+    my ($got, $eval_ok);
+    $eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 };
+    is( $eval_ok, 1, "decode_entitites() when processing a key as input");
+    is( $got, (values %hash)[0], "decode_entities() decodes a key properly");
+}
+
+# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
+# Subject: HTML entities problem with 5.11
+# To: libwww-perl@ics.uci.edu
+# Date: Fri, 05 Sep 1997 16:56:55 +1000
+# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
+#
+# Hi. I've got a problem that has surfaced with the changes to 
+# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08).  It's happening 
+# in the process of encoding then decoding special entities.  Eg, what goes 
+# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
+
+is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
+
+# Decoding of &apos;
+is(decode_entities("&apos;"), "'");
+is(encode_entities("'", "'"), "&#39;");
+
+is(decode_entities("Attention Home&#959&#969n&#1257rs...1&#1109t T&#1110&#1084e E&#957&#1257&#1075"),
+  "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}");
+is(decode_entities("{&#38;amp;&#x26;amp;&amp; also &#x42f;&#339;}"),
+    "{&amp;&amp;& also \x{42F}\x{153}}");
+
+__END__
+# Quoted from rfc1866.txt
+
+14. Proposed Entities
+
+   The HTML DTD references the "Added Latin 1" entity set, which only
+   supplies named entities for a subset of the non-ASCII characters in
+   [ISO-8859-1], namely the accented characters. The following entities
+   should be supported so that all ISO 8859-1 characters may only be
+   referenced symbolically. The names for these entities are taken from
+   the appendixes of [SGML].
+
+    <!ENTITY nbsp   CDATA "&#160;" -- no-break space -->
+    <!ENTITY iexcl  CDATA "&#161;" -- inverted exclamation mark -->
+    <!ENTITY cent   CDATA "&#162;" -- cent sign -->
+    <!ENTITY pound  CDATA "&#163;" -- pound sterling sign -->
+    <!ENTITY curren CDATA "&#164;" -- general currency sign -->
+    <!ENTITY yen    CDATA "&#165;" -- yen sign -->
+    <!ENTITY brvbar CDATA "&#166;" -- broken (vertical) bar -->
+    <!ENTITY sect   CDATA "&#167;" -- section sign -->
+    <!ENTITY uml    CDATA "&#168;" -- umlaut (dieresis) -->
+    <!ENTITY copy   CDATA "&#169;" -- copyright sign -->
+    <!ENTITY ordf   CDATA "&#170;" -- ordinal indicator, feminine -->
+    <!ENTITY laquo  CDATA "&#171;" -- angle quotation mark, left -->
+    <!ENTITY not    CDATA "&#172;" -- not sign -->
+    <!ENTITY shy    CDATA "&#173;" -- soft hyphen -->
+    <!ENTITY reg    CDATA "&#174;" -- registered sign -->
+    <!ENTITY macr   CDATA "&#175;" -- macron -->
+    <!ENTITY deg    CDATA "&#176;" -- degree sign -->
+    <!ENTITY plusmn CDATA "&#177;" -- plus-or-minus sign -->
+    <!ENTITY sup2   CDATA "&#178;" -- superscript two -->
+    <!ENTITY sup3   CDATA "&#179;" -- superscript three -->
+    <!ENTITY acute  CDATA "&#180;" -- acute accent -->
+    <!ENTITY micro  CDATA "&#181;" -- micro sign -->
+    <!ENTITY para   CDATA "&#182;" -- pilcrow (paragraph sign) -->
+    <!ENTITY middot CDATA "&#183;" -- middle dot -->
+    <!ENTITY cedil  CDATA "&#184;" -- cedilla -->
+    <!ENTITY sup1   CDATA "&#185;" -- superscript one -->
+    <!ENTITY ordm   CDATA "&#186;" -- ordinal indicator, masculine -->
+    <!ENTITY raquo  CDATA "&#187;" -- angle quotation mark, right -->
+    <!ENTITY frac14 CDATA "&#188;" -- fraction one-quarter -->
+    <!ENTITY frac12 CDATA "&#189;" -- fraction one-half -->
+    <!ENTITY frac34 CDATA "&#190;" -- fraction three-quarters -->
+    <!ENTITY iquest CDATA "&#191;" -- inverted question mark -->
+    <!ENTITY Agrave CDATA "&#192;" -- capital A, grave accent -->
+    <!ENTITY Aacute CDATA "&#193;" -- capital A, acute accent -->
+    <!ENTITY Acirc  CDATA "&#194;" -- capital A, circumflex accent -->
+
+
+
+Berners-Lee & Connolly      Standards Track                    [Page 75]
+
+RFC 1866            Hypertext Markup Language - 2.0        November 1995
+
+
+    <!ENTITY Atilde CDATA "&#195;" -- capital A, tilde -->
+    <!ENTITY Auml   CDATA "&#196;" -- capital A, dieresis or umlaut mark -->
+    <!ENTITY Aring  CDATA "&#197;" -- capital A, ring -->
+    <!ENTITY AElig  CDATA "&#198;" -- capital AE diphthong (ligature) -->
+    <!ENTITY Ccedil CDATA "&#199;" -- capital C, cedilla -->
+    <!ENTITY Egrave CDATA "&#200;" -- capital E, grave accent -->
+    <!ENTITY Eacute CDATA "&#201;" -- capital E, acute accent -->
+    <!ENTITY Ecirc  CDATA "&#202;" -- capital E, circumflex accent -->
+    <!ENTITY Euml   CDATA "&#203;" -- capital E, dieresis or umlaut mark -->
+    <!ENTITY Igrave CDATA "&#204;" -- capital I, grave accent -->
+    <!ENTITY Iacute CDATA "&#205;" -- capital I, acute accent -->
+    <!ENTITY Icirc  CDATA "&#206;" -- capital I, circumflex accent -->
+    <!ENTITY Iuml   CDATA "&#207;" -- capital I, dieresis or umlaut mark -->
+    <!ENTITY ETH    CDATA "&#208;" -- capital Eth, Icelandic -->
+    <!ENTITY Ntilde CDATA "&#209;" -- capital N, tilde -->
+    <!ENTITY Ograve CDATA "&#210;" -- capital O, grave accent -->
+    <!ENTITY Oacute CDATA "&#211;" -- capital O, acute accent -->
+    <!ENTITY Ocirc  CDATA "&#212;" -- capital O, circumflex accent -->
+    <!ENTITY Otilde CDATA "&#213;" -- capital O, tilde -->
+    <!ENTITY Ouml   CDATA "&#214;" -- capital O, dieresis or umlaut mark -->
+    <!ENTITY times  CDATA "&#215;" -- multiply sign -->
+    <!ENTITY Oslash CDATA "&#216;" -- capital O, slash -->
+    <!ENTITY Ugrave CDATA "&#217;" -- capital U, grave accent -->
+    <!ENTITY Uacute CDATA "&#218;" -- capital U, acute accent -->
+    <!ENTITY Ucirc  CDATA "&#219;" -- capital U, circumflex accent -->
+    <!ENTITY Uuml   CDATA "&#220;" -- capital U, dieresis or umlaut mark -->
+    <!ENTITY Yacute CDATA "&#221;" -- capital Y, acute accent -->
+    <!ENTITY THORN  CDATA "&#222;" -- capital THORN, Icelandic -->
+    <!ENTITY szlig  CDATA "&#223;" -- small sharp s, German (sz ligature) -->
+    <!ENTITY agrave CDATA "&#224;" -- small a, grave accent -->
+    <!ENTITY aacute CDATA "&#225;" -- small a, acute accent -->
+    <!ENTITY acirc  CDATA "&#226;" -- small a, circumflex accent -->
+    <!ENTITY atilde CDATA "&#227;" -- small a, tilde -->
+    <!ENTITY auml   CDATA "&#228;" -- small a, dieresis or umlaut mark -->
+    <!ENTITY aring  CDATA "&#229;" -- small a, ring -->
+    <!ENTITY aelig  CDATA "&#230;" -- small ae diphthong (ligature) -->
+    <!ENTITY ccedil CDATA "&#231;" -- small c, cedilla -->
+    <!ENTITY egrave CDATA "&#232;" -- small e, grave accent -->
+    <!ENTITY eacute CDATA "&#233;" -- small e, acute accent -->
+    <!ENTITY ecirc  CDATA "&#234;" -- small e, circumflex accent -->
+    <!ENTITY euml   CDATA "&#235;" -- small e, dieresis or umlaut mark -->
+    <!ENTITY igrave CDATA "&#236;" -- small i, grave accent -->
+    <!ENTITY iacute CDATA "&#237;" -- small i, acute accent -->
+    <!ENTITY icirc  CDATA "&#238;" -- small i, circumflex accent -->
+    <!ENTITY iuml   CDATA "&#239;" -- small i, dieresis or umlaut mark -->
+    <!ENTITY eth    CDATA "&#240;" -- small eth, Icelandic -->
+    <!ENTITY ntilde CDATA "&#241;" -- small n, tilde -->
+    <!ENTITY ograve CDATA "&#242;" -- small o, grave accent -->
+
+
+
+Berners-Lee & Connolly      Standards Track                    [Page 76]
+
+RFC 1866            Hypertext Markup Language - 2.0        November 1995
+
+
+    <!ENTITY oacute CDATA "&#243;" -- small o, acute accent -->
+    <!ENTITY ocirc  CDATA "&#244;" -- small o, circumflex accent -->
+    <!ENTITY otilde CDATA "&#245;" -- small o, tilde -->
+    <!ENTITY ouml   CDATA "&#246;" -- small o, dieresis or umlaut mark -->
+    <!ENTITY divide CDATA "&#247;" -- divide sign -->
+    <!ENTITY oslash CDATA "&#248;" -- small o, slash -->
+    <!ENTITY ugrave CDATA "&#249;" -- small u, grave accent -->
+    <!ENTITY uacute CDATA "&#250;" -- small u, acute accent -->
+    <!ENTITY ucirc  CDATA "&#251;" -- small u, circumflex accent -->
+    <!ENTITY uuml   CDATA "&#252;" -- small u, dieresis or umlaut mark -->
+    <!ENTITY yacute CDATA "&#253;" -- small y, acute accent -->
+    <!ENTITY thorn  CDATA "&#254;" -- small thorn, Icelandic -->
+    <!ENTITY yuml   CDATA "&#255;" -- small y, dieresis or umlaut mark -->
diff --git a/t/entities2.t b/t/entities2.t
new file mode 100644
index 0000000..537ac78
--- /dev/null
+++ b/t/entities2.t
@@ -0,0 +1,57 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 9;
+
+use HTML::Entities qw(_decode_entities);
+
+eval {
+    _decode_entities("&lt;", undef);
+};
+like($@, qr/^(?:Can't inline decode readonly string|Modification of a read-only value attempted)/);
+
+eval {
+    my $a = "";
+    _decode_entities($a, $a);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+eval {
+    my $a = "";
+    _decode_entities($a, []);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+$a = "&lt;";
+_decode_entities($a, undef);
+is($a, "&lt;");
+
+_decode_entities($a, { "lt" => "<" });
+is($a, "<");
+
+my $x = "x" x 20;
+
+my $err;
+for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") {
+    my $a = $_;
+    $a =~ s/:/&a;/g;
+    my $b = $_;
+    $b =~ s/:/$x/g;
+    _decode_entities($a, { "a" => $x });
+    if ($a ne $b) {
+	diag "Something went wrong with '$_'";
+	$err++;
+    }
+}
+ok(!$err);
+
+$a = "foo&nbsp;bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo\xA0bar");
+
+$a = "foo&nbspbar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo&nbspbar");
+
+_decode_entities($a, \%HTML::Entities::entity2char, 1);
+is($a, "foo\xA0bar");
diff --git a/t/filter-methods.t b/t/filter-methods.t
new file mode 100644
index 0000000..9eccaf1
--- /dev/null
+++ b/t/filter-methods.t
@@ -0,0 +1,205 @@
+#!/usr/bin/perl -w
+
+use Test::More tests => 12;
+use strict;
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]);
+$p->ignore_elements("script");
+$p->unbroken_text(1);
+
+$p->handler(default => [], "event, text");
+$p->parse(<<"EOT")->eof;
+<html><head><title>foo</title><Script language="Perl">
+   while (<B>) {
+      # ...
+   }
+</Script><body>
+This is an <i>italic</i> and <b>bold</b> text.
+</body>
+</html>
+EOT
+
+my $t = join("||", map join("|", @$_), @{$p->handler("default")});
+#diag $t;
+
+is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text|
+This is an italic and bold text.
+||end|</body>||text|
+||end|</html>||text|
+||end_document|", 'ignore_elements');
+
+
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags("a");
+$p->handler(start => sub {
+		my($tagname, %attr) = @_;
+		ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start');
+            }, 'tagname, @attr');
+$p->handler(end => sub {
+		my $tagname = shift;
+		is($tagname, "a", 'report_tags end');
+            }, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <a href="#a">very nice</a> example.
+
+EOT
+
+
+#------------------------------------------------------
+
+my @tags;
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(a em));
+$p->ignore_tags(qw(em));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <em>yet another</em> <a href="#a">very nice</a> example.
+
+EOT
+is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->report_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h3', 'report_tags replaces filter');
+
+
+#------------------------------------------------------
+
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h1 h2));
+$p->ignore_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->ignore_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_elements("script");
+my $res="";
+$p->handler(default=> sub {$res.=$_[0];}, 'text');
+$p->parse(<<'EOT')->eof;
+A <script> B </script> C </script> D <script> E </script> F
+EOT
+is($res,"A  C  D  F\n","ignore </script> without <script> correctly");
diff --git a/t/filter.t b/t/filter.t
new file mode 100644
index 0000000..3b18f9e
--- /dev/null
+++ b/t/filter.t
@@ -0,0 +1,60 @@
+use Test::More tests => 3;
+
+my $HTML = <<EOT;
+
+<!DOCTYPE HTML>
+<!-- comment
+<h1>Foo</h1>
+-->
+
+<H1
+>Bar</H1
+>
+
+<Table><tr><td>1<td>2<td>3
+<tr>
+</table>
+
+<?process>
+
+EOT
+
+use HTML::Filter;
+use SelectSaver;
+
+my $tmpfile = "test-$$.htm";
+die "$tmpfile already exists" if -e $tmpfile;
+
+open(HTML, ">$tmpfile") or die "$!";
+
+{
+    my $save = new SelectSaver(HTML);
+    HTML::Filter->new->parse($HTML)->eof;
+}
+close(HTML);
+
+open(HTML, $tmpfile) or die "$!";
+local($/) = undef;
+my $FILTERED = <HTML>;
+close(HTML);
+
+#print $FILTERED;
+is($FILTERED, $HTML);
+
+{
+    package MyFilter;
+    @ISA=qw(HTML::Filter);
+    sub comment {}
+    sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+    sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+}
+
+my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html;
+unlink($tmpfile) or warn "Can't unlink $tmpfile: $!";
+
+#diag $f2;
+
+unlike($f2, qr/Foo/);
+like($f2, qr/Bar/);
+
+
diff --git a/t/handler-eof.t b/t/handler-eof.t
new file mode 100644
index 0000000..39419dc
--- /dev/null
+++ b/t/handler-eof.t
@@ -0,0 +1,54 @@
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) },
+		     "attr");
+$p->handler(end => sub { shift->eof }, "self");
+my $text;
+$p->handler(text => sub { $text = shift }, "text");
+
+is($p->parse("<foo testno=1>"), $p);
+
+$text = '';
+ok(!$p->parse("</foo><foo testno=999>"));
+ok(!$text);
+
+$p->handler(end => sub { $p->parse("foo"); }, "");
+eval {
+    $p->parse("</foo>");
+};
+like($@,  qr/Parse loop not allowed/);
+
+# We used to get into an infinite loop if the eof triggered
+# handler called ->eof
+
+use HTML::Parser;
+$p = HTML::Parser->new(api_version => 3);
+
+my $i;
+$p->handler("default" =>
+	    sub {
+		my $p=shift;
+	        #++$i; diag "$i @_";
+		$p->eof;
+	    }, "self, event");
+$p->parse("Foo");
+$p->eof;
+
+# We used to sometimes trigger events after a handler signaled eof
+my $title='';
+$p = HTML::Parser->new(api_version => 3,);
+$p->handler(start=> \&title_handler, 'tagname, self');
+$p->parse("<head><title>foo</title>\n</head>");
+is($title, "foo");
+
+sub title_handler {
+    return if shift ne 'title';
+    my $self = shift; 
+    $self->handler(text => sub { $title .= shift}, 'dtext');
+    $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self');
+}
diff --git a/t/handler.t b/t/handler.t
new file mode 100644
index 0000000..8d7bbc5
--- /dev/null
+++ b/t/handler.t
@@ -0,0 +1,67 @@
+# Test handler method
+
+use Test::More tests => 11;
+
+my $testno;
+
+use HTML::Parser;
+{
+    package MyParser;
+    use vars qw(@ISA);
+    @ISA=(HTML::Parser);
+    
+    sub foo
+    {
+	Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1);
+    }
+
+    sub bar
+    {
+	Test::More::is($_[1], Test::More->builder->current_test + 1);
+    }
+}
+
+$p = MyParser->new(api_version => 3);
+
+eval {
+    $p->handler(foo => "foo", "foo");
+};
+
+like($@, qr/^No handler for foo events/);
+
+eval {
+   $p->handler(start => "foo", "foo");
+};
+like($@, qr/^Unrecognized identifier foo in argspec/);
+
+my $h = $p->handler(start => "foo", "self,tagname");
+ok(!defined($h));
+
+$x = \substr("xfoo", 1);
+$p->handler(start => $$x, "self,attr");
+$p->parse("<a testno=4>");
+
+$p->handler(start => \&MyParser::foo, "self,attr");
+$p->parse("<a testno=5>");
+
+$p->handler(start => "foo");
+$p->parse("<a testno=6>");
+
+$p->handler(start => "bar", "self,'7'");
+$p->parse("<a>");
+
+eval {
+    $p->handler(start => {}, "self");
+};
+like($@, qr/^Only code or array references allowed as handler/);
+
+$a = [];
+$p->handler(start => $a);
+$h = $p->handler("start");
+is($p->handler("start", "foo"), $a);
+
+is($p->handler("start", \&MyParser::foo, ""), "foo");
+
+is($p->handler("start"), \&MyParser::foo);
+
+
diff --git a/t/headparser-http.t b/t/headparser-http.t
new file mode 100644
index 0000000..b722c64
--- /dev/null
+++ b/t/headparser-http.t
@@ -0,0 +1,20 @@
+use Test::More tests => 1;
+
+eval {
+   require HTML::HeadParser;
+   $p = HTML::HeadParser->new;
+};
+
+SKIP: {
+skip $@, 1 if $@ =~ /^Can't locate HTTP/;
+
+$p = HTML::HeadParser->new($h);
+$p->parse(<<EOT);
+<title>Stupid example</title>
+<base href="http://www.sn.no/libwww-perl/">
+Normal text starts here.
+EOT
+$h = $p->header;
+undef $p;
+is($h->title, "Stupid example");
+}
diff --git a/t/headparser.t b/t/headparser.t
new file mode 100644
index 0000000..1b4b810
--- /dev/null
+++ b/t/headparser.t
@@ -0,0 +1,200 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 17;
+
+{ package H;
+  sub new { bless {}, shift; }
+
+  sub header {
+     my $self = shift;
+     my $key  = uc(shift);
+     die if $key =~ /:/;
+     my $old = $self->{$key};
+     if (@_) { $self->{$key} = shift; }
+     $old;
+  }
+
+  sub push_header {
+     my($self, $k, $v) = @_;
+     $k = uc($k);
+     die if $k =~ /:/;
+     if (exists $self->{$k}) {
+        $self->{$k} = [ $self->{$k} ] unless ref $self->{$k};
+	push(@{$self->{$k}}, $v);
+     } else {
+	$self->{$k} = $v;
+     }
+  }
+
+  sub as_string {
+     my $self = shift;
+     my $str = "";
+     for (sort keys %$self) {
+         if (ref($self->{$_})) {
+            my $v;
+            for $v (@{$self->{$_}}) {
+	        $str .= "$_: $v\n";
+            }
+         } else {
+            $str .= "$_: $self->{$_}\n";
+         }
+     }
+     $str;
+  }
+}
+
+
+my $HTML = <<'EOT';
+
+<title>&Aring v�re eller &#229; ikke v�re</title>
+<meta http-equiv="Expires" content="Soon">
+<meta http-equiv="Foo" content="Bar">
+<meta name='twitter:card' content='photo' />
+<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas">
+
+<script>
+
+    ignore this
+
+</script>
+<noscript> ... and this </noscript>
+
+<object classid="foo">
+
+<base href="http://www.sn.no">
+<meta name="Keywords" content="test, test, test,...">
+<meta name="Keywords" content="more">
+<meta charset="ISO-8859-1"><!-- HTML 5 -->
+
+Dette er vanlig tekst.  Denne teksten definerer ogs� slutten p�
+&lt;head> delen av dokumentet.
+
+<style>
+
+   ignore this too
+
+</style>
+
+<isindex>
+
+Dette er ogs� vanlig tekst som ikke skal blir parset i det hele tatt.
+
+EOT
+
+$| = 1;
+
+#$HTML::HeadParser::DEBUG = 1;
+require HTML::HeadParser;
+my $p = HTML::HeadParser->new( H->new );
+
+if ($p->parse($HTML)) {
+    fail("Need more data which should not happen");
+} else {
+    #diag $p->as_string;
+    pass();
+}
+
+like($p->header('Title'), qr/� v�re eller � ikke v�re/);
+is($p->header('Expires'), 'Soon');
+is($p->header('Content-Base'), 'http://www.sn.no');
+is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']);
+is($p->header('X-Meta-Charset'), 'ISO-8859-1');
+is($p->header('X-Meta-Twitter-Card'), 'photo');
+like($p->header('Link'), qr/<mailto:gisle\@aas.no>/);
+
+# This header should not be present because the head ended
+ok(!$p->header('Isindex'));
+
+
+# Try feeding one char at a time
+my $expected = $p->as_string;
+my $nl = 1;
+$p = HTML::HeadParser->new(H->new);
+while ($HTML =~ /(.)/sg) {
+    #print STDERR '#' if $nl;
+    #print STDERR $1;
+    $nl = $1 eq "\n";
+    $p->parse($1) or last;
+}
+is($p->as_string, $expected);
+
+
+# Try reading it from a file
+my $file = "hptest$$.html";
+die "$file already exists" if -e $file;
+
+open(FILE, ">$file") or die "Can't create $file: $!";
+binmode(FILE);
+print FILE $HTML;
+print FILE "<p>This is more content...</p>\n" x 2000;
+print FILE "<title>Buuuh!</title>\n" x 200;
+close FILE or die "Can't close $file: $!";
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+is($p->header("Title"), "� v�re eller � ikke v�re");
+
+
+# We got into an infinite loop on data without tags and no EOL.
+# This was actually a HTML::Parser bug.
+open(FILE, ">$file") or die "Can't create $file: $!";
+print FILE "Foo";
+close(FILE);
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+ok(!$p->as_string);
+
+SKIP: {
+  skip "Need Unicode support", 5 if $] < 5.008;
+
+  # Test that the Unicode BOM does not confuse us?
+  $p = HTML::HeadParser->new(H->new);
+  ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
+  $p->eof;
+
+  is($p->header("title"), "Hi <foo>");
+
+  $p = HTML::HeadParser->new(H->new);
+  $p->utf8_mode(1);
+  $p->parse(<<"EOT");  # example from http://rt.cpan.org/Ticket/Display.html?id=27522
+\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <title>
+Parkinson's disease</title>
+ <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
+ </meta>
+ \t
+\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
+\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
+\t
+\t </head>
+ <body>
+EOT
+  $p->eof;
+
+  is($p->header("title"), "Parkinson's disease");
+  is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
+
+  $p = HTML::HeadParser->new(H->new);
+  $p->utf8_mode(1);
+  $p->parse(<<"EOT");   # example from http://www.mjw.com.pl/
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
+\r
+<head profile="http://gmpg.org/xfn/11">\r
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
+\r
+<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
+
+EOT
+    $p->eof;
+    is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW");
+}
diff --git a/t/ignore.t b/t/ignore.t
new file mode 100644
index 0000000..008739e
--- /dev/null
+++ b/t/ignore.t
@@ -0,0 +1,27 @@
+
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $html = '<A href="foo">text</A>';
+
+my $text = '';
+my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']);
+$p->parse($html)->eof;
+is($text, $html);
+
+$text = '';
+$p->handler(start => "");
+$p->parse($html)->eof;
+is($text, 'text</A>');
+
+$text = '';
+$p->handler(end => 0);
+$p->parse($html)->eof;
+is($text, 'text');
+
+$text = '';
+$p->handler(start => undef);
+$p->parse($html)->eof;
+is($text, '<A href="foo">text');
diff --git a/t/largetags.t b/t/largetags.t
new file mode 100644
index 0000000..a9ed3ff
--- /dev/null
+++ b/t/largetags.t
@@ -0,0 +1,38 @@
+# Exercise the tokenpos buffer allocation routines by feeding it
+# very large tags.
+
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler("start" =>
+	    sub {
+		my $tp = shift;
+		#diag int(@$tp), " - ", join(", ", @$tp);
+		is(@$tp, 2 + 26 * 6 * 4);
+	    }, "tokenpos");
+
+$p->handler("declaration" =>
+	    sub {
+		my $t = shift;
+		#diag int(@$t), " - @$t";
+		is(@$t, 26 * 6 * 2 + 1);
+	    }, "tokens");
+
+$p->parse("<a ");
+for ("aa" .. "fz") {
+    $p->parse("$_=1 ");
+}
+$p->parse(">");
+
+$p->parse("<!DOCTYPE ");
+for ("aa" .. "fz") {
+    $p->parse("$_ -- $_ -- ");
+}
+$p->parse(">");
+$p->eof;
+exit;
+
diff --git a/t/linkextor-base.t b/t/linkextor-base.t
new file mode 100644
index 0000000..7ef8f02
--- /dev/null
+++ b/t/linkextor-base.t
@@ -0,0 +1,41 @@
+# This test that HTML::LinkExtor really absolutize links correctly
+# when a base URL is given to the constructor.
+
+use Test::More tests => 5;
+require HTML::LinkExtor;
+
+SKIP: {
+eval {
+   require URI;
+};
+skip $@, 5 if $@;
+
+# Try with base URL and the $p->links interface.
+$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html");
+$p->parse(<<HTML)->eof;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+@p = $p->links;
+
+# There should be 4 links in the document
+is(@p, 4);
+
+for (@p) {
+    ($t, %attr) = @$_ if $_->[0] eq 'img';
+}
+
+is($t, 'img');
+
+is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
+
+is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
+
+ok(!scalar(keys %attr)); # there should be no more attributes
+}
diff --git a/t/linkextor-rel.t b/t/linkextor-rel.t
new file mode 100644
index 0000000..1190a96
--- /dev/null
+++ b/t/linkextor-rel.t
@@ -0,0 +1,36 @@
+use Test::More tests => 4;
+
+require HTML::LinkExtor;
+
+$HTML = <<HTML;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+
+# Try the callback interface
+$links = "";
+$p = HTML::LinkExtor->new(
+  sub {
+      my($tag, %links) = @_;
+      #diag "$tag @{[%links]}";
+      $links .= "$tag @{[%links]}\n";
+  });
+
+$p->parse($HTML); $p->eof;
+
+ok($links =~ m|^base href http://www\.sn\.no/$|m);
+ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
+ok($links =~ m|^a href link\.html$|m);
+
+# Used to be problems when using the links method on a document with
+# no links it it.  This is a test to prove that it works.
+$p = new HTML::LinkExtor;
+$p->parse("this is a document with no links"); $p->eof;
+@a = $p->links;
+is(@a, 0);
diff --git a/t/magic.t b/t/magic.t
new file mode 100644
index 0000000..366f275
--- /dev/null
+++ b/t/magic.t
@@ -0,0 +1,41 @@
+# Check that the magic signature at the top of struct p_state works and that we
+# catch modifications to _hparser_xs_state gracefully
+
+use Test::More tests => 5;
+
+use HTML::Parser;
+
+$p = HTML::Parser->new(api_version => 3);
+
+$p->xml_mode(1);
+
+# We should not be able to simply modify this stuff
+eval {
+    ${$p->{_hparser_xs_state}} += 4;
+};
+like($@, qr/^Modification of a read-only value attempted/);
+
+
+my $x = delete $p->{_hparser_xs_state};
+
+eval {
+    $p->xml_mode(1);
+};
+like($@, qr/^Can't find '_hparser_xs_state'/);
+
+$p->{_hparser_xs_state} = \($$x + 16);
+
+eval {
+    $p->xml_mode(1);
+};
+like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
+
+$p->{_hparser_xs_state} = 33;
+eval {
+    $p->xml_mode(1);
+};
+like($@,  qr/^_hparser_xs_state element is not a reference/);
+
+$p->{_hparser_xs_state} = $x;
+
+ok($p->xml_mode(0));
diff --git a/t/marked-sect.t b/t/marked-sect.t
new file mode 100644
index 0000000..6a63478
--- /dev/null
+++ b/t/marked-sect.t
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $tag;
+my $text;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(start_h => [sub { $tag = shift  }, "tagname"],
+	                  text_h  => [sub { $text .= shift }, "dtext"],
+                         );
+
+
+use Test::More tests => 14;
+
+SKIP: {
+eval {
+    $p->marked_sections(1);
+};
+skip $@, 14 if $@;
+
+$p->parse("<![[foo]]>");
+is($text, "foo");
+
+$p->parse("<![TEMP INCLUDE[bar]]>");
+is($text, "foobar");
+
+$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>");
+is($text, "foobarfoo\n");
+
+$text = "";
+$p->parse("<![  CDATA   [&lt;foo");
+$p->parse("<![IGNORE[bar]]>,bar&gt;]]><br>");
+is($text, "&lt;foo<![IGNORE[bar,bar>]]>");
+
+$text = "";
+$p->parse("<![ RCDATA [&aring;<a>]]><![CDATA[&aring;<a>]]>&aring;<a><br>");
+is($text, "�<a>&aring;<a>�");
+is($tag, "br");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA IGNORE [foo&aring;<a>]]><br>");
+is($text,  "");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA [foo&aring;<a>]]><br>");
+is($text, "foo&aring;<a>");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA [foo&aring;<a>]]><br>");
+is($text, "foo�<a>");
+
+$text = "";
+$p->parse("<![INCLUDE [foo&aring;<a>]]><br>");
+is($text, "foo�");
+
+$text = "";
+$p->parse("<![[foo&aring;<a>]]><br>");
+is($text, "foo�");
+
+# offsets/line/column numbers
+$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
+		       marked_sections => 1,
+		      );
+$p->parse(<<'EOT')->eof;
+<title>Test</title>
+<![CDATA
+  [foo&aring;<a>
+]]>
+<![[
+INCLUDE
+STUFF
+]]>
+  <h1>Test</h1>
+EOT
+
+my @x;
+sub x {
+    my($line, $col, $offset, $event, $text) = @_;
+    $text =~ s/\n/\\n/g;
+    $text =~ s/ /./g;
+    push(@x, "$line.$col:$offset $event \"$text\"\n");
+}
+
+#diag @x;
+is(join("", @x), <<'EOT');
+1.0:0 start_document ""
+1.0:0 start "<title>"
+1.7:7 text "Test"
+1.11:11 end "</title>"
+1.19:19 text "\n"
+3.3:32 text "foo&aring;<a>\n"
+4.3:49 text "\n"
+5.4:54 text "\nINCLUDE\nSTUFF\n"
+8.3:72 text "\n.."
+9.2:75 start "<h1>"
+9.6:79 text "Test"
+9.10:83 end "</h1>"
+9.15:88 text "\n"
+10.0:89 end_document ""
+EOT
+
+my $doc = "<Tag><![CDATA[This is cdata]]></Tag>";
+my $result = "";
+$p = HTML::Parser->new(
+    marked_sections => 1,
+    handlers => {
+        default => [ sub { $result .= join("",@_); }, "skipped_text,text" ]
+    }
+)->parse($doc)->eof;
+is($doc, $result);
+
+$text = "";
+$p = HTML::Parser->new(
+    text_h => [sub { $text .= shift }, "dtext"],
+    marked_sections => 1,
+);
+
+$p->parse("<![CDATA[foo [1]]]>");
+is($text, "foo [1]", "CDATA text ending in square bracket");
+
+} # SKIP
diff --git a/t/msie-compat.t b/t/msie-compat.t
new file mode 100644
index 0000000..a297f1e
--- /dev/null
+++ b/t/msie-compat.t
@@ -0,0 +1,79 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 4;
+
+my $TEXT = "";
+sub h
+{
+    my($event, $tagname, $text, @attr) = @_;
+    for ($event, $tagname, $text, @attr) {
+        if (defined) {
+	    s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+	}
+	else {
+	    $_ = "<undef>";
+	}
+    }
+
+    $TEXT .= "[$event,$tagname,$text," . join(":", @attr) . "]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text,\@attr"]);
+$p->parse("<a>");
+$p->parse("</a f>");
+$p->parse("</a 'foo<>' 'bar>' x>");
+$p->parse("</a \"foo<>\"");
+$p->parse(" \"bar>\" x>");
+$p->parse("</ foo bar>");
+$p->parse("</ \"<>\" >");
+$p->parse("<!--comment>text<!--comment><p");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a>,]
+[end,a,</a f>,]
+[end,a,</a 'foo<>' 'bar>' x>,]
+[end,a,</a "foo<>" "bar>" x>,]
+[comment, foo bar,</ foo bar>,]
+[comment, "<>" ,</ "<>" >,]
+[comment,comment,<!--comment>,]
+[text,<undef>,text,]
+[comment,comment,<!--comment>,]
+[comment,p,<p,]
+[end_document,<undef>,,]
+EOT
+
+$TEXT = "";
+$p->parse("<!comment>");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[comment,comment,<!comment>,]
+[end_document,<undef>,,]
+EOT
+
+$TEXT = "";
+$p->parse(q(<a name=`foo bar`>));
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a name=`foo bar`>,name:`foo:bar`:bar`]
+[end_document,<undef>,,]
+EOT
+
+$p->backquote(1);
+$TEXT = "";
+$p->parse(q(<a name=`foo bar`>));
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a name=`foo bar`>,name:foo bar]
+[end_document,<undef>,,]
+EOT
diff --git a/t/offset.t b/t/offset.t
new file mode 100644
index 0000000..840728d
--- /dev/null
+++ b/t/offset.t
@@ -0,0 +1,58 @@
+use strict;
+use HTML::Parser ();
+use Test::More tests => 1;
+
+my $HTML = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<foo bar baz=3>heisan
+</foo> <?process>
+<!-- comment -->
+<xmp>xmp</xmp>
+
+EOT
+
+my $p = HTML::Parser->new(api_version => 3);
+
+my $sum_len = 0;
+my $count = 0;
+my $err;
+
+$p->handler(default =>
+	    sub {
+		my($offset, $length, $offset_end, $line, $col, $text) = @_;
+		my $copy = $text;
+		$copy =~ s/\n/\\n/g;
+		substr($copy, 30) = "..." if length($copy) > 32;
+		#diag sprintf ">>> %d.%d %s", $line, $col, $copy;
+		if ($offset != $sum_len) {
+		   diag "offset mismatch $offset vs $sum_len";
+		   $err++;
+                }
+		if ($offset_end != $offset + $length) {
+		   diag "offset_end $offset_end wrong";
+		   $err++;
+                }
+		if ($length != length($text)) {
+		   diag "length mismatch";
+		   $err++;
+		}
+                if (substr($HTML, $offset, $length) ne $text) {
+		   diag "content mismatch";
+		   $err++;
+		}
+		$sum_len += $length;
+		$count++;
+	    },
+	    'offset,length,offset_end,line,column,text');
+
+for (split(//, $HTML)) {
+   $p->parse($_);
+}
+$p->eof;
+
+ok($count > 5 && !$err);
+
+
diff --git a/t/options.t b/t/options.t
new file mode 100644
index 0000000..ff5f7db
--- /dev/null
+++ b/t/options.t
@@ -0,0 +1,36 @@
+# Test option setting methods
+
+use Test::More tests => 10;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3,
+			  xml_mode => 1);
+my $old;
+
+$old = $p->boolean_attribute_value("foo");
+ok(!defined $old);
+
+$old = $p->boolean_attribute_value();
+is($old, "foo");
+
+$old = $p->boolean_attribute_value(undef);
+is($old, "foo");
+ok(!defined($p->boolean_attribute_value));
+
+ok($p->xml_mode(0));
+ok(!$p->xml_mode);
+
+my $seen_buggy_comment_warning;
+$SIG{__WARN__} =
+    sub {
+	local $_ = shift;
+	$seen_buggy_comment_warning++
+	    if /^netscape_buggy_comment\(\) is deprecated/;
+    };
+
+ok(!$p->strict_comment(1));
+ok($p->strict_comment);
+ok(!$p->netscape_buggy_comment);
+ok($seen_buggy_comment_warning);
diff --git a/t/parsefile.t b/t/parsefile.t
new file mode 100644
index 0000000..f373f06
--- /dev/null
+++ b/t/parsefile.t
@@ -0,0 +1,45 @@
+use Test::More tests => 6;
+
+my $filename = "file$$.htm";
+die "$filename is already there" if -e $filename;
+open(FILE, ">$filename") || die "Can't create $filename: $!";
+print FILE <<'EOT'; close(FILE);
+<title>Heisan</title>
+EOT
+
+{
+    package MyParser;
+    require HTML::Parser;
+    @ISA=qw(HTML::Parser);
+
+    sub start
+    {
+	my($self, $tag, $attr) = @_;
+	Test::More::is($tag, "title");
+    }
+}
+
+MyParser->new->parse_file($filename);
+open(FILE, $filename) || die;
+MyParser->new->parse_file(*FILE);
+seek(FILE, 0, 0) || die;
+MyParser->new->parse_file(\*FILE);
+close(FILE);
+
+require IO::File;
+my $io = IO::File->new($filename) || die;
+MyParser->new->parse_file($io);
+$io->seek(0, 0) || die;
+MyParser->new->parse_file(*$io);
+
+my $text = '';
+$io->seek(0, 0) || die;
+MyParser->new(
+    start_h => [ sub{ shift->eof; }, "self" ],
+    text_h =>  [ sub{ $text = shift; }, "text" ])->parse_file(*$io);
+ok(!$text);
+
+close($io);  # needed because of bug in perl
+undef($io);
+
+unlink($filename) or warn "Can't unlink $filename: $!";
diff --git a/t/parser.t b/t/parser.t
new file mode 100644
index 0000000..0ce4d95
--- /dev/null
+++ b/t/parser.t
@@ -0,0 +1,184 @@
+use Test::More tests => 7;
+
+$HTML = <<'HTML';
+
+<!DOCTYPE HTML>
+
+<body>
+
+Various entities.  The parser must never break them in the middle:
+
+&#x2F
+&#x2F;
+&#200
+&#3030;
+&#XFFFF;
+&aring-&Aring;
+
+<ul>
+<li><a href="foo 'bar' baz>" id=33>This is a link</a>
+<li><a href='foo "bar" baz> &aring' id=34>This is another one</a>
+</ul>
+
+<p><div align="center"><img src="http://www.perl.com/perl.gif"
+alt="camel"></div>
+
+<!-- this is
+a comment --> and this is not.
+
+<!-- this is the kind of >comment< -- --> that Netscape hates -->
+
+< this > was not a tag. <this is/not either>
+
+</body>
+
+HTML
+
+#-------------------------------------------------------------------
+
+{
+    package P;
+    require HTML::Parser;
+    @ISA=qw(HTML::Parser);
+    $OUT='';
+    $COUNT=0;
+
+    sub new
+    {
+	my $class = shift;
+	my $self = $class->SUPER::new;
+	$OUT = '';
+        die "Can only have one" if $COUNT++;
+	$self;
+    }
+
+    sub DESTROY
+    {
+	my $self = shift;
+	eval { $self->SUPER::DESTROY; };
+	$COUNT--;
+    }
+
+    sub declaration
+    {
+	my($self, $decl) = @_;
+	$OUT .= "[[$decl]]|";
+    }
+
+    sub start
+    {
+	my($self, $tag, $attr) = @_;
+	$attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
+	$attr = "/$attr" if length $attr;
+	$OUT .= "<<$tag$attr>>|";
+    }
+
+    sub end
+    {
+	my($self, $tag) = @_;
+	$OUT .= ">>$tag<<|";
+    }
+
+    sub comment
+    {
+	my($self, $comment) = @_;
+	$OUT .= "##$comment##|";
+    }
+
+    sub text
+    {
+	my($self, $text) = @_;
+	#$text =~ s/\n/\\n/g;
+	#$text =~ s/\t/\\t/g;
+	#$text =~ s/ /�/g;
+	$OUT .= "$text|";
+    }
+
+    sub result
+    {
+	$OUT;
+    }
+}
+
+for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") {
+#for $chunksize (1) {
+    if ($chunksize =~ /^file/) {
+        #print "Parsing from $chunksize";
+    } else {
+        #print "Parsing using $chunksize byte chunks";
+    }
+    my $p = P->new;
+
+    if ($chunksize =~ /^file/) {
+	# First we must create the file
+	my $tmpfile = "tmp-$$.html";
+	my $file = $tmpfile;
+	die "$file already exists" if -e $file;
+	open(FILE, ">$file") or die "Can't create $file: $!";
+        binmode FILE;
+        print FILE $HTML;
+        close(FILE);
+
+	if ($chunksize eq "filehandle") {
+	    require FileHandle;
+	    my $fh = FileHandle->new($file) || die "Can't open $file: $!";
+	    $file = $fh;
+	}
+
+        # then we can parse it.
+        $p->parse_file($file);
+        close $file if $chunksize eq "filehandle";
+        unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
+    } else {
+	my $copy = $HTML;
+	while (length $copy) {
+	    my $chunk = substr($copy, 0, $chunksize);
+	    substr($copy, 0, $chunksize) = '';
+	    $p->parse($chunk);
+	}
+	$p->eof;
+    }
+
+    my $res = $p->result;
+    my $bad;
+    
+    # Then we start looking for things that should not happen
+    if ($res =~ /\s\|\s/) {
+	diag "broken space";
+	$bad++;
+    }
+    for (
+	 # Make sure entities are not broken
+	 '&#x2F', '&#x2F;', '&#200', '&#3030;', '&#XFFFF;', '&aring', '&Aring',
+
+         # Some elements that should be produced
+         "|[[DOCTYPE HTML]]|",
+         "|## this is\na comment ##|",
+         "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|",
+	 '|<<li>>|<<a/href=foo "bar" baz> �/id=34>>',
+         "|>>ul<<|", "|>>body<<|\n\n|",
+        )
+   {
+        if (index($res, $_) < 0) {
+	    diag "Can't find '$_' in parsed document";
+	    $bad++;
+        }
+    }
+
+    diag $res if $bad || $ENV{PRINT_RESULTS};
+
+    # And we check that we get the same result all the time
+    $res =~ s/\|//g;  # remove all break marks
+    if ($last_res && $res ne $last_res) {
+        diag "The result is not the same as last time";
+        $bad++;
+    }
+    $last_res = $res;
+
+    unless ($res =~ /Various entities/) {
+	diag "Some text must be missing";
+	$bad++;
+    }
+
+    ok(!$bad);
+}
diff --git a/t/plaintext.t b/t/plaintext.t
new file mode 100644
index 0000000..9a53a78
--- /dev/null
+++ b/t/plaintext.t
@@ -0,0 +1,58 @@
+use Test::More tests => 3;
+
+use strict;
+use HTML::Parser;
+
+my @a;
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \@a, '@{event, text, is_cdata}');
+$p->parse(<<EOT)->eof;
+<xmp><foo></xmp>x<plaintext><foo>
+</plaintext>
+foo
+EOT
+
+for (@a) {
+    $_ = "" unless defined;
+}
+
+my $doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo>
+</plaintext>
+foo
+:1:end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+</plaintext>foo<b></b>
+EOT
+
+for (@a) {
+    $_ = "" unless defined;
+}
+
+$doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text:
+::end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko (2)');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+foo<b></b>
+EOT
+
+$doc = join(":", map { defined $_ ? $_ : "" } @a);
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+foo<b></b>
+:1:end_document::");
diff --git a/t/pod.t b/t/pod.t
new file mode 100644
index 0000000..437887a
--- /dev/null
+++ b/t/pod.t
@@ -0,0 +1,4 @@
+use Test::More;
+eval "use Test::Pod 1.00";
+plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
+all_pod_files_ok();
diff --git a/t/process.t b/t/process.t
new file mode 100644
index 0000000..9d27250
--- /dev/null
+++ b/t/process.t
@@ -0,0 +1,43 @@
+use strict;
+
+use Test::More tests => 12;
+
+my $pi;
+my $orig;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; },
+				        "token0,text"]
+	                 );
+
+$p->parse("<a><?foo><a>");
+
+is($pi, "foo");
+is($orig, "<?foo>");
+
+$p->parse("<a><?><a>");
+is($pi, "");
+is($orig, "<?>");
+
+$p->parse("<a><?
+foo
+><a>");
+is($pi, "\nfoo\n");
+is($orig, "<?\nfoo\n>");
+
+for (qw(< a > < ? b a r > < a >)) {
+   $p->parse($_);
+}
+
+is($pi, "bar");
+is($orig, "<?bar>");
+
+$p->xml_mode(1);
+
+$p->parse("<a><?foo>bar??><a>");
+is($pi, "foo>bar?");
+is($orig, "<?foo>bar??>");
+
+$p->parse("<a><??></a>");
+is($pi, "");
+is($orig, "<??>");
diff --git a/t/pullparser.t b/t/pullparser.t
new file mode 100644
index 0000000..80a186b
--- /dev/null
+++ b/t/pullparser.t
@@ -0,0 +1,55 @@
+use Test::More tests => 3;
+
+use HTML::PullParser;
+
+my $doc = <<'EOT';
+<title>Title</title>
+<style> h1 { background: white }
+<foo>
+</style>
+<H1 ID="3">Heading</H1>
+<!-- ignore this -->
+
+This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>.
+EOT
+
+my $p = HTML::PullParser->new(doc   => $doc,
+			      start => 'event,tagname,@attr',
+                              end   => 'event,tagname',
+			      text  => 'event,dtext',
+
+                              ignore_elements         => [qw(script style)],
+			      unbroken_text           => 1,
+			      boolean_attribute_value => 1,
+			     );
+
+my $t = $p->get_token;
+is($t->[0], "start");
+is($t->[1], "title");
+$p->unget_token($t);
+
+my @a;
+while (my $t = $p->get_token) {
+    for (@$t) {
+	s/\s/./g;
+    }
+    push(@a, join("|", @$t));
+}
+
+my $res = join("\n", @a, "");
+#diag $res;
+is($res, <<'EOT');
+start|title
+text|Title
+end|title
+text|..
+start|h1|id|3
+text|Heading
+end|h1
+text|...This.is.a.text.with.a.
+start|a|href|http://www.sol.no|name|l1
+text|link
+end|a
+text|..
+EOT
+
diff --git a/t/script.t b/t/script.t
new file mode 100644
index 0000000..2a75ccb
--- /dev/null
+++ b/t/script.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use strict;
+use Test;
+plan tests => 1;
+
+use HTML::Parser;
+
+my $TEXT = "";
+sub h
+{
+    my($event, $tagname, $text) = @_;
+    for ($event, $tagname, $text) {
+        if (defined) {
+	    s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+	}
+	else {
+	    $_ = "<undef>";
+	}
+    }
+
+    $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1);
+$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>));
+$p->eof;
+
+ok($TEXT, <<'EOT');
+[start_document,<undef>,]
+[start,tr,<tr>]
+[start,td,<td align="center" height="100">]
+[start,script,<script src="whatever"/>]
+[end,script,]
+[start,script,<SCRIPT language="JavaScript1.1">]
+[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');]
+[end,script,</SCRIPT>]
+[end,td,</td>]
+[end,tr,</tr>]
+[end_document,<undef>,]
+EOT
diff --git a/t/skipped-text.t b/t/skipped-text.t
new file mode 100644
index 0000000..bc39915
--- /dev/null
+++ b/t/skipped-text.t
@@ -0,0 +1,89 @@
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->report_tags("a");
+
+my @doc;
+
+$p->handler(start => \&a_handler, "skipped_text, text");
+$p->handler(end_document => \@doc, '@{skipped_text}');
+
+$p->parse(<<EOT)->eof;
+<title>hi</title>
+<h1><a href="foo">link</a></h1>
+and <a foo="">some</a> text.
+EOT
+
+sub a_handler {
+    push(@doc, shift);
+    my $text = shift;
+    push(@doc, uc($text));
+}
+
+
+is(join("", @doc), <<'EOT');
+<title>hi</title>
+<h1><A HREF="FOO">link</a></h1>
+and <A FOO="">some</a> text.
+EOT
+
+#
+# Comment stripper.  Interaction with "" handlers.
+#
+my $doc = <<EOT;
+<html>text</html>
+<!-- comment -->
+and some more <b>text</b>.
+EOT
+(my $expected = $doc) =~ s/<!--.*?-->//;
+
+$p = HTML::Parser->new(api_version => 3);
+$p->handler(comment => "");
+$p->handler(end_document => sub {
+		                my $stripped = shift;
+				#diag $stripped;
+	                        is($stripped, $expected);
+			    }, "skipped_text");
+for (split(//, $doc)) {
+    $p->parse($_);
+}
+$p->eof;
+
+#
+# Interaction with unbroken text
+#
+my @x;
+$p = HTML::Parser->new(api_version => 3, unbroken_text => 1);
+$p->handler(text => \@x, '@{"X", skipped_text, text}');
+$p->handler(end => "");
+$p->handler(end_document => \@x, '@{"Y", skipped_text}');
+
+$doc = "a a<a>b b</a>c c<x>d d</x>e";
+
+for (split(//, $doc)) {
+   $p->parse($_);
+}
+$p->eof;
+
+#diag join(":", @x);
+is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:");
+
+#
+# The crash that Chip found
+#
+
+my $skipped;
+$p = HTML::Parser->new(
+    ignore_tags => ["foo"],
+    start_h => [sub {$skipped = shift}, "skipped_text"],
+);
+
+$p->parse("\x{100}<foo>");
+$p->parse("plain");
+$p->parse("<bar>");
+$p->eof;
+is($skipped, "\x{100}<foo>plain");
diff --git a/t/stack-realloc.t b/t/stack-realloc.t
new file mode 100644
index 0000000..46c7d35
--- /dev/null
+++ b/t/stack-realloc.t
@@ -0,0 +1,17 @@
+#!perl -w
+
+# HTML-Parser 3.33 and older used to core dump on this program because
+# of missing SPAGAIN calls in parse() XS code.  It was not prepared for
+# the stack to get realloced.
+
+$| = 1;
+
+use Test::More tests => 1;
+
+use HTML::Parser;
+my $x = HTML::Parser->new(api_version => 3);
+my @row;
+$x->handler(end => sub { push(@row, (1) x 505); 1 },   "tagname");
+$x->parse("</TD>");
+
+pass;
diff --git a/t/textarea.t b/t/textarea.t
new file mode 100644
index 0000000..120f79b
--- /dev/null
+++ b/t/textarea.t
@@ -0,0 +1,70 @@
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $html = <<'EOT';
+<html>
+<title>This is a <nice> title</title>
+<!--comment-->
+<script language="perl">while (<DATA>) { &amp; }</script>
+
+<FORM>
+
+<textarea name="foo" cols=50 rows=10>
+
+foo
+<foo>
+<!--comment-->
+&amp;
+foo
+</FORM>
+
+</textarea>
+
+</FORM>
+
+</html>
+EOT
+
+my $dump = "";
+sub tdump {
+   my @a = @_;
+   for (@a) {
+      $_ = "<undef>" unless defined;
+      s/\n/\\n/g;
+   }
+   $dump .= join("|", @a) . "\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
+$p->parse($html)->eof;
+
+#diag $dump;
+
+is($dump, <<'EOT');
+start_document||<undef>|<undef>
+start|<html>|<undef>|<undef>
+text|\n|\n|
+start|<title>|<undef>|<undef>
+text|This is a <nice> title|This is a <nice> title|
+end|</title>|<undef>|<undef>
+text|\n|\n|
+comment|<!--comment-->|<undef>|<undef>
+text|\n|\n|
+start|<script language="perl">|<undef>|<undef>
+text|while (<DATA>) { &amp; }|while (<DATA>) { &amp; }|1
+end|</script>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef>
+text|\n\nfoo\n<foo>\n<!--comment-->\n&amp;\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|
+end|</textarea>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</html>|<undef>|<undef>
+text|\n|\n|
+end_document||<undef>|<undef>
+EOT
diff --git a/t/threads.t b/t/threads.t
new file mode 100644
index 0000000..8da91e9
--- /dev/null
+++ b/t/threads.t
@@ -0,0 +1,39 @@
+# Verify thread safety.
+
+use Config;
+use Test::More;
+
+BEGIN {
+    plan(skip_all => "Not configured for threads")
+	unless $Config{useithreads} && $] >= 5.008;
+    plan(tests => 1);
+}
+
+use threads;
+use HTML::Parser;
+
+my $ok=0;
+
+sub start
+{
+    my($tag,$attr)=@_;
+
+    $ok += ($tag eq "foo");
+    $ok += (defined($attr->{param}) && $attr->{param} eq "bar");
+}
+
+my $p = HTML::Parser->new
+    (api_version => 3,
+     handlers => {
+	 start => [\&start, "tagname,attr"],
+     });
+
+$p->parse("<foo pa");
+
+$ok=async {
+    $p->parse("ram=bar>");
+    $ok;
+}->join();
+
+is($ok,2);
+
diff --git a/t/tokeparser.t b/t/tokeparser.t
new file mode 100644
index 0000000..2084201
--- /dev/null
+++ b/t/tokeparser.t
@@ -0,0 +1,164 @@
+use Test::More tests => 17;
+
+use strict;
+use HTML::TokeParser;
+
+# First we create an HTML document to test
+
+my $file = "ttest$$.htm";
+die "$file already exists" if -e $file;
+
+open(F, ">$file") or die "Can't create $file: $!";
+print F <<'EOT';  close(F);
+
+<!--This is a test-->
+<html><head><title>
+  This is the &lt;title&gt;
+</title>
+
+  <base href="http://www.perl.com">
+</head>
+
+<body background="bg.gif">
+
+    <h1>This is the <b>title</b> again
+    </h1>
+
+    And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl">&nbsp;<!--nice isn't it-->Institute</a>
+
+   <br/><? process instruction >
+
+</body>
+</html>
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+    my $title = $p->get_trimmed_text;
+    #diag "Title: $title";
+    is($title, "This is the <title>");
+}
+undef($p);
+
+# Test with reference to glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(\*F);
+my $scount = 0;
+my $ecount = 0;
+my $tcount = 0;
+my $pcount = 0;
+while (my $token = $p->get_token) {
+    $scount++ if $token->[0] eq "S";
+    $ecount++ if $token->[0] eq "E";
+    $pcount++ if $token->[0] eq "PI";
+}
+undef($p);
+close F;
+
+# Test with glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(*F);
+$tcount++ while $p->get_tag;
+undef($p);
+close F;
+
+# Test with plain file name
+$p = HTML::TokeParser->new($file) || die;
+$tcount++ while $p->get_tag;
+undef($p);
+
+#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
+is($tcount, 34);
+is($scount, 10);
+is($ecount, 7);
+is($pcount, 1);
+is($tcount/2, $scount + $ecount);
+
+ok(!HTML::TokeParser->new("/noT/thEre/$$"));
+
+
+$p = HTML::TokeParser->new($file) || die;
+$p->get_tag("a");
+my $atext = $p->get_text;
+undef($p);
+
+is($atext, "Perl\240Institute");
+
+# test parsing of embeded document
+$p = HTML::TokeParser->new(\<<HTML);
+<title>Title</title>
+<H1>
+Heading
+</h1>
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+    $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a heading</H1>
+This is s<b>o</b>me<hr>text.
+<br />
+This is some more text.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a <b>bold</b> heading</H1>
+This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
diff --git a/t/uentities.t b/t/uentities.t
new file mode 100644
index 0000000..36d5179
--- /dev/null
+++ b/t/uentities.t
@@ -0,0 +1,65 @@
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 26;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+  27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("&euro;"), "\x{20AC}");
+
+is(decode_entities("&aring"), "�");
+is(decode_entities("&aring;"), "�");
+
+is(decode_entities("&#500000"), chr(500000));
+
+is(decode_entities("&#x10FFFD"), "\x{10FFFD}");
+
+is(decode_entities("&#xFFFC"), "\x{FFFC}");
+
+
+is(decode_entities("&#xFDD0"), "\x{FFFD}");
+is(decode_entities("&#xFDD1"), "\x{FFFD}");
+is(decode_entities("&#xFDE0"), "\x{FFFD}");
+is(decode_entities("&#xFDEF"), "\x{FFFD}");
+is(decode_entities("&#xFFFF"), "&#xFFFF");
+is(decode_entities("&#x10FFFF"), "\x{FFFD}");
+is(decode_entities("&#x110000"), "&#x110000");
+is(decode_entities("&#XFFFFFFFF"), "&#XFFFFFFFF");
+
+is(decode_entities("&#0"), "&#0");
+is(decode_entities("&#0;"), "&#0;");
+is(decode_entities("&#x0"), "&#x0");
+is(decode_entities("&#X0;"), "&#X0;");
+
+is(decode_entities("&#&aring&#229&#229;&#xFFF"), "&#���\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities("&#0009999999999999999999999999999;"), "&#0009999999999999999999999999999;");
+is(decode_entities("&#xFFFF0000FFFF0000FFFF1"), "&#xFFFF0000FFFF0000FFFF1");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+    my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+    my $e = encode_entities($a);
+    my $d = decode_entities($e);
+
+    unless ($d eq $a) {
+	diag "Wrong decoding in range $_->[0] .. $_->[1]";
+	# use Devel::Peek; Dump($a); Dump($d);
+	$err++;
+    }
+}
+ok(!$err);
+
+
+is(decode_entities("&#56256;&#56453;"), chr(0x100085));
+
+is(decode_entities("&#56256"), chr(0xFFFD));
+
+is(decode_entities("\260&rsquo;\260"), "\x{b0}\x{2019}\x{b0}");
+}
diff --git a/t/unbroken-text.t b/t/unbroken-text.t
new file mode 100644
index 0000000..7de85a9
--- /dev/null
+++ b/t/unbroken-text.t
@@ -0,0 +1,60 @@
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+    my $cdata = shift() ? "CDATA" : "TEXT";
+    my($offset, $line, $col, $t) = @_;
+    $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+    $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+			  text_h =>  [\&text, "is_cdata,offset,line,column,text"],
+			  start_h => [\&tag, "text"],
+			  end_h   => [\&tag, "text"],
+			 );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse("<foo>");
+$p->parse("bar\n");
+$p->parse("</foo>");
+$p->parse("<xmp>xmp</xmp>");
+$p->parse("atend");
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>");
+
+$text = "";
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:37:2.20:atend]");
+
+
+$p = HTML::Parser->new(unbroken_text => 1,
+		       text_h => [\&text, "is_cdata,offset,line,column,text"],
+		      );
+
+$text = "";
+$p->parse("foo");
+$p->parse("<foo");
+$p->parse(">bar\n");
+$p->parse("foo<xm");
+$p->parse("p>xmp");
+$p->parse("</xmp");
+$p->parse(">bar");
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
+
+
diff --git a/t/unicode-bom.t b/t/unicode-bom.t
new file mode 100644
index 0000000..b7398cf
--- /dev/null
+++ b/t/unicode-bom.t
@@ -0,0 +1,63 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+use HTML::Parser;
+
+SKIP: {
+skip "This perl does not support Unicode", 2 if $] < 5.008;
+
+my @parsed;
+my $p = HTML::Parser->new(
+  api_version => 3,
+  start_h => [\@parsed, 'tag, attr'],
+);
+
+my @warn;
+$SIG{__WARN__} = sub {
+    push(@warn, $_[0]);
+};
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+
+#use Encode;
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A));
+$p->eof;
+
+$p->parse("\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\0\0\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF\0\0<head>Hi there</head>");
+$p->eof;
+
+for (@warn) {
+    s/line (\d+)/line ##/g;
+}
+
+is(join("", @warn), <<EOT);
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##.
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##.
+Parsing of undecoded UTF-16 at $0 line ##.
+Parsing of undecoded UTF-16 at $0 line ##.
+Parsing of undecoded UTF-32 at $0 line ##.
+Parsing of undecoded UTF-32 at $0 line ##.
+EOT
+
+@warn = ();
+
+$p = HTML::Parser->new(
+  api_version => 3,
+  start_h => [\@parsed, 'tag'],
+);
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+ok(!@warn);
+}
diff --git a/t/unicode.t b/t/unicode.t
new file mode 100644
index 0000000..911c547
--- /dev/null
+++ b/t/unicode.t
@@ -0,0 +1,198 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+use Test::More;
+BEGIN {
+  plan skip_all => "This perl does not support Unicode" if $] < 5.008;
+}
+
+plan tests => 105;
+
+my @warn;
+$SIG{__WARN__} = sub {
+    push(@warn, $_[0]);
+};
+
+my @parsed;
+my $p = HTML::Parser->new(
+  api_version => 3,
+  default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
+);
+
+my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile &#x263a</h1>\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 0x263A);
+is($parsed[2][2], chr(0x263A));
+is($parsed[2][3], 7);
+is($parsed[2][4], 1);
+is($parsed[2][5], 8);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 8);
+is($parsed[3][6], 8);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\x{2600} f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 226);
+is($parsed[2][1], "\xE2\x98\xBA");
+is($parsed[2][2], "\xE2\x98\xBA");
+is($parsed[2][3], 7);
+is($parsed[2][4], 3);
+is($parsed[2][5], 10);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 10);
+is($parsed[3][6], 10);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <<EOT;
+\x{FEFF}
+<title>\x{263A} Love! </title>
+<h1 id=&hearts;\x{2665}>&hearts; Love \x{2665}<h1>
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5");  # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(<a href="a=1&lang=2&times=3">foo</a>))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+
+$p = HTML::Parser->new(
+    api_version => 3,
+    default_h => [\@parsed, 'event, text, tag, attr'],
+    attr_encoded => 1,
+);
+
+@warn = ();
+@parsed = ();
+
+$p->parse($doc)->eof;
+
+ok(!@warn);
+is(@parsed, 9);
diff --git a/t/xml-mode.t b/t/xml-mode.t
new file mode 100644
index 0000000..cdfc5b0
--- /dev/null
+++ b/t/xml-mode.t
@@ -0,0 +1,112 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+			 );
+
+my $text = "";
+$p->handler(start =>
+	    sub {
+		 my($tag, $attr) = @_;
+		 $text .= "S[$tag";
+		 for my $k (sort keys %$attr) {
+		     my $v =  $attr->{$k};
+		     $text .= " $k=$v";
+		 }
+		 $text .= "]";
+	     }, "tagname,attr");
+$p->handler(end =>
+	     sub {
+		 $text .= "E[" . shift() . "]";
+	     }, "tagname");
+$p->handler(process => 
+	     sub {
+		 $text .= "PI[" . shift() . "]";
+	     }, "token0");
+$p->handler(text =>
+	     sub {
+		 $text .= shift;
+	     }, "text");
+
+my $xml = <<'EOT';
+<?xml version="1.0"?>
+<?IS10744:arch name="html"?><!-- comment -->
+<DOC>
+<title html="h1">My first architectual document</title>
+<author html="address">Geir Ove Gronmo, grove@infotek.no</author>
+<para>This is the first paragraph in this document</para>
+<para html="p">This is the second paragraph</para>
+<para/>
+<xmp><foo></foo></xmp>
+</DOC>
+EOT
+
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"]
+PI[IS10744:arch name="html"]
+S[DOC]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para]E[para]
+S[xmp]S[foo]E[foo]E[xmp]
+E[DOC]
+EOT
+
+$text = "";
+$p->xml_mode(0);
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"?]
+PI[IS10744:arch name="html"?]
+S[doc]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para/]
+S[xmp]<foo></foo>E[xmp]
+E[doc]
+EOT
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+	               xml_mode => 1);
+
+$p->handler("end" =>
+	    sub {
+		my($tagname, $text) = @_;
+		is($tagname, "Xyzzy");
+	        ok(!length($text));
+	    }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+	               empty_element_tags => 1);
+
+$p->handler("end" =>
+	    sub {
+		my($tagname, $text) = @_;
+		is($tagname, "xyzzy");
+	        ok(!length($text));
+	    }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+$p = HTML::Parser->new(
+    api_version => 3,
+    xml_pic => 1,
+);
+
+$p->handler(
+    "process" => sub {
+	my($text, $t0) = @_;
+	is($text, "<?foo > bar?>");
+	is($t0, "foo > bar");
+    }, "text, token0");
+$p->parse("<?foo > bar?> and then")->eof;
author	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
committer	Lorry Tar Creator <lorry-tar-importer@lorry>	2013-05-08 22:21:52 +0000
commit	2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree	4734ccd522c71dd455879162006742002f8c1565 /t
download	HTML-Parser-tarball-master.tar.gz