summaryrefslogtreecommitdiff
path: root/t
diff options
context:
space:
mode:
authorLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
committerLorry Tar Creator <lorry-tar-importer@lorry>2013-05-08 22:21:52 +0000
commit2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch)
tree4734ccd522c71dd455879162006742002f8c1565 /t
downloadHTML-Parser-tarball-master.tar.gz
Diffstat (limited to 't')
-rw-r--r--t/api_version.t22
-rw-r--r--t/argspec-bad.t40
-rw-r--r--t/argspec.t148
-rw-r--r--t/argspec2.t21
-rw-r--r--t/attr-encoded.t32
-rw-r--r--t/callback.t49
-rw-r--r--t/case-sensitive.t85
-rw-r--r--t/cases.t105
-rw-r--r--t/comment.t24
-rw-r--r--t/crashme.t43
-rw-r--r--t/declaration.t62
-rw-r--r--t/default.t43
-rw-r--r--t/document.t41
-rw-r--r--t/dtext.t72
-rw-r--r--t/entities.t213
-rw-r--r--t/entities2.t57
-rw-r--r--t/filter-methods.t205
-rw-r--r--t/filter.t60
-rw-r--r--t/handler-eof.t54
-rw-r--r--t/handler.t67
-rw-r--r--t/headparser-http.t20
-rw-r--r--t/headparser.t200
-rw-r--r--t/ignore.t27
-rw-r--r--t/largetags.t38
-rw-r--r--t/linkextor-base.t41
-rw-r--r--t/linkextor-rel.t36
-rw-r--r--t/magic.t41
-rw-r--r--t/marked-sect.t121
-rw-r--r--t/msie-compat.t79
-rw-r--r--t/offset.t58
-rw-r--r--t/options.t36
-rw-r--r--t/parsefile.t45
-rw-r--r--t/parser.t184
-rw-r--r--t/plaintext.t58
-rw-r--r--t/pod.t4
-rw-r--r--t/process.t43
-rw-r--r--t/pullparser.t55
-rw-r--r--t/script.t41
-rw-r--r--t/skipped-text.t89
-rw-r--r--t/stack-realloc.t17
-rw-r--r--t/textarea.t70
-rw-r--r--t/threads.t39
-rw-r--r--t/tokeparser.t164
-rw-r--r--t/uentities.t65
-rw-r--r--t/unbroken-text.t60
-rw-r--r--t/unicode-bom.t63
-rw-r--r--t/unicode.t198
-rw-r--r--t/xml-mode.t112
48 files changed, 3447 insertions, 0 deletions
diff --git a/t/api_version.t b/t/api_version.t
new file mode 100644
index 0000000..9803121
--- /dev/null
+++ b/t/api_version.t
@@ -0,0 +1,22 @@
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+ok(!$p->handler("start"), "API version 3");
+
+my $failed;
+eval {
+ my $p = HTML::Parser->new(api_version => 4);
+ $failed++;
+};
+like($@, qr/^API version 4 not supported/);
+ok(!$failed, "API version 4");
+
+$p = HTML::Parser->new(api_version => 2);
+
+is($p->handler("start"), "start", "API version 2");
+
+
diff --git a/t/argspec-bad.t b/t/argspec-bad.t
new file mode 100644
index 0000000..8c0b199
--- /dev/null
+++ b/t/argspec-bad.t
@@ -0,0 +1,40 @@
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+eval {
+ $p->handler(end => "end", q(xyzzy));
+};
+like($@, qr/^Unrecognized identifier xyzzy in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname text));
+};
+like($@, qr/^Missing comma separator in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname, "text));
+};
+like($@, qr/^Unterminated literal string in argspec/);
+
+
+eval {
+ $p->handler(end => "end", q(tagname, "t\\t"));
+};
+like($@, qr/^Backslash reserved for literal string in argspec/);
+
+eval {
+ $p->handler(end => "end", '"' . ("x" x 256) . '"');
+};
+like($@, qr/^Literal string is longer than 255 chars in argspec/);
+
+$p->handler(end => sub { is(length(shift), 255) },
+ '"' . ("x" x 255) . '"');
+$p->parse("</x>");
+
+
diff --git a/t/argspec.t b/t/argspec.t
new file mode 100644
index 0000000..e8aa7a5
--- /dev/null
+++ b/t/argspec.t
@@ -0,0 +1,148 @@
+
+use strict;
+require HTML::Parser;
+
+my $decl = '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->';
+my $com1 = '<!-- Comment -->';
+my $com2 = '<!-- Comment -- -- Comment -->';
+my $start = '<a href="foo">';
+my $end = '</a>';
+my $empty = "<IMG SRC='foo'/>";
+my $proc = '<? something completely different ?>';
+
+my @argspec = qw( self offset length
+ event tagname tag token0
+ text
+ is_cdata dtext
+ tokens
+ tokenpos
+ attr
+ attrseq );
+
+my @result = ();
+my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)],
+ strict_comment => 1, xml_mode => 1);
+
+my @tests =
+ ( # string, expected results
+ $decl => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY',
+ '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->',
+ undef, undef,
+ ['ENTITY', 'nbsp', 'CDATA', '"&#160;"', '-- no-break space --'],
+ [2, 6, 9, 4, 16, 5, 22, 8, 31, 20],
+ undef, undef ]],
+ $com1 => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ',
+ '<!-- Comment -->',
+ undef, undef,
+ [' Comment '],
+ [4, 9],
+ undef, undef ]],
+ $com2 => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ',
+ '<!-- Comment -- -- Comment -->',
+ undef, undef,
+ [' Comment ', ' Comment '],
+ [4, 9, 18, 9],
+ undef, undef ]],
+ $start => [[$p, 0, 14, 'start', 'a', 'a', 'a',
+ '<a href="foo">',
+ undef, undef,
+ ['a', 'href', '"foo"'],
+ [1, 1, 3, 4, 8, 5],
+ {'href', 'foo'}, ['href'] ]],
+ $end => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef ]],
+ $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG',
+ "<IMG SRC='foo'/>",
+ undef, undef,
+ ['IMG', 'SRC', "'foo'"],
+ [1, 3, 5, 3, 9, 5],
+ {'SRC', 'foo'}, ['SRC'] ],
+ [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG',
+ '',
+ undef, undef,
+ ['IMG'],
+ undef,
+ undef, undef ],
+ ],
+ $proc => [[$p, 0, 36, 'process', ' something completely different ',
+ '? something completely different ',
+ ' something completely different ',
+ '<? something completely different ?>',
+ undef, undef,
+ [' something completely different '],
+ [2, 32],
+ undef, undef ]],
+ "$end\n$end" => [[$p, 0, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef],
+ [$p, 4, 1, 'text', undef, undef, undef,
+ "\n",
+ '', "\n",
+ undef,
+ undef,
+ undef, undef],
+ [$p, 5, 4, 'end', 'a', '/a', 'a',
+ '</a>',
+ undef, undef,
+ ['a'],
+ [2, 1],
+ undef, undef ]],
+ );
+
+use Test::More;
+plan tests => @tests / 2;
+
+sub string_tag {
+ my (@pieces) = @_;
+ my $part;
+ foreach $part ( @pieces ) {
+ if (!defined $part) {
+ $part = 'undef';
+ }
+ elsif (!ref $part) {
+ $part = "'$part'" if $part !~ /^\d+$/;
+ }
+ elsif ('ARRAY' eq ref $part ) {
+ $part = '[' . join(', ', string_tag(@$part)) . ']';
+ }
+ elsif ('HASH' eq ref $part ) {
+ $part = '{' . join(',', string_tag(%$part)) . '}';
+ }
+ else {
+ $part = '<' . ref($part) . '>';
+ }
+ }
+ return join(", ", @pieces );
+}
+
+my $i = 0;
+TEST:
+while (@tests) {
+ my($html, $expected) = splice @tests, 0, 2;
+ ++$i;
+
+ @result = ();
+ $p->parse($html)->eof;
+
+ shift(@result) if $result[0][3] eq "start_document";
+ pop(@result) if $result[-1][3] eq "end_document";
+
+ # Compare results for each element expected
+ foreach (@$expected) {
+ my $want = string_tag($_);
+ my $got = string_tag(shift @result);
+ if ($want ne $got) {
+ is($want, $got);
+ next TEST;
+ }
+ }
+
+ pass;
+}
diff --git a/t/argspec2.t b/t/argspec2.t
new file mode 100644
index 0000000..6f594b9
--- /dev/null
+++ b/t/argspec2.t
@@ -0,0 +1,21 @@
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser;
+
+my @start;
+my @text;
+
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(start => \@start, '@{tagname, @attr}');
+$p->handler(text => \@text, '@{dtext}');
+$p->parse(<<EOT)->eof;
+Hi
+<a href="abc">Foo</a><b>:-)</b>
+EOT
+
+is("@start", "a href abc b");
+
+is(join("", @text), "Hi\nFoo:-)\n");
+
+
diff --git a/t/attr-encoded.t b/t/attr-encoded.t
new file mode 100644
index 0000000..4d458eb
--- /dev/null
+++ b/t/attr-encoded.t
@@ -0,0 +1,32 @@
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->attr_encoded(1);
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+
+my $html = <<'EOT';
+<tag arg="&amp;&lt;&gt">
+EOT
+
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&amp;&lt;&gt]');
+
+$text = "";
+$p->attr_encoded(0);
+$p->parse($html)->eof;
+
+is($text, 'S[tag arg=&<>]');
diff --git a/t/callback.t b/t/callback.t
new file mode 100644
index 0000000..7a456cf
--- /dev/null
+++ b/t/callback.t
@@ -0,0 +1,49 @@
+use Test::More tests => 47;
+
+use strict;
+use HTML::Parser;
+
+my @expected;
+my $p = HTML::Parser->new(api_version => 3,
+ unbroken_text => 1,
+ default_h => [\@expected, '@{event, text}'],
+ );
+
+my $doc = <<'EOT';
+<title>Hi</title>
+<h1>Ho ho</h1>
+<--comment->
+EOT
+
+$p->parse($doc)->eof;
+#use Data::Dump; Data::Dump::dump(@expected);
+
+for my $i (1..length($doc)) {
+ my @t;
+ $p->handler(default => \@t);
+ $p->parse(chunk($doc, $i));
+
+ # check that we got the same stuff
+ #diag "X:", join(":", @t);
+ #diag "Y:", join(":", @expected);
+ is(join(":", @t), join(":", @expected));
+}
+
+sub chunk {
+ my $str = shift;
+ my $size = shift || 1;
+ sub {
+ my $res = substr($str, 0, $size);
+ #diag "...$res";
+ substr($str, 0, $size) = "";
+ $res;
+ }
+}
+
+# Test croking behaviour
+$p->handler(default => []);
+
+eval {
+ $p->parse(sub { die "Hi" });
+};
+like($@, qr/^Hi/);
diff --git a/t/case-sensitive.t b/t/case-sensitive.t
new file mode 100644
index 0000000..565b20b
--- /dev/null
+++ b/t/case-sensitive.t
@@ -0,0 +1,85 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new();
+$p->case_sensitive(1);
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr, $attrseq) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ if (@$attrseq) { $text.=" Order:" ; }
+ for my $k (@$attrseq) {
+ $text .= " $k";
+ }
+ $text .= "]";
+ }, "tagname,attr,attrseq");
+$p->handler(end =>
+ sub {
+ my ($tag) = @_;
+ $text .= "E[$tag]";
+ }, "tagname");
+
+my $html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+EOT
+my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]';
+my $ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $ci);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$html = <<'EOT';
+<tAg aRg="Value" arg="other value"></tAg>
+<iGnOrE></ignore>
+EOT
+$p->ignore_tags('ignore');
+$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]';
+$ci = 'S[tag arg=Value Order: arg arg]E[tag]';
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $ci);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(0);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(0);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
+$text = "";
+$p->case_sensitive(1);
+$p->xml_mode(1);
+$p->parse($html)->eof;
+is($text, $cs);
+
diff --git a/t/cases.t b/t/cases.t
new file mode 100644
index 0000000..a537279
--- /dev/null
+++ b/t/cases.t
@@ -0,0 +1,105 @@
+use Test::More;
+
+require HTML::Parser;
+
+package P; @ISA = qw(HTML::Parser);
+
+my @result;
+sub start
+{
+ my($self, $tag, $attr) = @_;
+ push @result, "START[$tag]";
+ for (sort keys %$attr) {
+ push @result, "\t$_: " . $attr->{$_};
+ }
+ $start++;
+}
+
+sub end
+{
+ my($self, $tag) = @_;
+ push @result, "END[$tag]";
+ $end++;
+}
+
+sub text
+{
+ my $self = shift;
+ push @result, "TEXT[$_[0]]";
+ $text++;
+}
+
+sub comment
+{
+ my $self = shift;
+ push @result, "COMMENT[$_[0]]";
+ $comment++;
+}
+
+sub declaration
+{
+ my $self = shift;
+ push @result, "DECLARATION[$_[0]]";
+ $declaration++;
+}
+
+package main;
+
+
+@tests =
+ (
+ '<a ">' => ['START[a]', "\t\": \""],
+ '<a/>' => ['START[a/]',],
+ '<a />' => ['START[a]', "\t/: /"],
+ '<a a/>' => ['START[a]', "\ta/: a/"],
+ '<a a/=/>' => ['START[a]', "\ta/: /"],
+ '<a x="foo&nbsp;bar">' => ['START[a]', "\tx: foo\xA0bar"],
+ '<a x="foo&nbspbar">' => ['START[a]', "\tx: foo&nbspbar"],
+ '<å >' => ['TEXT[<å]', 'TEXT[ >]'],
+ '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'],
+ '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'],
+ '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'],
+ '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'],
+ '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'],
+ "2 <a href='foo bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+ '2 <a href=foo bar> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'],
+ '2 <a href="foo bar"> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'],
+ '2 <a href="foo\'bar"> 2' =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'],
+ "2 <a href='foo\"bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+ "2 <a href='foo&quot;bar'> 2" =>
+ ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'],
+ '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'],
+ '2 <a.b-12 a.b = 2 a> 2' =>
+ ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'],
+ '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'],
+ '<!ENTITY nbsp CDATA "&#160;" -- no-break space -->' =>
+ ['DECLARATION[ENTITY nbsp CDATA "&#160;" -- no-break space --]'],
+ '<!-- comment -->' => ['COMMENT[ comment ]'],
+ '<!-- comment -- --- comment -->' =>
+ ['COMMENT[ comment ]', 'COMMENT[- comment ]'],
+ '<!-- comment <!-- not comment --> comment -->' =>
+ ['COMMENT[ comment <!]', 'COMMENT[> comment ]'],
+ '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'],
+ );
+
+plan tests => @tests / 2;
+
+my $i = 0;
+TEST:
+while (@tests) {
+ ++$i;
+ my ($html, $expected) = splice @tests, 0, 2;
+ @result = ();
+
+ $p = new P;
+ $p->strict_comment(1);
+ $p->parse($html)->eof;
+
+ ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n",
+ "Got: @result\n");
+}
diff --git a/t/comment.t b/t/comment.t
new file mode 100644
index 0000000..303449e
--- /dev/null
+++ b/t/comment.t
@@ -0,0 +1,24 @@
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+my @com;
+$p->handler(comment => sub { push(@com, shift) }, "token0");
+$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text");
+
+$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>");
+$p->parse("<!--+--");
+$p->parse("\n\n");
+$p->parse(">");
+$p->parse("<!a'b>");
+$p->parse("<!--foo--->");
+$p->parse("<!--foo---->");
+$p->parse("<!--foo----->-->");
+$p->parse("<foo>");
+$p->parse("<!3453><!-3456><!FOO><>");
+$p->eof;
+
+my $com = join(":", @com);
+is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]");
diff --git a/t/crashme.t b/t/crashme.t
new file mode 100644
index 0000000..1a1e8e4
--- /dev/null
+++ b/t/crashme.t
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+
+# This test will simply run the parser on random junk.
+
+my $no_tests = shift || 3;
+use Test::More;
+plan tests => $no_tests;
+
+use HTML::Parser ();
+
+my $file = "junk$$.html";
+die if -e $file;
+
+for (1..$no_tests) {
+
+ open(JUNK, ">$file") || die;
+ for (1 .. rand(5000)) {
+ for (1 .. rand(200)) {
+ print JUNK pack("N", rand(2**32));
+ }
+ print JUNK ("<", "&", ">")[rand(3)]; # make these a bit more likely
+ }
+ close(JUNK);
+
+ #diag "Parse @{[-s $file]} bytes of junk";
+
+ HTML::Parser->new->parse_file($file);
+ pass();
+
+ #print_mem();
+}
+
+unlink($file);
+
+
+sub print_mem
+{
+ # this probably only works on Linux
+ open(STAT, "/proc/self/status") || return;
+ while (<STAT>) {
+ diag $_ if /^VmSize/;
+ }
+}
diff --git a/t/declaration.t b/t/declaration.t
new file mode 100644
index 0000000..17de561
--- /dev/null
+++ b/t/declaration.t
@@ -0,0 +1,62 @@
+use Test::More tests => 2;
+
+use HTML::Parser;
+my $res = "";
+
+sub decl
+{
+ my $t = shift;
+ $res .= "[" . join("\n", map "<$_>", @$t) . "]";
+}
+
+sub text
+{
+ $res .= shift;
+}
+
+my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"],
+ default_h => [\&text, "text"],
+ );
+
+$p->parse(<<EOT)->eof;
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>--
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<!ENTITY foo "<!-- foo -->">
+<!Entity foo "<!-- foo -->">
+
+<!row --> foo
+EOT
+
+is($res, <<EOT);
+[<DOCTYPE>
+<HTML>
+<PUBLIC>
+<"-//W3C//DTD HTML 4.01//EN">
+<--<comment>-->
+<"http://www.w3.org/TR/html40/strict.dtd">]
+
+[<ENTITY>
+<foo>
+<"<!-- foo -->">]
+[<Entity>
+<foo>
+<"<!-- foo -->">]
+
+<!row --> foo
+EOT
+
+$res = "";
+$p->parse(<<EOT)->eof;
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]>
+EOT
+is($res, <<EOT);
+[<DOCTYPE>
+<html>
+<PUBLIC>
+<"-//W3C//DTD XHTML 1.0 Strict//EN">
+<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<[]>]
+EOT
+
diff --git a/t/default.t b/t/default.t
new file mode 100644
index 0000000..4b5ed79
--- /dev/null
+++ b/t/default.t
@@ -0,0 +1,43 @@
+use strict;
+use Test::More tests => 3;
+
+my $text = "";
+use HTML::Parser ();
+my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"],
+ );
+
+my $html = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<title>foo</title>
+<!-- comment <a> -->
+<?process instruction>
+
+EOT
+
+$p->parse($html)->eof;
+
+is($text, $html);
+
+$text = "";
+$p->handler(start => sub { }, "");
+$p->handler(declaration => sub { }, "");
+$p->parse($html)->eof;
+
+my $html2;
+$html2 = $html;
+$html2 =~ s/<title>//;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
+
+$text = "";
+$p->handler(start => undef);
+$p->parse($html)->eof;
+
+$html2 = $html;
+$html2 =~ s/<!DOCTYPE[^>]*>//;
+
+is($text, $html2);
diff --git a/t/document.t b/t/document.t
new file mode 100644
index 0000000..6696939
--- /dev/null
+++ b/t/document.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use Test;
+plan tests => 6;
+
+
+use HTML::Parser;
+use File::Spec;
+
+my $events;
+my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]);
+
+$events = "";
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse_file(File::Spec->devnull);
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("");
+$p->eof;
+ok($events, "start_document\nend_document\n");
+
+$events = "";
+$p->parse("");
+$p->parse("<a>");
+$p->eof;
+ok($events, "start_document\nstart\nend_document\n");
+
+$events = "";
+$p->parse("<a> ");
+$p->eof;
+ok($events, "start_document\nstart\ntext\nend_document\n");
diff --git a/t/dtext.t b/t/dtext.t
new file mode 100644
index 0000000..883c61f
--- /dev/null
+++ b/t/dtext.t
@@ -0,0 +1,72 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+
+use HTML::Parser ();
+
+my $dtext = "";
+my $text = "";
+
+sub append
+{
+ $dtext .= shift;
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(text_h => [\&append, "dtext, text"],
+ default_h => [\&append, "text, text" ],
+ );
+
+my $doc = <<'EOT';
+<title>&aring</title>
+<a href="foo&aring">&aring&aring;&#65&#65;&lt&#65&gt;&#x41&#X41;</a>
+<?&aring>
+foo&nbsp;bar
+foo&nbspbar
+&xyzzy
+&xyzzy;
+<!-- &#0; -->
+&#1;
+&#255;
+&#xFF
+&#xFFG
+<!-- &#256; -->
+&#40000000000000000000000000000;
+&#x400000000000000000000000000000000;
+&
+&#
+&#x
+<xmp>&aring</xmp>
+<script>&aring</script>
+<ScRIPT>&aring</scRIPT>
+<skript>&aring</script>
+EOT
+
+$p->parse($doc)->eof;
+
+is($text, $doc);
+is($dtext, <<"EOT");
+<title>å</title>
+<a href="foo&aring">ååAA<A>AA</a>
+<?&aring>
+foo\240bar
+foo\240bar
+&xyzzy
+&xyzzy;
+<!-- &#0; -->
+\1
+\377
+\377
+\377G
+<!-- &#256; -->
+&#40000000000000000000000000000;
+&#x400000000000000000000000000000000;
+&
+&#
+&#x
+<xmp>&aring</xmp>
+<script>&aring</script>
+<ScRIPT>&aring</scRIPT>
+<skript>å</script>
+EOT
diff --git a/t/entities.t b/t/entities.t
new file mode 100644
index 0000000..f12d2fd
--- /dev/null
+++ b/t/entities.t
@@ -0,0 +1,213 @@
+use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric);
+
+use Test::More tests => 20;
+
+$a = "V&aring;re norske tegn b&oslash;r &#230res";
+
+decode_entities($a);
+
+is($a, "Våre norske tegn bør æres");
+
+encode_entities($a);
+
+is($a, "V&aring;re norske tegn b&oslash;r &aelig;res");
+
+decode_entities($a);
+encode_entities_numeric($a);
+
+is($a, "V&#xE5;re norske tegn b&#xF8;r &#xE6;res");
+
+$a = "<&>\"'";
+is(encode_entities($a), "&lt;&amp;&gt;&quot;&#39;");
+is(encode_entities_numeric($a), "&#x3C;&#x26;&#x3E;&#x22;&#x27;");
+
+$a = "abcdef";
+is(encode_entities($a, 'a-c'), "&#97;&#98;&#99;def");
+
+$a = "[24/7]\\";
+is(encode_entities($a, '/'), "[24&#47;7]\\");
+is(encode_entities($a, '\\/'), "[24&#47;7]\\");
+is(encode_entities($a, '\\'), "[24/7]&#92;");
+is(encode_entities($a, ']\\'), "[24/7&#93;&#92;");
+
+# See how well it does against rfc1866...
+$ent = $plain = "";
+while (<DATA>) {
+ next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/;
+ $ent .= "&$1;";
+ $plain .= chr($2);
+}
+
+$a = $ent;
+decode_entities($a);
+is($a, $plain);
+
+# Try decoding when the ";" are left out
+$a = $ent,
+$a =~ s/;//g;
+decode_entities($a);
+is($a, $plain);
+
+
+$a = $plain;
+encode_entities($a);
+is($a, $ent);
+
+{ #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144
+
+ my %hash= (
+ "V&aring;re norske tegn b&oslash;r &#230res" => "Våre norske tegn bør æres"
+ );
+
+ my ($got, $eval_ok);
+ $eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 };
+ is( $eval_ok, 1, "decode_entitites() when processing a key as input");
+ is( $got, (values %hash)[0], "decode_entities() decodes a key properly");
+}
+
+# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au>
+# Subject: HTML entities problem with 5.11
+# To: libwww-perl@ics.uci.edu
+# Date: Fri, 05 Sep 1997 16:56:55 +1000
+# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU>
+#
+# Hi. I've got a problem that has surfaced with the changes to
+# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening
+# in the process of encoding then decoding special entities. Eg, what goes
+# in as "abc&def&ghi" comes out as "abc&def;&ghi;".
+
+is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;");
+
+# Decoding of &apos;
+is(decode_entities("&apos;"), "'");
+is(encode_entities("'", "'"), "&#39;");
+
+is(decode_entities("Attention Home&#959&#969n&#1257rs...1&#1109t T&#1110&#1084e E&#957&#1257&#1075"),
+ "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}");
+is(decode_entities("{&#38;amp;&#x26;amp;&amp; also &#x42f;&#339;}"),
+ "{&amp;&amp;& also \x{42F}\x{153}}");
+
+__END__
+# Quoted from rfc1866.txt
+
+14. Proposed Entities
+
+ The HTML DTD references the "Added Latin 1" entity set, which only
+ supplies named entities for a subset of the non-ASCII characters in
+ [ISO-8859-1], namely the accented characters. The following entities
+ should be supported so that all ISO 8859-1 characters may only be
+ referenced symbolically. The names for these entities are taken from
+ the appendixes of [SGML].
+
+ <!ENTITY nbsp CDATA "&#160;" -- no-break space -->
+ <!ENTITY iexcl CDATA "&#161;" -- inverted exclamation mark -->
+ <!ENTITY cent CDATA "&#162;" -- cent sign -->
+ <!ENTITY pound CDATA "&#163;" -- pound sterling sign -->
+ <!ENTITY curren CDATA "&#164;" -- general currency sign -->
+ <!ENTITY yen CDATA "&#165;" -- yen sign -->
+ <!ENTITY brvbar CDATA "&#166;" -- broken (vertical) bar -->
+ <!ENTITY sect CDATA "&#167;" -- section sign -->
+ <!ENTITY uml CDATA "&#168;" -- umlaut (dieresis) -->
+ <!ENTITY copy CDATA "&#169;" -- copyright sign -->
+ <!ENTITY ordf CDATA "&#170;" -- ordinal indicator, feminine -->
+ <!ENTITY laquo CDATA "&#171;" -- angle quotation mark, left -->
+ <!ENTITY not CDATA "&#172;" -- not sign -->
+ <!ENTITY shy CDATA "&#173;" -- soft hyphen -->
+ <!ENTITY reg CDATA "&#174;" -- registered sign -->
+ <!ENTITY macr CDATA "&#175;" -- macron -->
+ <!ENTITY deg CDATA "&#176;" -- degree sign -->
+ <!ENTITY plusmn CDATA "&#177;" -- plus-or-minus sign -->
+ <!ENTITY sup2 CDATA "&#178;" -- superscript two -->
+ <!ENTITY sup3 CDATA "&#179;" -- superscript three -->
+ <!ENTITY acute CDATA "&#180;" -- acute accent -->
+ <!ENTITY micro CDATA "&#181;" -- micro sign -->
+ <!ENTITY para CDATA "&#182;" -- pilcrow (paragraph sign) -->
+ <!ENTITY middot CDATA "&#183;" -- middle dot -->
+ <!ENTITY cedil CDATA "&#184;" -- cedilla -->
+ <!ENTITY sup1 CDATA "&#185;" -- superscript one -->
+ <!ENTITY ordm CDATA "&#186;" -- ordinal indicator, masculine -->
+ <!ENTITY raquo CDATA "&#187;" -- angle quotation mark, right -->
+ <!ENTITY frac14 CDATA "&#188;" -- fraction one-quarter -->
+ <!ENTITY frac12 CDATA "&#189;" -- fraction one-half -->
+ <!ENTITY frac34 CDATA "&#190;" -- fraction three-quarters -->
+ <!ENTITY iquest CDATA "&#191;" -- inverted question mark -->
+ <!ENTITY Agrave CDATA "&#192;" -- capital A, grave accent -->
+ <!ENTITY Aacute CDATA "&#193;" -- capital A, acute accent -->
+ <!ENTITY Acirc CDATA "&#194;" -- capital A, circumflex accent -->
+
+
+
+Berners-Lee & Connolly Standards Track [Page 75]
+
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+ <!ENTITY Atilde CDATA "&#195;" -- capital A, tilde -->
+ <!ENTITY Auml CDATA "&#196;" -- capital A, dieresis or umlaut mark -->
+ <!ENTITY Aring CDATA "&#197;" -- capital A, ring -->
+ <!ENTITY AElig CDATA "&#198;" -- capital AE diphthong (ligature) -->
+ <!ENTITY Ccedil CDATA "&#199;" -- capital C, cedilla -->
+ <!ENTITY Egrave CDATA "&#200;" -- capital E, grave accent -->
+ <!ENTITY Eacute CDATA "&#201;" -- capital E, acute accent -->
+ <!ENTITY Ecirc CDATA "&#202;" -- capital E, circumflex accent -->
+ <!ENTITY Euml CDATA "&#203;" -- capital E, dieresis or umlaut mark -->
+ <!ENTITY Igrave CDATA "&#204;" -- capital I, grave accent -->
+ <!ENTITY Iacute CDATA "&#205;" -- capital I, acute accent -->
+ <!ENTITY Icirc CDATA "&#206;" -- capital I, circumflex accent -->
+ <!ENTITY Iuml CDATA "&#207;" -- capital I, dieresis or umlaut mark -->
+ <!ENTITY ETH CDATA "&#208;" -- capital Eth, Icelandic -->
+ <!ENTITY Ntilde CDATA "&#209;" -- capital N, tilde -->
+ <!ENTITY Ograve CDATA "&#210;" -- capital O, grave accent -->
+ <!ENTITY Oacute CDATA "&#211;" -- capital O, acute accent -->
+ <!ENTITY Ocirc CDATA "&#212;" -- capital O, circumflex accent -->
+ <!ENTITY Otilde CDATA "&#213;" -- capital O, tilde -->
+ <!ENTITY Ouml CDATA "&#214;" -- capital O, dieresis or umlaut mark -->
+ <!ENTITY times CDATA "&#215;" -- multiply sign -->
+ <!ENTITY Oslash CDATA "&#216;" -- capital O, slash -->
+ <!ENTITY Ugrave CDATA "&#217;" -- capital U, grave accent -->
+ <!ENTITY Uacute CDATA "&#218;" -- capital U, acute accent -->
+ <!ENTITY Ucirc CDATA "&#219;" -- capital U, circumflex accent -->
+ <!ENTITY Uuml CDATA "&#220;" -- capital U, dieresis or umlaut mark -->
+ <!ENTITY Yacute CDATA "&#221;" -- capital Y, acute accent -->
+ <!ENTITY THORN CDATA "&#222;" -- capital THORN, Icelandic -->
+ <!ENTITY szlig CDATA "&#223;" -- small sharp s, German (sz ligature) -->
+ <!ENTITY agrave CDATA "&#224;" -- small a, grave accent -->
+ <!ENTITY aacute CDATA "&#225;" -- small a, acute accent -->
+ <!ENTITY acirc CDATA "&#226;" -- small a, circumflex accent -->
+ <!ENTITY atilde CDATA "&#227;" -- small a, tilde -->
+ <!ENTITY auml CDATA "&#228;" -- small a, dieresis or umlaut mark -->
+ <!ENTITY aring CDATA "&#229;" -- small a, ring -->
+ <!ENTITY aelig CDATA "&#230;" -- small ae diphthong (ligature) -->
+ <!ENTITY ccedil CDATA "&#231;" -- small c, cedilla -->
+ <!ENTITY egrave CDATA "&#232;" -- small e, grave accent -->
+ <!ENTITY eacute CDATA "&#233;" -- small e, acute accent -->
+ <!ENTITY ecirc CDATA "&#234;" -- small e, circumflex accent -->
+ <!ENTITY euml CDATA "&#235;" -- small e, dieresis or umlaut mark -->
+ <!ENTITY igrave CDATA "&#236;" -- small i, grave accent -->
+ <!ENTITY iacute CDATA "&#237;" -- small i, acute accent -->
+ <!ENTITY icirc CDATA "&#238;" -- small i, circumflex accent -->
+ <!ENTITY iuml CDATA "&#239;" -- small i, dieresis or umlaut mark -->
+ <!ENTITY eth CDATA "&#240;" -- small eth, Icelandic -->
+ <!ENTITY ntilde CDATA "&#241;" -- small n, tilde -->
+ <!ENTITY ograve CDATA "&#242;" -- small o, grave accent -->
+
+
+
+Berners-Lee & Connolly Standards Track [Page 76]
+
+RFC 1866 Hypertext Markup Language - 2.0 November 1995
+
+
+ <!ENTITY oacute CDATA "&#243;" -- small o, acute accent -->
+ <!ENTITY ocirc CDATA "&#244;" -- small o, circumflex accent -->
+ <!ENTITY otilde CDATA "&#245;" -- small o, tilde -->
+ <!ENTITY ouml CDATA "&#246;" -- small o, dieresis or umlaut mark -->
+ <!ENTITY divide CDATA "&#247;" -- divide sign -->
+ <!ENTITY oslash CDATA "&#248;" -- small o, slash -->
+ <!ENTITY ugrave CDATA "&#249;" -- small u, grave accent -->
+ <!ENTITY uacute CDATA "&#250;" -- small u, acute accent -->
+ <!ENTITY ucirc CDATA "&#251;" -- small u, circumflex accent -->
+ <!ENTITY uuml CDATA "&#252;" -- small u, dieresis or umlaut mark -->
+ <!ENTITY yacute CDATA "&#253;" -- small y, acute accent -->
+ <!ENTITY thorn CDATA "&#254;" -- small thorn, Icelandic -->
+ <!ENTITY yuml CDATA "&#255;" -- small y, dieresis or umlaut mark -->
diff --git a/t/entities2.t b/t/entities2.t
new file mode 100644
index 0000000..537ac78
--- /dev/null
+++ b/t/entities2.t
@@ -0,0 +1,57 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 9;
+
+use HTML::Entities qw(_decode_entities);
+
+eval {
+ _decode_entities("&lt;", undef);
+};
+like($@, qr/^(?:Can't inline decode readonly string|Modification of a read-only value attempted)/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, $a);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+eval {
+ my $a = "";
+ _decode_entities($a, []);
+};
+like($@, qr/^2nd argument must be hash reference/);
+
+$a = "&lt;";
+_decode_entities($a, undef);
+is($a, "&lt;");
+
+_decode_entities($a, { "lt" => "<" });
+is($a, "<");
+
+my $x = "x" x 20;
+
+my $err;
+for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") {
+ my $a = $_;
+ $a =~ s/:/&a;/g;
+ my $b = $_;
+ $b =~ s/:/$x/g;
+ _decode_entities($a, { "a" => $x });
+ if ($a ne $b) {
+ diag "Something went wrong with '$_'";
+ $err++;
+ }
+}
+ok(!$err);
+
+$a = "foo&nbsp;bar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo\xA0bar");
+
+$a = "foo&nbspbar";
+_decode_entities($a, \%HTML::Entities::entity2char);
+is($a, "foo&nbspbar");
+
+_decode_entities($a, \%HTML::Entities::entity2char, 1);
+is($a, "foo\xA0bar");
diff --git a/t/filter-methods.t b/t/filter-methods.t
new file mode 100644
index 0000000..9eccaf1
--- /dev/null
+++ b/t/filter-methods.t
@@ -0,0 +1,205 @@
+#!/usr/bin/perl -w
+
+use Test::More tests => 12;
+use strict;
+
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]);
+$p->ignore_elements("script");
+$p->unbroken_text(1);
+
+$p->handler(default => [], "event, text");
+$p->parse(<<"EOT")->eof;
+<html><head><title>foo</title><Script language="Perl">
+ while (<B>) {
+ # ...
+ }
+</Script><body>
+This is an <i>italic</i> and <b>bold</b> text.
+</body>
+</html>
+EOT
+
+my $t = join("||", map join("|", @$_), @{$p->handler("default")});
+#diag $t;
+
+is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text|
+This is an italic and bold text.
+||end|</body>||text|
+||end|</html>||text|
+||end_document|", 'ignore_elements');
+
+
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags("a");
+$p->handler(start => sub {
+ my($tagname, %attr) = @_;
+ ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start');
+ }, 'tagname, @attr');
+$p->handler(end => sub {
+ my $tagname = shift;
+ is($tagname, "a", 'report_tags end');
+ }, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <a href="#a">very nice</a> example.
+
+EOT
+
+
+#------------------------------------------------------
+
+my @tags;
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(a em));
+$p->ignore_tags(qw(em));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+
+This is <em>yet another</em> <a href="#a">very nice</a> example.
+
+EOT
+is(join('|', @tags), 'a', 'report_tags followed by ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset report_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->ignore_tags(qw(h2));
+$p->report_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->report_tags(qw(h1 h2));
+$p->report_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h3', 'report_tags replaces filter');
+
+
+#------------------------------------------------------
+
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h1 h2));
+$p->ignore_tags(qw(h3));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+<h3>Next example</h3>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->ignore_tags();
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter');
+
+
+#------------------------------------------------------
+
+@tags = ();
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_tags(qw(h2));
+$p->report_tags(qw(h1 h2));
+$p->handler(end => sub {push @tags, @_;}, 'tagname');
+
+$p->parse(<<EOT)->eof;
+
+<h1>Next example</h1>
+<h2>Next example</h2>
+
+EOT
+is(join('|', @tags), 'h1', 'ignore_tags before report_tags');
+#------------------------------------------------------
+
+$p = HTML::Parser->new(api_version => 3);
+$p->ignore_elements("script");
+my $res="";
+$p->handler(default=> sub {$res.=$_[0];}, 'text');
+$p->parse(<<'EOT')->eof;
+A <script> B </script> C </script> D <script> E </script> F
+EOT
+is($res,"A C D F\n","ignore </script> without <script> correctly");
diff --git a/t/filter.t b/t/filter.t
new file mode 100644
index 0000000..3b18f9e
--- /dev/null
+++ b/t/filter.t
@@ -0,0 +1,60 @@
+use Test::More tests => 3;
+
+my $HTML = <<EOT;
+
+<!DOCTYPE HTML>
+<!-- comment
+<h1>Foo</h1>
+-->
+
+<H1
+>Bar</H1
+>
+
+<Table><tr><td>1<td>2<td>3
+<tr>
+</table>
+
+<?process>
+
+EOT
+
+use HTML::Filter;
+use SelectSaver;
+
+my $tmpfile = "test-$$.htm";
+die "$tmpfile already exists" if -e $tmpfile;
+
+open(HTML, ">$tmpfile") or die "$!";
+
+{
+ my $save = new SelectSaver(HTML);
+ HTML::Filter->new->parse($HTML)->eof;
+}
+close(HTML);
+
+open(HTML, $tmpfile) or die "$!";
+local($/) = undef;
+my $FILTERED = <HTML>;
+close(HTML);
+
+#print $FILTERED;
+is($FILTERED, $HTML);
+
+{
+ package MyFilter;
+ @ISA=qw(HTML::Filter);
+ sub comment {}
+ sub output { push(@{$_[0]->{fhtml}}, $_[1]) }
+ sub filtered_html { join("", @{$_[0]->{fhtml}}) }
+}
+
+my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html;
+unlink($tmpfile) or warn "Can't unlink $tmpfile: $!";
+
+#diag $f2;
+
+unlike($f2, qr/Foo/);
+like($f2, qr/Bar/);
+
+
diff --git a/t/handler-eof.t b/t/handler-eof.t
new file mode 100644
index 0000000..39419dc
--- /dev/null
+++ b/t/handler-eof.t
@@ -0,0 +1,54 @@
+use Test::More tests => 6;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) },
+ "attr");
+$p->handler(end => sub { shift->eof }, "self");
+my $text;
+$p->handler(text => sub { $text = shift }, "text");
+
+is($p->parse("<foo testno=1>"), $p);
+
+$text = '';
+ok(!$p->parse("</foo><foo testno=999>"));
+ok(!$text);
+
+$p->handler(end => sub { $p->parse("foo"); }, "");
+eval {
+ $p->parse("</foo>");
+};
+like($@, qr/Parse loop not allowed/);
+
+# We used to get into an infinite loop if the eof triggered
+# handler called ->eof
+
+use HTML::Parser;
+$p = HTML::Parser->new(api_version => 3);
+
+my $i;
+$p->handler("default" =>
+ sub {
+ my $p=shift;
+ #++$i; diag "$i @_";
+ $p->eof;
+ }, "self, event");
+$p->parse("Foo");
+$p->eof;
+
+# We used to sometimes trigger events after a handler signaled eof
+my $title='';
+$p = HTML::Parser->new(api_version => 3,);
+$p->handler(start=> \&title_handler, 'tagname, self');
+$p->parse("<head><title>foo</title>\n</head>");
+is($title, "foo");
+
+sub title_handler {
+ return if shift ne 'title';
+ my $self = shift;
+ $self->handler(text => sub { $title .= shift}, 'dtext');
+ $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self');
+}
diff --git a/t/handler.t b/t/handler.t
new file mode 100644
index 0000000..8d7bbc5
--- /dev/null
+++ b/t/handler.t
@@ -0,0 +1,67 @@
+# Test handler method
+
+use Test::More tests => 11;
+
+my $testno;
+
+use HTML::Parser;
+{
+ package MyParser;
+ use vars qw(@ISA);
+ @ISA=(HTML::Parser);
+
+ sub foo
+ {
+ Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1);
+ }
+
+ sub bar
+ {
+ Test::More::is($_[1], Test::More->builder->current_test + 1);
+ }
+}
+
+$p = MyParser->new(api_version => 3);
+
+eval {
+ $p->handler(foo => "foo", "foo");
+};
+
+like($@, qr/^No handler for foo events/);
+
+eval {
+ $p->handler(start => "foo", "foo");
+};
+like($@, qr/^Unrecognized identifier foo in argspec/);
+
+my $h = $p->handler(start => "foo", "self,tagname");
+ok(!defined($h));
+
+$x = \substr("xfoo", 1);
+$p->handler(start => $$x, "self,attr");
+$p->parse("<a testno=4>");
+
+$p->handler(start => \&MyParser::foo, "self,attr");
+$p->parse("<a testno=5>");
+
+$p->handler(start => "foo");
+$p->parse("<a testno=6>");
+
+$p->handler(start => "bar", "self,'7'");
+$p->parse("<a>");
+
+eval {
+ $p->handler(start => {}, "self");
+};
+like($@, qr/^Only code or array references allowed as handler/);
+
+$a = [];
+$p->handler(start => $a);
+$h = $p->handler("start");
+is($p->handler("start", "foo"), $a);
+
+is($p->handler("start", \&MyParser::foo, ""), "foo");
+
+is($p->handler("start"), \&MyParser::foo);
+
+
diff --git a/t/headparser-http.t b/t/headparser-http.t
new file mode 100644
index 0000000..b722c64
--- /dev/null
+++ b/t/headparser-http.t
@@ -0,0 +1,20 @@
+use Test::More tests => 1;
+
+eval {
+ require HTML::HeadParser;
+ $p = HTML::HeadParser->new;
+};
+
+SKIP: {
+skip $@, 1 if $@ =~ /^Can't locate HTTP/;
+
+$p = HTML::HeadParser->new($h);
+$p->parse(<<EOT);
+<title>Stupid example</title>
+<base href="http://www.sn.no/libwww-perl/">
+Normal text starts here.
+EOT
+$h = $p->header;
+undef $p;
+is($h->title, "Stupid example");
+}
diff --git a/t/headparser.t b/t/headparser.t
new file mode 100644
index 0000000..1b4b810
--- /dev/null
+++ b/t/headparser.t
@@ -0,0 +1,200 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 17;
+
+{ package H;
+ sub new { bless {}, shift; }
+
+ sub header {
+ my $self = shift;
+ my $key = uc(shift);
+ die if $key =~ /:/;
+ my $old = $self->{$key};
+ if (@_) { $self->{$key} = shift; }
+ $old;
+ }
+
+ sub push_header {
+ my($self, $k, $v) = @_;
+ $k = uc($k);
+ die if $k =~ /:/;
+ if (exists $self->{$k}) {
+ $self->{$k} = [ $self->{$k} ] unless ref $self->{$k};
+ push(@{$self->{$k}}, $v);
+ } else {
+ $self->{$k} = $v;
+ }
+ }
+
+ sub as_string {
+ my $self = shift;
+ my $str = "";
+ for (sort keys %$self) {
+ if (ref($self->{$_})) {
+ my $v;
+ for $v (@{$self->{$_}}) {
+ $str .= "$_: $v\n";
+ }
+ } else {
+ $str .= "$_: $self->{$_}\n";
+ }
+ }
+ $str;
+ }
+}
+
+
+my $HTML = <<'EOT';
+
+<title>&Aring være eller &#229; ikke være</title>
+<meta http-equiv="Expires" content="Soon">
+<meta http-equiv="Foo" content="Bar">
+<meta name='twitter:card' content='photo' />
+<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas">
+
+<script>
+
+ ignore this
+
+</script>
+<noscript> ... and this </noscript>
+
+<object classid="foo">
+
+<base href="http://www.sn.no">
+<meta name="Keywords" content="test, test, test,...">
+<meta name="Keywords" content="more">
+<meta charset="ISO-8859-1"><!-- HTML 5 -->
+
+Dette er vanlig tekst. Denne teksten definerer også slutten på
+&lt;head> delen av dokumentet.
+
+<style>
+
+ ignore this too
+
+</style>
+
+<isindex>
+
+Dette er også vanlig tekst som ikke skal blir parset i det hele tatt.
+
+EOT
+
+$| = 1;
+
+#$HTML::HeadParser::DEBUG = 1;
+require HTML::HeadParser;
+my $p = HTML::HeadParser->new( H->new );
+
+if ($p->parse($HTML)) {
+ fail("Need more data which should not happen");
+} else {
+ #diag $p->as_string;
+ pass();
+}
+
+like($p->header('Title'), qr/Å være eller å ikke være/);
+is($p->header('Expires'), 'Soon');
+is($p->header('Content-Base'), 'http://www.sn.no');
+is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']);
+is($p->header('X-Meta-Charset'), 'ISO-8859-1');
+is($p->header('X-Meta-Twitter-Card'), 'photo');
+like($p->header('Link'), qr/<mailto:gisle\@aas.no>/);
+
+# This header should not be present because the head ended
+ok(!$p->header('Isindex'));
+
+
+# Try feeding one char at a time
+my $expected = $p->as_string;
+my $nl = 1;
+$p = HTML::HeadParser->new(H->new);
+while ($HTML =~ /(.)/sg) {
+ #print STDERR '#' if $nl;
+ #print STDERR $1;
+ $nl = $1 eq "\n";
+ $p->parse($1) or last;
+}
+is($p->as_string, $expected);
+
+
+# Try reading it from a file
+my $file = "hptest$$.html";
+die "$file already exists" if -e $file;
+
+open(FILE, ">$file") or die "Can't create $file: $!";
+binmode(FILE);
+print FILE $HTML;
+print FILE "<p>This is more content...</p>\n" x 2000;
+print FILE "<title>Buuuh!</title>\n" x 200;
+close FILE or die "Can't close $file: $!";
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+is($p->header("Title"), "Å være eller å ikke være");
+
+
+# We got into an infinite loop on data without tags and no EOL.
+# This was actually a HTML::Parser bug.
+open(FILE, ">$file") or die "Can't create $file: $!";
+print FILE "Foo";
+close(FILE);
+
+$p = HTML::HeadParser->new(H->new);
+$p->parse_file($file);
+unlink($file) or warn "Can't unlink $file: $!";
+
+ok(!$p->as_string);
+
+SKIP: {
+ skip "Need Unicode support", 5 if $] < 5.008;
+
+ # Test that the Unicode BOM does not confuse us?
+ $p = HTML::HeadParser->new(H->new);
+ ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>"));
+ $p->eof;
+
+ is($p->header("title"), "Hi <foo>");
+
+ $p = HTML::HeadParser->new(H->new);
+ $p->utf8_mode(1);
+ $p->parse(<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522
+\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html>
+ <head>
+ <title>
+Parkinson's disease</title>
+ <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications">
+ </meta>
+ \t
+\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" />
+\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" />
+\t
+\t </head>
+ <body>
+EOT
+ $p->eof;
+
+ is($p->header("title"), "Parkinson's disease");
+ is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"');
+
+ $p = HTML::HeadParser->new(H->new);
+ $p->utf8_mode(1);
+ $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r
+\r
+<head profile="http://gmpg.org/xfn/11">\r
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r
+\r
+<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa &#8211; MJW</title>\r
+<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r
+
+EOT
+ $p->eof;
+ is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW");
+}
diff --git a/t/ignore.t b/t/ignore.t
new file mode 100644
index 0000000..008739e
--- /dev/null
+++ b/t/ignore.t
@@ -0,0 +1,27 @@
+
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser ();
+
+my $html = '<A href="foo">text</A>';
+
+my $text = '';
+my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']);
+$p->parse($html)->eof;
+is($text, $html);
+
+$text = '';
+$p->handler(start => "");
+$p->parse($html)->eof;
+is($text, 'text</A>');
+
+$text = '';
+$p->handler(end => 0);
+$p->parse($html)->eof;
+is($text, 'text');
+
+$text = '';
+$p->handler(start => undef);
+$p->parse($html)->eof;
+is($text, '<A href="foo">text');
diff --git a/t/largetags.t b/t/largetags.t
new file mode 100644
index 0000000..a9ed3ff
--- /dev/null
+++ b/t/largetags.t
@@ -0,0 +1,38 @@
+# Exercise the tokenpos buffer allocation routines by feeding it
+# very large tags.
+
+use Test::More tests => 2;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->handler("start" =>
+ sub {
+ my $tp = shift;
+ #diag int(@$tp), " - ", join(", ", @$tp);
+ is(@$tp, 2 + 26 * 6 * 4);
+ }, "tokenpos");
+
+$p->handler("declaration" =>
+ sub {
+ my $t = shift;
+ #diag int(@$t), " - @$t";
+ is(@$t, 26 * 6 * 2 + 1);
+ }, "tokens");
+
+$p->parse("<a ");
+for ("aa" .. "fz") {
+ $p->parse("$_=1 ");
+}
+$p->parse(">");
+
+$p->parse("<!DOCTYPE ");
+for ("aa" .. "fz") {
+ $p->parse("$_ -- $_ -- ");
+}
+$p->parse(">");
+$p->eof;
+exit;
+
diff --git a/t/linkextor-base.t b/t/linkextor-base.t
new file mode 100644
index 0000000..7ef8f02
--- /dev/null
+++ b/t/linkextor-base.t
@@ -0,0 +1,41 @@
+# This test that HTML::LinkExtor really absolutize links correctly
+# when a base URL is given to the constructor.
+
+use Test::More tests => 5;
+require HTML::LinkExtor;
+
+SKIP: {
+eval {
+ require URI;
+};
+skip $@, 5 if $@;
+
+# Try with base URL and the $p->links interface.
+$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html");
+$p->parse(<<HTML)->eof;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+@p = $p->links;
+
+# There should be 4 links in the document
+is(@p, 4);
+
+for (@p) {
+ ($t, %attr) = @$_ if $_->[0] eq 'img';
+}
+
+is($t, 'img');
+
+is(delete $attr{src}, "http://www.sn.no/foo/img.jpg");
+
+is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif");
+
+ok(!scalar(keys %attr)); # there should be no more attributes
+}
diff --git a/t/linkextor-rel.t b/t/linkextor-rel.t
new file mode 100644
index 0000000..1190a96
--- /dev/null
+++ b/t/linkextor-rel.t
@@ -0,0 +1,36 @@
+use Test::More tests => 4;
+
+require HTML::LinkExtor;
+
+$HTML = <<HTML;
+<head>
+<base href="http://www.sn.no/">
+</head>
+<body background="http://www.sn.no/sn.gif">
+
+This is <A HREF="link.html">link</a> and an <img SRC="img.jpg"
+lowsrc="img.gif" alt="Image">.
+HTML
+
+
+# Try the callback interface
+$links = "";
+$p = HTML::LinkExtor->new(
+ sub {
+ my($tag, %links) = @_;
+ #diag "$tag @{[%links]}";
+ $links .= "$tag @{[%links]}\n";
+ });
+
+$p->parse($HTML); $p->eof;
+
+ok($links =~ m|^base href http://www\.sn\.no/$|m);
+ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m);
+ok($links =~ m|^a href link\.html$|m);
+
+# Used to be problems when using the links method on a document with
+# no links it it. This is a test to prove that it works.
+$p = new HTML::LinkExtor;
+$p->parse("this is a document with no links"); $p->eof;
+@a = $p->links;
+is(@a, 0);
diff --git a/t/magic.t b/t/magic.t
new file mode 100644
index 0000000..366f275
--- /dev/null
+++ b/t/magic.t
@@ -0,0 +1,41 @@
+# Check that the magic signature at the top of struct p_state works and that we
+# catch modifications to _hparser_xs_state gracefully
+
+use Test::More tests => 5;
+
+use HTML::Parser;
+
+$p = HTML::Parser->new(api_version => 3);
+
+$p->xml_mode(1);
+
+# We should not be able to simply modify this stuff
+eval {
+ ${$p->{_hparser_xs_state}} += 4;
+};
+like($@, qr/^Modification of a read-only value attempted/);
+
+
+my $x = delete $p->{_hparser_xs_state};
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^Can't find '_hparser_xs_state'/);
+
+$p->{_hparser_xs_state} = \($$x + 16);
+
+eval {
+ $p->xml_mode(1);
+};
+like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/);
+
+$p->{_hparser_xs_state} = 33;
+eval {
+ $p->xml_mode(1);
+};
+like($@, qr/^_hparser_xs_state element is not a reference/);
+
+$p->{_hparser_xs_state} = $x;
+
+ok($p->xml_mode(0));
diff --git a/t/marked-sect.t b/t/marked-sect.t
new file mode 100644
index 0000000..6a63478
--- /dev/null
+++ b/t/marked-sect.t
@@ -0,0 +1,121 @@
+#!/usr/bin/perl -w
+
+use strict;
+my $tag;
+my $text;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"],
+ text_h => [sub { $text .= shift }, "dtext"],
+ );
+
+
+use Test::More tests => 14;
+
+SKIP: {
+eval {
+ $p->marked_sections(1);
+};
+skip $@, 14 if $@;
+
+$p->parse("<![[foo]]>");
+is($text, "foo");
+
+$p->parse("<![TEMP INCLUDE[bar]]>");
+is($text, "foobar");
+
+$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>");
+is($text, "foobarfoo\n");
+
+$text = "";
+$p->parse("<![ CDATA [&lt;foo");
+$p->parse("<![IGNORE[bar]]>,bar&gt;]]><br>");
+is($text, "&lt;foo<![IGNORE[bar,bar>]]>");
+
+$text = "";
+$p->parse("<![ RCDATA [&aring;<a>]]><![CDATA[&aring;<a>]]>&aring;<a><br>");
+is($text, "å<a>&aring;<a>å");
+is($tag, "br");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA IGNORE [foo&aring;<a>]]><br>");
+is($text, "");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA CDATA [foo&aring;<a>]]><br>");
+is($text, "foo&aring;<a>");
+
+$text = "";
+$p->parse("<![INCLUDE RCDATA [foo&aring;<a>]]><br>");
+is($text, "fooå<a>");
+
+$text = "";
+$p->parse("<![INCLUDE [foo&aring;<a>]]><br>");
+is($text, "fooå");
+
+$text = "";
+$p->parse("<![[foo&aring;<a>]]><br>");
+is($text, "fooå");
+
+# offsets/line/column numbers
+$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"],
+ marked_sections => 1,
+ );
+$p->parse(<<'EOT')->eof;
+<title>Test</title>
+<![CDATA
+ [foo&aring;<a>
+]]>
+<![[
+INCLUDE
+STUFF
+]]>
+ <h1>Test</h1>
+EOT
+
+my @x;
+sub x {
+ my($line, $col, $offset, $event, $text) = @_;
+ $text =~ s/\n/\\n/g;
+ $text =~ s/ /./g;
+ push(@x, "$line.$col:$offset $event \"$text\"\n");
+}
+
+#diag @x;
+is(join("", @x), <<'EOT');
+1.0:0 start_document ""
+1.0:0 start "<title>"
+1.7:7 text "Test"
+1.11:11 end "</title>"
+1.19:19 text "\n"
+3.3:32 text "foo&aring;<a>\n"
+4.3:49 text "\n"
+5.4:54 text "\nINCLUDE\nSTUFF\n"
+8.3:72 text "\n.."
+9.2:75 start "<h1>"
+9.6:79 text "Test"
+9.10:83 end "</h1>"
+9.15:88 text "\n"
+10.0:89 end_document ""
+EOT
+
+my $doc = "<Tag><![CDATA[This is cdata]]></Tag>";
+my $result = "";
+$p = HTML::Parser->new(
+ marked_sections => 1,
+ handlers => {
+ default => [ sub { $result .= join("",@_); }, "skipped_text,text" ]
+ }
+)->parse($doc)->eof;
+is($doc, $result);
+
+$text = "";
+$p = HTML::Parser->new(
+ text_h => [sub { $text .= shift }, "dtext"],
+ marked_sections => 1,
+);
+
+$p->parse("<![CDATA[foo [1]]]>");
+is($text, "foo [1]", "CDATA text ending in square bracket");
+
+} # SKIP
diff --git a/t/msie-compat.t b/t/msie-compat.t
new file mode 100644
index 0000000..a297f1e
--- /dev/null
+++ b/t/msie-compat.t
@@ -0,0 +1,79 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 4;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text, @attr) = @_;
+ for ($event, $tagname, $text, @attr) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "<undef>";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text," . join(":", @attr) . "]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text,\@attr"]);
+$p->parse("<a>");
+$p->parse("</a f>");
+$p->parse("</a 'foo<>' 'bar>' x>");
+$p->parse("</a \"foo<>\"");
+$p->parse(" \"bar>\" x>");
+$p->parse("</ foo bar>");
+$p->parse("</ \"<>\" >");
+$p->parse("<!--comment>text<!--comment><p");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a>,]
+[end,a,</a f>,]
+[end,a,</a 'foo<>' 'bar>' x>,]
+[end,a,</a "foo<>" "bar>" x>,]
+[comment, foo bar,</ foo bar>,]
+[comment, "<>" ,</ "<>" >,]
+[comment,comment,<!--comment>,]
+[text,<undef>,text,]
+[comment,comment,<!--comment>,]
+[comment,p,<p,]
+[end_document,<undef>,,]
+EOT
+
+$TEXT = "";
+$p->parse("<!comment>");
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[comment,comment,<!comment>,]
+[end_document,<undef>,,]
+EOT
+
+$TEXT = "";
+$p->parse(q(<a name=`foo bar`>));
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a name=`foo bar`>,name:`foo:bar`:bar`]
+[end_document,<undef>,,]
+EOT
+
+$p->backquote(1);
+$TEXT = "";
+$p->parse(q(<a name=`foo bar`>));
+$p->eof;
+
+is($TEXT, <<'EOT');
+[start_document,<undef>,,]
+[start,a,<a name=`foo bar`>,name:foo bar]
+[end_document,<undef>,,]
+EOT
diff --git a/t/offset.t b/t/offset.t
new file mode 100644
index 0000000..840728d
--- /dev/null
+++ b/t/offset.t
@@ -0,0 +1,58 @@
+use strict;
+use HTML::Parser ();
+use Test::More tests => 1;
+
+my $HTML = <<'EOT';
+
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
+ "http://www.w3.org/TR/html40/strict.dtd">
+
+<foo bar baz=3>heisan
+</foo> <?process>
+<!-- comment -->
+<xmp>xmp</xmp>
+
+EOT
+
+my $p = HTML::Parser->new(api_version => 3);
+
+my $sum_len = 0;
+my $count = 0;
+my $err;
+
+$p->handler(default =>
+ sub {
+ my($offset, $length, $offset_end, $line, $col, $text) = @_;
+ my $copy = $text;
+ $copy =~ s/\n/\\n/g;
+ substr($copy, 30) = "..." if length($copy) > 32;
+ #diag sprintf ">>> %d.%d %s", $line, $col, $copy;
+ if ($offset != $sum_len) {
+ diag "offset mismatch $offset vs $sum_len";
+ $err++;
+ }
+ if ($offset_end != $offset + $length) {
+ diag "offset_end $offset_end wrong";
+ $err++;
+ }
+ if ($length != length($text)) {
+ diag "length mismatch";
+ $err++;
+ }
+ if (substr($HTML, $offset, $length) ne $text) {
+ diag "content mismatch";
+ $err++;
+ }
+ $sum_len += $length;
+ $count++;
+ },
+ 'offset,length,offset_end,line,column,text');
+
+for (split(//, $HTML)) {
+ $p->parse($_);
+}
+$p->eof;
+
+ok($count > 5 && !$err);
+
+
diff --git a/t/options.t b/t/options.t
new file mode 100644
index 0000000..ff5f7db
--- /dev/null
+++ b/t/options.t
@@ -0,0 +1,36 @@
+# Test option setting methods
+
+use Test::More tests => 10;
+
+use strict;
+use HTML::Parser ();
+
+my $p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+my $old;
+
+$old = $p->boolean_attribute_value("foo");
+ok(!defined $old);
+
+$old = $p->boolean_attribute_value();
+is($old, "foo");
+
+$old = $p->boolean_attribute_value(undef);
+is($old, "foo");
+ok(!defined($p->boolean_attribute_value));
+
+ok($p->xml_mode(0));
+ok(!$p->xml_mode);
+
+my $seen_buggy_comment_warning;
+$SIG{__WARN__} =
+ sub {
+ local $_ = shift;
+ $seen_buggy_comment_warning++
+ if /^netscape_buggy_comment\(\) is deprecated/;
+ };
+
+ok(!$p->strict_comment(1));
+ok($p->strict_comment);
+ok(!$p->netscape_buggy_comment);
+ok($seen_buggy_comment_warning);
diff --git a/t/parsefile.t b/t/parsefile.t
new file mode 100644
index 0000000..f373f06
--- /dev/null
+++ b/t/parsefile.t
@@ -0,0 +1,45 @@
+use Test::More tests => 6;
+
+my $filename = "file$$.htm";
+die "$filename is already there" if -e $filename;
+open(FILE, ">$filename") || die "Can't create $filename: $!";
+print FILE <<'EOT'; close(FILE);
+<title>Heisan</title>
+EOT
+
+{
+ package MyParser;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ Test::More::is($tag, "title");
+ }
+}
+
+MyParser->new->parse_file($filename);
+open(FILE, $filename) || die;
+MyParser->new->parse_file(*FILE);
+seek(FILE, 0, 0) || die;
+MyParser->new->parse_file(\*FILE);
+close(FILE);
+
+require IO::File;
+my $io = IO::File->new($filename) || die;
+MyParser->new->parse_file($io);
+$io->seek(0, 0) || die;
+MyParser->new->parse_file(*$io);
+
+my $text = '';
+$io->seek(0, 0) || die;
+MyParser->new(
+ start_h => [ sub{ shift->eof; }, "self" ],
+ text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io);
+ok(!$text);
+
+close($io); # needed because of bug in perl
+undef($io);
+
+unlink($filename) or warn "Can't unlink $filename: $!";
diff --git a/t/parser.t b/t/parser.t
new file mode 100644
index 0000000..0ce4d95
--- /dev/null
+++ b/t/parser.t
@@ -0,0 +1,184 @@
+use Test::More tests => 7;
+
+$HTML = <<'HTML';
+
+<!DOCTYPE HTML>
+
+<body>
+
+Various entities. The parser must never break them in the middle:
+
+&#x2F
+&#x2F;
+&#200
+&#3030;
+&#XFFFF;
+&aring-&Aring;
+
+<ul>
+<li><a href="foo 'bar' baz>" id=33>This is a link</a>
+<li><a href='foo "bar" baz> &aring' id=34>This is another one</a>
+</ul>
+
+<p><div align="center"><img src="http://www.perl.com/perl.gif"
+alt="camel"></div>
+
+<!-- this is
+a comment --> and this is not.
+
+<!-- this is the kind of >comment< -- --> that Netscape hates -->
+
+< this > was not a tag. <this is/not either>
+
+</body>
+
+HTML
+
+#-------------------------------------------------------------------
+
+{
+ package P;
+ require HTML::Parser;
+ @ISA=qw(HTML::Parser);
+ $OUT='';
+ $COUNT=0;
+
+ sub new
+ {
+ my $class = shift;
+ my $self = $class->SUPER::new;
+ $OUT = '';
+ die "Can only have one" if $COUNT++;
+ $self;
+ }
+
+ sub DESTROY
+ {
+ my $self = shift;
+ eval { $self->SUPER::DESTROY; };
+ $COUNT--;
+ }
+
+ sub declaration
+ {
+ my($self, $decl) = @_;
+ $OUT .= "[[$decl]]|";
+ }
+
+ sub start
+ {
+ my($self, $tag, $attr) = @_;
+ $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr);
+ $attr = "/$attr" if length $attr;
+ $OUT .= "<<$tag$attr>>|";
+ }
+
+ sub end
+ {
+ my($self, $tag) = @_;
+ $OUT .= ">>$tag<<|";
+ }
+
+ sub comment
+ {
+ my($self, $comment) = @_;
+ $OUT .= "##$comment##|";
+ }
+
+ sub text
+ {
+ my($self, $text) = @_;
+ #$text =~ s/\n/\\n/g;
+ #$text =~ s/\t/\\t/g;
+ #$text =~ s/ /·/g;
+ $OUT .= "$text|";
+ }
+
+ sub result
+ {
+ $OUT;
+ }
+}
+
+for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") {
+#for $chunksize (1) {
+ if ($chunksize =~ /^file/) {
+ #print "Parsing from $chunksize";
+ } else {
+ #print "Parsing using $chunksize byte chunks";
+ }
+ my $p = P->new;
+
+ if ($chunksize =~ /^file/) {
+ # First we must create the file
+ my $tmpfile = "tmp-$$.html";
+ my $file = $tmpfile;
+ die "$file already exists" if -e $file;
+ open(FILE, ">$file") or die "Can't create $file: $!";
+ binmode FILE;
+ print FILE $HTML;
+ close(FILE);
+
+ if ($chunksize eq "filehandle") {
+ require FileHandle;
+ my $fh = FileHandle->new($file) || die "Can't open $file: $!";
+ $file = $fh;
+ }
+
+ # then we can parse it.
+ $p->parse_file($file);
+ close $file if $chunksize eq "filehandle";
+ unlink($tmpfile) || warn "Can't unlink $tmpfile: $!";
+ } else {
+ my $copy = $HTML;
+ while (length $copy) {
+ my $chunk = substr($copy, 0, $chunksize);
+ substr($copy, 0, $chunksize) = '';
+ $p->parse($chunk);
+ }
+ $p->eof;
+ }
+
+ my $res = $p->result;
+ my $bad;
+
+ # Then we start looking for things that should not happen
+ if ($res =~ /\s\|\s/) {
+ diag "broken space";
+ $bad++;
+ }
+ for (
+ # Make sure entities are not broken
+ '&#x2F', '&#x2F;', '&#200', '&#3030;', '&#XFFFF;', '&aring', '&Aring',
+
+ # Some elements that should be produced
+ "|[[DOCTYPE HTML]]|",
+ "|## this is\na comment ##|",
+ "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|",
+ '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>',
+ "|>>ul<<|", "|>>body<<|\n\n|",
+ )
+ {
+ if (index($res, $_) < 0) {
+ diag "Can't find '$_' in parsed document";
+ $bad++;
+ }
+ }
+
+ diag $res if $bad || $ENV{PRINT_RESULTS};
+
+ # And we check that we get the same result all the time
+ $res =~ s/\|//g; # remove all break marks
+ if ($last_res && $res ne $last_res) {
+ diag "The result is not the same as last time";
+ $bad++;
+ }
+ $last_res = $res;
+
+ unless ($res =~ /Various entities/) {
+ diag "Some text must be missing";
+ $bad++;
+ }
+
+ ok(!$bad);
+}
diff --git a/t/plaintext.t b/t/plaintext.t
new file mode 100644
index 0000000..9a53a78
--- /dev/null
+++ b/t/plaintext.t
@@ -0,0 +1,58 @@
+use Test::More tests => 3;
+
+use strict;
+use HTML::Parser;
+
+my @a;
+my $p = HTML::Parser->new(api_version => 3);
+$p->handler(default => \@a, '@{event, text, is_cdata}');
+$p->parse(<<EOT)->eof;
+<xmp><foo></xmp>x<plaintext><foo>
+</plaintext>
+foo
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+my $doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo>
+</plaintext>
+foo
+:1:end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+</plaintext>foo<b></b>
+EOT
+
+for (@a) {
+ $_ = "" unless defined;
+}
+
+$doc = join(":", @a);
+
+#diag $doc;
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text:
+::end_document::");
+
+@a = ();
+$p->closing_plaintext('yep, emulate gecko (2)');
+$p->parse(<<EOT)->eof;
+<plaintext><foo>
+foo<b></b>
+EOT
+
+$doc = join(":", map { defined $_ ? $_ : "" } @a);
+
+is($doc, "start_document:::start:<plaintext>::text:<foo>
+foo<b></b>
+:1:end_document::");
diff --git a/t/pod.t b/t/pod.t
new file mode 100644
index 0000000..437887a
--- /dev/null
+++ b/t/pod.t
@@ -0,0 +1,4 @@
+use Test::More;
+eval "use Test::Pod 1.00";
+plan skip_all => "Test::Pod 1.00 required for testing POD" if $@;
+all_pod_files_ok();
diff --git a/t/process.t b/t/process.t
new file mode 100644
index 0000000..9d27250
--- /dev/null
+++ b/t/process.t
@@ -0,0 +1,43 @@
+use strict;
+
+use Test::More tests => 12;
+
+my $pi;
+my $orig;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; },
+ "token0,text"]
+ );
+
+$p->parse("<a><?foo><a>");
+
+is($pi, "foo");
+is($orig, "<?foo>");
+
+$p->parse("<a><?><a>");
+is($pi, "");
+is($orig, "<?>");
+
+$p->parse("<a><?
+foo
+><a>");
+is($pi, "\nfoo\n");
+is($orig, "<?\nfoo\n>");
+
+for (qw(< a > < ? b a r > < a >)) {
+ $p->parse($_);
+}
+
+is($pi, "bar");
+is($orig, "<?bar>");
+
+$p->xml_mode(1);
+
+$p->parse("<a><?foo>bar??><a>");
+is($pi, "foo>bar?");
+is($orig, "<?foo>bar??>");
+
+$p->parse("<a><??></a>");
+is($pi, "");
+is($orig, "<??>");
diff --git a/t/pullparser.t b/t/pullparser.t
new file mode 100644
index 0000000..80a186b
--- /dev/null
+++ b/t/pullparser.t
@@ -0,0 +1,55 @@
+use Test::More tests => 3;
+
+use HTML::PullParser;
+
+my $doc = <<'EOT';
+<title>Title</title>
+<style> h1 { background: white }
+<foo>
+</style>
+<H1 ID="3">Heading</H1>
+<!-- ignore this -->
+
+This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>.
+EOT
+
+my $p = HTML::PullParser->new(doc => $doc,
+ start => 'event,tagname,@attr',
+ end => 'event,tagname',
+ text => 'event,dtext',
+
+ ignore_elements => [qw(script style)],
+ unbroken_text => 1,
+ boolean_attribute_value => 1,
+ );
+
+my $t = $p->get_token;
+is($t->[0], "start");
+is($t->[1], "title");
+$p->unget_token($t);
+
+my @a;
+while (my $t = $p->get_token) {
+ for (@$t) {
+ s/\s/./g;
+ }
+ push(@a, join("|", @$t));
+}
+
+my $res = join("\n", @a, "");
+#diag $res;
+is($res, <<'EOT');
+start|title
+text|Title
+end|title
+text|..
+start|h1|id|3
+text|Heading
+end|h1
+text|...This.is.a.text.with.a.
+start|a|href|http://www.sol.no|name|l1
+text|link
+end|a
+text|..
+EOT
+
diff --git a/t/script.t b/t/script.t
new file mode 100644
index 0000000..2a75ccb
--- /dev/null
+++ b/t/script.t
@@ -0,0 +1,41 @@
+#!perl -w
+
+use strict;
+use Test;
+plan tests => 1;
+
+use HTML::Parser;
+
+my $TEXT = "";
+sub h
+{
+ my($event, $tagname, $text) = @_;
+ for ($event, $tagname, $text) {
+ if (defined) {
+ s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge;
+ }
+ else {
+ $_ = "<undef>";
+ }
+ }
+
+ $TEXT .= "[$event,$tagname,$text]\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1);
+$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>));
+$p->eof;
+
+ok($TEXT, <<'EOT');
+[start_document,<undef>,]
+[start,tr,<tr>]
+[start,td,<td align="center" height="100">]
+[start,script,<script src="whatever"/>]
+[end,script,]
+[start,script,<SCRIPT language="JavaScript1.1">]
+[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');]
+[end,script,</SCRIPT>]
+[end,td,</td>]
+[end,tr,</tr>]
+[end_document,<undef>,]
+EOT
diff --git a/t/skipped-text.t b/t/skipped-text.t
new file mode 100644
index 0000000..bc39915
--- /dev/null
+++ b/t/skipped-text.t
@@ -0,0 +1,89 @@
+use Test::More tests => 4;
+
+use strict;
+use HTML::Parser;
+
+my $p = HTML::Parser->new(api_version => 3);
+
+$p->report_tags("a");
+
+my @doc;
+
+$p->handler(start => \&a_handler, "skipped_text, text");
+$p->handler(end_document => \@doc, '@{skipped_text}');
+
+$p->parse(<<EOT)->eof;
+<title>hi</title>
+<h1><a href="foo">link</a></h1>
+and <a foo="">some</a> text.
+EOT
+
+sub a_handler {
+ push(@doc, shift);
+ my $text = shift;
+ push(@doc, uc($text));
+}
+
+
+is(join("", @doc), <<'EOT');
+<title>hi</title>
+<h1><A HREF="FOO">link</a></h1>
+and <A FOO="">some</a> text.
+EOT
+
+#
+# Comment stripper. Interaction with "" handlers.
+#
+my $doc = <<EOT;
+<html>text</html>
+<!-- comment -->
+and some more <b>text</b>.
+EOT
+(my $expected = $doc) =~ s/<!--.*?-->//;
+
+$p = HTML::Parser->new(api_version => 3);
+$p->handler(comment => "");
+$p->handler(end_document => sub {
+ my $stripped = shift;
+ #diag $stripped;
+ is($stripped, $expected);
+ }, "skipped_text");
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#
+# Interaction with unbroken text
+#
+my @x;
+$p = HTML::Parser->new(api_version => 3, unbroken_text => 1);
+$p->handler(text => \@x, '@{"X", skipped_text, text}');
+$p->handler(end => "");
+$p->handler(end_document => \@x, '@{"Y", skipped_text}');
+
+$doc = "a a<a>b b</a>c c<x>d d</x>e";
+
+for (split(//, $doc)) {
+ $p->parse($_);
+}
+$p->eof;
+
+#diag join(":", @x);
+is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:");
+
+#
+# The crash that Chip found
+#
+
+my $skipped;
+$p = HTML::Parser->new(
+ ignore_tags => ["foo"],
+ start_h => [sub {$skipped = shift}, "skipped_text"],
+);
+
+$p->parse("\x{100}<foo>");
+$p->parse("plain");
+$p->parse("<bar>");
+$p->eof;
+is($skipped, "\x{100}<foo>plain");
diff --git a/t/stack-realloc.t b/t/stack-realloc.t
new file mode 100644
index 0000000..46c7d35
--- /dev/null
+++ b/t/stack-realloc.t
@@ -0,0 +1,17 @@
+#!perl -w
+
+# HTML-Parser 3.33 and older used to core dump on this program because
+# of missing SPAGAIN calls in parse() XS code. It was not prepared for
+# the stack to get realloced.
+
+$| = 1;
+
+use Test::More tests => 1;
+
+use HTML::Parser;
+my $x = HTML::Parser->new(api_version => 3);
+my @row;
+$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname");
+$x->parse("</TD>");
+
+pass;
diff --git a/t/textarea.t b/t/textarea.t
new file mode 100644
index 0000000..120f79b
--- /dev/null
+++ b/t/textarea.t
@@ -0,0 +1,70 @@
+use Test::More tests => 1;
+
+use strict;
+use HTML::Parser;
+
+my $html = <<'EOT';
+<html>
+<title>This is a <nice> title</title>
+<!--comment-->
+<script language="perl">while (<DATA>) { &amp; }</script>
+
+<FORM>
+
+<textarea name="foo" cols=50 rows=10>
+
+foo
+<foo>
+<!--comment-->
+&amp;
+foo
+</FORM>
+
+</textarea>
+
+</FORM>
+
+</html>
+EOT
+
+my $dump = "";
+sub tdump {
+ my @a = @_;
+ for (@a) {
+ $_ = "<undef>" unless defined;
+ s/\n/\\n/g;
+ }
+ $dump .= join("|", @a) . "\n";
+}
+
+my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]);
+$p->parse($html)->eof;
+
+#diag $dump;
+
+is($dump, <<'EOT');
+start_document||<undef>|<undef>
+start|<html>|<undef>|<undef>
+text|\n|\n|
+start|<title>|<undef>|<undef>
+text|This is a <nice> title|This is a <nice> title|
+end|</title>|<undef>|<undef>
+text|\n|\n|
+comment|<!--comment-->|<undef>|<undef>
+text|\n|\n|
+start|<script language="perl">|<undef>|<undef>
+text|while (<DATA>) { &amp; }|while (<DATA>) { &amp; }|1
+end|</script>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef>
+text|\n\nfoo\n<foo>\n<!--comment-->\n&amp;\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|
+end|</textarea>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</FORM>|<undef>|<undef>
+text|\n\n|\n\n|
+end|</html>|<undef>|<undef>
+text|\n|\n|
+end_document||<undef>|<undef>
+EOT
diff --git a/t/threads.t b/t/threads.t
new file mode 100644
index 0000000..8da91e9
--- /dev/null
+++ b/t/threads.t
@@ -0,0 +1,39 @@
+# Verify thread safety.
+
+use Config;
+use Test::More;
+
+BEGIN {
+ plan(skip_all => "Not configured for threads")
+ unless $Config{useithreads} && $] >= 5.008;
+ plan(tests => 1);
+}
+
+use threads;
+use HTML::Parser;
+
+my $ok=0;
+
+sub start
+{
+ my($tag,$attr)=@_;
+
+ $ok += ($tag eq "foo");
+ $ok += (defined($attr->{param}) && $attr->{param} eq "bar");
+}
+
+my $p = HTML::Parser->new
+ (api_version => 3,
+ handlers => {
+ start => [\&start, "tagname,attr"],
+ });
+
+$p->parse("<foo pa");
+
+$ok=async {
+ $p->parse("ram=bar>");
+ $ok;
+}->join();
+
+is($ok,2);
+
diff --git a/t/tokeparser.t b/t/tokeparser.t
new file mode 100644
index 0000000..2084201
--- /dev/null
+++ b/t/tokeparser.t
@@ -0,0 +1,164 @@
+use Test::More tests => 17;
+
+use strict;
+use HTML::TokeParser;
+
+# First we create an HTML document to test
+
+my $file = "ttest$$.htm";
+die "$file already exists" if -e $file;
+
+open(F, ">$file") or die "Can't create $file: $!";
+print F <<'EOT'; close(F);
+
+<!--This is a test-->
+<html><head><title>
+ This is the &lt;title&gt;
+</title>
+
+ <base href="http://www.perl.com">
+</head>
+
+<body background="bg.gif">
+
+ <h1>This is the <b>title</b> again
+ </h1>
+
+ And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl">&nbsp;<!--nice isn't it-->Institute</a>
+
+ <br/><? process instruction >
+
+</body>
+</html>
+
+EOT
+
+END { unlink($file) || warn "Can't unlink $file: $!"; }
+
+
+my $p;
+
+
+$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
+ok($p->unbroken_text);
+if ($p->get_tag("foo", "title")) {
+ my $title = $p->get_trimmed_text;
+ #diag "Title: $title";
+ is($title, "This is the <title>");
+}
+undef($p);
+
+# Test with reference to glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(\*F);
+my $scount = 0;
+my $ecount = 0;
+my $tcount = 0;
+my $pcount = 0;
+while (my $token = $p->get_token) {
+ $scount++ if $token->[0] eq "S";
+ $ecount++ if $token->[0] eq "E";
+ $pcount++ if $token->[0] eq "PI";
+}
+undef($p);
+close F;
+
+# Test with glob
+open(F, $file) || die "Can't open $file: $!";
+$p = HTML::TokeParser->new(*F);
+$tcount++ while $p->get_tag;
+undef($p);
+close F;
+
+# Test with plain file name
+$p = HTML::TokeParser->new($file) || die;
+$tcount++ while $p->get_tag;
+undef($p);
+
+#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
+is($tcount, 34);
+is($scount, 10);
+is($ecount, 7);
+is($pcount, 1);
+is($tcount/2, $scount + $ecount);
+
+ok(!HTML::TokeParser->new("/noT/thEre/$$"));
+
+
+$p = HTML::TokeParser->new($file) || die;
+$p->get_tag("a");
+my $atext = $p->get_text;
+undef($p);
+
+is($atext, "Perl\240Institute");
+
+# test parsing of embeded document
+$p = HTML::TokeParser->new(\<<HTML);
+<title>Title</title>
+<H1>
+Heading
+</h1>
+HTML
+
+ok($p->get_tag("h1"));
+is($p->get_trimmed_text, "Heading");
+undef($p);
+
+# test parsing of large embedded documents
+my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022;
+
+#use Time::HiRes qw(time);
+my $start = time;
+$p = HTML::TokeParser->new(\$doc);
+#diag "Construction time: ", time - $start;
+
+my $count;
+while (my $t = $p->get_token) {
+ $count++ if $t->[0] eq "S";
+}
+#diag "Parse time: ", time - $start;
+
+is($count, 2022);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a heading</H1>
+This is s<b>o</b>me<hr>text.
+<br />
+This is some more text.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("/h1");
+
+my $t = $p->get_trimmed_text("br", "p");
+is($t, "This is some text.");
+
+$p->get_tag;
+
+$t = $p->get_trimmed_text("br", "p");
+is($t,"This is some more text.");
+
+undef($p);
+
+$p = HTML::TokeParser->new(\<<'EOT');
+<H1>This is a <b>bold</b> heading</H1>
+This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>.
+<p>
+This is even some more.
+EOT
+
+$p->get_tag("h1");
+
+$t = $p->get_phrase;
+is($t, "This is a bold heading");
+
+$t = $p->get_phrase;
+is($t, "");
+
+$p->get_tag;
+
+$t = $p->get_phrase;
+is($t, "This is some italic text. This is some more text.");
+
+undef($p);
diff --git a/t/uentities.t b/t/uentities.t
new file mode 100644
index 0000000..36d5179
--- /dev/null
+++ b/t/uentities.t
@@ -0,0 +1,65 @@
+# Test Unicode entities
+
+use HTML::Entities;
+
+use Test::More tests => 26;
+
+SKIP: {
+skip "This perl does not support Unicode or Unicode entities not selected",
+ 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT;
+
+is(decode_entities("&euro"), "&euro");
+is(decode_entities("&euro;"), "\x{20AC}");
+
+is(decode_entities("&aring"), "å");
+is(decode_entities("&aring;"), "å");
+
+is(decode_entities("&#500000"), chr(500000));
+
+is(decode_entities("&#x10FFFD"), "\x{10FFFD}");
+
+is(decode_entities("&#xFFFC"), "\x{FFFC}");
+
+
+is(decode_entities("&#xFDD0"), "\x{FFFD}");
+is(decode_entities("&#xFDD1"), "\x{FFFD}");
+is(decode_entities("&#xFDE0"), "\x{FFFD}");
+is(decode_entities("&#xFDEF"), "\x{FFFD}");
+is(decode_entities("&#xFFFF"), "&#xFFFF");
+is(decode_entities("&#x10FFFF"), "\x{FFFD}");
+is(decode_entities("&#x110000"), "&#x110000");
+is(decode_entities("&#XFFFFFFFF"), "&#XFFFFFFFF");
+
+is(decode_entities("&#0"), "&#0");
+is(decode_entities("&#0;"), "&#0;");
+is(decode_entities("&#x0"), "&#x0");
+is(decode_entities("&#X0;"), "&#X0;");
+
+is(decode_entities("&#&aring&#229&#229;&#xFFF"), "&#ååå\x{FFF}");
+
+# This might fail when we get more than 64 bit UVs
+is(decode_entities("&#0009999999999999999999999999999;"), "&#0009999999999999999999999999999;");
+is(decode_entities("&#xFFFF0000FFFF0000FFFF1"), "&#xFFFF0000FFFF0000FFFF1");
+
+my $err;
+for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) {
+ my $a = join("", map chr, $_->[0] .. $_->[1]);
+
+ my $e = encode_entities($a);
+ my $d = decode_entities($e);
+
+ unless ($d eq $a) {
+ diag "Wrong decoding in range $_->[0] .. $_->[1]";
+ # use Devel::Peek; Dump($a); Dump($d);
+ $err++;
+ }
+}
+ok(!$err);
+
+
+is(decode_entities("&#56256;&#56453;"), chr(0x100085));
+
+is(decode_entities("&#56256"), chr(0xFFFD));
+
+is(decode_entities("\260&rsquo;\260"), "\x{b0}\x{2019}\x{b0}");
+}
diff --git a/t/unbroken-text.t b/t/unbroken-text.t
new file mode 100644
index 0000000..7de85a9
--- /dev/null
+++ b/t/unbroken-text.t
@@ -0,0 +1,60 @@
+use strict;
+use HTML::Parser;
+
+use Test::More tests => 3;
+
+my $text = "";
+sub text
+{
+ my $cdata = shift() ? "CDATA" : "TEXT";
+ my($offset, $line, $col, $t) = @_;
+ $text .= "[$cdata:$offset:$line.$col:$t]";
+}
+
+sub tag
+{
+ $text .= shift;
+}
+
+my $p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ start_h => [\&tag, "text"],
+ end_h => [\&tag, "text"],
+ );
+
+$p->parse("foo ");
+$p->parse("bar ");
+$p->parse("<foo>");
+$p->parse("bar\n");
+$p->parse("</foo>");
+$p->parse("<xmp>xmp</xmp>");
+$p->parse("atend");
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>");
+
+$text = "";
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:37:2.20:atend]");
+
+
+$p = HTML::Parser->new(unbroken_text => 1,
+ text_h => [\&text, "is_cdata,offset,line,column,text"],
+ );
+
+$text = "";
+$p->parse("foo");
+$p->parse("<foo");
+$p->parse(">bar\n");
+$p->parse("foo<xm");
+$p->parse("p>xmp");
+$p->parse("</xmp");
+$p->parse(">bar");
+$p->eof;
+
+#diag $text;
+is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]");
+
+
diff --git a/t/unicode-bom.t b/t/unicode-bom.t
new file mode 100644
index 0000000..b7398cf
--- /dev/null
+++ b/t/unicode-bom.t
@@ -0,0 +1,63 @@
+#!perl -w
+
+use strict;
+use Test::More tests => 2;
+use HTML::Parser;
+
+SKIP: {
+skip "This perl does not support Unicode", 2 if $] < 5.008;
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag, attr'],
+);
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+
+#use Encode;
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A));
+$p->eof;
+
+$p->parse("\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\0\0\xFF\xFE<head>Hi there</head>");
+$p->eof;
+
+$p->parse("\xFE\xFF\0\0<head>Hi there</head>");
+$p->eof;
+
+for (@warn) {
+ s/line (\d+)/line ##/g;
+}
+
+is(join("", @warn), <<EOT);
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##.
+Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##.
+Parsing of undecoded UTF-16 at $0 line ##.
+Parsing of undecoded UTF-16 at $0 line ##.
+Parsing of undecoded UTF-32 at $0 line ##.
+Parsing of undecoded UTF-32 at $0 line ##.
+EOT
+
+@warn = ();
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ start_h => [\@parsed, 'tag'],
+);
+
+$p->parse("\xEF\xBB\xBF<head>Hi there</head>");
+$p->eof;
+ok(!@warn);
+}
diff --git a/t/unicode.t b/t/unicode.t
new file mode 100644
index 0000000..911c547
--- /dev/null
+++ b/t/unicode.t
@@ -0,0 +1,198 @@
+#!perl -w
+
+use strict;
+use HTML::Parser;
+use Test::More;
+BEGIN {
+ plan skip_all => "This perl does not support Unicode" if $] < 5.008;
+}
+
+plan tests => 105;
+
+my @warn;
+$SIG{__WARN__} = sub {
+ push(@warn, $_[0]);
+};
+
+my @parsed;
+my $p = HTML::Parser->new(
+ api_version => 3,
+ default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'],
+);
+
+my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile &#x263a</h1>\x{0420}";
+is(length($doc), 46);
+
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 0x263A);
+is($parsed[2][2], chr(0x263A));
+is($parsed[2][3], 7);
+is($parsed[2][4], 1);
+is($parsed[2][5], 8);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 8);
+is($parsed[3][6], 8);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\x{2600} f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0");
+is($parsed[4][8]{id}, "\x{2600}");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[7][0], "text");
+is($parsed[7][1], "\x{0420}");
+is($parsed[7][2], "\x{0420}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+is(@warn, 0);
+
+# Try to parse it as an UTF8 encoded string
+utf8::encode($doc);
+is(length($doc), 51);
+
+@parsed = ();
+$p->parse($doc)->eof;
+
+#use Data::Dump; Data::Dump::dump(@parsed);
+
+is(@parsed, 9);
+is($parsed[0][0], "start_document");
+
+is($parsed[1][0], "start");
+is($parsed[1][1], "<title>");
+SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") };
+is($parsed[1][3], 0);
+is($parsed[1][4], 7);
+
+is($parsed[2][0], "text");
+is(ord($parsed[2][1]), 226);
+is($parsed[2][1], "\xE2\x98\xBA");
+is($parsed[2][2], "\xE2\x98\xBA");
+is($parsed[2][3], 7);
+is($parsed[2][4], 3);
+is($parsed[2][5], 10);
+is($parsed[2][6], 7);
+
+is($parsed[3][0], "end");
+is($parsed[3][1], "</title>");
+is($parsed[3][3], 10);
+is($parsed[3][6], 10);
+
+is($parsed[4][0], "start");
+is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>");
+is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0");
+is($parsed[4][8]{id}, "\xE2\x98\x80");
+
+is($parsed[5][0], "text");
+is($parsed[5][1], "Smile &#x263a");
+is($parsed[5][2], "Smile \x{263A}");
+
+is($parsed[8][0], "end_document");
+is($parsed[8][3], length($doc));
+is($parsed[8][5], length($doc));
+is($parsed[8][6], length($doc));
+
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+my $file = "test-$$.html";
+open(my $fh, ">:utf8", $file) || die;
+print $fh <<EOT;
+\x{FEFF}
+<title>\x{263A} Love! </title>
+<h1 id=&hearts;\x{2665}>&hearts; Love \x{2665}<h1>
+EOT
+close($fh) || die;
+
+@warn = ();
+@parsed = ();
+$p->parse_file($file);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage
+is($parsed[10][3], -s $file);
+is(@warn, 1);
+like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw:utf8", $file) || die;
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\x{2665}\x{2665}");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \x{2665}");
+is($parsed[7][2], "\x{2665} Love \x{2665}");
+is($parsed[10][3], (-s $file) - 2 * 4);
+is(@warn, 0);
+
+@warn = ();
+@parsed = ();
+open($fh, "<:raw", $file) || die;
+$p->utf8_mode(1);
+$p->parse_file($fh);
+is(@parsed, "11");
+is($parsed[6][0], "start");
+is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5");
+is($parsed[7][0], "text");
+is($parsed[7][1], "&hearts; Love \xE2\x99\xA5");
+is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5");
+is($parsed[10][3], -s $file);
+is(@warn, 0);
+
+unlink($file);
+
+@parsed = ();
+$p->parse(q(<a href="a=1&lang=2&times=3">foo</a>))->eof;
+is(@parsed, "5");
+is($parsed[1][0], "start");
+is($parsed[1][8]{href}, "a=1&lang=2\xd7=3");
+
+ok(!HTML::Entities::_probably_utf8_chunk(""));
+ok(!HTML::Entities::_probably_utf8_chunk("f"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2"));
+ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2"));
+ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99"));
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ default_h => [\@parsed, 'event, text, tag, attr'],
+ attr_encoded => 1,
+);
+
+@warn = ();
+@parsed = ();
+
+$p->parse($doc)->eof;
+
+ok(!@warn);
+is(@parsed, 9);
diff --git a/t/xml-mode.t b/t/xml-mode.t
new file mode 100644
index 0000000..cdfc5b0
--- /dev/null
+++ b/t/xml-mode.t
@@ -0,0 +1,112 @@
+use strict;
+use Test::More tests => 8;
+
+use HTML::Parser ();
+my $p = HTML::Parser->new(xml_mode => 1,
+ );
+
+my $text = "";
+$p->handler(start =>
+ sub {
+ my($tag, $attr) = @_;
+ $text .= "S[$tag";
+ for my $k (sort keys %$attr) {
+ my $v = $attr->{$k};
+ $text .= " $k=$v";
+ }
+ $text .= "]";
+ }, "tagname,attr");
+$p->handler(end =>
+ sub {
+ $text .= "E[" . shift() . "]";
+ }, "tagname");
+$p->handler(process =>
+ sub {
+ $text .= "PI[" . shift() . "]";
+ }, "token0");
+$p->handler(text =>
+ sub {
+ $text .= shift;
+ }, "text");
+
+my $xml = <<'EOT';
+<?xml version="1.0"?>
+<?IS10744:arch name="html"?><!-- comment -->
+<DOC>
+<title html="h1">My first architectual document</title>
+<author html="address">Geir Ove Gronmo, grove@infotek.no</author>
+<para>This is the first paragraph in this document</para>
+<para html="p">This is the second paragraph</para>
+<para/>
+<xmp><foo></foo></xmp>
+</DOC>
+EOT
+
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"]
+PI[IS10744:arch name="html"]
+S[DOC]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para]E[para]
+S[xmp]S[foo]E[foo]E[xmp]
+E[DOC]
+EOT
+
+$text = "";
+$p->xml_mode(0);
+$p->parse($xml)->eof;
+
+is($text, <<'EOT');
+PI[xml version="1.0"?]
+PI[IS10744:arch name="html"?]
+S[doc]
+S[title html=h1]My first architectual documentE[title]
+S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author]
+S[para]This is the first paragraph in this documentE[para]
+S[para html=p]This is the second paragraphE[para]
+S[para/]
+S[xmp]<foo></foo>E[xmp]
+E[doc]
+EOT
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ xml_mode => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "Xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+# Test that we get an empty tag back
+$p = HTML::Parser->new(api_version => 3,
+ empty_element_tags => 1);
+
+$p->handler("end" =>
+ sub {
+ my($tagname, $text) = @_;
+ is($tagname, "xyzzy");
+ ok(!length($text));
+ }, "tagname,text");
+$p->parse("<Xyzzy foo=bar/>and some more")->eof;
+
+$p = HTML::Parser->new(
+ api_version => 3,
+ xml_pic => 1,
+);
+
+$p->handler(
+ "process" => sub {
+ my($text, $t0) = @_;
+ is($text, "<?foo > bar?>");
+ is($t0, "foo > bar");
+ }, "text, token0");
+$p->parse("<?foo > bar?> and then")->eof;