diff options
author | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
---|---|---|
committer | Lorry Tar Creator <lorry-tar-importer@lorry> | 2013-05-08 22:21:52 +0000 |
commit | 2f253cfc85ffd55a8acb988e91f0bc5ab348124c (patch) | |
tree | 4734ccd522c71dd455879162006742002f8c1565 /t | |
download | HTML-Parser-tarball-master.tar.gz |
HTML-Parser-3.71HEADHTML-Parser-3.71master
Diffstat (limited to 't')
-rw-r--r-- | t/api_version.t | 22 | ||||
-rw-r--r-- | t/argspec-bad.t | 40 | ||||
-rw-r--r-- | t/argspec.t | 148 | ||||
-rw-r--r-- | t/argspec2.t | 21 | ||||
-rw-r--r-- | t/attr-encoded.t | 32 | ||||
-rw-r--r-- | t/callback.t | 49 | ||||
-rw-r--r-- | t/case-sensitive.t | 85 | ||||
-rw-r--r-- | t/cases.t | 105 | ||||
-rw-r--r-- | t/comment.t | 24 | ||||
-rw-r--r-- | t/crashme.t | 43 | ||||
-rw-r--r-- | t/declaration.t | 62 | ||||
-rw-r--r-- | t/default.t | 43 | ||||
-rw-r--r-- | t/document.t | 41 | ||||
-rw-r--r-- | t/dtext.t | 72 | ||||
-rw-r--r-- | t/entities.t | 213 | ||||
-rw-r--r-- | t/entities2.t | 57 | ||||
-rw-r--r-- | t/filter-methods.t | 205 | ||||
-rw-r--r-- | t/filter.t | 60 | ||||
-rw-r--r-- | t/handler-eof.t | 54 | ||||
-rw-r--r-- | t/handler.t | 67 | ||||
-rw-r--r-- | t/headparser-http.t | 20 | ||||
-rw-r--r-- | t/headparser.t | 200 | ||||
-rw-r--r-- | t/ignore.t | 27 | ||||
-rw-r--r-- | t/largetags.t | 38 | ||||
-rw-r--r-- | t/linkextor-base.t | 41 | ||||
-rw-r--r-- | t/linkextor-rel.t | 36 | ||||
-rw-r--r-- | t/magic.t | 41 | ||||
-rw-r--r-- | t/marked-sect.t | 121 | ||||
-rw-r--r-- | t/msie-compat.t | 79 | ||||
-rw-r--r-- | t/offset.t | 58 | ||||
-rw-r--r-- | t/options.t | 36 | ||||
-rw-r--r-- | t/parsefile.t | 45 | ||||
-rw-r--r-- | t/parser.t | 184 | ||||
-rw-r--r-- | t/plaintext.t | 58 | ||||
-rw-r--r-- | t/pod.t | 4 | ||||
-rw-r--r-- | t/process.t | 43 | ||||
-rw-r--r-- | t/pullparser.t | 55 | ||||
-rw-r--r-- | t/script.t | 41 | ||||
-rw-r--r-- | t/skipped-text.t | 89 | ||||
-rw-r--r-- | t/stack-realloc.t | 17 | ||||
-rw-r--r-- | t/textarea.t | 70 | ||||
-rw-r--r-- | t/threads.t | 39 | ||||
-rw-r--r-- | t/tokeparser.t | 164 | ||||
-rw-r--r-- | t/uentities.t | 65 | ||||
-rw-r--r-- | t/unbroken-text.t | 60 | ||||
-rw-r--r-- | t/unicode-bom.t | 63 | ||||
-rw-r--r-- | t/unicode.t | 198 | ||||
-rw-r--r-- | t/xml-mode.t | 112 |
48 files changed, 3447 insertions, 0 deletions
diff --git a/t/api_version.t b/t/api_version.t new file mode 100644 index 0000000..9803121 --- /dev/null +++ b/t/api_version.t @@ -0,0 +1,22 @@ +use Test::More tests => 4; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +ok(!$p->handler("start"), "API version 3"); + +my $failed; +eval { + my $p = HTML::Parser->new(api_version => 4); + $failed++; +}; +like($@, qr/^API version 4 not supported/); +ok(!$failed, "API version 4"); + +$p = HTML::Parser->new(api_version => 2); + +is($p->handler("start"), "start", "API version 2"); + + diff --git a/t/argspec-bad.t b/t/argspec-bad.t new file mode 100644 index 0000000..8c0b199 --- /dev/null +++ b/t/argspec-bad.t @@ -0,0 +1,40 @@ +use Test::More tests => 6; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +eval { + $p->handler(end => "end", q(xyzzy)); +}; +like($@, qr/^Unrecognized identifier xyzzy in argspec/); + + +eval { + $p->handler(end => "end", q(tagname text)); +}; +like($@, qr/^Missing comma separator in argspec/); + + +eval { + $p->handler(end => "end", q(tagname, "text)); +}; +like($@, qr/^Unterminated literal string in argspec/); + + +eval { + $p->handler(end => "end", q(tagname, "t\\t")); +}; +like($@, qr/^Backslash reserved for literal string in argspec/); + +eval { + $p->handler(end => "end", '"' . ("x" x 256) . '"'); +}; +like($@, qr/^Literal string is longer than 255 chars in argspec/); + +$p->handler(end => sub { is(length(shift), 255) }, + '"' . ("x" x 255) . '"'); +$p->parse("</x>"); + + diff --git a/t/argspec.t b/t/argspec.t new file mode 100644 index 0000000..e8aa7a5 --- /dev/null +++ b/t/argspec.t @@ -0,0 +1,148 @@ + +use strict; +require HTML::Parser; + +my $decl = '<!ENTITY nbsp CDATA " " -- no-break space -->'; +my $com1 = '<!-- Comment -->'; +my $com2 = '<!-- Comment -- -- Comment -->'; +my $start = '<a href="foo">'; +my $end = '</a>'; +my $empty = "<IMG SRC='foo'/>"; +my $proc = '<? something completely different ?>'; + +my @argspec = qw( self offset length + event tagname tag token0 + text + is_cdata dtext + tokens + tokenpos + attr + attrseq ); + +my @result = (); +my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)], + strict_comment => 1, xml_mode => 1); + +my @tests = + ( # string, expected results + $decl => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY', + '<!ENTITY nbsp CDATA " " -- no-break space -->', + undef, undef, + ['ENTITY', 'nbsp', 'CDATA', '" "', '-- no-break space --'], + [2, 6, 9, 4, 16, 5, 22, 8, 31, 20], + undef, undef ]], + $com1 => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ', + '<!-- Comment -->', + undef, undef, + [' Comment '], + [4, 9], + undef, undef ]], + $com2 => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ', + '<!-- Comment -- -- Comment -->', + undef, undef, + [' Comment ', ' Comment '], + [4, 9, 18, 9], + undef, undef ]], + $start => [[$p, 0, 14, 'start', 'a', 'a', 'a', + '<a href="foo">', + undef, undef, + ['a', 'href', '"foo"'], + [1, 1, 3, 4, 8, 5], + {'href', 'foo'}, ['href'] ]], + $end => [[$p, 0, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef ]], + $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG', + "<IMG SRC='foo'/>", + undef, undef, + ['IMG', 'SRC', "'foo'"], + [1, 3, 5, 3, 9, 5], + {'SRC', 'foo'}, ['SRC'] ], + [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG', + '', + undef, undef, + ['IMG'], + undef, + undef, undef ], + ], + $proc => [[$p, 0, 36, 'process', ' something completely different ', + '? something completely different ', + ' something completely different ', + '<? something completely different ?>', + undef, undef, + [' something completely different '], + [2, 32], + undef, undef ]], + "$end\n$end" => [[$p, 0, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef], + [$p, 4, 1, 'text', undef, undef, undef, + "\n", + '', "\n", + undef, + undef, + undef, undef], + [$p, 5, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef ]], + ); + +use Test::More; +plan tests => @tests / 2; + +sub string_tag { + my (@pieces) = @_; + my $part; + foreach $part ( @pieces ) { + if (!defined $part) { + $part = 'undef'; + } + elsif (!ref $part) { + $part = "'$part'" if $part !~ /^\d+$/; + } + elsif ('ARRAY' eq ref $part ) { + $part = '[' . join(', ', string_tag(@$part)) . ']'; + } + elsif ('HASH' eq ref $part ) { + $part = '{' . join(',', string_tag(%$part)) . '}'; + } + else { + $part = '<' . ref($part) . '>'; + } + } + return join(", ", @pieces ); +} + +my $i = 0; +TEST: +while (@tests) { + my($html, $expected) = splice @tests, 0, 2; + ++$i; + + @result = (); + $p->parse($html)->eof; + + shift(@result) if $result[0][3] eq "start_document"; + pop(@result) if $result[-1][3] eq "end_document"; + + # Compare results for each element expected + foreach (@$expected) { + my $want = string_tag($_); + my $got = string_tag(shift @result); + if ($want ne $got) { + is($want, $got); + next TEST; + } + } + + pass; +} diff --git a/t/argspec2.t b/t/argspec2.t new file mode 100644 index 0000000..6f594b9 --- /dev/null +++ b/t/argspec2.t @@ -0,0 +1,21 @@ +use Test::More tests => 2; + +use strict; +use HTML::Parser; + +my @start; +my @text; + +my $p = HTML::Parser->new(api_version => 3); +$p->handler(start => \@start, '@{tagname, @attr}'); +$p->handler(text => \@text, '@{dtext}'); +$p->parse(<<EOT)->eof; +Hi +<a href="abc">Foo</a><b>:-)</b> +EOT + +is("@start", "a href abc b"); + +is(join("", @text), "Hi\nFoo:-)\n"); + + diff --git a/t/attr-encoded.t b/t/attr-encoded.t new file mode 100644 index 0000000..4d458eb --- /dev/null +++ b/t/attr-encoded.t @@ -0,0 +1,32 @@ +use strict; +use Test::More tests => 2; + +use HTML::Parser (); +my $p = HTML::Parser->new(); +$p->attr_encoded(1); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + $text .= "]"; + }, "tagname,attr"); + +my $html = <<'EOT'; +<tag arg="&<>"> +EOT + +$p->parse($html)->eof; + +is($text, 'S[tag arg=&<>]'); + +$text = ""; +$p->attr_encoded(0); +$p->parse($html)->eof; + +is($text, 'S[tag arg=&<>]'); diff --git a/t/callback.t b/t/callback.t new file mode 100644 index 0000000..7a456cf --- /dev/null +++ b/t/callback.t @@ -0,0 +1,49 @@ +use Test::More tests => 47; + +use strict; +use HTML::Parser; + +my @expected; +my $p = HTML::Parser->new(api_version => 3, + unbroken_text => 1, + default_h => [\@expected, '@{event, text}'], + ); + +my $doc = <<'EOT'; +<title>Hi</title> +<h1>Ho ho</h1> +<--comment-> +EOT + +$p->parse($doc)->eof; +#use Data::Dump; Data::Dump::dump(@expected); + +for my $i (1..length($doc)) { + my @t; + $p->handler(default => \@t); + $p->parse(chunk($doc, $i)); + + # check that we got the same stuff + #diag "X:", join(":", @t); + #diag "Y:", join(":", @expected); + is(join(":", @t), join(":", @expected)); +} + +sub chunk { + my $str = shift; + my $size = shift || 1; + sub { + my $res = substr($str, 0, $size); + #diag "...$res"; + substr($str, 0, $size) = ""; + $res; + } +} + +# Test croking behaviour +$p->handler(default => []); + +eval { + $p->parse(sub { die "Hi" }); +}; +like($@, qr/^Hi/); diff --git a/t/case-sensitive.t b/t/case-sensitive.t new file mode 100644 index 0000000..565b20b --- /dev/null +++ b/t/case-sensitive.t @@ -0,0 +1,85 @@ +use strict; +use Test::More tests => 8; + +use HTML::Parser (); +my $p = HTML::Parser->new(); +$p->case_sensitive(1); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr, $attrseq) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + if (@$attrseq) { $text.=" Order:" ; } + for my $k (@$attrseq) { + $text .= " $k"; + } + $text .= "]"; + }, "tagname,attr,attrseq"); +$p->handler(end => + sub { + my ($tag) = @_; + $text .= "E[$tag]"; + }, "tagname"); + +my $html = <<'EOT'; +<tAg aRg="Value" arg="other value"></tAg> +EOT +my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]'; +my $ci = 'S[tag arg=Value Order: arg arg]E[tag]'; + +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->parse($html)->eof; +is($text, $ci); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->parse($html)->eof; +is($text, $cs); + +$html = <<'EOT'; +<tAg aRg="Value" arg="other value"></tAg> +<iGnOrE></ignore> +EOT +$p->ignore_tags('ignore'); +$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]'; +$ci = 'S[tag arg=Value Order: arg arg]E[tag]'; + +$text = ""; +$p->case_sensitive(0); +$p->xml_mode(0); +$p->parse($html)->eof; +is($text, $ci); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(0); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + diff --git a/t/cases.t b/t/cases.t new file mode 100644 index 0000000..a537279 --- /dev/null +++ b/t/cases.t @@ -0,0 +1,105 @@ +use Test::More; + +require HTML::Parser; + +package P; @ISA = qw(HTML::Parser); + +my @result; +sub start +{ + my($self, $tag, $attr) = @_; + push @result, "START[$tag]"; + for (sort keys %$attr) { + push @result, "\t$_: " . $attr->{$_}; + } + $start++; +} + +sub end +{ + my($self, $tag) = @_; + push @result, "END[$tag]"; + $end++; +} + +sub text +{ + my $self = shift; + push @result, "TEXT[$_[0]]"; + $text++; +} + +sub comment +{ + my $self = shift; + push @result, "COMMENT[$_[0]]"; + $comment++; +} + +sub declaration +{ + my $self = shift; + push @result, "DECLARATION[$_[0]]"; + $declaration++; +} + +package main; + + +@tests = + ( + '<a ">' => ['START[a]', "\t\": \""], + '<a/>' => ['START[a/]',], + '<a />' => ['START[a]', "\t/: /"], + '<a a/>' => ['START[a]', "\ta/: a/"], + '<a a/=/>' => ['START[a]', "\ta/: /"], + '<a x="foo bar">' => ['START[a]', "\tx: foo\xA0bar"], + '<a x="foo bar">' => ['START[a]', "\tx: foo bar"], + '<å >' => ['TEXT[<å]', 'TEXT[ >]'], + '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'], + '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'], + '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'], + '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'], + '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'], + "2 <a href='foo bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], + '2 <a href=foo bar> 2' => + ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'], + '2 <a href="foo bar"> 2' => + ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], + '2 <a href="foo\'bar"> 2' => + ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'], + "2 <a href='foo\"bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], + "2 <a href='foo"bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], + '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'], + '2 <a.b-12 a.b = 2 a> 2' => + ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'], + '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'], + '<!ENTITY nbsp CDATA " " -- no-break space -->' => + ['DECLARATION[ENTITY nbsp CDATA " " -- no-break space --]'], + '<!-- comment -->' => ['COMMENT[ comment ]'], + '<!-- comment -- --- comment -->' => + ['COMMENT[ comment ]', 'COMMENT[- comment ]'], + '<!-- comment <!-- not comment --> comment -->' => + ['COMMENT[ comment <!]', 'COMMENT[> comment ]'], + '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'], + ); + +plan tests => @tests / 2; + +my $i = 0; +TEST: +while (@tests) { + ++$i; + my ($html, $expected) = splice @tests, 0, 2; + @result = (); + + $p = new P; + $p->strict_comment(1); + $p->parse($html)->eof; + + ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n", + "Got: @result\n"); +} diff --git a/t/comment.t b/t/comment.t new file mode 100644 index 0000000..303449e --- /dev/null +++ b/t/comment.t @@ -0,0 +1,24 @@ +use Test::More tests => 1; + +use strict; +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3); +my @com; +$p->handler(comment => sub { push(@com, shift) }, "token0"); +$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text"); + +$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>"); +$p->parse("<!--+--"); +$p->parse("\n\n"); +$p->parse(">"); +$p->parse("<!a'b>"); +$p->parse("<!--foo--->"); +$p->parse("<!--foo---->"); +$p->parse("<!--foo----->-->"); +$p->parse("<foo>"); +$p->parse("<!3453><!-3456><!FOO><>"); +$p->eof; + +my $com = join(":", @com); +is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]"); diff --git a/t/crashme.t b/t/crashme.t new file mode 100644 index 0000000..1a1e8e4 --- /dev/null +++ b/t/crashme.t @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# This test will simply run the parser on random junk. + +my $no_tests = shift || 3; +use Test::More; +plan tests => $no_tests; + +use HTML::Parser (); + +my $file = "junk$$.html"; +die if -e $file; + +for (1..$no_tests) { + + open(JUNK, ">$file") || die; + for (1 .. rand(5000)) { + for (1 .. rand(200)) { + print JUNK pack("N", rand(2**32)); + } + print JUNK ("<", "&", ">")[rand(3)]; # make these a bit more likely + } + close(JUNK); + + #diag "Parse @{[-s $file]} bytes of junk"; + + HTML::Parser->new->parse_file($file); + pass(); + + #print_mem(); +} + +unlink($file); + + +sub print_mem +{ + # this probably only works on Linux + open(STAT, "/proc/self/status") || return; + while (<STAT>) { + diag $_ if /^VmSize/; + } +} diff --git a/t/declaration.t b/t/declaration.t new file mode 100644 index 0000000..17de561 --- /dev/null +++ b/t/declaration.t @@ -0,0 +1,62 @@ +use Test::More tests => 2; + +use HTML::Parser; +my $res = ""; + +sub decl +{ + my $t = shift; + $res .= "[" . join("\n", map "<$_>", @$t) . "]"; +} + +sub text +{ + $res .= shift; +} + +my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"], + default_h => [\&text, "text"], + ); + +$p->parse(<<EOT)->eof; +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>-- + "http://www.w3.org/TR/html40/strict.dtd"> + +<!ENTITY foo "<!-- foo -->"> +<!Entity foo "<!-- foo -->"> + +<!row --> foo +EOT + +is($res, <<EOT); +[<DOCTYPE> +<HTML> +<PUBLIC> +<"-//W3C//DTD HTML 4.01//EN"> +<--<comment>--> +<"http://www.w3.org/TR/html40/strict.dtd">] + +[<ENTITY> +<foo> +<"<!-- foo -->">] +[<Entity> +<foo> +<"<!-- foo -->">] + +<!row --> foo +EOT + +$res = ""; +$p->parse(<<EOT)->eof; +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]> +EOT +is($res, <<EOT); +[<DOCTYPE> +<html> +<PUBLIC> +<"-//W3C//DTD XHTML 1.0 Strict//EN"> +<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<[]>] +EOT + diff --git a/t/default.t b/t/default.t new file mode 100644 index 0000000..4b5ed79 --- /dev/null +++ b/t/default.t @@ -0,0 +1,43 @@ +use strict; +use Test::More tests => 3; + +my $text = ""; +use HTML::Parser (); +my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"], + ); + +my $html = <<'EOT'; + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html40/strict.dtd"> + +<title>foo</title> +<!-- comment <a> --> +<?process instruction> + +EOT + +$p->parse($html)->eof; + +is($text, $html); + +$text = ""; +$p->handler(start => sub { }, ""); +$p->handler(declaration => sub { }, ""); +$p->parse($html)->eof; + +my $html2; +$html2 = $html; +$html2 =~ s/<title>//; +$html2 =~ s/<!DOCTYPE[^>]*>//; + +is($text, $html2); + +$text = ""; +$p->handler(start => undef); +$p->parse($html)->eof; + +$html2 = $html; +$html2 =~ s/<!DOCTYPE[^>]*>//; + +is($text, $html2); diff --git a/t/document.t b/t/document.t new file mode 100644 index 0000000..6696939 --- /dev/null +++ b/t/document.t @@ -0,0 +1,41 @@ +#!perl -w + +use Test; +plan tests => 6; + + +use HTML::Parser; +use File::Spec; + +my $events; +my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]); + +$events = ""; +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse_file(File::Spec->devnull); +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->parse(""); +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->parse("<a>"); +$p->eof; +ok($events, "start_document\nstart\nend_document\n"); + +$events = ""; +$p->parse("<a> "); +$p->eof; +ok($events, "start_document\nstart\ntext\nend_document\n"); diff --git a/t/dtext.t b/t/dtext.t new file mode 100644 index 0000000..883c61f --- /dev/null +++ b/t/dtext.t @@ -0,0 +1,72 @@ +#!perl -w + +use strict; +use Test::More tests => 2; + +use HTML::Parser (); + +my $dtext = ""; +my $text = ""; + +sub append +{ + $dtext .= shift; + $text .= shift; +} + +my $p = HTML::Parser->new(text_h => [\&append, "dtext, text"], + default_h => [\&append, "text, text" ], + ); + +my $doc = <<'EOT'; +<title>å</title> +<a href="fooå">ååAA<A>AA</a> +<?å> +foo bar +foo bar +&xyzzy +&xyzzy; +<!-- � --> + +ÿ +ÿ +ÿG +<!-- Ā --> +� +� +& +&# +&#x +<xmp>å</xmp> +<script>å</script> +<ScRIPT>å</scRIPT> +<skript>å</script> +EOT + +$p->parse($doc)->eof; + +is($text, $doc); +is($dtext, <<"EOT"); +<title>å</title> +<a href="fooå">ååAA<A>AA</a> +<?å> +foo\240bar +foo\240bar +&xyzzy +&xyzzy; +<!-- � --> +\1 +\377 +\377 +\377G +<!-- Ā --> +� +� +& +&# +&#x +<xmp>å</xmp> +<script>å</script> +<ScRIPT>å</scRIPT> +<skript>å</script> +EOT diff --git a/t/entities.t b/t/entities.t new file mode 100644 index 0000000..f12d2fd --- /dev/null +++ b/t/entities.t @@ -0,0 +1,213 @@ +use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); + +use Test::More tests => 20; + +$a = "Våre norske tegn bør æres"; + +decode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +encode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +decode_entities($a); +encode_entities_numeric($a); + +is($a, "Våre norske tegn bør æres"); + +$a = "<&>\"'"; +is(encode_entities($a), "<&>"'"); +is(encode_entities_numeric($a), "<&>"'"); + +$a = "abcdef"; +is(encode_entities($a, 'a-c'), "abcdef"); + +$a = "[24/7]\\"; +is(encode_entities($a, '/'), "[24/7]\\"); +is(encode_entities($a, '\\/'), "[24/7]\\"); +is(encode_entities($a, '\\'), "[24/7]\"); +is(encode_entities($a, ']\\'), "[24/7]\"); + +# See how well it does against rfc1866... +$ent = $plain = ""; +while (<DATA>) { + next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; + $ent .= "&$1;"; + $plain .= chr($2); +} + +$a = $ent; +decode_entities($a); +is($a, $plain); + +# Try decoding when the ";" are left out +$a = $ent, +$a =~ s/;//g; +decode_entities($a); +is($a, $plain); + + +$a = $plain; +encode_entities($a); +is($a, $ent); + +{ #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144 + + my %hash= ( + "Våre norske tegn bør æres" => "Våre norske tegn bør æres" + ); + + my ($got, $eval_ok); + $eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 }; + is( $eval_ok, 1, "decode_entitites() when processing a key as input"); + is( $got, (values %hash)[0], "decode_entities() decodes a key properly"); +} + +# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> +# Subject: HTML entities problem with 5.11 +# To: libwww-perl@ics.uci.edu +# Date: Fri, 05 Sep 1997 16:56:55 +1000 +# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> +# +# Hi. I've got a problem that has surfaced with the changes to +# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening +# in the process of encoding then decoding special entities. Eg, what goes +# in as "abc&def&ghi" comes out as "abc&def;&ghi;". + +is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); + +# Decoding of ' +is(decode_entities("'"), "'"); +is(encode_entities("'", "'"), "'"); + +is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"), + "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}"); +is(decode_entities("{&amp;&amp;& also Яœ}"), + "{&&& also \x{42F}\x{153}}"); + +__END__ +# Quoted from rfc1866.txt + +14. Proposed Entities + + The HTML DTD references the "Added Latin 1" entity set, which only + supplies named entities for a subset of the non-ASCII characters in + [ISO-8859-1], namely the accented characters. The following entities + should be supported so that all ISO 8859-1 characters may only be + referenced symbolically. The names for these entities are taken from + the appendixes of [SGML]. + + <!ENTITY nbsp CDATA " " -- no-break space --> + <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> + <!ENTITY cent CDATA "¢" -- cent sign --> + <!ENTITY pound CDATA "£" -- pound sterling sign --> + <!ENTITY curren CDATA "¤" -- general currency sign --> + <!ENTITY yen CDATA "¥" -- yen sign --> + <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> + <!ENTITY sect CDATA "§" -- section sign --> + <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> + <!ENTITY copy CDATA "©" -- copyright sign --> + <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> + <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> + <!ENTITY not CDATA "¬" -- not sign --> + <!ENTITY shy CDATA "­" -- soft hyphen --> + <!ENTITY reg CDATA "®" -- registered sign --> + <!ENTITY macr CDATA "¯" -- macron --> + <!ENTITY deg CDATA "°" -- degree sign --> + <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> + <!ENTITY sup2 CDATA "²" -- superscript two --> + <!ENTITY sup3 CDATA "³" -- superscript three --> + <!ENTITY acute CDATA "´" -- acute accent --> + <!ENTITY micro CDATA "µ" -- micro sign --> + <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> + <!ENTITY middot CDATA "·" -- middle dot --> + <!ENTITY cedil CDATA "¸" -- cedilla --> + <!ENTITY sup1 CDATA "¹" -- superscript one --> + <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> + <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> + <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> + <!ENTITY frac12 CDATA "½" -- fraction one-half --> + <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> + <!ENTITY iquest CDATA "¿" -- inverted question mark --> + <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> + <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> + <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> + + + +Berners-Lee & Connolly Standards Track [Page 75] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> + <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> + <!ENTITY Aring CDATA "Å" -- capital A, ring --> + <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> + <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> + <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> + <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> + <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> + <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> + <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> + <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> + <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> + <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> + <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> + <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> + <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> + <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> + <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> + <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> + <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> + <!ENTITY times CDATA "×" -- multiply sign --> + <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> + <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> + <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> + <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> + <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> + <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> + <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> + <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> + <!ENTITY agrave CDATA "à" -- small a, grave accent --> + <!ENTITY aacute CDATA "á" -- small a, acute accent --> + <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> + <!ENTITY atilde CDATA "ã" -- small a, tilde --> + <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> + <!ENTITY aring CDATA "å" -- small a, ring --> + <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> + <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> + <!ENTITY egrave CDATA "è" -- small e, grave accent --> + <!ENTITY eacute CDATA "é" -- small e, acute accent --> + <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> + <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> + <!ENTITY igrave CDATA "ì" -- small i, grave accent --> + <!ENTITY iacute CDATA "í" -- small i, acute accent --> + <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> + <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> + <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> + <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> + <!ENTITY ograve CDATA "ò" -- small o, grave accent --> + + + +Berners-Lee & Connolly Standards Track [Page 76] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY oacute CDATA "ó" -- small o, acute accent --> + <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> + <!ENTITY otilde CDATA "õ" -- small o, tilde --> + <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> + <!ENTITY divide CDATA "÷" -- divide sign --> + <!ENTITY oslash CDATA "ø" -- small o, slash --> + <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> + <!ENTITY uacute CDATA "ú" -- small u, acute accent --> + <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> + <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> + <!ENTITY yacute CDATA "ý" -- small y, acute accent --> + <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> + <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> diff --git a/t/entities2.t b/t/entities2.t new file mode 100644 index 0000000..537ac78 --- /dev/null +++ b/t/entities2.t @@ -0,0 +1,57 @@ +#!perl -w + +use strict; +use Test::More tests => 9; + +use HTML::Entities qw(_decode_entities); + +eval { + _decode_entities("<", undef); +}; +like($@, qr/^(?:Can't inline decode readonly string|Modification of a read-only value attempted)/); + +eval { + my $a = ""; + _decode_entities($a, $a); +}; +like($@, qr/^2nd argument must be hash reference/); + +eval { + my $a = ""; + _decode_entities($a, []); +}; +like($@, qr/^2nd argument must be hash reference/); + +$a = "<"; +_decode_entities($a, undef); +is($a, "<"); + +_decode_entities($a, { "lt" => "<" }); +is($a, "<"); + +my $x = "x" x 20; + +my $err; +for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { + my $a = $_; + $a =~ s/:/&a;/g; + my $b = $_; + $b =~ s/:/$x/g; + _decode_entities($a, { "a" => $x }); + if ($a ne $b) { + diag "Something went wrong with '$_'"; + $err++; + } +} +ok(!$err); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo\xA0bar"); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo bar"); + +_decode_entities($a, \%HTML::Entities::entity2char, 1); +is($a, "foo\xA0bar"); diff --git a/t/filter-methods.t b/t/filter-methods.t new file mode 100644 index 0000000..9eccaf1 --- /dev/null +++ b/t/filter-methods.t @@ -0,0 +1,205 @@ +#!/usr/bin/perl -w + +use Test::More tests => 12; +use strict; + +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); +$p->ignore_elements("script"); +$p->unbroken_text(1); + +$p->handler(default => [], "event, text"); +$p->parse(<<"EOT")->eof; +<html><head><title>foo</title><Script language="Perl"> + while (<B>) { + # ... + } +</Script><body> +This is an <i>italic</i> and <b>bold</b> text. +</body> +</html> +EOT + +my $t = join("||", map join("|", @$_), @{$p->handler("default")}); +#diag $t; + +is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text| +This is an italic and bold text. +||end|</body>||text| +||end|</html>||text| +||end_document|", 'ignore_elements'); + + +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->report_tags("a"); +$p->handler(start => sub { + my($tagname, %attr) = @_; + ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); + }, 'tagname, @attr'); +$p->handler(end => sub { + my $tagname = shift; + is($tagname, "a", 'report_tags end'); + }, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> + +This is <a href="#a">very nice</a> example. + +EOT + + +#------------------------------------------------------ + +my @tags; +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(a em)); +$p->ignore_tags(qw(em)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> + +This is <em>yet another</em> <a href="#a">very nice</a> example. + +EOT +is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->report_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> +<h3>Next example</h3> + +EOT +is(join('|', @tags), 'h3', 'report_tags replaces filter'); + + +#------------------------------------------------------ + + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h1 h2)); +$p->ignore_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> +<h3>Next example</h3> + +EOT +is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->ignore_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->ignore_elements("script"); +my $res=""; +$p->handler(default=> sub {$res.=$_[0];}, 'text'); +$p->parse(<<'EOT')->eof; +A <script> B </script> C </script> D <script> E </script> F +EOT +is($res,"A C D F\n","ignore </script> without <script> correctly"); diff --git a/t/filter.t b/t/filter.t new file mode 100644 index 0000000..3b18f9e --- /dev/null +++ b/t/filter.t @@ -0,0 +1,60 @@ +use Test::More tests => 3; + +my $HTML = <<EOT; + +<!DOCTYPE HTML> +<!-- comment +<h1>Foo</h1> +--> + +<H1 +>Bar</H1 +> + +<Table><tr><td>1<td>2<td>3 +<tr> +</table> + +<?process> + +EOT + +use HTML::Filter; +use SelectSaver; + +my $tmpfile = "test-$$.htm"; +die "$tmpfile already exists" if -e $tmpfile; + +open(HTML, ">$tmpfile") or die "$!"; + +{ + my $save = new SelectSaver(HTML); + HTML::Filter->new->parse($HTML)->eof; +} +close(HTML); + +open(HTML, $tmpfile) or die "$!"; +local($/) = undef; +my $FILTERED = <HTML>; +close(HTML); + +#print $FILTERED; +is($FILTERED, $HTML); + +{ + package MyFilter; + @ISA=qw(HTML::Filter); + sub comment {} + sub output { push(@{$_[0]->{fhtml}}, $_[1]) } + sub filtered_html { join("", @{$_[0]->{fhtml}}) } +} + +my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html; +unlink($tmpfile) or warn "Can't unlink $tmpfile: $!"; + +#diag $f2; + +unlike($f2, qr/Foo/); +like($f2, qr/Bar/); + + diff --git a/t/handler-eof.t b/t/handler-eof.t new file mode 100644 index 0000000..39419dc --- /dev/null +++ b/t/handler-eof.t @@ -0,0 +1,54 @@ +use Test::More tests => 6; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) }, + "attr"); +$p->handler(end => sub { shift->eof }, "self"); +my $text; +$p->handler(text => sub { $text = shift }, "text"); + +is($p->parse("<foo testno=1>"), $p); + +$text = ''; +ok(!$p->parse("</foo><foo testno=999>")); +ok(!$text); + +$p->handler(end => sub { $p->parse("foo"); }, ""); +eval { + $p->parse("</foo>"); +}; +like($@, qr/Parse loop not allowed/); + +# We used to get into an infinite loop if the eof triggered +# handler called ->eof + +use HTML::Parser; +$p = HTML::Parser->new(api_version => 3); + +my $i; +$p->handler("default" => + sub { + my $p=shift; + #++$i; diag "$i @_"; + $p->eof; + }, "self, event"); +$p->parse("Foo"); +$p->eof; + +# We used to sometimes trigger events after a handler signaled eof +my $title=''; +$p = HTML::Parser->new(api_version => 3,); +$p->handler(start=> \&title_handler, 'tagname, self'); +$p->parse("<head><title>foo</title>\n</head>"); +is($title, "foo"); + +sub title_handler { + return if shift ne 'title'; + my $self = shift; + $self->handler(text => sub { $title .= shift}, 'dtext'); + $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self'); +} diff --git a/t/handler.t b/t/handler.t new file mode 100644 index 0000000..8d7bbc5 --- /dev/null +++ b/t/handler.t @@ -0,0 +1,67 @@ +# Test handler method + +use Test::More tests => 11; + +my $testno; + +use HTML::Parser; +{ + package MyParser; + use vars qw(@ISA); + @ISA=(HTML::Parser); + + sub foo + { + Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1); + } + + sub bar + { + Test::More::is($_[1], Test::More->builder->current_test + 1); + } +} + +$p = MyParser->new(api_version => 3); + +eval { + $p->handler(foo => "foo", "foo"); +}; + +like($@, qr/^No handler for foo events/); + +eval { + $p->handler(start => "foo", "foo"); +}; +like($@, qr/^Unrecognized identifier foo in argspec/); + +my $h = $p->handler(start => "foo", "self,tagname"); +ok(!defined($h)); + +$x = \substr("xfoo", 1); +$p->handler(start => $$x, "self,attr"); +$p->parse("<a testno=4>"); + +$p->handler(start => \&MyParser::foo, "self,attr"); +$p->parse("<a testno=5>"); + +$p->handler(start => "foo"); +$p->parse("<a testno=6>"); + +$p->handler(start => "bar", "self,'7'"); +$p->parse("<a>"); + +eval { + $p->handler(start => {}, "self"); +}; +like($@, qr/^Only code or array references allowed as handler/); + +$a = []; +$p->handler(start => $a); +$h = $p->handler("start"); +is($p->handler("start", "foo"), $a); + +is($p->handler("start", \&MyParser::foo, ""), "foo"); + +is($p->handler("start"), \&MyParser::foo); + + diff --git a/t/headparser-http.t b/t/headparser-http.t new file mode 100644 index 0000000..b722c64 --- /dev/null +++ b/t/headparser-http.t @@ -0,0 +1,20 @@ +use Test::More tests => 1; + +eval { + require HTML::HeadParser; + $p = HTML::HeadParser->new; +}; + +SKIP: { +skip $@, 1 if $@ =~ /^Can't locate HTTP/; + +$p = HTML::HeadParser->new($h); +$p->parse(<<EOT); +<title>Stupid example</title> +<base href="http://www.sn.no/libwww-perl/"> +Normal text starts here. +EOT +$h = $p->header; +undef $p; +is($h->title, "Stupid example"); +} diff --git a/t/headparser.t b/t/headparser.t new file mode 100644 index 0000000..1b4b810 --- /dev/null +++ b/t/headparser.t @@ -0,0 +1,200 @@ +#!perl -w + +use strict; +use Test::More tests => 17; + +{ package H; + sub new { bless {}, shift; } + + sub header { + my $self = shift; + my $key = uc(shift); + die if $key =~ /:/; + my $old = $self->{$key}; + if (@_) { $self->{$key} = shift; } + $old; + } + + sub push_header { + my($self, $k, $v) = @_; + $k = uc($k); + die if $k =~ /:/; + if (exists $self->{$k}) { + $self->{$k} = [ $self->{$k} ] unless ref $self->{$k}; + push(@{$self->{$k}}, $v); + } else { + $self->{$k} = $v; + } + } + + sub as_string { + my $self = shift; + my $str = ""; + for (sort keys %$self) { + if (ref($self->{$_})) { + my $v; + for $v (@{$self->{$_}}) { + $str .= "$_: $v\n"; + } + } else { + $str .= "$_: $self->{$_}\n"; + } + } + $str; + } +} + + +my $HTML = <<'EOT'; + +<title>Å være eller å ikke være</title> +<meta http-equiv="Expires" content="Soon"> +<meta http-equiv="Foo" content="Bar"> +<meta name='twitter:card' content='photo' /> +<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas"> + +<script> + + ignore this + +</script> +<noscript> ... and this </noscript> + +<object classid="foo"> + +<base href="http://www.sn.no"> +<meta name="Keywords" content="test, test, test,..."> +<meta name="Keywords" content="more"> +<meta charset="ISO-8859-1"><!-- HTML 5 --> + +Dette er vanlig tekst. Denne teksten definerer også slutten på +<head> delen av dokumentet. + +<style> + + ignore this too + +</style> + +<isindex> + +Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. + +EOT + +$| = 1; + +#$HTML::HeadParser::DEBUG = 1; +require HTML::HeadParser; +my $p = HTML::HeadParser->new( H->new ); + +if ($p->parse($HTML)) { + fail("Need more data which should not happen"); +} else { + #diag $p->as_string; + pass(); +} + +like($p->header('Title'), qr/Å være eller å ikke være/); +is($p->header('Expires'), 'Soon'); +is($p->header('Content-Base'), 'http://www.sn.no'); +is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']); +is($p->header('X-Meta-Charset'), 'ISO-8859-1'); +is($p->header('X-Meta-Twitter-Card'), 'photo'); +like($p->header('Link'), qr/<mailto:gisle\@aas.no>/); + +# This header should not be present because the head ended +ok(!$p->header('Isindex')); + + +# Try feeding one char at a time +my $expected = $p->as_string; +my $nl = 1; +$p = HTML::HeadParser->new(H->new); +while ($HTML =~ /(.)/sg) { + #print STDERR '#' if $nl; + #print STDERR $1; + $nl = $1 eq "\n"; + $p->parse($1) or last; +} +is($p->as_string, $expected); + + +# Try reading it from a file +my $file = "hptest$$.html"; +die "$file already exists" if -e $file; + +open(FILE, ">$file") or die "Can't create $file: $!"; +binmode(FILE); +print FILE $HTML; +print FILE "<p>This is more content...</p>\n" x 2000; +print FILE "<title>Buuuh!</title>\n" x 200; +close FILE or die "Can't close $file: $!"; + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +is($p->header("Title"), "Å være eller å ikke være"); + + +# We got into an infinite loop on data without tags and no EOL. +# This was actually a HTML::Parser bug. +open(FILE, ">$file") or die "Can't create $file: $!"; +print FILE "Foo"; +close(FILE); + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +ok(!$p->as_string); + +SKIP: { + skip "Need Unicode support", 5 if $] < 5.008; + + # Test that the Unicode BOM does not confuse us? + $p = HTML::HeadParser->new(H->new); + ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>")); + $p->eof; + + is($p->header("title"), "Hi <foo>"); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522 +\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html> + <head> + <title> +Parkinson's disease</title> + <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications"> + </meta> + \t +\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" /> +\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" /> +\t +\t </head> + <body> +EOT + $p->eof; + + is($p->header("title"), "Parkinson's disease"); + is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"'); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r +\r +<head profile="http://gmpg.org/xfn/11">\r +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r +\r +<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r +<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r + +EOT + $p->eof; + is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"); +} diff --git a/t/ignore.t b/t/ignore.t new file mode 100644 index 0000000..008739e --- /dev/null +++ b/t/ignore.t @@ -0,0 +1,27 @@ + +use Test::More tests => 4; + +use strict; +use HTML::Parser (); + +my $html = '<A href="foo">text</A>'; + +my $text = ''; +my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']); +$p->parse($html)->eof; +is($text, $html); + +$text = ''; +$p->handler(start => ""); +$p->parse($html)->eof; +is($text, 'text</A>'); + +$text = ''; +$p->handler(end => 0); +$p->parse($html)->eof; +is($text, 'text'); + +$text = ''; +$p->handler(start => undef); +$p->parse($html)->eof; +is($text, '<A href="foo">text'); diff --git a/t/largetags.t b/t/largetags.t new file mode 100644 index 0000000..a9ed3ff --- /dev/null +++ b/t/largetags.t @@ -0,0 +1,38 @@ +# Exercise the tokenpos buffer allocation routines by feeding it +# very large tags. + +use Test::More tests => 2; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +$p->handler("start" => + sub { + my $tp = shift; + #diag int(@$tp), " - ", join(", ", @$tp); + is(@$tp, 2 + 26 * 6 * 4); + }, "tokenpos"); + +$p->handler("declaration" => + sub { + my $t = shift; + #diag int(@$t), " - @$t"; + is(@$t, 26 * 6 * 2 + 1); + }, "tokens"); + +$p->parse("<a "); +for ("aa" .. "fz") { + $p->parse("$_=1 "); +} +$p->parse(">"); + +$p->parse("<!DOCTYPE "); +for ("aa" .. "fz") { + $p->parse("$_ -- $_ -- "); +} +$p->parse(">"); +$p->eof; +exit; + diff --git a/t/linkextor-base.t b/t/linkextor-base.t new file mode 100644 index 0000000..7ef8f02 --- /dev/null +++ b/t/linkextor-base.t @@ -0,0 +1,41 @@ +# This test that HTML::LinkExtor really absolutize links correctly +# when a base URL is given to the constructor. + +use Test::More tests => 5; +require HTML::LinkExtor; + +SKIP: { +eval { + require URI; +}; +skip $@, 5 if $@; + +# Try with base URL and the $p->links interface. +$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); +$p->parse(<<HTML)->eof; +<head> +<base href="http://www.sn.no/"> +</head> +<body background="http://www.sn.no/sn.gif"> + +This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" +lowsrc="img.gif" alt="Image">. +HTML + +@p = $p->links; + +# There should be 4 links in the document +is(@p, 4); + +for (@p) { + ($t, %attr) = @$_ if $_->[0] eq 'img'; +} + +is($t, 'img'); + +is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); + +is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); + +ok(!scalar(keys %attr)); # there should be no more attributes +} diff --git a/t/linkextor-rel.t b/t/linkextor-rel.t new file mode 100644 index 0000000..1190a96 --- /dev/null +++ b/t/linkextor-rel.t @@ -0,0 +1,36 @@ +use Test::More tests => 4; + +require HTML::LinkExtor; + +$HTML = <<HTML; +<head> +<base href="http://www.sn.no/"> +</head> +<body background="http://www.sn.no/sn.gif"> + +This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" +lowsrc="img.gif" alt="Image">. +HTML + + +# Try the callback interface +$links = ""; +$p = HTML::LinkExtor->new( + sub { + my($tag, %links) = @_; + #diag "$tag @{[%links]}"; + $links .= "$tag @{[%links]}\n"; + }); + +$p->parse($HTML); $p->eof; + +ok($links =~ m|^base href http://www\.sn\.no/$|m); +ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); +ok($links =~ m|^a href link\.html$|m); + +# Used to be problems when using the links method on a document with +# no links it it. This is a test to prove that it works. +$p = new HTML::LinkExtor; +$p->parse("this is a document with no links"); $p->eof; +@a = $p->links; +is(@a, 0); diff --git a/t/magic.t b/t/magic.t new file mode 100644 index 0000000..366f275 --- /dev/null +++ b/t/magic.t @@ -0,0 +1,41 @@ +# Check that the magic signature at the top of struct p_state works and that we +# catch modifications to _hparser_xs_state gracefully + +use Test::More tests => 5; + +use HTML::Parser; + +$p = HTML::Parser->new(api_version => 3); + +$p->xml_mode(1); + +# We should not be able to simply modify this stuff +eval { + ${$p->{_hparser_xs_state}} += 4; +}; +like($@, qr/^Modification of a read-only value attempted/); + + +my $x = delete $p->{_hparser_xs_state}; + +eval { + $p->xml_mode(1); +}; +like($@, qr/^Can't find '_hparser_xs_state'/); + +$p->{_hparser_xs_state} = \($$x + 16); + +eval { + $p->xml_mode(1); +}; +like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/); + +$p->{_hparser_xs_state} = 33; +eval { + $p->xml_mode(1); +}; +like($@, qr/^_hparser_xs_state element is not a reference/); + +$p->{_hparser_xs_state} = $x; + +ok($p->xml_mode(0)); diff --git a/t/marked-sect.t b/t/marked-sect.t new file mode 100644 index 0000000..6a63478 --- /dev/null +++ b/t/marked-sect.t @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w + +use strict; +my $tag; +my $text; + +use HTML::Parser (); +my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"], + text_h => [sub { $text .= shift }, "dtext"], + ); + + +use Test::More tests => 14; + +SKIP: { +eval { + $p->marked_sections(1); +}; +skip $@, 14 if $@; + +$p->parse("<![[foo]]>"); +is($text, "foo"); + +$p->parse("<![TEMP INCLUDE[bar]]>"); +is($text, "foobar"); + +$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>"); +is($text, "foobarfoo\n"); + +$text = ""; +$p->parse("<![ CDATA [<foo"); +$p->parse("<![IGNORE[bar]]>,bar>]]><br>"); +is($text, "<foo<![IGNORE[bar,bar>]]>"); + +$text = ""; +$p->parse("<![ RCDATA [å<a>]]><![CDATA[å<a>]]>å<a><br>"); +is($text, "å<a>å<a>å"); +is($tag, "br"); + +$text = ""; +$p->parse("<![INCLUDE RCDATA CDATA IGNORE [fooå<a>]]><br>"); +is($text, ""); + +$text = ""; +$p->parse("<![INCLUDE RCDATA CDATA [fooå<a>]]><br>"); +is($text, "fooå<a>"); + +$text = ""; +$p->parse("<![INCLUDE RCDATA [fooå<a>]]><br>"); +is($text, "fooå<a>"); + +$text = ""; +$p->parse("<![INCLUDE [fooå<a>]]><br>"); +is($text, "fooå"); + +$text = ""; +$p->parse("<![[fooå<a>]]><br>"); +is($text, "fooå"); + +# offsets/line/column numbers +$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"], + marked_sections => 1, + ); +$p->parse(<<'EOT')->eof; +<title>Test</title> +<![CDATA + [fooå<a> +]]> +<![[ +INCLUDE +STUFF +]]> + <h1>Test</h1> +EOT + +my @x; +sub x { + my($line, $col, $offset, $event, $text) = @_; + $text =~ s/\n/\\n/g; + $text =~ s/ /./g; + push(@x, "$line.$col:$offset $event \"$text\"\n"); +} + +#diag @x; +is(join("", @x), <<'EOT'); +1.0:0 start_document "" +1.0:0 start "<title>" +1.7:7 text "Test" +1.11:11 end "</title>" +1.19:19 text "\n" +3.3:32 text "fooå<a>\n" +4.3:49 text "\n" +5.4:54 text "\nINCLUDE\nSTUFF\n" +8.3:72 text "\n.." +9.2:75 start "<h1>" +9.6:79 text "Test" +9.10:83 end "</h1>" +9.15:88 text "\n" +10.0:89 end_document "" +EOT + +my $doc = "<Tag><![CDATA[This is cdata]]></Tag>"; +my $result = ""; +$p = HTML::Parser->new( + marked_sections => 1, + handlers => { + default => [ sub { $result .= join("",@_); }, "skipped_text,text" ] + } +)->parse($doc)->eof; +is($doc, $result); + +$text = ""; +$p = HTML::Parser->new( + text_h => [sub { $text .= shift }, "dtext"], + marked_sections => 1, +); + +$p->parse("<![CDATA[foo [1]]]>"); +is($text, "foo [1]", "CDATA text ending in square bracket"); + +} # SKIP diff --git a/t/msie-compat.t b/t/msie-compat.t new file mode 100644 index 0000000..a297f1e --- /dev/null +++ b/t/msie-compat.t @@ -0,0 +1,79 @@ +#!perl -w + +use strict; +use HTML::Parser; + +use Test::More tests => 4; + +my $TEXT = ""; +sub h +{ + my($event, $tagname, $text, @attr) = @_; + for ($event, $tagname, $text, @attr) { + if (defined) { + s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; + } + else { + $_ = "<undef>"; + } + } + + $TEXT .= "[$event,$tagname,$text," . join(":", @attr) . "]\n"; +} + +my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text,\@attr"]); +$p->parse("<a>"); +$p->parse("</a f>"); +$p->parse("</a 'foo<>' 'bar>' x>"); +$p->parse("</a \"foo<>\""); +$p->parse(" \"bar>\" x>"); +$p->parse("</ foo bar>"); +$p->parse("</ \"<>\" >"); +$p->parse("<!--comment>text<!--comment><p"); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a>,] +[end,a,</a f>,] +[end,a,</a 'foo<>' 'bar>' x>,] +[end,a,</a "foo<>" "bar>" x>,] +[comment, foo bar,</ foo bar>,] +[comment, "<>" ,</ "<>" >,] +[comment,comment,<!--comment>,] +[text,<undef>,text,] +[comment,comment,<!--comment>,] +[comment,p,<p,] +[end_document,<undef>,,] +EOT + +$TEXT = ""; +$p->parse("<!comment>"); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[comment,comment,<!comment>,] +[end_document,<undef>,,] +EOT + +$TEXT = ""; +$p->parse(q(<a name=`foo bar`>)); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a name=`foo bar`>,name:`foo:bar`:bar`] +[end_document,<undef>,,] +EOT + +$p->backquote(1); +$TEXT = ""; +$p->parse(q(<a name=`foo bar`>)); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a name=`foo bar`>,name:foo bar] +[end_document,<undef>,,] +EOT diff --git a/t/offset.t b/t/offset.t new file mode 100644 index 0000000..840728d --- /dev/null +++ b/t/offset.t @@ -0,0 +1,58 @@ +use strict; +use HTML::Parser (); +use Test::More tests => 1; + +my $HTML = <<'EOT'; + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html40/strict.dtd"> + +<foo bar baz=3>heisan +</foo> <?process> +<!-- comment --> +<xmp>xmp</xmp> + +EOT + +my $p = HTML::Parser->new(api_version => 3); + +my $sum_len = 0; +my $count = 0; +my $err; + +$p->handler(default => + sub { + my($offset, $length, $offset_end, $line, $col, $text) = @_; + my $copy = $text; + $copy =~ s/\n/\\n/g; + substr($copy, 30) = "..." if length($copy) > 32; + #diag sprintf ">>> %d.%d %s", $line, $col, $copy; + if ($offset != $sum_len) { + diag "offset mismatch $offset vs $sum_len"; + $err++; + } + if ($offset_end != $offset + $length) { + diag "offset_end $offset_end wrong"; + $err++; + } + if ($length != length($text)) { + diag "length mismatch"; + $err++; + } + if (substr($HTML, $offset, $length) ne $text) { + diag "content mismatch"; + $err++; + } + $sum_len += $length; + $count++; + }, + 'offset,length,offset_end,line,column,text'); + +for (split(//, $HTML)) { + $p->parse($_); +} +$p->eof; + +ok($count > 5 && !$err); + + diff --git a/t/options.t b/t/options.t new file mode 100644 index 0000000..ff5f7db --- /dev/null +++ b/t/options.t @@ -0,0 +1,36 @@ +# Test option setting methods + +use Test::More tests => 10; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3, + xml_mode => 1); +my $old; + +$old = $p->boolean_attribute_value("foo"); +ok(!defined $old); + +$old = $p->boolean_attribute_value(); +is($old, "foo"); + +$old = $p->boolean_attribute_value(undef); +is($old, "foo"); +ok(!defined($p->boolean_attribute_value)); + +ok($p->xml_mode(0)); +ok(!$p->xml_mode); + +my $seen_buggy_comment_warning; +$SIG{__WARN__} = + sub { + local $_ = shift; + $seen_buggy_comment_warning++ + if /^netscape_buggy_comment\(\) is deprecated/; + }; + +ok(!$p->strict_comment(1)); +ok($p->strict_comment); +ok(!$p->netscape_buggy_comment); +ok($seen_buggy_comment_warning); diff --git a/t/parsefile.t b/t/parsefile.t new file mode 100644 index 0000000..f373f06 --- /dev/null +++ b/t/parsefile.t @@ -0,0 +1,45 @@ +use Test::More tests => 6; + +my $filename = "file$$.htm"; +die "$filename is already there" if -e $filename; +open(FILE, ">$filename") || die "Can't create $filename: $!"; +print FILE <<'EOT'; close(FILE); +<title>Heisan</title> +EOT + +{ + package MyParser; + require HTML::Parser; + @ISA=qw(HTML::Parser); + + sub start + { + my($self, $tag, $attr) = @_; + Test::More::is($tag, "title"); + } +} + +MyParser->new->parse_file($filename); +open(FILE, $filename) || die; +MyParser->new->parse_file(*FILE); +seek(FILE, 0, 0) || die; +MyParser->new->parse_file(\*FILE); +close(FILE); + +require IO::File; +my $io = IO::File->new($filename) || die; +MyParser->new->parse_file($io); +$io->seek(0, 0) || die; +MyParser->new->parse_file(*$io); + +my $text = ''; +$io->seek(0, 0) || die; +MyParser->new( + start_h => [ sub{ shift->eof; }, "self" ], + text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io); +ok(!$text); + +close($io); # needed because of bug in perl +undef($io); + +unlink($filename) or warn "Can't unlink $filename: $!"; diff --git a/t/parser.t b/t/parser.t new file mode 100644 index 0000000..0ce4d95 --- /dev/null +++ b/t/parser.t @@ -0,0 +1,184 @@ +use Test::More tests => 7; + +$HTML = <<'HTML'; + +<!DOCTYPE HTML> + +<body> + +Various entities. The parser must never break them in the middle: + +/ +/ +È +௖ + +å-Å + +<ul> +<li><a href="foo 'bar' baz>" id=33>This is a link</a> +<li><a href='foo "bar" baz> å' id=34>This is another one</a> +</ul> + +<p><div align="center"><img src="http://www.perl.com/perl.gif" +alt="camel"></div> + +<!-- this is +a comment --> and this is not. + +<!-- this is the kind of >comment< -- --> that Netscape hates --> + +< this > was not a tag. <this is/not either> + +</body> + +HTML + +#------------------------------------------------------------------- + +{ + package P; + require HTML::Parser; + @ISA=qw(HTML::Parser); + $OUT=''; + $COUNT=0; + + sub new + { + my $class = shift; + my $self = $class->SUPER::new; + $OUT = ''; + die "Can only have one" if $COUNT++; + $self; + } + + sub DESTROY + { + my $self = shift; + eval { $self->SUPER::DESTROY; }; + $COUNT--; + } + + sub declaration + { + my($self, $decl) = @_; + $OUT .= "[[$decl]]|"; + } + + sub start + { + my($self, $tag, $attr) = @_; + $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); + $attr = "/$attr" if length $attr; + $OUT .= "<<$tag$attr>>|"; + } + + sub end + { + my($self, $tag) = @_; + $OUT .= ">>$tag<<|"; + } + + sub comment + { + my($self, $comment) = @_; + $OUT .= "##$comment##|"; + } + + sub text + { + my($self, $text) = @_; + #$text =~ s/\n/\\n/g; + #$text =~ s/\t/\\t/g; + #$text =~ s/ /·/g; + $OUT .= "$text|"; + } + + sub result + { + $OUT; + } +} + +for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { +#for $chunksize (1) { + if ($chunksize =~ /^file/) { + #print "Parsing from $chunksize"; + } else { + #print "Parsing using $chunksize byte chunks"; + } + my $p = P->new; + + if ($chunksize =~ /^file/) { + # First we must create the file + my $tmpfile = "tmp-$$.html"; + my $file = $tmpfile; + die "$file already exists" if -e $file; + open(FILE, ">$file") or die "Can't create $file: $!"; + binmode FILE; + print FILE $HTML; + close(FILE); + + if ($chunksize eq "filehandle") { + require FileHandle; + my $fh = FileHandle->new($file) || die "Can't open $file: $!"; + $file = $fh; + } + + # then we can parse it. + $p->parse_file($file); + close $file if $chunksize eq "filehandle"; + unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; + } else { + my $copy = $HTML; + while (length $copy) { + my $chunk = substr($copy, 0, $chunksize); + substr($copy, 0, $chunksize) = ''; + $p->parse($chunk); + } + $p->eof; + } + + my $res = $p->result; + my $bad; + + # Then we start looking for things that should not happen + if ($res =~ /\s\|\s/) { + diag "broken space"; + $bad++; + } + for ( + # Make sure entities are not broken + '/', '/', 'È', '௖', '', 'å', 'Å', + + # Some elements that should be produced + "|[[DOCTYPE HTML]]|", + "|## this is\na comment ##|", + "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", + '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', + "|>>ul<<|", "|>>body<<|\n\n|", + ) + { + if (index($res, $_) < 0) { + diag "Can't find '$_' in parsed document"; + $bad++; + } + } + + diag $res if $bad || $ENV{PRINT_RESULTS}; + + # And we check that we get the same result all the time + $res =~ s/\|//g; # remove all break marks + if ($last_res && $res ne $last_res) { + diag "The result is not the same as last time"; + $bad++; + } + $last_res = $res; + + unless ($res =~ /Various entities/) { + diag "Some text must be missing"; + $bad++; + } + + ok(!$bad); +} diff --git a/t/plaintext.t b/t/plaintext.t new file mode 100644 index 0000000..9a53a78 --- /dev/null +++ b/t/plaintext.t @@ -0,0 +1,58 @@ +use Test::More tests => 3; + +use strict; +use HTML::Parser; + +my @a; +my $p = HTML::Parser->new(api_version => 3); +$p->handler(default => \@a, '@{event, text, is_cdata}'); +$p->parse(<<EOT)->eof; +<xmp><foo></xmp>x<plaintext><foo> +</plaintext> +foo +EOT + +for (@a) { + $_ = "" unless defined; +} + +my $doc = join(":", @a); + +#diag $doc; + +is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo> +</plaintext> +foo +:1:end_document::"); + +@a = (); +$p->closing_plaintext('yep, emulate gecko'); +$p->parse(<<EOT)->eof; +<plaintext><foo> +</plaintext>foo<b></b> +EOT + +for (@a) { + $_ = "" unless defined; +} + +$doc = join(":", @a); + +#diag $doc; + +is($doc, "start_document:::start:<plaintext>::text:<foo> +:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text: +::end_document::"); + +@a = (); +$p->closing_plaintext('yep, emulate gecko (2)'); +$p->parse(<<EOT)->eof; +<plaintext><foo> +foo<b></b> +EOT + +$doc = join(":", map { defined $_ ? $_ : "" } @a); + +is($doc, "start_document:::start:<plaintext>::text:<foo> +foo<b></b> +:1:end_document::"); @@ -0,0 +1,4 @@ +use Test::More; +eval "use Test::Pod 1.00"; +plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; +all_pod_files_ok(); diff --git a/t/process.t b/t/process.t new file mode 100644 index 0000000..9d27250 --- /dev/null +++ b/t/process.t @@ -0,0 +1,43 @@ +use strict; + +use Test::More tests => 12; + +my $pi; +my $orig; + +use HTML::Parser (); +my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; }, + "token0,text"] + ); + +$p->parse("<a><?foo><a>"); + +is($pi, "foo"); +is($orig, "<?foo>"); + +$p->parse("<a><?><a>"); +is($pi, ""); +is($orig, "<?>"); + +$p->parse("<a><? +foo +><a>"); +is($pi, "\nfoo\n"); +is($orig, "<?\nfoo\n>"); + +for (qw(< a > < ? b a r > < a >)) { + $p->parse($_); +} + +is($pi, "bar"); +is($orig, "<?bar>"); + +$p->xml_mode(1); + +$p->parse("<a><?foo>bar??><a>"); +is($pi, "foo>bar?"); +is($orig, "<?foo>bar??>"); + +$p->parse("<a><??></a>"); +is($pi, ""); +is($orig, "<??>"); diff --git a/t/pullparser.t b/t/pullparser.t new file mode 100644 index 0000000..80a186b --- /dev/null +++ b/t/pullparser.t @@ -0,0 +1,55 @@ +use Test::More tests => 3; + +use HTML::PullParser; + +my $doc = <<'EOT'; +<title>Title</title> +<style> h1 { background: white } +<foo> +</style> +<H1 ID="3">Heading</H1> +<!-- ignore this --> + +This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>. +EOT + +my $p = HTML::PullParser->new(doc => $doc, + start => 'event,tagname,@attr', + end => 'event,tagname', + text => 'event,dtext', + + ignore_elements => [qw(script style)], + unbroken_text => 1, + boolean_attribute_value => 1, + ); + +my $t = $p->get_token; +is($t->[0], "start"); +is($t->[1], "title"); +$p->unget_token($t); + +my @a; +while (my $t = $p->get_token) { + for (@$t) { + s/\s/./g; + } + push(@a, join("|", @$t)); +} + +my $res = join("\n", @a, ""); +#diag $res; +is($res, <<'EOT'); +start|title +text|Title +end|title +text|.. +start|h1|id|3 +text|Heading +end|h1 +text|...This.is.a.text.with.a. +start|a|href|http://www.sol.no|name|l1 +text|link +end|a +text|.. +EOT + diff --git a/t/script.t b/t/script.t new file mode 100644 index 0000000..2a75ccb --- /dev/null +++ b/t/script.t @@ -0,0 +1,41 @@ +#!perl -w + +use strict; +use Test; +plan tests => 1; + +use HTML::Parser; + +my $TEXT = ""; +sub h +{ + my($event, $tagname, $text) = @_; + for ($event, $tagname, $text) { + if (defined) { + s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; + } + else { + $_ = "<undef>"; + } + } + + $TEXT .= "[$event,$tagname,$text]\n"; +} + +my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1); +$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>)); +$p->eof; + +ok($TEXT, <<'EOT'); +[start_document,<undef>,] +[start,tr,<tr>] +[start,td,<td align="center" height="100">] +[start,script,<script src="whatever"/>] +[end,script,] +[start,script,<SCRIPT language="JavaScript1.1">] +[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');] +[end,script,</SCRIPT>] +[end,td,</td>] +[end,tr,</tr>] +[end_document,<undef>,] +EOT diff --git a/t/skipped-text.t b/t/skipped-text.t new file mode 100644 index 0000000..bc39915 --- /dev/null +++ b/t/skipped-text.t @@ -0,0 +1,89 @@ +use Test::More tests => 4; + +use strict; +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3); + +$p->report_tags("a"); + +my @doc; + +$p->handler(start => \&a_handler, "skipped_text, text"); +$p->handler(end_document => \@doc, '@{skipped_text}'); + +$p->parse(<<EOT)->eof; +<title>hi</title> +<h1><a href="foo">link</a></h1> +and <a foo="">some</a> text. +EOT + +sub a_handler { + push(@doc, shift); + my $text = shift; + push(@doc, uc($text)); +} + + +is(join("", @doc), <<'EOT'); +<title>hi</title> +<h1><A HREF="FOO">link</a></h1> +and <A FOO="">some</a> text. +EOT + +# +# Comment stripper. Interaction with "" handlers. +# +my $doc = <<EOT; +<html>text</html> +<!-- comment --> +and some more <b>text</b>. +EOT +(my $expected = $doc) =~ s/<!--.*?-->//; + +$p = HTML::Parser->new(api_version => 3); +$p->handler(comment => ""); +$p->handler(end_document => sub { + my $stripped = shift; + #diag $stripped; + is($stripped, $expected); + }, "skipped_text"); +for (split(//, $doc)) { + $p->parse($_); +} +$p->eof; + +# +# Interaction with unbroken text +# +my @x; +$p = HTML::Parser->new(api_version => 3, unbroken_text => 1); +$p->handler(text => \@x, '@{"X", skipped_text, text}'); +$p->handler(end => ""); +$p->handler(end_document => \@x, '@{"Y", skipped_text}'); + +$doc = "a a<a>b b</a>c c<x>d d</x>e"; + +for (split(//, $doc)) { + $p->parse($_); +} +$p->eof; + +#diag join(":", @x); +is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:"); + +# +# The crash that Chip found +# + +my $skipped; +$p = HTML::Parser->new( + ignore_tags => ["foo"], + start_h => [sub {$skipped = shift}, "skipped_text"], +); + +$p->parse("\x{100}<foo>"); +$p->parse("plain"); +$p->parse("<bar>"); +$p->eof; +is($skipped, "\x{100}<foo>plain"); diff --git a/t/stack-realloc.t b/t/stack-realloc.t new file mode 100644 index 0000000..46c7d35 --- /dev/null +++ b/t/stack-realloc.t @@ -0,0 +1,17 @@ +#!perl -w + +# HTML-Parser 3.33 and older used to core dump on this program because +# of missing SPAGAIN calls in parse() XS code. It was not prepared for +# the stack to get realloced. + +$| = 1; + +use Test::More tests => 1; + +use HTML::Parser; +my $x = HTML::Parser->new(api_version => 3); +my @row; +$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname"); +$x->parse("</TD>"); + +pass; diff --git a/t/textarea.t b/t/textarea.t new file mode 100644 index 0000000..120f79b --- /dev/null +++ b/t/textarea.t @@ -0,0 +1,70 @@ +use Test::More tests => 1; + +use strict; +use HTML::Parser; + +my $html = <<'EOT'; +<html> +<title>This is a <nice> title</title> +<!--comment--> +<script language="perl">while (<DATA>) { & }</script> + +<FORM> + +<textarea name="foo" cols=50 rows=10> + +foo +<foo> +<!--comment--> +& +foo +</FORM> + +</textarea> + +</FORM> + +</html> +EOT + +my $dump = ""; +sub tdump { + my @a = @_; + for (@a) { + $_ = "<undef>" unless defined; + s/\n/\\n/g; + } + $dump .= join("|", @a) . "\n"; +} + +my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]); +$p->parse($html)->eof; + +#diag $dump; + +is($dump, <<'EOT'); +start_document||<undef>|<undef> +start|<html>|<undef>|<undef> +text|\n|\n| +start|<title>|<undef>|<undef> +text|This is a <nice> title|This is a <nice> title| +end|</title>|<undef>|<undef> +text|\n|\n| +comment|<!--comment-->|<undef>|<undef> +text|\n|\n| +start|<script language="perl">|<undef>|<undef> +text|while (<DATA>) { & }|while (<DATA>) { & }|1 +end|</script>|<undef>|<undef> +text|\n\n|\n\n| +start|<FORM>|<undef>|<undef> +text|\n\n|\n\n| +start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef> +text|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n| +end|</textarea>|<undef>|<undef> +text|\n\n|\n\n| +end|</FORM>|<undef>|<undef> +text|\n\n|\n\n| +end|</html>|<undef>|<undef> +text|\n|\n| +end_document||<undef>|<undef> +EOT diff --git a/t/threads.t b/t/threads.t new file mode 100644 index 0000000..8da91e9 --- /dev/null +++ b/t/threads.t @@ -0,0 +1,39 @@ +# Verify thread safety. + +use Config; +use Test::More; + +BEGIN { + plan(skip_all => "Not configured for threads") + unless $Config{useithreads} && $] >= 5.008; + plan(tests => 1); +} + +use threads; +use HTML::Parser; + +my $ok=0; + +sub start +{ + my($tag,$attr)=@_; + + $ok += ($tag eq "foo"); + $ok += (defined($attr->{param}) && $attr->{param} eq "bar"); +} + +my $p = HTML::Parser->new + (api_version => 3, + handlers => { + start => [\&start, "tagname,attr"], + }); + +$p->parse("<foo pa"); + +$ok=async { + $p->parse("ram=bar>"); + $ok; +}->join(); + +is($ok,2); + diff --git a/t/tokeparser.t b/t/tokeparser.t new file mode 100644 index 0000000..2084201 --- /dev/null +++ b/t/tokeparser.t @@ -0,0 +1,164 @@ +use Test::More tests => 17; + +use strict; +use HTML::TokeParser; + +# First we create an HTML document to test + +my $file = "ttest$$.htm"; +die "$file already exists" if -e $file; + +open(F, ">$file") or die "Can't create $file: $!"; +print F <<'EOT'; close(F); + +<!--This is a test--> +<html><head><title> + This is the <title> +</title> + + <base href="http://www.perl.com"> +</head> + +<body background="bg.gif"> + + <h1>This is the <b>title</b> again + </h1> + + And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> <!--nice isn't it-->Institute</a> + + <br/><? process instruction > + +</body> +</html> + +EOT + +END { unlink($file) || warn "Can't unlink $file: $!"; } + + +my $p; + + +$p = HTML::TokeParser->new($file) || die "Can't open $file: $!"; +ok($p->unbroken_text); +if ($p->get_tag("foo", "title")) { + my $title = $p->get_trimmed_text; + #diag "Title: $title"; + is($title, "This is the <title>"); +} +undef($p); + +# Test with reference to glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(\*F); +my $scount = 0; +my $ecount = 0; +my $tcount = 0; +my $pcount = 0; +while (my $token = $p->get_token) { + $scount++ if $token->[0] eq "S"; + $ecount++ if $token->[0] eq "E"; + $pcount++ if $token->[0] eq "PI"; +} +undef($p); +close F; + +# Test with glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(*F); +$tcount++ while $p->get_tag; +undef($p); +close F; + +# Test with plain file name +$p = HTML::TokeParser->new($file) || die; +$tcount++ while $p->get_tag; +undef($p); + +#diag "Number of tokens found: $tcount/2 = $scount + $ecount"; +is($tcount, 34); +is($scount, 10); +is($ecount, 7); +is($pcount, 1); +is($tcount/2, $scount + $ecount); + +ok(!HTML::TokeParser->new("/noT/thEre/$$")); + + +$p = HTML::TokeParser->new($file) || die; +$p->get_tag("a"); +my $atext = $p->get_text; +undef($p); + +is($atext, "Perl\240Institute"); + +# test parsing of embeded document +$p = HTML::TokeParser->new(\<<HTML); +<title>Title</title> +<H1> +Heading +</h1> +HTML + +ok($p->get_tag("h1")); +is($p->get_trimmed_text, "Heading"); +undef($p); + +# test parsing of large embedded documents +my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022; + +#use Time::HiRes qw(time); +my $start = time; +$p = HTML::TokeParser->new(\$doc); +#diag "Construction time: ", time - $start; + +my $count; +while (my $t = $p->get_token) { + $count++ if $t->[0] eq "S"; +} +#diag "Parse time: ", time - $start; + +is($count, 2022); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a heading</H1> +This is s<b>o</b>me<hr>text. +<br /> +This is some more text. +<p> +This is even some more. +EOT + +$p->get_tag("/h1"); + +my $t = $p->get_trimmed_text("br", "p"); +is($t, "This is some text."); + +$p->get_tag; + +$t = $p->get_trimmed_text("br", "p"); +is($t,"This is some more text."); + +undef($p); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a <b>bold</b> heading</H1> +This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>. +<p> +This is even some more. +EOT + +$p->get_tag("h1"); + +$t = $p->get_phrase; +is($t, "This is a bold heading"); + +$t = $p->get_phrase; +is($t, ""); + +$p->get_tag; + +$t = $p->get_phrase; +is($t, "This is some italic text. This is some more text."); + +undef($p); diff --git a/t/uentities.t b/t/uentities.t new file mode 100644 index 0000000..36d5179 --- /dev/null +++ b/t/uentities.t @@ -0,0 +1,65 @@ +# Test Unicode entities + +use HTML::Entities; + +use Test::More tests => 26; + +SKIP: { +skip "This perl does not support Unicode or Unicode entities not selected", + 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT; + +is(decode_entities("&euro"), "&euro"); +is(decode_entities("€"), "\x{20AC}"); + +is(decode_entities("å"), "å"); +is(decode_entities("å"), "å"); + +is(decode_entities("񺄠"), chr(500000)); + +is(decode_entities("􏿽"), "\x{10FFFD}"); + +is(decode_entities(""), "\x{FFFC}"); + + +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), ""); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +is(decode_entities("&#ååå࿿"), "&#ååå\x{FFF}"); + +# This might fail when we get more than 64 bit UVs +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +my $err; +for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) { + my $a = join("", map chr, $_->[0] .. $_->[1]); + + my $e = encode_entities($a); + my $d = decode_entities($e); + + unless ($d eq $a) { + diag "Wrong decoding in range $_->[0] .. $_->[1]"; + # use Devel::Peek; Dump($a); Dump($d); + $err++; + } +} +ok(!$err); + + +is(decode_entities("��"), chr(0x100085)); + +is(decode_entities("�"), chr(0xFFFD)); + +is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}"); +} diff --git a/t/unbroken-text.t b/t/unbroken-text.t new file mode 100644 index 0000000..7de85a9 --- /dev/null +++ b/t/unbroken-text.t @@ -0,0 +1,60 @@ +use strict; +use HTML::Parser; + +use Test::More tests => 3; + +my $text = ""; +sub text +{ + my $cdata = shift() ? "CDATA" : "TEXT"; + my($offset, $line, $col, $t) = @_; + $text .= "[$cdata:$offset:$line.$col:$t]"; +} + +sub tag +{ + $text .= shift; +} + +my $p = HTML::Parser->new(unbroken_text => 1, + text_h => [\&text, "is_cdata,offset,line,column,text"], + start_h => [\&tag, "text"], + end_h => [\&tag, "text"], + ); + +$p->parse("foo "); +$p->parse("bar "); +$p->parse("<foo>"); +$p->parse("bar\n"); +$p->parse("</foo>"); +$p->parse("<xmp>xmp</xmp>"); +$p->parse("atend"); + +#diag $text; +is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>"); + +$text = ""; +$p->eof; + +#diag $text; +is($text, "[TEXT:37:2.20:atend]"); + + +$p = HTML::Parser->new(unbroken_text => 1, + text_h => [\&text, "is_cdata,offset,line,column,text"], + ); + +$text = ""; +$p->parse("foo"); +$p->parse("<foo"); +$p->parse(">bar\n"); +$p->parse("foo<xm"); +$p->parse("p>xmp"); +$p->parse("</xmp"); +$p->parse(">bar"); +$p->eof; + +#diag $text; +is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]"); + + diff --git a/t/unicode-bom.t b/t/unicode-bom.t new file mode 100644 index 0000000..b7398cf --- /dev/null +++ b/t/unicode-bom.t @@ -0,0 +1,63 @@ +#!perl -w + +use strict; +use Test::More tests => 2; +use HTML::Parser; + +SKIP: { +skip "This perl does not support Unicode", 2 if $] < 5.008; + +my @parsed; +my $p = HTML::Parser->new( + api_version => 3, + start_h => [\@parsed, 'tag, attr'], +); + +my @warn; +$SIG{__WARN__} = sub { + push(@warn, $_[0]); +}; + +$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); +$p->eof; + +#use Encode; +$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A)); +$p->eof; + +$p->parse("\xFF\xFE<head>Hi there</head>"); +$p->eof; + +$p->parse("\xFE\xFF<head>Hi there</head>"); +$p->eof; + +$p->parse("\0\0\xFF\xFE<head>Hi there</head>"); +$p->eof; + +$p->parse("\xFE\xFF\0\0<head>Hi there</head>"); +$p->eof; + +for (@warn) { + s/line (\d+)/line ##/g; +} + +is(join("", @warn), <<EOT); +Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. +Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. +Parsing of undecoded UTF-16 at $0 line ##. +Parsing of undecoded UTF-16 at $0 line ##. +Parsing of undecoded UTF-32 at $0 line ##. +Parsing of undecoded UTF-32 at $0 line ##. +EOT + +@warn = (); + +$p = HTML::Parser->new( + api_version => 3, + start_h => [\@parsed, 'tag'], +); + +$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); +$p->eof; +ok(!@warn); +} diff --git a/t/unicode.t b/t/unicode.t new file mode 100644 index 0000000..911c547 --- /dev/null +++ b/t/unicode.t @@ -0,0 +1,198 @@ +#!perl -w + +use strict; +use HTML::Parser; +use Test::More; +BEGIN { + plan skip_all => "This perl does not support Unicode" if $] < 5.008; +} + +plan tests => 105; + +my @warn; +$SIG{__WARN__} = sub { + push(@warn, $_[0]); +}; + +my @parsed; +my $p = HTML::Parser->new( + api_version => 3, + default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'], +); + +my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile ☺</h1>\x{0420}"; +is(length($doc), 46); + +$p->parse($doc)->eof; + +#use Data::Dump; Data::Dump::dump(@parsed); + +is(@parsed, 9); +is($parsed[0][0], "start_document"); + +is($parsed[1][0], "start"); +is($parsed[1][1], "<title>"); +SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") }; +is($parsed[1][3], 0); +is($parsed[1][4], 7); + +is($parsed[2][0], "text"); +is(ord($parsed[2][1]), 0x263A); +is($parsed[2][2], chr(0x263A)); +is($parsed[2][3], 7); +is($parsed[2][4], 1); +is($parsed[2][5], 8); +is($parsed[2][6], 7); + +is($parsed[3][0], "end"); +is($parsed[3][1], "</title>"); +is($parsed[3][3], 8); +is($parsed[3][6], 8); + +is($parsed[4][0], "start"); +is($parsed[4][1], "<h1 id=\x{2600} f>"); +is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0"); +is($parsed[4][8]{id}, "\x{2600}"); + +is($parsed[5][0], "text"); +is($parsed[5][1], "Smile ☺"); +is($parsed[5][2], "Smile \x{263A}"); + +is($parsed[7][0], "text"); +is($parsed[7][1], "\x{0420}"); +is($parsed[7][2], "\x{0420}"); + +is($parsed[8][0], "end_document"); +is($parsed[8][3], length($doc)); +is($parsed[8][5], length($doc)); +is($parsed[8][6], length($doc)); +is(@warn, 0); + +# Try to parse it as an UTF8 encoded string +utf8::encode($doc); +is(length($doc), 51); + +@parsed = (); +$p->parse($doc)->eof; + +#use Data::Dump; Data::Dump::dump(@parsed); + +is(@parsed, 9); +is($parsed[0][0], "start_document"); + +is($parsed[1][0], "start"); +is($parsed[1][1], "<title>"); +SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") }; +is($parsed[1][3], 0); +is($parsed[1][4], 7); + +is($parsed[2][0], "text"); +is(ord($parsed[2][1]), 226); +is($parsed[2][1], "\xE2\x98\xBA"); +is($parsed[2][2], "\xE2\x98\xBA"); +is($parsed[2][3], 7); +is($parsed[2][4], 3); +is($parsed[2][5], 10); +is($parsed[2][6], 7); + +is($parsed[3][0], "end"); +is($parsed[3][1], "</title>"); +is($parsed[3][3], 10); +is($parsed[3][6], 10); + +is($parsed[4][0], "start"); +is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>"); +is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0"); +is($parsed[4][8]{id}, "\xE2\x98\x80"); + +is($parsed[5][0], "text"); +is($parsed[5][1], "Smile ☺"); +is($parsed[5][2], "Smile \x{263A}"); + +is($parsed[8][0], "end_document"); +is($parsed[8][3], length($doc)); +is($parsed[8][5], length($doc)); +is($parsed[8][6], length($doc)); + +is(@warn, 1); +like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); + +my $file = "test-$$.html"; +open(my $fh, ">:utf8", $file) || die; +print $fh <<EOT; +\x{FEFF} +<title>\x{263A} Love! </title> +<h1 id=♥\x{2665}>♥ Love \x{2665}<h1> +EOT +close($fh) || die; + +@warn = (); +@parsed = (); +$p->parse_file($file); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \xE2\x99\xA5"); +is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage +is($parsed[10][3], -s $file); +is(@warn, 1); +like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); + +@warn = (); +@parsed = (); +open($fh, "<:raw:utf8", $file) || die; +$p->parse_file($fh); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\x{2665}\x{2665}"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \x{2665}"); +is($parsed[7][2], "\x{2665} Love \x{2665}"); +is($parsed[10][3], (-s $file) - 2 * 4); +is(@warn, 0); + +@warn = (); +@parsed = (); +open($fh, "<:raw", $file) || die; +$p->utf8_mode(1); +$p->parse_file($fh); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \xE2\x99\xA5"); +is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5"); +is($parsed[10][3], -s $file); +is(@warn, 0); + +unlink($file); + +@parsed = (); +$p->parse(q(<a href="a=1&lang=2×=3">foo</a>))->eof; +is(@parsed, "5"); +is($parsed[1][0], "start"); +is($parsed[1][8]{href}, "a=1&lang=2\xd7=3"); + +ok(!HTML::Entities::_probably_utf8_chunk("")); +ok(!HTML::Entities::_probably_utf8_chunk("f")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99")); +ok(!HTML::Entities::_probably_utf8_chunk("f\xE2")); +ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99")); + +$p = HTML::Parser->new( + api_version => 3, + default_h => [\@parsed, 'event, text, tag, attr'], + attr_encoded => 1, +); + +@warn = (); +@parsed = (); + +$p->parse($doc)->eof; + +ok(!@warn); +is(@parsed, 9); diff --git a/t/xml-mode.t b/t/xml-mode.t new file mode 100644 index 0000000..cdfc5b0 --- /dev/null +++ b/t/xml-mode.t @@ -0,0 +1,112 @@ +use strict; +use Test::More tests => 8; + +use HTML::Parser (); +my $p = HTML::Parser->new(xml_mode => 1, + ); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + $text .= "]"; + }, "tagname,attr"); +$p->handler(end => + sub { + $text .= "E[" . shift() . "]"; + }, "tagname"); +$p->handler(process => + sub { + $text .= "PI[" . shift() . "]"; + }, "token0"); +$p->handler(text => + sub { + $text .= shift; + }, "text"); + +my $xml = <<'EOT'; +<?xml version="1.0"?> +<?IS10744:arch name="html"?><!-- comment --> +<DOC> +<title html="h1">My first architectual document</title> +<author html="address">Geir Ove Gronmo, grove@infotek.no</author> +<para>This is the first paragraph in this document</para> +<para html="p">This is the second paragraph</para> +<para/> +<xmp><foo></foo></xmp> +</DOC> +EOT + +$p->parse($xml)->eof; + +is($text, <<'EOT'); +PI[xml version="1.0"] +PI[IS10744:arch name="html"] +S[DOC] +S[title html=h1]My first architectual documentE[title] +S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] +S[para]This is the first paragraph in this documentE[para] +S[para html=p]This is the second paragraphE[para] +S[para]E[para] +S[xmp]S[foo]E[foo]E[xmp] +E[DOC] +EOT + +$text = ""; +$p->xml_mode(0); +$p->parse($xml)->eof; + +is($text, <<'EOT'); +PI[xml version="1.0"?] +PI[IS10744:arch name="html"?] +S[doc] +S[title html=h1]My first architectual documentE[title] +S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] +S[para]This is the first paragraph in this documentE[para] +S[para html=p]This is the second paragraphE[para] +S[para/] +S[xmp]<foo></foo>E[xmp] +E[doc] +EOT + +# Test that we get an empty tag back +$p = HTML::Parser->new(api_version => 3, + xml_mode => 1); + +$p->handler("end" => + sub { + my($tagname, $text) = @_; + is($tagname, "Xyzzy"); + ok(!length($text)); + }, "tagname,text"); +$p->parse("<Xyzzy foo=bar/>and some more")->eof; + +# Test that we get an empty tag back +$p = HTML::Parser->new(api_version => 3, + empty_element_tags => 1); + +$p->handler("end" => + sub { + my($tagname, $text) = @_; + is($tagname, "xyzzy"); + ok(!length($text)); + }, "tagname,text"); +$p->parse("<Xyzzy foo=bar/>and some more")->eof; + +$p = HTML::Parser->new( + api_version => 3, + xml_pic => 1, +); + +$p->handler( + "process" => sub { + my($text, $t0) = @_; + is($text, "<?foo > bar?>"); + is($t0, "foo > bar"); + }, "text, token0"); +$p->parse("<?foo > bar?> and then")->eof; |