diff options
Diffstat (limited to 't/parser.t')
-rw-r--r-- | t/parser.t | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/t/parser.t b/t/parser.t new file mode 100644 index 0000000..0ce4d95 --- /dev/null +++ b/t/parser.t @@ -0,0 +1,184 @@ +use Test::More tests => 7; + +$HTML = <<'HTML'; + +<!DOCTYPE HTML> + +<body> + +Various entities. The parser must never break them in the middle: + +/ +/ +È +௖ + +å-Å + +<ul> +<li><a href="foo 'bar' baz>" id=33>This is a link</a> +<li><a href='foo "bar" baz> å' id=34>This is another one</a> +</ul> + +<p><div align="center"><img src="http://www.perl.com/perl.gif" +alt="camel"></div> + +<!-- this is +a comment --> and this is not. + +<!-- this is the kind of >comment< -- --> that Netscape hates --> + +< this > was not a tag. <this is/not either> + +</body> + +HTML + +#------------------------------------------------------------------- + +{ + package P; + require HTML::Parser; + @ISA=qw(HTML::Parser); + $OUT=''; + $COUNT=0; + + sub new + { + my $class = shift; + my $self = $class->SUPER::new; + $OUT = ''; + die "Can only have one" if $COUNT++; + $self; + } + + sub DESTROY + { + my $self = shift; + eval { $self->SUPER::DESTROY; }; + $COUNT--; + } + + sub declaration + { + my($self, $decl) = @_; + $OUT .= "[[$decl]]|"; + } + + sub start + { + my($self, $tag, $attr) = @_; + $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); + $attr = "/$attr" if length $attr; + $OUT .= "<<$tag$attr>>|"; + } + + sub end + { + my($self, $tag) = @_; + $OUT .= ">>$tag<<|"; + } + + sub comment + { + my($self, $comment) = @_; + $OUT .= "##$comment##|"; + } + + sub text + { + my($self, $text) = @_; + #$text =~ s/\n/\\n/g; + #$text =~ s/\t/\\t/g; + #$text =~ s/ /·/g; + $OUT .= "$text|"; + } + + sub result + { + $OUT; + } +} + +for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { +#for $chunksize (1) { + if ($chunksize =~ /^file/) { + #print "Parsing from $chunksize"; + } else { + #print "Parsing using $chunksize byte chunks"; + } + my $p = P->new; + + if ($chunksize =~ /^file/) { + # First we must create the file + my $tmpfile = "tmp-$$.html"; + my $file = $tmpfile; + die "$file already exists" if -e $file; + open(FILE, ">$file") or die "Can't create $file: $!"; + binmode FILE; + print FILE $HTML; + close(FILE); + + if ($chunksize eq "filehandle") { + require FileHandle; + my $fh = FileHandle->new($file) || die "Can't open $file: $!"; + $file = $fh; + } + + # then we can parse it. + $p->parse_file($file); + close $file if $chunksize eq "filehandle"; + unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; + } else { + my $copy = $HTML; + while (length $copy) { + my $chunk = substr($copy, 0, $chunksize); + substr($copy, 0, $chunksize) = ''; + $p->parse($chunk); + } + $p->eof; + } + + my $res = $p->result; + my $bad; + + # Then we start looking for things that should not happen + if ($res =~ /\s\|\s/) { + diag "broken space"; + $bad++; + } + for ( + # Make sure entities are not broken + '/', '/', 'È', '௖', '', 'å', 'Å', + + # Some elements that should be produced + "|[[DOCTYPE HTML]]|", + "|## this is\na comment ##|", + "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", + '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', + "|>>ul<<|", "|>>body<<|\n\n|", + ) + { + if (index($res, $_) < 0) { + diag "Can't find '$_' in parsed document"; + $bad++; + } + } + + diag $res if $bad || $ENV{PRINT_RESULTS}; + + # And we check that we get the same result all the time + $res =~ s/\|//g; # remove all break marks + if ($last_res && $res ne $last_res) { + diag "The result is not the same as last time"; + $bad++; + } + $last_res = $res; + + unless ($res =~ /Various entities/) { + diag "Some text must be missing"; + $bad++; + } + + ok(!$bad); +} |