diff options
Diffstat (limited to 't/tokeparser.t')
-rw-r--r-- | t/tokeparser.t | 164 |
1 files changed, 164 insertions, 0 deletions
diff --git a/t/tokeparser.t b/t/tokeparser.t new file mode 100644 index 0000000..2084201 --- /dev/null +++ b/t/tokeparser.t @@ -0,0 +1,164 @@ +use Test::More tests => 17; + +use strict; +use HTML::TokeParser; + +# First we create an HTML document to test + +my $file = "ttest$$.htm"; +die "$file already exists" if -e $file; + +open(F, ">$file") or die "Can't create $file: $!"; +print F <<'EOT'; close(F); + +<!--This is a test--> +<html><head><title> + This is the <title> +</title> + + <base href="http://www.perl.com"> +</head> + +<body background="bg.gif"> + + <h1>This is the <b>title</b> again + </h1> + + And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> <!--nice isn't it-->Institute</a> + + <br/><? process instruction > + +</body> +</html> + +EOT + +END { unlink($file) || warn "Can't unlink $file: $!"; } + + +my $p; + + +$p = HTML::TokeParser->new($file) || die "Can't open $file: $!"; +ok($p->unbroken_text); +if ($p->get_tag("foo", "title")) { + my $title = $p->get_trimmed_text; + #diag "Title: $title"; + is($title, "This is the <title>"); +} +undef($p); + +# Test with reference to glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(\*F); +my $scount = 0; +my $ecount = 0; +my $tcount = 0; +my $pcount = 0; +while (my $token = $p->get_token) { + $scount++ if $token->[0] eq "S"; + $ecount++ if $token->[0] eq "E"; + $pcount++ if $token->[0] eq "PI"; +} +undef($p); +close F; + +# Test with glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(*F); +$tcount++ while $p->get_tag; +undef($p); +close F; + +# Test with plain file name +$p = HTML::TokeParser->new($file) || die; +$tcount++ while $p->get_tag; +undef($p); + +#diag "Number of tokens found: $tcount/2 = $scount + $ecount"; +is($tcount, 34); +is($scount, 10); +is($ecount, 7); +is($pcount, 1); +is($tcount/2, $scount + $ecount); + +ok(!HTML::TokeParser->new("/noT/thEre/$$")); + + +$p = HTML::TokeParser->new($file) || die; +$p->get_tag("a"); +my $atext = $p->get_text; +undef($p); + +is($atext, "Perl\240Institute"); + +# test parsing of embeded document +$p = HTML::TokeParser->new(\<<HTML); +<title>Title</title> +<H1> +Heading +</h1> +HTML + +ok($p->get_tag("h1")); +is($p->get_trimmed_text, "Heading"); +undef($p); + +# test parsing of large embedded documents +my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022; + +#use Time::HiRes qw(time); +my $start = time; +$p = HTML::TokeParser->new(\$doc); +#diag "Construction time: ", time - $start; + +my $count; +while (my $t = $p->get_token) { + $count++ if $t->[0] eq "S"; +} +#diag "Parse time: ", time - $start; + +is($count, 2022); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a heading</H1> +This is s<b>o</b>me<hr>text. +<br /> +This is some more text. +<p> +This is even some more. +EOT + +$p->get_tag("/h1"); + +my $t = $p->get_trimmed_text("br", "p"); +is($t, "This is some text."); + +$p->get_tag; + +$t = $p->get_trimmed_text("br", "p"); +is($t,"This is some more text."); + +undef($p); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a <b>bold</b> heading</H1> +This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>. +<p> +This is even some more. +EOT + +$p->get_tag("h1"); + +$t = $p->get_phrase; +is($t, "This is a bold heading"); + +$t = $p->get_phrase; +is($t, ""); + +$p->get_tag; + +$t = $p->get_phrase; +is($t, "This is some italic text. This is some more text."); + +undef($p); |