use Test::More tests => 17;
use strict;
use HTML::TokeParser;
# First we create an HTML document to test
my $file = "ttest$$.htm";
die "$file already exists" if -e $file;
open(F, ">$file") or die "Can't create $file: $!";
print F <<'EOT'; close(F);
This is the <title>
This is the title again
And this is a link to the Institute
process instruction >
EOT
END { unlink($file) || warn "Can't unlink $file: $!"; }
my $p;
$p = HTML::TokeParser->new($file) || die "Can't open $file: $!";
ok($p->unbroken_text);
if ($p->get_tag("foo", "title")) {
my $title = $p->get_trimmed_text;
#diag "Title: $title";
is($title, "This is the ");
}
undef($p);
# Test with reference to glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(\*F);
my $scount = 0;
my $ecount = 0;
my $tcount = 0;
my $pcount = 0;
while (my $token = $p->get_token) {
$scount++ if $token->[0] eq "S";
$ecount++ if $token->[0] eq "E";
$pcount++ if $token->[0] eq "PI";
}
undef($p);
close F;
# Test with glob
open(F, $file) || die "Can't open $file: $!";
$p = HTML::TokeParser->new(*F);
$tcount++ while $p->get_tag;
undef($p);
close F;
# Test with plain file name
$p = HTML::TokeParser->new($file) || die;
$tcount++ while $p->get_tag;
undef($p);
#diag "Number of tokens found: $tcount/2 = $scount + $ecount";
is($tcount, 34);
is($scount, 10);
is($ecount, 7);
is($pcount, 1);
is($tcount/2, $scount + $ecount);
ok(!HTML::TokeParser->new("/noT/thEre/$$"));
$p = HTML::TokeParser->new($file) || die;
$p->get_tag("a");
my $atext = $p->get_text;
undef($p);
is($atext, "Perl\240Institute");
# test parsing of embeded document
$p = HTML::TokeParser->new(\<Title
Heading
HTML
ok($p->get_tag("h1"));
is($p->get_trimmed_text, "Heading");
undef($p);
# test parsing of large embedded documents
my $doc = "foo is bar\n\n\n" x 2022;
#use Time::HiRes qw(time);
my $start = time;
$p = HTML::TokeParser->new(\$doc);
#diag "Construction time: ", time - $start;
my $count;
while (my $t = $p->get_token) {
$count++ if $t->[0] eq "S";
}
#diag "Parse time: ", time - $start;
is($count, 2022);
$p = HTML::TokeParser->new(\<<'EOT');
This is a heading
This is some
text.
This is some more text.
This is even some more.
EOT
$p->get_tag("/h1");
my $t = $p->get_trimmed_text("br", "p");
is($t, "This is some text.");
$p->get_tag;
$t = $p->get_trimmed_text("br", "p");
is($t,"This is some more text.");
undef($p);
$p = HTML::TokeParser->new(\<<'EOT');
This is a bold heading
This is some italic text.
This is some more text.
This is even some more.
EOT
$p->get_tag("h1");
$t = $p->get_phrase;
is($t, "This is a bold heading");
$t = $p->get_phrase;
is($t, "");
$p->get_tag;
$t = $p->get_phrase;
is($t, "This is some italic text. This is some more text.");
undef($p);