diff options
Diffstat (limited to 'lib/HTML/Filter.pm')
-rw-r--r-- | lib/HTML/Filter.pm | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/lib/HTML/Filter.pm b/lib/HTML/Filter.pm new file mode 100644 index 0000000..c5aa16e --- /dev/null +++ b/lib/HTML/Filter.pm @@ -0,0 +1,112 @@ +package HTML::Filter; + +use strict; +use vars qw(@ISA $VERSION); + +require HTML::Parser; +@ISA=qw(HTML::Parser); + +$VERSION = "3.57"; + +sub declaration { $_[0]->output("<!$_[1]>") } +sub process { $_[0]->output($_[2]) } +sub comment { $_[0]->output("<!--$_[1]-->") } +sub start { $_[0]->output($_[4]) } +sub end { $_[0]->output($_[2]) } +sub text { $_[0]->output($_[1]) } + +sub output { print $_[1] } + +1; + +__END__ + +=head1 NAME + +HTML::Filter - Filter HTML text through the parser + +=head1 NOTE + +B<This module is deprecated.> The C<HTML::Parser> now provides the +functionally of C<HTML::Filter> much more efficiently with the the +C<default> handler. + +=head1 SYNOPSIS + + require HTML::Filter; + $p = HTML::Filter->new->parse_file("index.html"); + +=head1 DESCRIPTION + +C<HTML::Filter> is an HTML parser that by default prints the +original text of each HTML element (a slow version of cat(1) basically). +The callback methods may be overridden to modify the filtering for some +HTML elements and you can override output() method which is called to +print the HTML text. + +C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that +the document should be given to the parser by calling the $p->parse() +or $p->parse_file() methods. + +=head1 EXAMPLES + +The first example is a filter that will remove all comments from an +HTML file. This is achieved by simply overriding the comment method +to do nothing. + + package CommentStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub comment { } # ignore comments + +The second example shows a filter that will remove any E<lt>TABLE>s +found in the HTML file. We specialize the start() and end() methods +to count table tags and then make output not happen when inside a +table. + + package TableStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub start + { + my $self = shift; + $self->{table_seen}++ if $_[0] eq "table"; + $self->SUPER::start(@_); + } + + sub end + { + my $self = shift; + $self->SUPER::end(@_); + $self->{table_seen}-- if $_[0] eq "table"; + } + + sub output + { + my $self = shift; + unless ($self->{table_seen}) { + $self->SUPER::output(@_); + } + } + +If you want to collect the parsed text internally you might want to do +something like this: + + package FilterIntoString; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub output { push(@{$_[0]->{fhtml}}, $_[1]) } + sub filtered_html { join("", @{$_[0]->{fhtml}}) } + +=head1 SEE ALSO + +L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1997-1999 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut |