summaryrefslogtreecommitdiff
path: root/eg/htext
diff options
context:
space:
mode:
Diffstat (limited to 'eg/htext')
-rwxr-xr-xeg/htext29
1 files changed, 29 insertions, 0 deletions
diff --git a/eg/htext b/eg/htext
new file mode 100755
index 0000000..e4d276d
--- /dev/null
+++ b/eg/htext
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+
+# Extract all plain text from an HTML file
+
+use strict;
+use HTML::Parser 3.00 ();
+
+my %inside;
+
+sub tag
+{
+ my($tag, $num) = @_;
+ $inside{$tag} += $num;
+ print " "; # not for all tags
+}
+
+sub text
+{
+ return if $inside{script} || $inside{style};
+ print $_[0];
+}
+
+HTML::Parser->new(api_version => 3,
+ handlers => [start => [\&tag, "tagname, '+1'"],
+ end => [\&tag, "tagname, '-1'"],
+ text => [\&text, "dtext"],
+ ],
+ marked_sections => 1,
+ )->parse_file(shift) || die "Can't open file: $!\n";;