diff options
-rw-r--r-- | Changes | 1664 | ||||
-rw-r--r-- | MANIFEST | 80 | ||||
-rw-r--r-- | META.json | 53 | ||||
-rw-r--r-- | META.yml | 30 | ||||
-rw-r--r-- | Makefile.PL | 64 | ||||
-rw-r--r-- | Parser.pm | 1240 | ||||
-rw-r--r-- | Parser.xs | 687 | ||||
-rw-r--r-- | README | 65 | ||||
-rw-r--r-- | TODO | 28 | ||||
-rwxr-xr-x | eg/hanchors | 48 | ||||
-rwxr-xr-x | eg/hdump | 23 | ||||
-rwxr-xr-x | eg/hform | 83 | ||||
-rwxr-xr-x | eg/hlc | 20 | ||||
-rwxr-xr-x | eg/hrefsub | 93 | ||||
-rwxr-xr-x | eg/hstrip | 65 | ||||
-rwxr-xr-x | eg/htext | 29 | ||||
-rwxr-xr-x | eg/htextsub | 28 | ||||
-rwxr-xr-x | eg/htitle | 21 | ||||
-rw-r--r-- | hints/solaris.pl | 4 | ||||
-rw-r--r-- | hparser.c | 1902 | ||||
-rw-r--r-- | hparser.h | 132 | ||||
-rw-r--r-- | lib/HTML/Entities.pm | 483 | ||||
-rw-r--r-- | lib/HTML/Filter.pm | 112 | ||||
-rw-r--r-- | lib/HTML/HeadParser.pm | 315 | ||||
-rw-r--r-- | lib/HTML/LinkExtor.pm | 185 | ||||
-rw-r--r-- | lib/HTML/PullParser.pm | 209 | ||||
-rw-r--r-- | lib/HTML/TokeParser.pm | 371 | ||||
-rwxr-xr-x | mkhctype | 57 | ||||
-rwxr-xr-x | mkpfunc | 28 | ||||
-rw-r--r-- | t/api_version.t | 22 | ||||
-rw-r--r-- | t/argspec-bad.t | 40 | ||||
-rw-r--r-- | t/argspec.t | 148 | ||||
-rw-r--r-- | t/argspec2.t | 21 | ||||
-rw-r--r-- | t/attr-encoded.t | 32 | ||||
-rw-r--r-- | t/callback.t | 49 | ||||
-rw-r--r-- | t/case-sensitive.t | 85 | ||||
-rw-r--r-- | t/cases.t | 105 | ||||
-rw-r--r-- | t/comment.t | 24 | ||||
-rw-r--r-- | t/crashme.t | 43 | ||||
-rw-r--r-- | t/declaration.t | 62 | ||||
-rw-r--r-- | t/default.t | 43 | ||||
-rw-r--r-- | t/document.t | 41 | ||||
-rw-r--r-- | t/dtext.t | 72 | ||||
-rw-r--r-- | t/entities.t | 213 | ||||
-rw-r--r-- | t/entities2.t | 57 | ||||
-rw-r--r-- | t/filter-methods.t | 205 | ||||
-rw-r--r-- | t/filter.t | 60 | ||||
-rw-r--r-- | t/handler-eof.t | 54 | ||||
-rw-r--r-- | t/handler.t | 67 | ||||
-rw-r--r-- | t/headparser-http.t | 20 | ||||
-rw-r--r-- | t/headparser.t | 200 | ||||
-rw-r--r-- | t/ignore.t | 27 | ||||
-rw-r--r-- | t/largetags.t | 38 | ||||
-rw-r--r-- | t/linkextor-base.t | 41 | ||||
-rw-r--r-- | t/linkextor-rel.t | 36 | ||||
-rw-r--r-- | t/magic.t | 41 | ||||
-rw-r--r-- | t/marked-sect.t | 121 | ||||
-rw-r--r-- | t/msie-compat.t | 79 | ||||
-rw-r--r-- | t/offset.t | 58 | ||||
-rw-r--r-- | t/options.t | 36 | ||||
-rw-r--r-- | t/parsefile.t | 45 | ||||
-rw-r--r-- | t/parser.t | 184 | ||||
-rw-r--r-- | t/plaintext.t | 58 | ||||
-rw-r--r-- | t/pod.t | 4 | ||||
-rw-r--r-- | t/process.t | 43 | ||||
-rw-r--r-- | t/pullparser.t | 55 | ||||
-rw-r--r-- | t/script.t | 41 | ||||
-rw-r--r-- | t/skipped-text.t | 89 | ||||
-rw-r--r-- | t/stack-realloc.t | 17 | ||||
-rw-r--r-- | t/textarea.t | 70 | ||||
-rw-r--r-- | t/threads.t | 39 | ||||
-rw-r--r-- | t/tokeparser.t | 164 | ||||
-rw-r--r-- | t/uentities.t | 65 | ||||
-rw-r--r-- | t/unbroken-text.t | 60 | ||||
-rw-r--r-- | t/unicode-bom.t | 63 | ||||
-rw-r--r-- | t/unicode.t | 198 | ||||
-rw-r--r-- | t/xml-mode.t | 112 | ||||
-rw-r--r-- | tokenpos.h | 49 | ||||
-rw-r--r-- | typemap | 5 | ||||
-rw-r--r-- | util.c | 311 |
80 files changed, 11931 insertions, 0 deletions
@@ -0,0 +1,1664 @@ +_______________________________________________________________________________ +2013-05-09 Release 3.71 + +Gisle Aas (1): + Transform ':' in headers to '-' [RT#80524] + + +_______________________________________________________________________________ +2013-03-28 Release 3.70 + +François Perrad (1): + Fix for cross-compiling with Buildroot + +Gisle Aas (1): + Comment typo fix + +Yves Orton (1): + Fix Issue #3 / RT #84144: HTML::Entities::decode_entities() needs + to call SV_CHECK_THINKFIRST() before checking READONLY flag + + +_______________________________________________________________________________ +2011-10-15 Release 3.69 + +Gisle Aas (4): + Documentation fix; encode_utf8 mixup [RT#71151] + Make it clearer that there are 2 (actually 3) options for handing "UTF-8 garbage" + Github is the official repo + Can't be bothered to try to fix the failures that occur on perl-5.6 + +Barbie (1): + fix to TokeParser to correctly handle option configuration + +Jon Jensen (1): + Aesthetic change: remove extra ; + +Ville Skyttä (1): + Trim surrounding whitespace from extracted URLs. + + +_______________________________________________________________________________ +2010-09-01 Release 3.68 + +Gisle Aas (1): + Declare the encoding of the POD to be utf8 + + +_______________________________________________________________________________ +2010-08-17 Release 3.67 + +Nicholas Clark (1): + bleadperl 2154eca7 breaks HTML::Parser 3.66 [RT#60368] + + +_______________________________________________________________________________ +2010-07-09 Release 3.66 + +Gisle Aas (1): + Fix entity decoding in utf8_mode for the title header + + +_______________________________________________________________________________ +2010-04-04 Release 3.65 + +Gisle Aas (1): + Eliminate buggy entities_decode_old + +Salvatore Bonaccorso (1): + Fixed endianness typo [RT#50811] + +Ville Skyttä (1): + Documentation fixes. + + +_______________________________________________________________________________ +2009-10-25 Release 3.64 + +Gisle Aas (5): + Convert files to UTF-8 + Don't allow decode_entities() to generate illegal Unicode chars + Copyright 2009 + Remove rendundant (repeated) test + Make parse_file() method use 3-arg open [RT#49434] + + + +_______________________________________________________________________________ +2009-10-22 Release 3.63 + +Gisle Aas (2): + Take more care to prepare the char range for encode_entities [RT#50170] + decode_entities confused by trailing incomplete entity + + + +_______________________________________________________________________________ +2009-08-13 Release 3.62 + +Ville Skyttä (4): + HTTP::Header doc typo fix. + Do not bother tracking style or script, they're ignored. + Bring HTML 5 head elements up to date with WD-html5-20090423. + Improve HeadParser performance. + +Gisle Aas (1): + Doc patch: Make it clearer what the return value from ->parse is + + + +_______________________________________________________________________________ +2009-06-20 Release 3.61 + +Gisle Aas (2): + Test that triggers the crash that Chip fixed + Complete documented list of literal tags + +Chip Salzenberg (1): + Avoid crash (referenced pend_text instead of skipped_text) + +Antonio Radici (1): + Reference HTML::LinkExttor [RT#43164] + + + +_______________________________________________________________________________ +2009-02-09 Release 3.60 + +Ville Skytta (5): + Spelling fixes. + Test multi-value headers. + Documentation improvements. + Do not terminate head parsing on the <object> element (added in HTML 4.0). + Add support for HTML 5 <meta charset> and new HEAD elements. + +Damyan Ivanov (1): + Short description of the htextsub example + +Mike South (1): + Suppress warning when encode_entities is called with undef [RT#27567] + +Zefram (1): + HTML::Parser doesn't compile with perl 5.8.0. + + + +_______________________________________________________________________________ +2008-11-24 Gisle Aas <gisle@ActiveState.com> + + Release 3.59 + + Restore perl-5.6 compatibility for HTML::HeadParser. + + Improved META.yml + + + +_______________________________________________________________________________ +2008-11-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.58 + + Suppress "Parsing of undecoded UTF-8 will give garbage" warning + with attr_encoded [RT#29089] + + HTML::HeadParser: + - Recognize the Unicode BOM in utf8_mode as well [RT#27522] + - Avoid ending up with '/' keys attribute in Link headers. + + + +_______________________________________________________________________________ +2008-11-16 Gisle Aas <gisle@ActiveState.com> + + Release 3.57 + + The <iframe> element content is now parsed in literal mode. + + Parsing of <script> and <style> content ends on the first end tag + even when that tag was in a quoted string. That seems to be the + behaviour of all modern browsers. + + Implement backquote() attribute as requested by Alex Kapranoff. + + Test and documentation tweaks from Alex Kapranoff. + + + +_______________________________________________________________________________ +2007-01-12 Gisle Aas <gisle@ActiveState.com> + + Release 3.56 + + Cloning of parser state for compatibility with threads. + Fixed by Bo Lindbergh <blgl@hagernas.com>. + + Don't require whitespace between declaration tokens. + <http://rt.cpan.org/Ticket/Display.html?id=20864> + + + +_______________________________________________________________________________ +2006-07-10 Gisle Aas <gisle@ActiveState.com> + + Release 3.55 + + Treat <> at the end of document as text. Used to be + reported as a comment. + + Improved Firefox compatibility for bad HTML: + - Unclosed <script>, <style> are now treated as empty tags. + - Unclosed <textarea>, <xmp> and <plaintext> treat rest as text. + - Unclosed <title> closes at next tag. + + Make <!a'b> a comment by itself. + + + +_______________________________________________________________________________ +2006-04-28 Gisle Aas <gisle@ActiveState.com> + + Release 3.54 + + Yaakov Belch discovered yet another issue with <script> parsing. + Enabling of 'empty_element_tags' got the parser confused + if it found such a tag for elements that are normally parsed + in literal mode. Of these <script src="..."/> is the only + one likely to be found in documents. + <http://rt.cpan.org//Ticket/Display.html?id=18965> + + + +_______________________________________________________________________________ +2006-04-27 Gisle Aas <gisle@ActiveState.com> + + Release 3.53 + + When ignore_element was enabled it got confused if the + corresponding tags did not nest properly; the end tag + was treated it as if it was a start tag. + Found and fixed by Yaakov Belch <code@yaakovnet.net>. + <http://rt.cpan.org/Ticket/Display.html?id=18936> + + + +_______________________________________________________________________________ +2006-04-26 Gisle Aas <gisle@ActiveState.com> + + Release 3.52 + + Make sure the 'start_document' fires exactly once for + each document parsed. For earlier releases it did not + fire at all for empty documents and could fire multiple + times if parse was called with empty chunks. + + Documentation tweaks and typo fixes. + + + +_______________________________________________________________________________ +2006-03-22 Gisle Aas <gisle@ActiveState.com> + + Release 3.51 + + Named entities outside the Latin-1 range are now only expanded + when properly terminated with ";". This makes HTML::Parser + compatible with Firefox/Konqueror/MSIE when it comes to how these + entities are expanded in attribute values. Firefox does expand + unterminated non-Latin-1 entities in plain text, so here + HTML::Parser only stays compatible with Konqueror/MSIE. + Fixes <http://rt.cpan.org/Ticket/Display.html?id=17962>. + + Fixed some documentation typos spotted by <william@knowmad.com>. + <http://rt.cpan.org/Ticket/Display.html?id=18062> + + + +_______________________________________________________________________________ +2006-02-14 Gisle Aas <gisle@ActiveState.com> + + Release 3.50 + + The 3.49 release didn't compile with VC++ because it mixed code + and declarations. Fixed by Steve Hay <steve.hay@uk.radan.com>. + + + +_______________________________________________________________________________ +2006-02-08 Gisle Aas <gisle@ActiveState.com> + + Release 3.49 + + Events could sometimes still fire after a handler has signaled eof. + + Marked_sections with text ending in square bracket parsed wrong. + Fix provided by <paul.bijnens@xplanation.com>. + <http://rt.cpan.org/Ticket/Display.html?id=16749> + + + +_______________________________________________________________________________ +2005-12-02 Gisle Aas <gisle@ActiveState.com> + + Release 3.48 + + Enabling empty_element_tags by default for HTML::TokeParser + was a mistake. Reverted that change. + <http://rt.cpan.org/Ticket/Display.html?id=16164> + + When processing a document with "marked_sections => 1", the + skipped text missed the first 3 bytes "<![". + <http://rt.cpan.org/Ticket/Display.html?id=16207> + + + +2005-11-22 Gisle Aas <gisle@ActiveState.com> + + Release 3.47 + + Added empty_element_tags and xml_pic configuration + options. These make it possible to enable these XML + features without enabling the full XML-mode. + + The empty_element_tags is enabled by default for + HTML::TokeParser. + + + +2005-10-24 Gisle Aas <gisle@ActiveState.com> + + Release 3.46 + + Don't try to treat an literal as space. + This breaks Unicode parsing. + <http://rt.cpan.org/Ticket/Display.html?id=15068> + + The unbroken_text option is now on by default + for HTML::TokeParser. + + HTML::Entities::encode will now encode "'" by default. + + Improved report/ignore_tags documentation by + Norbert Kiesel <nkiesel@tbdnetworks.com>. + + Test suite now use Test::More, by + Norbert Kiesel <nkiesel@tbdnetworks.com>. + + Fix HTML::Entities typo spotted by + Stefan Funke <bundy@adm.arcor.net>. + + Faster load time with XSLoader (perl-5.6 or better now required). + + Fixed POD markup errors in some of the modules. + + + +2005-01-06 Gisle Aas <gisle@ActiveState.com> + + Release 3.45 + + Fix stack memory leak caused by missing PUTBACK. Only + code that used $p->parse(\&cb) form was affected. + Fix provided by Gurusamy Sarathy <gsar@sophos.com>. + + + +2004-12-28 Gisle Aas <gisle@ActiveState.com> + + Release 3.44 + + Fix confusion about nested quotes in <script> and <style> text. + + + +2004-12-06 Gisle Aas <gisle@ActiveState.com> + + Release 3.43 + + The SvUTF8 flag was not propagated correctly when replacing + unterminated entities. + + Fixed test failure because of missing binmode on Windows. + + + +2004-12-04 Gisle Aas <gisle@ActiveState.com> + + Release 3.42 + + Avoid sv_catpvn_utf8_upgrade() as that macro was not + available in perl-5.8.0. + Patch by Reed Russell <Russell.Reed@acxiom.com>. + + Add casts to suppress compilation warnings for char/U8 + mismatches. + + HTML::HeadParser will always push new header values. + This make sure we never loose old header values. + + + +2004-11-30 Gisle Aas <gisle@ActiveState.com> + + Release 3.41 + + Fix unresolved symbol error with perl-5.005. + + + +2004-11-29 Gisle Aas <gisle@ActiveState.com> + + Release 3.40 + + Make utf8_mode only available on perl-5.8 or better. It produced + garbage with older versions of perl. + + Emit warning if entities are decoded and something in the first + chunk looks like hi-bit UTF-8. Previously this warning was only + triggered for documents with BOM. + + + +2004-11-23 Gisle Aas <gisle@ActiveState.com> + + Release 3.39_92 + + More documentation of the Unicode issues. Moved around HTML::Parser + documentation a bit. + + New boolean option; $p->utf8_mode to allow parsing of raw UTF-8. + + Documented that HTML::Entities::decode_entities() can take multiple + arguments. + + Unterminated entities are now decoded in text (compatibility + with MSIE misfeature). + + Document HTML::Entities::_decode_entities(); this variation of the + decode_entities() function has been available for a long time, but + have not been documented until now. + + HTML::Entities::_decode_entities() can now be told to try to + expand unterminated entities. + + Simplified Makefile.PL + + + +2004-11-23 Gisle Aas <gisle@ActiveState.com> + + Release 3.39_91 + + The HTML::HeadParser will skip Unicode BOM. Previously it + would consider the <head> section done when it saw the BOM. + + The parser will look for Unicode BOM and give appropriate + warnings if the form found indicate trouble. + + If no matching end tag is found for <script>, <style>, <xmp> + <title>, <textarea> then generate one where the next tag + starts. + + For <script> and <style> recognize quoted strings and don't + consider end element if the corresponding end tag is found + inside such a string. + + + +2004-11-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.39_90 + + The <title> element is now parsed in literal mode, which + means that other tags are not recognized until </title> has + been seen. + + Unicode support for perl-5.8 and better. + + Decoding Unicode entities always enabled; no longer a compile + time option. + + Propagation of UTF8 state on strings. + Patch contributed by John Gardiner Myers <jgmyers@proofpoint.com>. + + Calculate offsets and lengths in chars for Unicode strings. + + Fixed link typo in the HTML::TokeParser documentation. + + + +2004-11-11 Gisle Aas <gisle@ActiveState.com> + + Release 3.38 + + New boolean option; $p->closing_plaintext + Contributed by Alex Kapranoff <alex@kapranoff.ru> + + + +2004-11-10 Gisle Aas <gisle@ActiveState.com> + + Release 3.37 + + Improved handling of HTML encoded surrogate pairs and illegally + encoded Unicode; <http://rt.cpan.org/Ticket/Display.html?id=7785>. + Patch by John Gardiner Myers <jgmyers@proofpoint.com>. + + Avoid generating bad UTF8 strings when decoding entities + representing chars beyond #255 in 8-bit strings. Such bad + UTF8 sometimes made perl-5.8.5 and older segfault. + + Undocument v2 style subclassing in synopsis section. + + Internal cleanup: + + Make 'gcc -Wall' happier. + + Avoid modification of PVs during parsing of attrspec. + Another patch by John Gardiner Myers. + + + +2004-04-01 Gisle Aas <gisle@ActiveState.com> + + Release 3.36 + + Improved MSIE/Mozilla compatibility. If the same attribute + name repeats for a start tag, use the first value instead + of the last. Patch by Nick Duffek <html-parser@duffek.com>. + <http://rt.cpan.org/Ticket/Display.html?id=5472> + + + +2003-12-12 Gisle Aas <gisle@ActiveState.com> + + Release 3.35 + + Documentation fixes by Paul Croome <Paul.Croome@softwareag.com>. + + Removed redundant dSP. + + + +2003-10-27 Gisle Aas <gisle@ActiveState.com> + + Release 3.34 + + Fix segfault that happened when the parse callback caused + the stack to get reallocated. The original bug report was + <http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=217616> + + + +2003-10-14 Gisle Aas <gisle@ActiveState.com> + + Release 3.33 + + Perl 5.005 or better is now required. For some reason we get + a test failure with perl-5.004 and I don't really feel like + debugging that perl any more. Details about this failure can + be found at <http://rt.cpan.org/Ticket/Display.html?id=4065>. + + New HTML::TokeParser method called 'get_phrase'. It returns + all current text while ignoring any phrase-level markup. + + The HTML::TokeParser method 'get_text' now expands skipped + non-phrase-level tags as a single space. + + + +2003-10-10 Gisle Aas <gisle@ActiveState.com> + + Release 3.32 + + If the document parsed ended with some kind of unterminated markup, + then the parser state was not reset properly and this piece of markup + would show up in the beginning of the next document parsed. + <http://rt.cpan.org/Ticket/Display.html?id=3954> + + The get_text and get_trimmed_text methods of HTML::TokeParser can + now take multiple end tags as argument. Patch by <siegmann@tinbergen.nl> + at <http://rt.cpan.org/Ticket/Display.html?id=3166>. + + Various documentation tweaks. + + Included another example program: hdump + + + +2003-08-19 Gisle Aas <gisle@ActiveState.com> + + Release 3.31 + + The -DDEBUGGING fix in 3.30 was not really there :-( + + + +2003-08-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.30 + + The previous release failed to compile on a -DDEBUGGING perl + like the one provided by Redhat 9. + + Got rid of references to perl-5.7. + + Further fixes to avoid warnings from Visual C. + Patch by Steve Hay <steve.hay@uk.radan.com>. + + + +2003-08-14 Gisle Aas <gisle@ActiveState.com> + + Release 3.29 + + Setting xml_mode now implies strict_names also for end tags. + + Avoid warning from Visual C. Patch by <gsar@activestate.com>. + + 64-bit fix from Doug Larrick <doug@ties.org> + http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=195500 + + Try to parse similar to Mozilla/MSIE in certain edge cases. + All these are outside of the official definition of HTML but + HTML spam often tries to take advantage of these. + + - New configuration attribute 'strict_end'. Unless enabled + we will allow end tags to contain extra words or stuff + that look like attributes before the '>'. This means that + tags like these: + + </foo foo="<ignored>"> + </foo ignored> + </foo ">" ignored> + + are now all parsed as a 'foo' end tag instead of text. + Even if the extra stuff looks like attributes they will not + be reported if requested via the 'attr' or 'tokens' argspecs + for the 'end' handler. + + - Parse '</:comment>' and '</ comment>' as comments unless + strict_comment is enabled. Previous versions of the parser + would report these as text. If these comments contain + quoted words prefixed by space or '=' these words can + contain '>' without terminating the comment. + + - Parse '<! "<>" foo>' as comment containing ' "<>" foo'. + Previous versions of the parser would terminate the comment + at the first '>' and report the rest as text. + + - Legacy comment mode: Parse with comments terminated with a + lone '>' if no '-->' is found before eof. + + - Incomplete tag at eof is reported as a 'comment' instead + of 'text' unless strict_comment is enabled. + + + +2003-04-16 Gisle Aas <gisle@ActiveState.com> + + Release 3.28 + + When 'strict_comment' is off (which it is by default) + treat anything that matches <!...> a comment. + + Should now be more efficient on threaded perls. + + + +2003-01-18 Gisle Aas <gisle@ActiveState.com> + + Release 3.27 + + Typo fixes to the documentation. + + HTML::Entities::escape_entities_numeric contributed + by Sean M. Burke <sburke@cpan.org>. + + Included one more example program 'hlc' that show + how to downcase all tags in an HTML file. + + + +2002-03-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.26 + + Avoid core dump in some cases where the callback croaks. + The perl_call_method and perl_call_sv needs G_EVAL flag + to be safe. + + New parser attributes; 'attr_encoded' and 'case_sensitive'. + Contributed by Guy Albertelli II <guy@albertelli.com>. + + HTML::Entities + - don't encode \r by default as suggested by Sean M. Burke. + + HTML::HeadParser + - ignore empty http-equiv + - allow multiple <link> elements. Patch by + Timur I. Bakeyev <timur@gnu.org> + + Avoid warnings from bleadperl on the uentities test. + + + +2001-05-11 Gisle Aas <gisle@ActiveState.com> + + Release 3.25 + + Minor tweaks for build failures on perl5.004_04, perl-5.6.0, + and for macro clash under Windows. + + Improved parsing of <plaintext>... :-) + + + +2001-05-09 Gisle Aas <gisle@ActiveState.com> + + Release 3.24 + + $p->parse(CODE) + + New events: start_document, end_document + + New argspecs: skipped_text, offset_end + + The offset/line/column counters was not properly reset + after eof. + + + +2001-05-01 Gisle Aas <gisle@ActiveState.com> + + Release 3.23 + + If the $p->ignore_elements filter did not work as it should if + handlers for start/end events was not registered. + + + +2001-04-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.22 + + The <textarea> element is now parsed in literal mode, i.e. no other tags + recognized until the </textarea> tag is seen. Unlike other literal elements, + the text content is not 'cdata'. + + The XML ' entity is decoded. It apos-char itself is still encoded as + ' as ' is not really an HTML tag, and not recognized by many HTML + browsers. + + + +2001-04-10 Gisle Aas <gisle@ActiveState.com> + + Release 3.21 + + Fix a memory leak which occurred when using filter methods. + + Avoid a few compiler warnings (DEC C): + - Trailing comma found in enumerator list + - "unsigned char" is not compatible with "const char". + + Doc update. + + + +2001-04-02 Gisle Aas <gisle@ActiveState.com> + + Release 3.20 + + Some minor documentation updates. + + + +2001-03-30 Gisle Aas <gisle@ActiveState.com> + + Release 3.19_94 + + Implemented 'tag', 'line', 'column' argspecs. + + HTML::PullParser doc update. + eg/hform is an example of HTML::PullParser usage. + + + +2001-03-27 Gisle Aas <gisle@ActiveState.com> + + Release 3.19_93 + + Shorten 'report_only_tags' to 'report_tags'. + I think it reads better. + + Bleadperl portability fixes. + + + +2001-03-25 Gisle Aas <gisle@ActiveState.com> + + Release 3.19_92 + + HTML::HeadParser made more efficient by using 'ignore_elements'. + + HTML::LinkExtor made more efficient by using 'report_only_tags'. + + HTML::TokeParser generalized into HTML::PullParser. HTML::PullParser + only support the get_token/unget_token interface of HTML::TokeParser, + but is more flexible because the information that make up an token + is customisable. HTML::TokeParser is made into an HTML::PullParser + subclass. + + + +2001-03-19 Gisle Aas <gisle@ActiveState.com> + + Release 3.19_91 + + Array references can be passed to the filter methods. Makes it easier + to use them as constructor options. + + Example programs updated to use filters. + + Reset ignored_element state on EOF. + + Documentation updates. + + The netscape_buggy_comment() method now generates mandatory warning + about its deprecation. + + + +2001-03-13 Gisle Aas <gisle@ActiveState.com> + + Release 3.19_90 + + This is an developer only release. It contains some new + experimental features. The interface to these might still change. + + Implemented filters to reduce the numbers of callbacks generated: + - $p->ignore_tags() + - $p->report_only_tags() + - $p->ignore_elements() + + New @attr argspec. Less overhead than 'attr' and allow + compatibility with XML::Parser style start events. + + The whole argspec can be wrapped up in @{...} to signal + flattening. Only makes a difference when the target is an + array. + + + +2001-03-09 Gisle Aas <gisle@ActiveState.com> + + Release 3.19 + + Avoid the entity2char global. That should make the module + more thread safe. Patch by Gurusamy Sarathy <gsar@ActiveState.com>. + + + +2001-02-24 Gisle Aas <gisle@ActiveState.com> + + Release 3.18 + + There was a C++ style comment left in util.c. Strict C + compilers do not like that kind of stuff. + + + +2001-02-23 Gisle Aas <gisle@ActiveState.com> + + Release 3.17 + + The 3.16 release broke MULTIPLICITY builds. Fixed. + + + +2001-02-22 Gisle Aas <gisle@ActiveState.com> + + Release 3.16 + + The unbroken_text option now works across ignored tags. + + Fix casting of pointers on some 64 bit platforms. + + Fix decoding of Unicode entities. Only optionally available for + perl-5.7.0 or better. + + Expose internal decode_entities() function at the Perl level. + + Reindented some code. + + + +2000-12-26 Gisle Aas <gisle@ActiveState.com> + + Release 3.15 + + HTML::TokeParser's get_tag() method now takes multiple + tags to match. Hopefully the documentation is also a bit clearer. + + #define PERL_NO_GET_CONTEXT: Should speed up things for thread + enabled versions of perl. + + Quote some more entities that also happens to be perl keywords. + This avoids warnings on perl-5.004. + + Unicode entities only triggered for perl-5.7.0 or higher. + + + +2000-12-03 Gisle Aas <gisle@ActiveState.com> + + Release 3.14 + + If a handler triggered by flushing text at eof called the + eof method then infinite recursion occurred. Fixed. + Bug discovered by Jonathan Stowe <gellyfish@gellyfish.com>. + + Allow <!doctype ...> to be parsed as declaration. + + + +2000-09-17 Gisle Aas <gisle@ActiveState.com> + + Release 3.13 + + Experimental support for decoding of Unicode entities. + + + +2000-09-14 Gisle Aas <gisle@ActiveState.com> + + Release 3.12 + + Some tweaks to get it to compile with "Optimierender Microsoft (R) + 32-Bit C/C++-Compiler, Version 12.00.8168, fuer x86." + Patch by Matthias Waldorf <matthias.waldorf@zoom.de>. + + HTML::Entities documentation spelling patch by + David Dyck <dcd@tc.fluke.com>. + + + +2000-08-22 Gisle Aas <gisle@ActiveState.com> + + Release 3.11 + + HTML::LinkExtor and eg/hrefsub now obtain %linkElements from + the HTML::Tagset module. + + + +2000-06-29 Gisle Aas <gisle@ActiveState.com> + + Release 3.10 + + Avoid core dump when stack gets relocated as the result of + text handler invocation while $p->unbroken_text is enabled. + Needed to refresh the stack pointer. + + + +2000-06-28 Gisle Aas <gisle@ActiveState.com> + + Release 3.09 + + Avoid core dump if somebody clobbers the aliased $self argument of + a handler. + + HTML::TokeParser documentation update suggested by + Paul Makepeace <Paul.Makepeace@realprogrammers.com>. + + + +2000-05-23 Gisle Aas <gisle@ActiveState.com> + + Release 3.08 + + Fix core dump for large start tags. + Bug spotted by Alexander Fraser <green795@hotmail.com> + + Added yet another example program: eg/hanchors + + Typo fix by Jamie McCarthy <jamie@mccarthy.org> + + + +2000-03-20 Gisle Aas <gisle@aas.no> + + Release 3.07 + + Fix perl5.004 builds (was broken in 3.06) + + Declaration parsing mode now only triggers for <!DOCTYPE ...> and + <!ENTITY ...>. Based on patch by la mouton <kero@3sheep.com>. + + + +2000-03-06 Gisle Aas <gisle@aas.no> + + Release 3.06 + + Multi-threading/MULTIPLICITY compilation fix. + Both Doug MacEachern <dougm@pobox.com> and + Matthias Urlichs <smurf@noris.net> provided a patch. + + Avoid some "statement not reached" warnings from picky + compilers. + + Remove final commas in enums as ANSI C does not allow + them and some compilers actually care. + Patch by James Walden <jamesw@ichips.intel.com> + + Added eg/htextsub example program. + + + +2000-01-22 Gisle Aas <gisle@aas.no> + + Release 3.05 + + Implemented $p->unbroken_text option + + Don't parse content of certain HTML elements as CDATA when + xml_mode is enabled. + + Offset was reported with wrong sign for text at end of chunk. + + + +2000-01-15 Gisle Aas <gisle@aas.no> + + Release 3.04 + + Backed out 3.03-patch that checked for legal handler and attribute + names in the HTML::Parser constructor. + + Documentation typo fixed by Michael. + + + +2000-01-14 Gisle Aas <gisle@aas.no> + + Release 3.03 + + We did not get out of comment mode for comments ending with an + odd number of "-" before ">". Patch by la mouton <kero@3sheep.com> + + Documentation patch by Michael. + + + +1999-12-21 Gisle Aas <gisle@aas.no> + + Release 3.02 + + Hide ~-magic IV-pointer to 'struct p_state' behind a reference. + This allow copying of the internal _hparser_xs_state element, and + will make HTML-Tree-0.61 work again. + + Introduced $p->init() which might be useful for subclasses that + only want the initialization part of the constructor. + + Filled out DIAGNOSTICS section of the HTML::Parser POD. + + + +1999-12-19 Gisle Aas <gisle@aas.no> + + Release 3.01 + + Rely on ~-magic instead of a DESTROY method to deallocate + the internal 'struct p_state'. This avoid memory leaks + when people simply wipe of the content of the object hash. + + One of the assertion in hparser.c had opposite logic. This made + the parser fail when compiled with a -DDEBUGGING perl. + + Don't assume any specific order of hash keys in the t/cases.t. + This test failed with some newer development releases of perl. + + + +1999-12-14 Gisle Aas <gisle@aas.no> + + Release 3.00 + + Documentation update (most of it from Michael) + + Minor patch to eg/hstrip so that it use a "" handler + instead of &ignore. + + Test suite patches from Michael + + + +1999-12-13 Gisle Aas <gisle@aas.no> + + Release 2.99_96 + + Patches from Michael: + + - A handler of "" means that the event will be ignored. + More efficient than using 'sub {}' as handler. + + - Don't use a perl hash for looking up argspec keywords. + + - Documentation tweaks. + + + +1999-12-09 Gisle Aas <gisle@aas.no> + + Release 2.99_95 (this is a 3.00 candidate) + + Fixed core dump when "<" was followed by an 8-bit character. + Spotted and test case provided by Doug MacEachern. Doug had + been running HTML-Parser-XS through more that 1 million urls that + had been downloaded via LWP. + + Handlers can now invoke $p->eof to request the parsing to terminate. + HTML::HeadParser has been simplified by taking advantage of this. + Also added a title-extraction example that uses this. + + Michael once again fixed my bad English in the HTML::Parser + documentation. + + netscape_buggy_comment will carp instead of warn + + updated TODO/README + + Documented that HTML::Filter is depreciated. + + Made backslash reserved in literal argspec strings. + + Added several new test scripts. + + + +1999-12-08 Gisle Aas <gisle@aas.no> + + Release 2.99_94 (should almost be a 3.00 candidate) + + Renamed 'cdata_flag' as 'is_cdata'. + + Dropped support for wrapping callback handler and argspec + in an array and passing a reference to $p->handler. It + created ambiguities when you want to pass a array as + handler destination and not update argspec. The wrapping + for constructor arguments are unchanged. + + Reworked the documentation after updates from Michael. + + Simplified internal check_handler(). It should probably simply + be inlined in handler() again. + + Added argspec 'length' and 'undef' + + Fix statement-less label. Fix suggested by Matthew Langford + <langfml@Eng.Auburn.EDU>. + + Added two more example programs: eg/hstrip and eg/htext. + + Various minor patches from Michael. + + + +1999-12-07 Gisle Aas <gisle@aas.no> + + Release 2.99_93 + + Documentation update + + $p->bool_attr_value renamed as $p->boolean_attribute_value + + Internal renaming: attrspec --> argspec + + Introduced internal 'enum argcode' in hparser.c + + Added eg/hrefsub + + + +1999-12-05 Gisle Aas <gisle@aas.no> + + Release 2.99_92 + + More documentation patches from Michael + + Renamed 'token1' as 'token0' as suggested by Michael + + For artificial end tags we now report 'tokens', but not 'tokenpos'. + + Boolean attribute values show up as (0, 0) in 'tokenpos' now. + + If $p->bool_attr_value is set it will influence 'tokens' + + Fix for core dump when parsing <a "> when $p->strict_names(0). + Based on fix by Michael. + + Will av_extend() the tokens/tokenspos arrays. + + New test suite script by Michael: t/attrspec.t + + + +1999-12-04 Gisle Aas <gisle@aas.no> + + Release 2.99_91 + + Implemented attrspec 'offset' + + Documentation patch from Michael + + Some more cleanup/updated TODO + + + +1999-12-03 Gisle Aas <gisle@aas.no> + + Release 2.99_90 (first beta for 3.00) + + Using "realloc" as a parameter name in grow_tokens created + problems for some people. Fix by Paul Schinder <schinder@pobox.com> + + Patch by Michael that makes array handler destinations really work. + + Patch by Michael that make HTML::TokeParser use this. This gave a + a speedup of about 80%. + + Patch by Michael that makes t/cases into a real test. + + Small HTML::Parser documentation patch by Michael. + + Renamed attrspec 'origtext' to 'text' and 'decoded_text' to 'dtext' + + Split up Parser.xs. Moved stuff into hparser.c and util.c + + Dropped html_ prefix from internal parser functions. + + Renamed internal function html_handle() as report_event(). + + + +1999-12-02 Gisle Aas <gisle@aas.no> + + Release 2.99_17 + + HTML::Parser documentation patch from Michael. + + Fix memory leaks in html_handler() + + Patch that makes an array legal as handler destination. + Also from Michael. + + The end of marked sections does not eat successive newline + any more. + + The artificial end event for empty tag in xml_mode did not + report an empty origtext. + + New constructor option: 'api_version' + + + +1999-12-01 Gisle Aas <gisle@aas.no> + + Release 2.99_16 + + Support "event" in argspec. It expands to the name of the + handler (minus "default"). + + Fix core dump for large start tags. The tokens_grow() routine + needed an adjustment. Added test for this; t/largstags.t. + + + +1999-11-30 Gisle Aas <gisle@aas.no> + + Release 2.99_15 + + Major restructuring/simplification of callback interface based on + initial work by Michael. The main news is that you now need to + tell what arguments you want to be provided to your callbacks. + + The following parser options has been eliminated: + + $p->decode_text_entities + $p->keep_case + $p->v2_compat + $p->pass_self + $p->attr_pos + + + +1999-11-26 Gisle Aas <gisle@aas.no> + + Release 2.99_14 + + Documentation update by Michael A. Chase. + + Fix for declaration parsing by Michael A. Chase. + + Workaround for perl5.004_05 bug. Can't return &PL_sv_undef. + + + +1999-11-22 Gisle Aas <gisle@aas.no> + + Release 2.99_13 + + New Parser.pm POD based on initial work by Michael A. Chase. + All new features should now be described. + + $p->callback(start => undef) will not reset the callback. + + $p->xml_mode() did not parse attributes correct because + HCTYPE_NOT_SPACE_EQ_SLASH_GT flag was never set. + + A few more tests. + + + +1999-11-18 Gisle Aas <gisle@aas.no> + + Release 2.99_12 + + Implemented $p->attr_pos attribute. This causes attr positions + within $origtext of the start tag to be reported instead of the + attribute values. The positions are reported as 4 numbers; end of + previous attr, start of this attr, start of attr value, and end of + attr. This should make substr() manipulations of $origtext easy. + + Implemented $p->unbroken_text attribute. This makes sure that + text segments are never broken and given back as separate text + callbacks. It delays text callbacks until some other markup + has been recognized. + + More English corrections by Michael A. Chase. + + HTML::LinkExtor now recognizes even more URI attributes as + suggested by Sean M. Burke <sburke@netadventure.net> + + Completed marked sections support. It is also now a compile + time decision if you want this supported or not. The only + drawback of enabling it should be a possible parsing speed + reduction. I have not measured this yet. + + The keys for callbacks initialized in the constructor are now + suffixed with "_cb". + + Renamed $p->pass_cbdata to $p->pass_self. + + Added magic number to the p_state struct. + + + +1999-11-17 Gisle Aas <gisle@aas.no> + + Release 2.99_11 + + Don't leak $@ modifications from HTML::Parser constructor. + + Included HTML::Parser POD. + + Marked sections almost work. CDATA and RCDATA should work. + + For tags that take us into literal_mode; <script>, <style>, + <xmp>, we did not recognize the end tag unless it was written + in all lower case. + + + +1999-11-16 Gisle Aas <gisle@aas.no> + + Release 2.99_10 + + The mkhctype and mkpfunc scripts were using \z inside RE. This + did not work for perl5.004. Replaced them with plain old + dollar signs. + + + +1999-11-15 Gisle Aas <gisle@aas.no> + + Release 2.99_09 + + Grammar fixes by Michael A. Chase <mchase@ix.netcom.com> + + Some more test suite patches for Win32 by Michael A. Chase + <mchase@ix.netcom.com> + + Implemented $p->strict_names attribute. By default we now + allow almost anything in tag and attribute names. This is much + closer to the behaviour of some popular browsers. This allows us + to parse broken tags like this example from the LWP mailing list: + <IMG ALIGN=MIDDLE SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> + + Introduced some tables in "hctype.h" and "pfunc.h". These + are built by the corresponding "mk..." script. + + + +1999-11-10 Gisle Aas <gisle@aas.no> + + Release 2.99_08 + + Make Parser.xs compile on perl5.004_05 too. + + New callback called 'default'. This will be called for any + document text no other callback shows an interest in. + + Patch by Michael A. Chase <mchase@ix.netcom.com> that should + help clean up files for the test suite on Win32. + + Can now set up various attributes with key/value pairs passed to + the constructor. + + $p->parse_file() will open the file in binmode() + + Pass complete processing instruction tag as second argument + to process callback. + + New boolean attribute v2_compat. This influences how attributes + are reported for start tags. + + HTML::Filter now filters process instructions too. + + Faster HTML::LinkExtor by taking advantage of the new + callback interface. The module now also uses URI.pm (instead + of the old URI::URL) to absolutize URIs. + + Faster HTML::TokeParser by taking advantage of new + accum interface. + + + +1999-11-09 Gisle Aas <gisle@aas.no> + + Release 2.99_07 + + Entities in attribute values are now always expanded. + + If you set the $p->decode_text_entities to a true value, then + you don't have to decode the text yourself. + + In xml_mode we don't report empty element tags as a start tag + with an extra parameter any more. Instead we generate an artificial + end tag. + + 'xml_mode' now implies 'keep_case'. + + The parser now keeps its own copy of the bool_attr_value value. + + Avoid memory leak for text callbacks + + Avoid using ERROR as a goto label. + + Introduced common internal accessor function for all boolean parser + attributes. + + Tweaks to make Parser.xs compile under perl5.004. + + + +1999-11-08 Gisle Aas <gisle@aas.no> + + Release 2.99_06 + + Internal fast decode_entities(). By using it we are able to make + the HTML::Entities::decode function 6 times faster than the old one + implemented in pure Perl. + + $p->bool_attr_value() can be set to influence the value that + boolean attributes will be assigned. The default is to assign + a value identical to the attribute name. + + Process instructions are reported as "PI" in @accum + + $p->xml_mode(1) modifies how processing instructions are terminated + and allows "/>" at the end of start tags. + + Turn off optimizations when compiling with gcc on Solaris. Avoids + what we believe to be a compiler bug. Should probably figure out + which versions of gcc have this bug. + + + +1999-11-05 Gisle Aas <gisle@aas.no> + + Release 2.99_05 + + The previous release did not even compile. I forgot to try 'make test' + before uploading. + + + +1999-11-05 Gisle Aas <gisle@aas.no> + + Release 2.99_04 + + Generalized <XMP>-support to cover all literal parsing. Currently + activated for <script>, <style>, <xmp> and <plaintext>. + + + +1999-11-05 Gisle Aas <gisle@aas.no> + + Release 2.99_03 + + <XMP>-support. + + Allow ":" in tag and attribute names + + Include rest of the HTML::* files from the old HTML::Parser + package. This should make testing easier. + + + +1999-11-04 Gisle Aas <gisle@aas.no> + + Release 2.99_02 + + Implemented keep_case() option. If this attribute is true, then + we don't lowercase tag and attribute names. + + Implemented accum() that takes an array reference. Tokens are + pushed onto this array instead of sent to callbacks. + + Implemented strict_comment(). + + + +1999-11-03 Gisle Aas <gisle@aas.no> + + Release 2.99_01 + + Baseline of XS implementation + + + +1999-11-05 Gisle Aas <gisle@aas.no> + + Release 2.25 + + Allow ":" in attribute names as a workaround for Microsoft Excel + 2000 which generates such files. + + Make deprecate warning if netscape_buggy_comment() method is + used. The method is used in strict_comment(). + + Avoid duplication of parse_file() method in HTML::HeadParser. + + + +1999-10-29 Gisle Aas <gisle@aas.no> + + Release 2.24 + + $p->parse_file() will not close a handle passed to it any more. + If passed a filename that can't be opened it will return undef + instead of raising an exception, and strings like "*STDIN" are not + treated as globs any more. + + HTML::LinkExtor knows about background attribute of <tables>. + Patch by Clinton Wong <clintdw@netcom.com> + + HTML::TokeParser will parse large inline strings much faster now. + The string holding the document must not be changed during parsing. + + + +1999-06-09 Gisle Aas <gisle@aas.no> + + Release 2.23 + + Documentation updates. + + + +1998-12-18 Gisle Aas <aas@sn.no> + + Release 2.22 + + Protect HTML::HeadParser from evil $SIG{__DIE__} hooks. + + + +1998-11-13 Gisle Aas <aas@sn.no> + + Release 2.21 + + HTML::TokeParser can now parse strings directly and does the + right thing if you pass it a GLOB. Based on patch by + Sami Itkonen <si@iki.fi>. + + HTML::Parser now allows space before and after "--" in Netscape + comments. Patch by Peter Orbaek <poe@daimi.au.dk>. + + + +1998-07-08 Gisle Aas <aas@sn.no> + + Release 2.20 + + Added HTML::TokeParser. Check it out! + + + +1998-07-07 Gisle Aas <aas@sn.no> + + Release 2.19 + + Don't end a text chunk with space when we try to avoid breaking up + words. + + + +1998-06-22 Gisle Aas <aas@sn.no> + + Release 2.18 + + HTML::HeadParser->parse_file will now stop parsing when the + <body> starts as it should. + + HTML::LinkExtor more easily subclassable by introducing the + $self->_found_link method. + + + +1998-04-28 Gisle Aas <aas@sn.no> + + Release 2.17 + + Never split words (a sequence of non-space) between two invocations + of $self->text. This is just a simplification of the code that tried + not to break entities. + + HTML::Parser->parse_file now use smaller chunks as already + suggested by the HTML::Parser documentation. + + + +1998-04-02 Gisle Aas <aas@sn.no> + + Release 2.16 + + The HTML::Parser could some times break hex entities (like ) + in the middle. + + Removed remaining forced dependencies on libwww-perl modules. It + means that all tests should now pass, even if libwww-perl was not + installed previously. + + More tests. + + + +1998-04-01 Gisle Aas <aas@sn.no> + + Release 2.14, HTML::* modules unbundled from libwww-perl-5.22. diff --git a/MANIFEST b/MANIFEST new file mode 100644 index 0000000..6b89983 --- /dev/null +++ b/MANIFEST @@ -0,0 +1,80 @@ +Changes History of this package +MANIFEST This file +Makefile.PL Will create 'Makefile' to build this extention +Parser.pm Bootstrap & documentation +Parser.xs XS glue +README The Instructions +TODO Ideas and things still left to do +eg/hanchors Extract all links from a document +eg/hdump Show how a document is parsed +eg/hform Parse <forms> using HTML::PullParser +eg/hlc Downcase tag and attribute names +eg/hrefsub Do substitutions on link attributes +eg/hstrip Stip away certains tags/elements and attributes +eg/htext Leave only the text +eg/htextsub Do substitutions only on the text content +eg/htitle Extract document title +hints/solaris.pl Avoid compiler bug +hparser.c Parser implementation +hparser.h Parser implementation (data structures) +lib/HTML/Entities.pm Encode and decode entities in strings +lib/HTML/Filter.pm HTML::Filter class +lib/HTML/HeadParser.pm HTML::HeadParser class +lib/HTML/LinkExtor.pm HTML::LinkExtor class +lib/HTML/PullParser.pm HTML::PullParser class +lib/HTML/TokeParser.pm HTML::TokeParser class +mkhctype Generates 'hctype.h' +mkpfunc Generates 'pfunc.h' +t/api_version.t Test api_version constructor option +t/argspec-bad.t Test various bad argspec arguments +t/argspec.t Test argspec +t/argspec2.t Test new argspecs @attr, @{...} +t/attr-encoded.t Test attr_encoded option +t/callback.t Use callback to get data +t/case-sensitive.t Test case_sensitive option +t/cases.t Test various interesting cases +t/comment.t Test comment parsing +t/crashme.t Parse random data +t/declaration.t Test declaration parsing +t/default.t Test default handler +t/document.t Test {start,end}_document behaviour +t/dtext.t Test dtext decoding of entities +t/entities.t Test encoding/decoding of entities +t/entities2.t Test _decode_entities() +t/filter-methods.t Test ignore_tags, ignore_elements methods. +t/filter.t Test HTML::Filter +t/handler-eof.t Test invocation of $p->eof in handlers +t/handler.t Test $p->handler method +t/headparser-http.t Test HTML::HeadParser +t/headparser.t Test HTML::HeadParser +t/ignore.t Test elements ignored by handler = '' or 0 +t/largetags.t Test with very large tags +t/linkextor-base.t Test HTML::LinkExtor +t/linkextor-rel.t Test HTML::LinkExtor +t/magic.t Test that checking magic head in p_state works +t/marked-sect.t Test marked section support +t/msie-compat.t Test some MSIE compatibility edge cases +t/offset.t Test attrspec offset +t/options.t Test set/get for various parser options +t/parsefile.t Test the $p->parse_file() method +t/parser.t Test HTML::Parser subclassing +t/pod.t Test pod correctness +t/plaintext.t Test parsing of <plaintext> +t/process.t Test process instruction support +t/pullparser.t Test HTML::PullParser +t/script.t Test parsing of <script> with quoted strings +t/skipped-text.t Test skipped_text argspec +t/stack-realloc.t Test that stack reallocation bug don't come back +t/textarea.t Test handling of <textarea> +t/threads.t Test thread safety +t/tokeparser.t Test HTML::TokeParser +t/uentities.t Test encoding/decoding of Unicode entities +t/unbroken-text.t Test unbroken_text option +t/unicode.t Test parsing of Unicode text +t/unicode-bom.t Test handling of the Unicode BOM character +t/xml-mode.t Test parsing in XML mode +tokenpos.h Dynamically sized token_pos arrays +typemap Convert between HTML::Parser and 'struct p_state' +util.c Some utility functions +META.yml Module YAML meta-data (added by MakeMaker) +META.json Module JSON meta-data (added by MakeMaker) diff --git a/META.json b/META.json new file mode 100644 index 0000000..9dfc448 --- /dev/null +++ b/META.json @@ -0,0 +1,53 @@ +{ + "abstract" : "HTML parser class", + "author" : [ + "Gisle Aas <gisle@activestate.com>" + ], + "dynamic_config" : 1, + "generated_by" : "ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.120921", + "license" : [ + "perl_5" + ], + "meta-spec" : { + "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec", + "version" : "2" + }, + "name" : "HTML-Parser", + "no_index" : { + "directory" : [ + "t", + "inc" + ] + }, + "prereqs" : { + "build" : { + "requires" : { + "ExtUtils::MakeMaker" : "0", + "Test::More" : "0" + } + }, + "configure" : { + "requires" : { + "ExtUtils::MakeMaker" : "0" + } + }, + "runtime" : { + "recommends" : { + "HTTP::Headers" : "0" + }, + "requires" : { + "HTML::Tagset" : "3", + "XSLoader" : "0", + "perl" : "5.008" + } + } + }, + "release_status" : "stable", + "resources" : { + "repository" : { + "url" : "http://github.com/gisle/html-parser" + }, + "x_MailingList" : "mailto:libwww@perl.org" + }, + "version" : "3.71" +} diff --git a/META.yml b/META.yml new file mode 100644 index 0000000..dd2496d --- /dev/null +++ b/META.yml @@ -0,0 +1,30 @@ +--- +abstract: 'HTML parser class' +author: + - 'Gisle Aas <gisle@activestate.com>' +build_requires: + ExtUtils::MakeMaker: 0 + Test::More: 0 +configure_requires: + ExtUtils::MakeMaker: 0 +dynamic_config: 1 +generated_by: 'ExtUtils::MakeMaker version 6.66, CPAN::Meta::Converter version 2.120921' +license: perl +meta-spec: + url: http://module-build.sourceforge.net/META-spec-v1.4.html + version: 1.4 +name: HTML-Parser +no_index: + directory: + - t + - inc +recommends: + HTTP::Headers: 0 +requires: + HTML::Tagset: 3 + XSLoader: 0 + perl: 5.008 +resources: + repository: http://github.com/gisle/html-parser + x_MailingList: mailto:libwww@perl.org +version: 3.71 diff --git a/Makefile.PL b/Makefile.PL new file mode 100644 index 0000000..a641658 --- /dev/null +++ b/Makefile.PL @@ -0,0 +1,64 @@ +use strict; +use ExtUtils::MakeMaker; + +WriteMakefile( + NAME => 'HTML::Parser', + VERSION_FROM => 'Parser.pm', + ABSTRACT_FROM => 'Parser.pm', + AUTHOR => 'Gisle Aas <gisle@activestate.com>', + LICENSE => 'perl', + + MIN_PERL_VERSION => 5.008, + PREREQ_PM => { + 'HTML::Tagset' => 3, + 'XSLoader' => 0, + }, + META_MERGE => { + build_requires => { 'Test::More' => 0 }, + recommends => { 'HTTP::Headers' => 0 }, + resources => { + repository => 'http://github.com/gisle/html-parser', + MailingList => 'mailto:libwww@perl.org', + } + }, + + DEFINE => "-DMARKED_SECTION", + H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h", + "hparser.c", "util.c", + ], + clean => { FILES => 'hctype.h pfunc.h' }, +); + + +sub MY::postamble +{ + ' +pfunc.h : mkpfunc + $(PERLRUN) mkpfunc >pfunc.h + +hctype.h : mkhctype + $(PERLRUN) mkhctype >hctype.h +' +} + +BEGIN { + # compatibility with older versions of MakeMaker + my $developer = -f "MANIFEST.SKIP"; + my %mm_req = ( + LICENCE => 6.31, + META_MERGE => 6.45, + META_ADD => 6.45, + MIN_PERL_VERSION => 6.48, + ); + undef(*WriteMakefile); + *WriteMakefile = sub { + my %arg = @_; + for (keys %mm_req) { + unless (eval { ExtUtils::MakeMaker->VERSION($mm_req{$_}) }) { + warn "$_ $@" if $developer; + delete $arg{$_}; + } + } + ExtUtils::MakeMaker::WriteMakefile(%arg); + }; +} diff --git a/Parser.pm b/Parser.pm new file mode 100644 index 0000000..42dd1a1 --- /dev/null +++ b/Parser.pm @@ -0,0 +1,1240 @@ +package HTML::Parser; + +# Copyright 1996-2009, Gisle Aas. +# Copyright 1999-2000, Michael A. Chase. +# +# This library is free software; you can redistribute it and/or +# modify it under the same terms as Perl itself. + +use strict; +use vars qw($VERSION @ISA); + +$VERSION = "3.71"; + +require HTML::Entities; + +require XSLoader; +XSLoader::load('HTML::Parser', $VERSION); + +sub new +{ + my $class = shift; + my $self = bless {}, $class; + return $self->init(@_); +} + + +sub init +{ + my $self = shift; + $self->_alloc_pstate; + + my %arg = @_; + my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); + if ($api_version >= 4) { + require Carp; + Carp::croak("API version $api_version not supported " . + "by HTML::Parser $VERSION"); + } + + if ($api_version < 3) { + # Set up method callbacks compatible with HTML-Parser-2.xx + $self->handler(text => "text", "self,text,is_cdata"); + $self->handler(end => "end", "self,tagname,text"); + $self->handler(process => "process", "self,token0,text"); + $self->handler(start => "start", + "self,tagname,attr,attrseq,text"); + + $self->handler(comment => + sub { + my($self, $tokens) = @_; + for (@$tokens) { + $self->comment($_); + } + }, "self,tokens"); + + $self->handler(declaration => + sub { + my $self = shift; + $self->declaration(substr($_[0], 2, -1)); + }, "self,text"); + } + + if (my $h = delete $arg{handlers}) { + $h = {@$h} if ref($h) eq "ARRAY"; + while (my($event, $cb) = each %$h) { + $self->handler($event => @$cb); + } + } + + # In the end we try to assume plain attribute or handler + while (my($option, $val) = each %arg) { + if ($option =~ /^(\w+)_h$/) { + $self->handler($1 => @$val); + } + elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { + require Carp; + Carp::croak("Bad constructor option '$option'"); + } + else { + $self->$option($val); + } + } + + return $self; +} + + +sub parse_file +{ + my($self, $file) = @_; + my $opened; + if (!ref($file) && ref(\$file) ne "GLOB") { + # Assume $file is a filename + local(*F); + open(F, "<", $file) || return undef; + binmode(F); # should we? good for byte counts + $opened++; + $file = *F; + } + my $chunk = ''; + while (read($file, $chunk, 512)) { + $self->parse($chunk) || last; + } + close($file) if $opened; + $self->eof; +} + + +sub netscape_buggy_comment # legacy +{ + my $self = shift; + require Carp; + Carp::carp("netscape_buggy_comment() is deprecated. " . + "Please use the strict_comment() method instead"); + my $old = !$self->strict_comment; + $self->strict_comment(!shift) if @_; + return $old; +} + +# set up method stubs +sub text { } +*start = \&text; +*end = \&text; +*comment = \&text; +*declaration = \&text; +*process = \&text; + +1; + +__END__ + + +=head1 NAME + +HTML::Parser - HTML parser class + +=head1 SYNOPSIS + + use HTML::Parser (); + + # Create parser object + $p = HTML::Parser->new( api_version => 3, + start_h => [\&start, "tagname, attr"], + end_h => [\&end, "tagname"], + marked_sections => 1, + ); + + # Parse document text chunk by chunk + $p->parse($chunk1); + $p->parse($chunk2); + #... + $p->eof; # signal end of document + + # Parse directly from file + $p->parse_file("foo.html"); + # or + open(my $fh, "<:utf8", "foo.html") || die; + $p->parse_file($fh); + +=head1 DESCRIPTION + +Objects of the C<HTML::Parser> class will recognize markup and +separate it from plain text (alias data content) in HTML +documents. As different kinds of markup and text are recognized, the +corresponding event handlers are invoked. + +C<HTML::Parser> is not a generic SGML parser. We have tried to +make it able to deal with the HTML that is actually "out there", and +it normally parses as closely as possible to the way the popular web +browsers do it instead of strictly following one of the many HTML +specifications from W3C. Where there is disagreement, there is often +an option that you can enable to get the official behaviour. + +The document to be parsed may be supplied in arbitrary chunks. This +makes on-the-fly parsing as documents are received from the network +possible. + +If event driven parsing does not feel right for your application, you +might want to use C<HTML::PullParser>. This is an C<HTML::Parser> +subclass that allows a more conventional program structure. + + +=head1 METHODS + +The following method is used to construct a new C<HTML::Parser> object: + +=over + +=item $p = HTML::Parser->new( %options_and_handlers ) + +This class method creates a new C<HTML::Parser> object and +returns it. Key/value argument pairs may be provided to assign event +handlers or initialize parser options. The handlers and parser +options can also be set or modified later by the method calls described below. + +If a top level key is in the form "<event>_h" (e.g., "text_h") then it +assigns a handler to that event, otherwise it initializes a parser +option. The event handler specification value must be an array +reference. Multiple handlers may also be assigned with the 'handlers +=> [%handlers]' option. See examples below. + +If new() is called without any arguments, it will create a parser that +uses callback methods compatible with version 2 of C<HTML::Parser>. +See the section on "version 2 compatibility" below for details. + +The special constructor option 'api_version => 2' can be used to +initialize version 2 callbacks while still setting other options and +handlers. The 'api_version => 3' option can be used if you don't want +to set any options and don't want to fall back to v2 compatible +mode. + +Examples: + + $p = HTML::Parser->new(api_version => 3, + text_h => [ sub {...}, "dtext" ]); + +This creates a new parser object with a text event handler subroutine +that receives the original text with general entities decoded. + + $p = HTML::Parser->new(api_version => 3, + start_h => [ 'my_start', "self,tokens" ]); + +This creates a new parser object with a start event handler method +that receives the $p and the tokens array. + + $p = HTML::Parser->new(api_version => 3, + handlers => { text => [\@array, "event,text"], + comment => [\@array, "event,text"], + }); + +This creates a new parser object that stores the event type and the +original text in @array for text and comment events. + +=back + +The following methods feed the HTML document +to the C<HTML::Parser> object: + +=over + +=item $p->parse( $string ) + +Parse $string as the next chunk of the HTML document. Handlers invoked should +not attempt to modify the $string in-place until $p->parse returns. + +If an invoked event handler aborts parsing by calling $p->eof, then $p->parse() +will return a FALSE value. Otherwise the return value is a reference to the +parser object ($p). + +=item $p->parse( $code_ref ) + +If a code reference is passed as the argument to be parsed, then the +chunks to be parsed are obtained by invoking this function repeatedly. +Parsing continues until the function returns an empty (or undefined) +result. When this happens $p->eof is automatically signaled. + +Parsing will also abort if one of the event handlers calls $p->eof. + +The effect of this is the same as: + + while (1) { + my $chunk = &$code_ref(); + if (!defined($chunk) || !length($chunk)) { + $p->eof; + return $p; + } + $p->parse($chunk) || return undef; + } + +But it is more efficient as this loop runs internally in XS code. + +=item $p->parse_file( $file ) + +Parse text directly from a file. The $file argument can be a +filename, an open file handle, or a reference to an open file +handle. + +If $file contains a filename and the file can't be opened, then the +method returns an undefined value and $! tells why it failed. +Otherwise the return value is a reference to the parser object. + +If a file handle is passed as the $file argument, then the file will +normally be read until EOF, but not closed. + +If an invoked event handler aborts parsing by calling $p->eof, +then $p->parse_file() may not have read the entire file. + +On systems with multi-byte line terminators, the values passed for the +offset and length argspecs may be too low if parse_file() is called on +a file handle that is not in binary mode. + +If a filename is passed in, then parse_file() will open the file in +binary mode. + +=item $p->eof + +Signals the end of the HTML document. Calling the $p->eof method +outside a handler callback will flush any remaining buffered text +(which triggers the C<text> event if there is any remaining text). + +Calling $p->eof inside a handler will terminate parsing at that point +and cause $p->parse to return a FALSE value. This also terminates +parsing by $p->parse_file(). + +After $p->eof has been called, the parse() and parse_file() methods +can be invoked to feed new documents with the parser object. + +The return value from eof() is a reference to the parser object. + +=back + + +Most parser options are controlled by boolean attributes. +Each boolean attribute is enabled by calling the corresponding method +with a TRUE argument and disabled with a FALSE argument. The +attribute value is left unchanged if no argument is given. The return +value from each method is the old attribute value. + +Methods that can be used to get and/or set parser options are: + +=over + +=item $p->attr_encoded + +=item $p->attr_encoded( $bool ) + +By default, the C<attr> and C<@attr> argspecs will have general +entities for attribute values decoded. Enabling this attribute leaves +entities alone. + +=item $p->backquote + +=item $p->backquote( $bool ) + +By default, only ' and " are recognized as quote characters around +attribute values. MSIE also recognizes backquotes for some reason. +Enabling this attribute provides compatibility with this behaviour. + +=item $p->boolean_attribute_value( $val ) + +This method sets the value reported for boolean attributes inside HTML +start tags. By default, the name of the attribute is also used as its +value. This affects the values reported for C<tokens> and C<attr> +argspecs. + +=item $p->case_sensitive + +=item $p->case_sensitive( $bool ) + +By default, tagnames and attribute names are down-cased. Enabling this +attribute leaves them as found in the HTML source document. + +=item $p->closing_plaintext + +=item $p->closing_plaintext( $bool ) + +By default, "plaintext" element can never be closed. Everything up to +the end of the document is parsed in CDATA mode. This historical +behaviour is what at least MSIE does. Enabling this attribute makes +closing "</plaintext>" tag effective and the parsing process will resume +after seeing this tag. This emulates early gecko-based browsers. + +=item $p->empty_element_tags + +=item $p->empty_element_tags( $bool ) + +By default, empty element tags are not recognized as such and the "/" +before ">" is just treated like a normal name character (unless +C<strict_names> is enabled). Enabling this attribute make +C<HTML::Parser> recognize these tags. + +Empty element tags look like start tags, but end with the character +sequence "/>" instead of ">". When recognized by C<HTML::Parser> they +cause an artificial end event in addition to the start event. The +C<text> for the artificial end event will be empty and the C<tokenpos> +array will be undefined even though the the token array will have one +element containing the tag name. + +=item $p->marked_sections + +=item $p->marked_sections( $bool ) + +By default, section markings like <![CDATA[...]]> are treated like +ordinary text. When this attribute is enabled section markings are +honoured. + +There are currently no events associated with the marked section +markup, but the text can be returned as C<skipped_text>. + +=item $p->strict_comment + +=item $p->strict_comment( $bool ) + +By default, comments are terminated by the first occurrence of "-->". +This is the behaviour of most popular browsers (like Mozilla, Opera and +MSIE), but it is not correct according to the official HTML +standard. Officially, you need an even number of "--" tokens before +the closing ">" is recognized and there may not be anything but +whitespace between an even and an odd "--". + +The official behaviour is enabled by enabling this attribute. + +Enabling of 'strict_comment' also disables recognizing these forms as +comments: + + </ comment> + <! comment> + + +=item $p->strict_end + +=item $p->strict_end( $bool ) + +By default, attributes and other junk are allowed to be present on end tags in a +manner that emulates MSIE's behaviour. + +The official behaviour is enabled with this attribute. If enabled, +only whitespace is allowed between the tagname and the final ">". + +=item $p->strict_names + +=item $p->strict_names( $bool ) + +By default, almost anything is allowed in tag and attribute names. +This is the behaviour of most popular browsers and allows us to parse +some broken tags with invalid attribute values like: + + <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> + +By default, "LIST]" is parsed as a boolean attribute, not as +part of the ALT value as was clearly intended. This is also what +Mozilla sees. + +The official behaviour is enabled by enabling this attribute. If +enabled, it will cause the tag above to be reported as text +since "LIST]" is not a legal attribute name. + +=item $p->unbroken_text + +=item $p->unbroken_text( $bool ) + +By default, blocks of text are given to the text handler as soon as +possible (but the parser takes care always to break text at a +boundary between whitespace and non-whitespace so single words and +entities can always be decoded safely). This might create breaks that +make it hard to do transformations on the text. When this attribute is +enabled, blocks of text are always reported in one piece. This will +delay the text event until the following (non-text) event has been +recognized by the parser. + +Note that the C<offset> argspec will give you the offset of the first +segment of text and C<length> is the combined length of the segments. +Since there might be ignored tags in between, these numbers can't be +used to directly index in the original document file. + +=item $p->utf8_mode + +=item $p->utf8_mode( $bool ) + +Enable this option when parsing raw undecoded UTF-8. This tells the +parser that the entities expanded for strings reported by C<attr>, +C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end +up compatible with the surrounding text. + +If C<utf8_mode> is enabled then it is an error to pass strings +containing characters with code above 255 to the parse() method, and +the parse() method will croak if you try. + +Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 +encoded. The character can also be represented by the entity +"♥" or "♥". If we feed the parser: + + $p->parse("\xE2\x99\xA5♥"); + +then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without +C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. +The later string is what you want. + +This option is only available with perl-5.8 or better. + +=item $p->xml_mode + +=item $p->xml_mode( $bool ) + +Enabling this attribute changes the parser to allow some XML +constructs. This enables the behaviour controlled by individually by +the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and +C<xml_pic> attributes and also suppresses special treatment of +elements that are parsed as CDATA for HTML. + +=item $p->xml_pic + +=item $p->xml_pic( $bool ) + +By default, I<processing instructions> are terminated by ">". When +this attribute is enabled, processing instructions are terminated by +"?>" instead. + +=back + +As markup and text is recognized, handlers are invoked. The following +method is used to set up handlers for different events: + +=over + +=item $p->handler( event => \&subroutine, $argspec ) + +=item $p->handler( event => $method_name, $argspec ) + +=item $p->handler( event => \@accum, $argspec ) + +=item $p->handler( event => "" ); + +=item $p->handler( event => undef ); + +=item $p->handler( event ); + +This method assigns a subroutine, method, or array to handle an event. + +Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, +C<process>, C<start_document>, C<end_document> or C<default>. + +The C<\&subroutine> is a reference to a subroutine which is called to handle +the event. + +The C<$method_name> is the name of a method of $p which is called to handle +the event. + +The C<@accum> is an array that will hold the event information as +sub-arrays. + +If the second argument is "", the event is ignored. +If it is undef, the default handler is invoked for the event. + +The C<$argspec> is a string that describes the information to be reported +for the event. Any requested information that does not apply to a +specific event is passed as C<undef>. If argspec is omitted, then it +is left unchanged. + +The return value from $p->handler is the old callback routine or a +reference to the accumulator array. + +Any return values from handler callback routines/methods are always +ignored. A handler callback can request parsing to be aborted by +invoking the $p->eof method. A handler callback is not allowed to +invoke the $p->parse() or $p->parse_file() method. An exception will +be raised if it tries. + +Examples: + + $p->handler(start => "start", 'self, attr, attrseq, text' ); + +This causes the "start" method of object $p to be called for 'start' events. +The callback signature is $p->start(\%attr, \@attr_seq, $text). + + $p->handler(start => \&start, 'attr, attrseq, text' ); + +This causes subroutine start() to be called for 'start' events. +The callback signature is start(\%attr, \@attr_seq, $text). + + $p->handler(start => \@accum, '"S", attr, attrseq, text' ); + +This causes 'start' event information to be saved in @accum. +The array elements will be ['S', \%attr, \@attr_seq, $text]. + + $p->handler(start => ""); + +This causes 'start' events to be ignored. It also suppresses +invocations of any default handler for start events. It is in most +cases equivalent to $p->handler(start => sub {}), but is more +efficient. It is different from the empty-sub-handler in that +C<skipped_text> is not reset by it. + + $p->handler(start => undef); + +This causes no handler to be associated with start events. +If there is a default handler it will be invoked. + +=back + +Filters based on tags can be set up to limit the number of events +reported. The main bottleneck during parsing is often the huge number +of callbacks made from the parser. Applying filters can improve +performance significantly. + +The following methods control filters: + +=over + +=item $p->ignore_elements( @tags ) + +Both the C<start> event and the C<end> event as well as any events that +would be reported in between are suppressed. The ignored elements can +contain nested occurrences of itself. Example: + + $p->ignore_elements(qw(script style)); + +The C<script> and C<style> tags will always nest properly since their +content is parsed in CDATA mode. For most other tags +C<ignore_elements> must be used with caution since HTML is often not +I<well formed>. + +=item $p->ignore_tags( @tags ) + +Any C<start> and C<end> events involving any of the tags given are +suppressed. To reset the filter (i.e. don't suppress any C<start> and +C<end> events), call C<ignore_tags> without an argument. + +=item $p->report_tags( @tags ) + +Any C<start> and C<end> events involving any of the tags I<not> given +are suppressed. To reset the filter (i.e. report all C<start> and +C<end> events), call C<report_tags> without an argument. + +=back + +Internally, the system has two filter lists, one for C<report_tags> +and one for C<ignore_tags>, and both filters are applied. This +effectively gives C<ignore_tags> precedence over C<report_tags>. + +Examples: + + $p->ignore_tags(qw(style)); + $p->report_tags(qw(script style)); + +results in only C<script> events being reported. + +=head2 Argspec + +Argspec is a string containing a comma-separated list that describes +the information reported by the event. The following argspec +identifier names can be used: + +=over + +=item C<attr> + +Attr causes a reference to a hash of attribute name/value pairs to be +passed. + +Boolean attributes' values are either the value set by +$p->boolean_attribute_value, or the attribute name if no value has been +set by $p->boolean_attribute_value. + +This passes undef except for C<start> events. + +Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute +names are forced to lower case. + +General entities are decoded in the attribute values and +one layer of matching quotes enclosing the attribute values is removed. + +The Unicode character set is assumed for entity decoding. + +=item C<@attr> + +Basically the same as C<attr>, but keys and values are passed as +individual arguments and the original sequence of the attributes is +kept. The parameters passed will be the same as the @attr calculated +here: + + @attr = map { $_ => $attr->{$_} } @$attrseq; + +assuming $attr and $attrseq here are the hash and array passed as the +result of C<attr> and C<attrseq> argspecs. + +This passes no values for events besides C<start>. + +=item C<attrseq> + +Attrseq causes a reference to an array of attribute names to be +passed. This can be useful if you want to walk the C<attr> hash in +the original sequence. + +This passes undef except for C<start> events. + +Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute +names are forced to lower case. + +=item C<column> + +Column causes the column number of the start of the event to be passed. +The first column on a line is 0. + +=item C<dtext> + +Dtext causes the decoded text to be passed. General entities are +automatically decoded unless the event was inside a CDATA section or +was between literal start and end tags (C<script>, C<style>, +C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). + +The Unicode character set is assumed for entity decoding. With Perl +version 5.6 or earlier only the Latin-1 range is supported, and +entities for characters outside the range 0..255 are left unchanged. + +This passes undef except for C<text> events. + +=item C<event> + +Event causes the event name to be passed. + +The event name is one of C<text>, C<start>, C<end>, C<declaration>, +C<comment>, C<process>, C<start_document> or C<end_document>. + +=item C<is_cdata> + +Is_cdata causes a TRUE value to be passed if the event is inside a CDATA +section or between literal start and end tags (C<script>, +C<style>, C<xmp>, C<iframe>, C<title>, C<textarea> and C<plaintext>). + +if the flag is FALSE for a text event, then you should normally +either use C<dtext> or decode the entities yourself before the text is +processed further. + +=item C<length> + +Length causes the number of bytes of the source text of the event to +be passed. + +=item C<line> + +Line causes the line number of the start of the event to be passed. +The first line in the document is 1. Line counting doesn't start +until at least one handler requests this value to be reported. + +=item C<offset> + +Offset causes the byte position in the HTML document of the start of +the event to be passed. The first byte in the document has offset 0. + +=item C<offset_end> + +Offset_end causes the byte position in the HTML document of the end of +the event to be passed. This is the same as C<offset> + C<length>. + +=item C<self> + +Self causes the current object to be passed to the handler. If the +handler is a method, this must be the first element in the argspec. + +An alternative to passing self as an argspec is to register closures +that capture $self by themselves as handlers. Unfortunately this +creates circular references which prevent the HTML::Parser object +from being garbage collected. Using the C<self> argspec avoids this +problem. + +=item C<skipped_text> + +Skipped_text returns the concatenated text of all the events that have +been skipped since the last time an event was reported. Events might +be skipped because no handler is registered for them or because some +filter applies. Skipped text also includes marked section markup, +since there are no events that can catch it. + +If an C<"">-handler is registered for an event, then the text for this +event is not included in C<skipped_text>. Skipped text both before +and after the C<"">-event is included in the next reported +C<skipped_text>. + +=item C<tag> + +Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> +event and "!" for a declaration. The C<tag> does not have any prefix +for C<start> events, and is in this case identical to C<tagname>. + +=item C<tagname> + +This is the element name (or I<generic identifier> in SGML jargon) for +start and end tags. Since HTML is case insensitive, this name is +forced to lower case to ease string matching. + +Since XML is case sensitive, the tagname case is not changed when +C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute +is set. + +The declaration type of declaration elements is also passed as a tagname, +even if that is a bit strange. +In fact, in the current implementation tagname is +identical to C<token0> except that the name may be forced to lower case. + +=item C<token0> + +Token0 causes the original text of the first token string to be +passed. This should always be the same as $tokens->[0]. + +For C<declaration> events, this is the declaration type. + +For C<start> and C<end> events, this is the tag name. + +For C<process> and non-strict C<comment> events, this is everything +inside the tag. + +This passes undef if there are no tokens in the event. + +=item C<tokenpos> + +Tokenpos causes a reference to an array of token positions to be +passed. For each string that appears in C<tokens>, this array +contains two numbers. The first number is the offset of the start of +the token in the original C<text> and the second number is the length +of the token. + +Boolean attributes in a C<start> event will have (0,0) for the +attribute value offset and length. + +This passes undef if there are no tokens in the event (e.g., C<text>) +and for artificial C<end> events triggered by empty element tags. + +If you are using these offsets and lengths to modify C<text>, you +should either work from right to left, or be very careful to calculate +the changes to the offsets. + +=item C<tokens> + +Tokens causes a reference to an array of token strings to be passed. +The strings are exactly as they were found in the original text, +no decoding or case changes are applied. + +For C<declaration> events, the array contains each word, comment, and +delimited string starting with the declaration type. + +For C<comment> events, this contains each sub-comment. If +$p->strict_comments is disabled, there will be only one sub-comment. + +For C<start> events, this contains the original tag name followed by +the attribute name/value pairs. The values of boolean attributes will +be either the value set by $p->boolean_attribute_value, or the +attribute name if no value has been set by +$p->boolean_attribute_value. + +For C<end> events, this contains the original tag name (always one token). + +For C<process> events, this contains the process instructions (always one +token). + +This passes C<undef> for C<text> events. + +=item C<text> + +Text causes the source text (including markup element delimiters) to be +passed. + +=item C<undef> + +Pass an undefined value. Useful as padding where the same handler +routine is registered for multiple events. + +=item C<'...'> + +A literal string of 0 to 255 characters enclosed +in single (') or double (") quotes is passed as entered. + +=back + +The whole argspec string can be wrapped up in C<'@{...}'> to signal +that the resulting event array should be flattened. This only makes a +difference if an array reference is used as the handler target. +Consider this example: + + $p->handler(text => [], 'text'); + $p->handler(text => [], '@{text}']); + +With two text events; C<"foo">, C<"bar">; then the first example will end +up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in +the handler target array. + + +=head2 Events + +Handlers for the following events can be registered: + +=over + +=item C<comment> + +This event is triggered when a markup comment is recognized. + +Example: + + <!-- This is a comment -- -- So is this --> + +=item C<declaration> + +This event is triggered when a I<markup declaration> is recognized. + +For typical HTML documents, the only declaration you are +likely to find is <!DOCTYPE ...>. + +Example: + + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> + +DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. + +=item C<default> + +This event is triggered for events that do not have a specific +handler. You can set up a handler for this event to catch stuff you +did not want to catch explicitly. + +=item C<end> + +This event is triggered when an end tag is recognized. + +Example: + + </A> + +=item C<end_document> + +This event is triggered when $p->eof is called and after any remaining +text is flushed. There is no document text associated with this event. + +=item C<process> + +This event is triggered when a processing instructions markup is +recognized. + +The format and content of processing instructions are system and +application dependent. + +Examples: + + <? HTML processing instructions > + <? XML processing instructions ?> + +=item C<start> + +This event is triggered when a start tag is recognized. + +Example: + + <A HREF="http://www.perl.com/"> + +=item C<start_document> + +This event is triggered before any other events for a new document. A +handler for it can be used to initialize stuff. There is no document +text associated with this event. + +=item C<text> + +This event is triggered when plain text (characters) is recognized. +The text may contain multiple lines. A sequence of text may be broken +between several text events unless $p->unbroken_text is enabled. + +The parser will make sure that it does not break a word or a sequence +of whitespace between two text events. + +=back + +=head2 Unicode + +C<HTML::Parser> can parse Unicode strings when running under +perl-5.8 or better. If Unicode is passed to $p->parse() then chunks +of Unicode will be reported to the handlers. The offset and length +argspecs will also report their position in terms of characters. + +It is safe to parse raw undecoded UTF-8 if you either avoid decoding +entities and make sure to not use I<argspecs> that do, or enable the +C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be +useful when parsing from a file where you need the reported offsets +and lengths to match the byte offsets in the file. + +If a filename is passed to $p->parse_file() then the file will be read +in binary mode. This will be fine if the file contains only ASCII or +Latin-1 characters. If the file contains UTF-8 encoded text then care +must be taken when decoding entities as described in the previous +paragraph, but better is to open the file with the UTF-8 layer so that +it is decoded properly: + + open(my $fh, "<:utf8", "index.html") || die "...: $!"; + $p->parse_file($fh); + +If the file contains text encoded in a charset besides ASCII, Latin-1 +or UTF-8 then decoding will always be needed. + +=head1 VERSION 2 COMPATIBILITY + +When an C<HTML::Parser> object is constructed with no arguments, a set +of handlers is automatically provided that is compatible with the old +HTML::Parser version 2 callback methods. + +This is equivalent to the following method calls: + + $p->handler(start => "start", "self, tagname, attr, attrseq, text"); + $p->handler(end => "end", "self, tagname, text"); + $p->handler(text => "text", "self, text, is_cdata"); + $p->handler(process => "process", "self, token0, text"); + $p->handler(comment => + sub { + my($self, $tokens) = @_; + for (@$tokens) {$self->comment($_);}}, + "self, tokens"); + $p->handler(declaration => + sub { + my $self = shift; + $self->declaration(substr($_[0], 2, -1));}, + "self, text"); + +Setting up these handlers can also be requested with the "api_version => +2" constructor option. + +=head1 SUBCLASSING + +The C<HTML::Parser> class is subclassable. Parser objects are plain +hashes and C<HTML::Parser> reserves only hash keys that start with +"_hparser". The parser state can be set up by invoking the init() +method, which takes the same arguments as new(). + +=head1 EXAMPLES + +The first simple example shows how you might strip out comments from +an HTML document. We achieve this by setting up a comment handler that +does nothing and a default handler that will print out anything else: + + use HTML::Parser; + HTML::Parser->new(default_h => [sub { print shift }, 'text'], + comment_h => [""], + )->parse_file(shift || die) || die $!; + +An alternative implementation is: + + use HTML::Parser; + HTML::Parser->new(end_document_h => [sub { print shift }, + 'skipped_text'], + comment_h => [""], + )->parse_file(shift || die) || die $!; + +This will in most cases be much more efficient since only a single +callback will be made. + +The next example prints out the text that is inside the <title> +element of an HTML document. Here we start by setting up a start +handler. When it sees the title start tag it enables a text handler +that prints any text found and an end handler that will terminate +parsing as soon as the title end tag is seen: + + use HTML::Parser (); + + sub start_handler + { + return if shift ne "title"; + my $self = shift; + $self->handler(text => sub { print shift }, "dtext"); + $self->handler(end => sub { shift->eof if shift eq "title"; }, + "tagname,self"); + } + + my $p = HTML::Parser->new(api_version => 3); + $p->handler( start => \&start_handler, "tagname,self"); + $p->parse_file(shift || die) || die $!; + print "\n"; + +More examples are found in the F<eg/> directory of the C<HTML-Parser> +distribution: the program C<hrefsub> shows how you can edit all links +found in a document; the program C<htextsub> shows how to edit the text only; the +program C<hstrip> shows how you can strip out certain tags/elements +and/or attributes; and the program C<htext> show how to obtain the +plain text, but not any script/style content. + +You can browse the F<eg/> directory online from the I<[Browse]> link on +the http://search.cpan.org/~gaas/HTML-Parser/ page. + +=head1 BUGS + +The <style> and <script> sections do not end with the first "</", but +need the complete corresponding end tag. The standard behaviour is +not really practical. + +When the I<strict_comment> option is enabled, we still recognize +comments where there is something other than whitespace between even +and odd "--" markers. + +Once $p->boolean_attribute_value has been set, there is no way to +restore the default behaviour. + +There is currently no way to get both quote characters +into the same literal argspec. + +Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them +to repeat the previous start tag or close the previous start tag +respectively. + +NET tags, e.g. "code/.../" are not recognized. This is SGML +shorthand for "<code>...</code>". + +Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not +recognized. + +=head1 DIAGNOSTICS + +The following messages may be produced by HTML::Parser. The notation +in this listing is the same as used in L<perldiag>: + +=over + +=item Not a reference to a hash + +(F) The object blessed into or subclassed from HTML::Parser is not a +hash as required by the HTML::Parser methods. + +=item Bad signature in parser state object at %p + +(F) The _hparser_xs_state element does not refer to a valid state structure. +Something must have changed the internal value +stored in this hash element, or the memory has been overwritten. + +=item _hparser_xs_state element is not a reference + +(F) The _hparser_xs_state element has been destroyed. + +=item Can't find '_hparser_xs_state' element in HTML::Parser hash + +(F) The _hparser_xs_state element is missing from the parser hash. +It was either deleted, or not created when the object was created. + +=item API version %s not supported by HTML::Parser %s + +(F) The constructor option 'api_version' with an argument greater than +or equal to 4 is reserved for future extensions. + +=item Bad constructor option '%s' + +(F) An unknown constructor option key was passed to the new() or +init() methods. + +=item Parse loop not allowed + +(F) A handler invoked the parse() or parse_file() method. +This is not permitted. + +=item marked sections not supported + +(F) The $p->marked_sections() method was invoked in a HTML::Parser +module that was compiled without support for marked sections. + +=item Unknown boolean attribute (%d) + +(F) Something is wrong with the internal logic that set up aliases for +boolean attributes. + +=item Only code or array references allowed as handler + +(F) The second argument for $p->handler must be either a subroutine +reference, then name of a subroutine or method, or a reference to an +array. + +=item No handler for %s events + +(F) The first argument to $p->handler must be a valid event name; i.e. one +of "start", "end", "text", "process", "declaration" or "comment". + +=item Unrecognized identifier %s in argspec + +(F) The identifier is not a known argspec name. +Use one of the names mentioned in the argspec section above. + +=item Literal string is longer than 255 chars in argspec + +(F) The current implementation limits the length of literals in +an argspec to 255 characters. Make the literal shorter. + +=item Backslash reserved for literal string in argspec + +(F) The backslash character "\" is not allowed in argspec literals. +It is reserved to permit quoting inside a literal in a later version. + +=item Unterminated literal string in argspec + +(F) The terminating quote character for a literal was not found. + +=item Bad argspec (%s) + +(F) Only identifier names, literals, spaces and commas +are allowed in argspecs. + +=item Missing comma separator in argspec + +(F) Identifiers in an argspec must be separated with ",". + +=item Parsing of undecoded UTF-8 will give garbage when decoding entities + +(W) The first chunk parsed appears to contain undecoded UTF-8 and one +or more argspecs that decode entities are used for the callback +handlers. + +The result of decoding will be a mix of encoded and decoded characters +for any entities that expand to characters with code above 127. This +is not a good thing. + +The recommened solution is to apply Encode::decode_utf8() on the data before +feeding it to the $p->parse(). For $p->parse_file() pass a file that has been +opened in ":utf8" mode. + +The alternative solution is to enable the C<utf8_mode> and not decode before +passing strings to $p->parse(). The parser can process raw undecoded UTF-8 +sanely if the C<utf8_mode> is enabled, or if the "attr", "@attr" or "dtext" +argspecs are avoided. + +=item Parsing string decoded with wrong endianness + +(W) The first character in the document is U+FFFE. This is not a +legal Unicode character but a byte swapped BOM. The result of parsing +will likely be garbage. + +=item Parsing of undecoded UTF-32 + +(W) The parser found the Unicode UTF-32 BOM signature at the start +of the document. The result of parsing will likely be garbage. + +=item Parsing of undecoded UTF-16 + +(W) The parser found the Unicode UTF-16 BOM signature at the start of +the document. The result of parsing will likely be garbage. + +=back + +=head1 SEE ALSO + +L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, +L<HTML::LinkExtor>, L<HTML::Form> + +L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) + +L<http://www.w3.org/TR/html4/> + +More information about marked sections and processing instructions may +be found at L<http://www.is-thought.co.uk/book/sgml-8.htm>. + +=head1 COPYRIGHT + + Copyright 1996-2008 Gisle Aas. All rights reserved. + Copyright 1999-2000 Michael A. Chase. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/Parser.xs b/Parser.xs new file mode 100644 index 0000000..02d2a5a --- /dev/null +++ b/Parser.xs @@ -0,0 +1,687 @@ +/* + * Copyright 1999-2009, Gisle Aas. + * Copyright 1999-2000, Michael A. Chase. + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + + +/* + * Standard XS greeting. + */ +#ifdef __cplusplus +extern "C" { +#endif +#define PERL_NO_GET_CONTEXT /* we want efficiency */ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" +#ifdef __cplusplus +} +#endif + + + +/* + * Some perl version compatibility gruff. + */ +#include "patchlevel.h" +#if PATCHLEVEL <= 4 /* perl5.004_XX */ + +#ifndef PL_sv_undef + #define PL_sv_undef sv_undef + #define PL_sv_yes sv_yes +#endif + +#ifndef PL_hexdigit + #define PL_hexdigit hexdigit +#endif + +#ifndef ERRSV + #define ERRSV GvSV(errgv) +#endif + +#if (PATCHLEVEL == 4 && SUBVERSION <= 4) +/* The newSVpvn function was introduced in perl5.004_05 */ +static SV * +newSVpvn(char *s, STRLEN len) +{ + register SV *sv = newSV(0); + sv_setpvn(sv,s,len); + return sv; +} +#endif /* not perl5.004_05 */ +#endif /* perl5.004_XX */ + +#ifndef dNOOP + #define dNOOP extern int errno +#endif +#ifndef dTHX + #define dTHX dNOOP + #define pTHX_ + #define aTHX_ +#endif + +#ifndef MEMBER_TO_FPTR + #define MEMBER_TO_FPTR(x) (x) +#endif + +#ifndef INT2PTR + #define INT2PTR(any,d) (any)(d) + #define PTR2IV(p) (IV)(p) +#endif + + +#if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0) + #define RETHROW croak(Nullch) +#else + #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); } +#endif + +#if PATCHLEVEL < 8 + /* No useable Unicode support */ + /* Make these harmless if present */ + #undef SvUTF8 + #undef SvUTF8_on + #undef SvUTF8_off + #define SvUTF8(sv) 0 + #define SvUTF8_on(sv) 0 + #define SvUTF8_off(sv) 0 +#else + #define UNICODE_HTML_PARSER +#endif + +#ifdef G_WARN_ON + #define DOWARN (PL_dowarn & G_WARN_ON) +#else + #define DOWARN PL_dowarn +#endif + +#ifndef CLONEf_JOIN_IN + #define CLONEf_JOIN_IN 0 +#endif + +/* + * Include stuff. We include .c files instead of linking them, + * so that they don't have to pollute the external dll name space. + */ + +#ifdef EXTERN + #undef EXTERN +#endif + +#define EXTERN static /* Don't pollute */ + +#include "hparser.h" +#include "util.c" +#include "hparser.c" + + +/* + * Support functions for the XS glue + */ + +static SV* +check_handler(pTHX_ SV* h) +{ + SvGETMAGIC(h); + if (SvROK(h)) { + SV* myref = SvRV(h); + if (SvTYPE(myref) == SVt_PVCV) + return newSVsv(h); + if (SvTYPE(myref) == SVt_PVAV) + return SvREFCNT_inc(myref); + croak("Only code or array references allowed as handler"); + } + return SvOK(h) ? newSVsv(h) : 0; +} + + +static PSTATE* +get_pstate_iv(pTHX_ SV* sv) +{ + PSTATE *p; +#if PATCHLEVEL < 8 + p = INT2PTR(PSTATE*, SvIV(sv)); +#else + MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL; + + if (!mg) + croak("Lost parser state magic"); + p = (PSTATE *)mg->mg_ptr; + if (!p) + croak("Lost parser state magic"); +#endif + if (p->signature != P_SIGNATURE) + croak("Bad signature in parser state object at %p", p); + return p; +} + + +static PSTATE* +get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */ +{ + HV* hv; + SV** svp; + + sv = SvRV(sv); + if (!sv || SvTYPE(sv) != SVt_PVHV) + croak("Not a reference to a hash"); + hv = (HV*)sv; + svp = hv_fetch(hv, "_hparser_xs_state", 17, 0); + if (svp) { + if (SvROK(*svp)) + return get_pstate_iv(aTHX_ SvRV(*svp)); + else + croak("_hparser_xs_state element is not a reference"); + } + croak("Can't find '_hparser_xs_state' element in HTML::Parser hash"); + return 0; +} + + +static void +free_pstate(pTHX_ PSTATE* pstate) +{ + int i; + SvREFCNT_dec(pstate->buf); + SvREFCNT_dec(pstate->pend_text); + SvREFCNT_dec(pstate->skipped_text); +#ifdef MARKED_SECTION + SvREFCNT_dec(pstate->ms_stack); +#endif + SvREFCNT_dec(pstate->bool_attr_val); + for (i = 0; i < EVENT_COUNT; i++) { + SvREFCNT_dec(pstate->handlers[i].cb); + SvREFCNT_dec(pstate->handlers[i].argspec); + } + + SvREFCNT_dec(pstate->report_tags); + SvREFCNT_dec(pstate->ignore_tags); + SvREFCNT_dec(pstate->ignore_elements); + SvREFCNT_dec(pstate->ignoring_element); + + SvREFCNT_dec(pstate->tmp); + + pstate->signature = 0; + Safefree(pstate); +} + +static int +magic_free_pstate(pTHX_ SV *sv, MAGIC *mg) +{ +#if PATCHLEVEL < 8 + free_pstate(aTHX_ get_pstate_iv(aTHX_ sv)); +#else + free_pstate(aTHX_ (PSTATE *)mg->mg_ptr); +#endif + return 0; +} + +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + +static PSTATE * +dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params) +{ + PSTATE *pstate2; + int i; + + Newz(56, pstate2, 1, PSTATE); + pstate2->signature = pstate->signature; + + pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params)); + pstate2->offset = pstate->offset; + pstate2->line = pstate->line; + pstate2->column = pstate->column; + pstate2->start_document = pstate->start_document; + pstate2->parsing = pstate->parsing; + pstate2->eof = pstate->eof; + + pstate2->literal_mode = pstate->literal_mode; + pstate2->is_cdata = pstate->is_cdata; + pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end; + pstate2->pending_end_tag = pstate->pending_end_tag; + + pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params)); + pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata; + pstate2->pend_text_offset = pstate->pend_text_offset; + pstate2->pend_text_line = pstate->pend_text_offset; + pstate2->pend_text_column = pstate->pend_text_column; + + pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params)); + +#ifdef MARKED_SECTION + pstate2->ms = pstate->ms; + pstate2->ms_stack = + (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params)); + pstate2->marked_sections = pstate->marked_sections; +#endif + + pstate2->strict_comment = pstate->strict_comment; + pstate2->strict_names = pstate->strict_names; + pstate2->strict_end = pstate->strict_end; + pstate2->xml_mode = pstate->xml_mode; + pstate2->unbroken_text = pstate->unbroken_text; + pstate2->attr_encoded = pstate->attr_encoded; + pstate2->case_sensitive = pstate->case_sensitive; + pstate2->closing_plaintext = pstate->closing_plaintext; + pstate2->utf8_mode = pstate->utf8_mode; + pstate2->empty_element_tags = pstate->empty_element_tags; + pstate2->xml_pic = pstate->xml_pic; + pstate2->backquote = pstate->backquote; + + pstate2->bool_attr_val = + SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params)); + for (i = 0; i < EVENT_COUNT; i++) { + pstate2->handlers[i].cb = + SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params)); + pstate2->handlers[i].argspec = + SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params)); + } + pstate2->argspec_entity_decode = pstate->argspec_entity_decode; + + pstate2->report_tags = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params)); + pstate2->ignore_tags = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params)); + pstate2->ignore_elements = + (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params)); + + pstate2->ignoring_element = + SvREFCNT_inc(sv_dup(pstate->ignoring_element, params)); + pstate2->ignore_depth = pstate->ignore_depth; + + if (params->flags & CLONEf_JOIN_IN) { + pstate2->entity2char = + perl_get_hv("HTML::Entities::entity2char", TRUE); + } else { + pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params); + } + pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params)); + + return pstate2; +} + +static int +magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params) +{ + mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params); + return 0; +} + +#endif + +MGVTBL vtbl_pstate = +{ + 0, + 0, + 0, + 0, + MEMBER_TO_FPTR(magic_free_pstate), +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + 0, + MEMBER_TO_FPTR(magic_dup_pstate), +#endif +}; + + +/* + * XS interface definition. + */ + +MODULE = HTML::Parser PACKAGE = HTML::Parser + +PROTOTYPES: DISABLE + +void +_alloc_pstate(self) + SV* self; + PREINIT: + PSTATE* pstate; + SV* sv; + HV* hv; + MAGIC* mg; + + CODE: + sv = SvRV(self); + if (!sv || SvTYPE(sv) != SVt_PVHV) + croak("Not a reference to a hash"); + hv = (HV*)sv; + + Newz(56, pstate, 1, PSTATE); + pstate->signature = P_SIGNATURE; + pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE); + pstate->tmp = NEWSV(0, 20); + + sv = newSViv(PTR2IV(pstate)); +#if PATCHLEVEL < 8 + sv_magic(sv, 0, '~', 0, 0); +#else + sv_magic(sv, 0, '~', (char *)pstate, 0); +#endif + mg = mg_find(sv, '~'); + assert(mg); + mg->mg_virtual = &vtbl_pstate; +#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 + mg->mg_flags |= MGf_DUP; +#endif + SvREADONLY_on(sv); + + hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0); + +void +parse(self, chunk) + SV* self; + SV* chunk + PREINIT: + PSTATE* p_state = get_pstate_hv(aTHX_ self); + PPCODE: + if (p_state->parsing) + croak("Parse loop not allowed"); + p_state->parsing = 1; + if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) { + SV* generator = chunk; + STRLEN len; + do { + int count; + PUSHMARK(SP); + count = perl_call_sv(generator, G_SCALAR|G_EVAL); + SPAGAIN; + chunk = count ? POPs : 0; + PUTBACK; + + if (SvTRUE(ERRSV)) { + p_state->parsing = 0; + p_state->eof = 0; + RETHROW; + } + + if (chunk && SvOK(chunk)) { + (void)SvPV(chunk, len); /* get length */ + } + else { + len = 0; + } + parse(aTHX_ p_state, len ? chunk : 0, self); + SPAGAIN; + + } while (len && !p_state->eof); + } + else { + parse(aTHX_ p_state, chunk, self); + SPAGAIN; + } + p_state->parsing = 0; + if (p_state->eof) { + p_state->eof = 0; + PUSHs(sv_newmortal()); + } + else { + PUSHs(self); + } + +void +eof(self) + SV* self; + PREINIT: + PSTATE* p_state = get_pstate_hv(aTHX_ self); + PPCODE: + if (p_state->parsing) + p_state->eof = 1; + else { + p_state->parsing = 1; + parse(aTHX_ p_state, 0, self); /* flush */ + p_state->parsing = 0; + } + PUSHs(self); + +SV* +strict_comment(pstate,...) + PSTATE* pstate + ALIAS: + HTML::Parser::strict_comment = 1 + HTML::Parser::strict_names = 2 + HTML::Parser::xml_mode = 3 + HTML::Parser::unbroken_text = 4 + HTML::Parser::marked_sections = 5 + HTML::Parser::attr_encoded = 6 + HTML::Parser::case_sensitive = 7 + HTML::Parser::strict_end = 8 + HTML::Parser::closing_plaintext = 9 + HTML::Parser::utf8_mode = 10 + HTML::Parser::empty_element_tags = 11 + HTML::Parser::xml_pic = 12 + HTML::Parser::backquote = 13 + PREINIT: + bool *attr; + CODE: + switch (ix) { + case 1: attr = &pstate->strict_comment; break; + case 2: attr = &pstate->strict_names; break; + case 3: attr = &pstate->xml_mode; break; + case 4: attr = &pstate->unbroken_text; break; + case 5: +#ifdef MARKED_SECTION + attr = &pstate->marked_sections; break; +#else + croak("marked sections not supported"); break; +#endif + case 6: attr = &pstate->attr_encoded; break; + case 7: attr = &pstate->case_sensitive; break; + case 8: attr = &pstate->strict_end; break; + case 9: attr = &pstate->closing_plaintext; break; +#ifdef UNICODE_HTML_PARSER + case 10: attr = &pstate->utf8_mode; break; +#else + case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required"); +#endif + case 11: attr = &pstate->empty_element_tags; break; + case 12: attr = &pstate->xml_pic; break; + case 13: attr = &pstate->backquote; break; + default: + croak("Unknown boolean attribute (%d)", ix); + } + RETVAL = boolSV(*attr); + if (items > 1) + *attr = SvTRUE(ST(1)); + OUTPUT: + RETVAL + +SV* +boolean_attribute_value(pstate,...) + PSTATE* pstate + CODE: + RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val) + : &PL_sv_undef; + if (items > 1) { + SvREFCNT_dec(pstate->bool_attr_val); + pstate->bool_attr_val = newSVsv(ST(1)); + } + OUTPUT: + RETVAL + +void +ignore_tags(pstate,...) + PSTATE* pstate + ALIAS: + HTML::Parser::report_tags = 1 + HTML::Parser::ignore_tags = 2 + HTML::Parser::ignore_elements = 3 + PREINIT: + HV** attr; + int i; + CODE: + switch (ix) { + case 1: attr = &pstate->report_tags; break; + case 2: attr = &pstate->ignore_tags; break; + case 3: attr = &pstate->ignore_elements; break; + default: + croak("Unknown tag-list attribute (%d)", ix); + } + if (GIMME_V != G_VOID) + croak("Can't report tag lists yet"); + + items--; /* pstate */ + if (items) { + if (*attr) + hv_clear(*attr); + else + *attr = newHV(); + + for (i = 0; i < items; i++) { + SV* sv = ST(i+1); + if (SvROK(sv)) { + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_PVAV) { + AV* av = (AV*)sv; + STRLEN j; + STRLEN len = av_len(av) + 1; + for (j = 0; j < len; j++) { + SV**svp = av_fetch(av, j, 0); + if (svp) { + hv_store_ent(*attr, *svp, newSViv(0), 0); + } + } + } + else + croak("Tag list must be plain scalars and arrays"); + } + else { + hv_store_ent(*attr, sv, newSViv(0), 0); + } + } + } + else if (*attr) { + SvREFCNT_dec(*attr); + *attr = 0; + } + +void +handler(pstate, eventname,...) + PSTATE* pstate + SV* eventname + PREINIT: + STRLEN name_len; + char *name = SvPV(eventname, name_len); + int event = -1; + int i; + struct p_handler *h; + PPCODE: + /* map event name string to event_id */ + for (i = 0; i < EVENT_COUNT; i++) { + if (strEQ(name, event_id_str[i])) { + event = i; + break; + } + } + if (event < 0) + croak("No handler for %s events", name); + + h = &pstate->handlers[event]; + + /* set up return value */ + if (h->cb) { + PUSHs((SvTYPE(h->cb) == SVt_PVAV) + ? sv_2mortal(newRV_inc(h->cb)) + : sv_2mortal(newSVsv(h->cb))); + } + else { + PUSHs(&PL_sv_undef); + } + + /* update */ + if (items > 3) { + SvREFCNT_dec(h->argspec); + h->argspec = 0; + h->argspec = argspec_compile(ST(3), pstate); + } + if (items > 2) { + SvREFCNT_dec(h->cb); + h->cb = 0; + h->cb = check_handler(aTHX_ ST(2)); + } + + +MODULE = HTML::Parser PACKAGE = HTML::Entities + +void +decode_entities(...) + PREINIT: + int i; + HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE); + PPCODE: + if (GIMME_V == G_SCALAR && items > 1) + items = 1; + for (i = 0; i < items; i++) { + if (GIMME_V != G_VOID) + ST(i) = sv_2mortal(newSVsv(ST(i))); + else { +#ifdef SV_CHECK_THINKFIRST + SV_CHECK_THINKFIRST(ST(i)); +#endif + if (SvREADONLY(ST(i))) + croak("Can't inline decode readonly string in decode_entities()"); + } + decode_entities(aTHX_ ST(i), entity2char, 0); + } + SP += items; + +void +_decode_entities(string, entities, ...) + SV* string + SV* entities + PREINIT: + HV* entities_hv; + bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0; + CODE: + if (SvOK(entities)) { + if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) { + entities_hv = (HV*)SvRV(entities); + } + else { + croak("2nd argument must be hash reference"); + } + } + else { + entities_hv = 0; + } +#ifdef SV_CHECK_THINKFIRST + SV_CHECK_THINKFIRST(string); +#endif + if (SvREADONLY(string)) + croak("Can't inline decode readonly string in _decode_entities()"); + decode_entities(aTHX_ string, entities_hv, expand_prefix); + +bool +_probably_utf8_chunk(string) + SV* string + PREINIT: + STRLEN len; + char *s; + CODE: +#ifdef UNICODE_HTML_PARSER + sv_utf8_downgrade(string, 0); + s = SvPV(string, len); + RETVAL = probably_utf8_chunk(aTHX_ s, len); +#else + RETVAL = 0; /* avoid never initialized complains from compiler */ + croak("_probably_utf8_chunk() only works for Unicode enabled perls"); +#endif + OUTPUT: + RETVAL + +int +UNICODE_SUPPORT() + PROTOTYPE: + CODE: +#ifdef UNICODE_HTML_PARSER + RETVAL = 1; +#else + RETVAL = 0; +#endif + OUTPUT: + RETVAL + + +MODULE = HTML::Parser PACKAGE = HTML::Parser @@ -0,0 +1,65 @@ +OVERVIEW + +The HTML-Parser distribution is is a collection of modules that parse +and extract information from HTML documents. The modules present in +this collection are: + + HTML::Parser - The parser base class. It receives arbitrary sized + chunks of the HTML text, recognizes markup elements, and + separates them from the plain text. As different kinds of markup + and text are recognized, the corresponding event handlers are + invoked. + + HTML::Entities - Provides functions to encode and decode text with + embedded HTML <entities>. + + HTML::HeadParser - A lightweight HTML::Parser subclass that extracts + information from the <HEAD> section of an HTML document. + + HTML::LinkExtor - An HTML::Parser subclass that extracts links from + an HTML document. + + HTML::PullParser - An alternative interface to the basic parser + that does not require event driven programming. + + HTML::TokeParser - An HTML::PullParser subclass with fixed + token setup and methods for extracting text. Many simple + parsing needs are probably best attacked with this module. + +In addition take a look at the HTML-Tree package that build on +HTML::Parser to create and extract information from HTML syntax trees +(similar to HTML DOM). + + +PREREQUISITES + +In order to install and use this package you will need Perl version +5.8 or better. The HTML::Tagset module should be installed. + +If you intend to use the HTML::HeadParser you probably want to install +libwww-perl too. + + +INSTALLATION + +Just follow the usual procedure: + + perl Makefile.PL + make + make test + make install + + +REPORTING BUGS + +Bug reports and issues for discussion about these modules can be sent +to the <libwww@perl.org> mailing list. + + +COPYRIGHT + + © 1995-2009 Gisle Aas. All rights reserved. + © 1999-2000 Michael A. Chase. All rights reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. @@ -0,0 +1,28 @@ +TODO + - Check how we compare to the HTML5 parsing rules + - limit the length of markup elements that never end. Perhaps by + configurable limits on the length that markup can have and still + be recognized. Report stuff as 'text' when this happens? + - remove 255 char limit on literal argspec strings + - implement backslash escapes in literal argspec string + - <![%app1;[...]]> (parameter entities) + - make literal tags configurable. The current list is hardcoded + to be "script", "style", "title", "iframe", "textarea", "xmp", + and "plaintext". + + +SGML FEATURES WE WILL PROBABLY IGNORE FOREVER + - Empty tags: <> </> (repeat previous start tag) + - <foo<bar> (same as <foo><bar>) + - NET tags <name/.../ + + +MINOR "BUGS" (alias FEATURES) + - no way to clear "boolean_attribute_value". + - <style> and <script> do not end with the first "</". + + +MSIE bug compatibility + - recognize server side includes as comments; <% ... %> + if no matching %> found tread "<% ..." as text + - skip quoted strings when looking for PIC diff --git a/eg/hanchors b/eg/hanchors new file mode 100755 index 0000000..c7693fd --- /dev/null +++ b/eg/hanchors @@ -0,0 +1,48 @@ +#!/usr/bin/perl -w + +# This program will print out all <a href=".."> links in a +# document together with the text that goes with it. +# +# See also HTML::LinkExtor + +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3, + start_h => [\&a_start_handler, "self,tagname,attr"], + report_tags => [qw(a img)], + ); +$p->parse_file(shift || die) || die $!; + +sub a_start_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "a"; + return unless exists $attr->{href}; + print "A $attr->{href}\n"; + + $self->handler(text => [], '@{dtext}' ); + $self->handler(start => \&img_handler); + $self->handler(end => \&a_end_handler, "self,tagname"); +} + +sub img_handler +{ + my($self, $tag, $attr) = @_; + return unless $tag eq "img"; + push(@{$self->handler("text")}, $attr->{alt} || "[IMG]"); +} + +sub a_end_handler +{ + my($self, $tag) = @_; + my $text = join("", @{$self->handler("text")}); + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + print "T $text\n"; + + $self->handler("text", undef); + $self->handler("start", \&a_start_handler); + $self->handler("end", undef); +} + diff --git a/eg/hdump b/eg/hdump new file mode 100755 index 0000000..2174584 --- /dev/null +++ b/eg/hdump @@ -0,0 +1,23 @@ +#!/usr/bin/perl -w + +use HTML::Parser (); +use Data::Dump (); + +sub h { + my($event, $line, $column, $text, $tagname, $attr) = @_; + + my @d = (uc(substr($event,0,1)) . " L$line C$column"); + substr($text, 40) = "..." if length($text) > 40; + push(@d, $text); + push(@d, $tagname) if defined $tagname; + push(@d, $attr) if $attr; + + print Data::Dump::dump(@d), "\n"; +} + +my $p = HTML::Parser->new(api_version => 3); +$p->handler(default => \&h, "event, line, column, text, tagname, attr"); + +$p->parse_file(@ARGV ? shift : *STDIN); + + diff --git a/eg/hform b/eg/hform new file mode 100755 index 0000000..d2599ed --- /dev/null +++ b/eg/hform @@ -0,0 +1,83 @@ +#!/usr/bin/perl -w + +# See also HTML::Form module + +use HTML::PullParser (); +use HTML::Entities qw(decode_entities); +use Data::Dump qw(dump); + +my @FORM_TAGS = qw(form input textarea button select option); + +my $p = HTML::PullParser->new(file => shift || "xxx.html", + start => 'tag, attr', + end => 'tag', + text => '@{text}', + report_tags => \@FORM_TAGS, + ) || die "$!"; + +# a little helper function +sub get_text { + my($p, $stop) = @_; + my $text; + while (defined(my $t = $p->get_token)) { + if (ref $t) { + $p->unget_token($t) unless $t->[0] eq $stop; + last; + } + else { + $text .= $t; + } + } + return $text; +} + +my @forms; +while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + if ($t->[0] eq "form") { + shift @$t; + push(@forms, $t); + while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + last if $t->[0] eq "/form"; + if ($t->[0] eq "select") { + my $sel = $t; + push(@{$forms[-1]}, $t); + while (defined(my $t = $p->get_token)) { + next unless ref $t; # skip text + last if $t->[0] eq "/select"; + #print "select ", dump($t), "\n"; + if ($t->[0] eq "option") { + my $value = $t->[1]->{value}; + my $text = get_text($p, "/option"); + unless (defined $value) { + $value = decode_entities($text); + } + push(@$sel, $value); + } + else { + warn "$t->[0] inside select"; + } + } + } + elsif ($t->[0] =~ /^\/?option$/) { + warn "option tag outside select"; + } + elsif ($t->[0] eq "textarea") { + push(@{$forms[-1]}, $t); + $t->[1]{value} = get_text($p, "/textarea"); + } + elsif ($t->[0] =~ m,^/,) { + warn "stray $t->[0] tag"; + } + else { + push(@{$forms[-1]}, $t); + } + } + } + else { + warn "form tag $t->[0] outside form"; + } +} + +print dump(\@forms), "\n"; @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w + +use strict; +use HTML::Parser (); + +HTML::Parser->new(start_h => [ \&start_lc, "tokenpos, text" ], + end_h => [ sub { print lc shift }, "text" ], + default_h => [ sub { print shift }, "text" ], + ) + ->parse_file(shift) || die "Can't open file: $!\n"; + +sub start_lc { + my($tpos, $text) = @_; + for (my $i = 0; $i < @$tpos; $i += 2) { + next if $i && ($i/2) % 2 == 0; # skip attribute values + $_ = lc $_ for substr($text, $tpos->[$i], $tpos->[$i+1]); + } + print $text; +} + diff --git a/eg/hrefsub b/eg/hrefsub new file mode 100755 index 0000000..fe14159 --- /dev/null +++ b/eg/hrefsub @@ -0,0 +1,93 @@ +#!/usr/bin/perl + +# Perform transformations on link attributes in an HTML document. +# Examples: +# +# $ hrefsub 's/foo/bar/g' index.html +# $ hrefsub '$_=URI->new_abs($_, "http://foo")' index.html +# +# The first argument is a perl expression that might modify $_. +# It is called for each link in the document with $_ set to +# the original value of the link URI. The variables $tag and +# $attr can be used to access the tagname and attributename +# within the tag where the current link is found. +# +# The second argument is the name of a file to process. + +use strict; +use HTML::Parser (); +use URI; + +# Construct a hash of tag names that may have links. +my %link_attr; +{ + # To simplify things, reformat the %HTML::Tagset::linkElements + # hash so that it is always a hash of hashes. + require HTML::Tagset; + while (my($k,$v) = each %HTML::Tagset::linkElements) { + if (ref($v)) { + $v = { map {$_ => 1} @$v }; + } + else { + $v = { $v => 1}; + } + $link_attr{$k} = $v; + } + # Uncomment this to see what HTML::Tagset::linkElements thinks are + # the tags with link attributes + #use Data::Dump; Data::Dump::dump(\%link_attr); exit; +} + +# Create a subroutine named 'edit' to perform the operation +# passed in from the command line. The code should modify $_ +# to change things. +my $code = shift; +my $code = 'sub edit { local $_ = shift; my($attr, $tag) = @_; no strict; ' . + $code . + '; $_; }'; +#print $code; +eval $code; +die $@ if $@; + +# Set up the parser. +my $p = HTML::Parser->new(api_version => 3); + +# The default is to print everything as is. +$p->handler(default => sub { print @_ }, "text"); + +# All links are found in start tags. This handler will evaluate +# &edit for each link attribute found. +$p->handler(start => sub { + my($tagname, $pos, $text) = @_; + if (my $link_attr = $link_attr{$tagname}) { + while (4 <= @$pos) { + # use attribute sets from right to left + # to avoid invalidating the offsets + # when replacing the values + my($k_offset, $k_len, $v_offset, $v_len) = + splice(@$pos, -4); + my $attrname = lc(substr($text, $k_offset, $k_len)); + next unless $link_attr->{$attrname}; + next unless $v_offset; # 0 v_offset means no value + my $v = substr($text, $v_offset, $v_len); + $v =~ s/^([\'\"])(.*)\1$/$2/; + my $new_v = edit($v, $attrname, $tagname); + next if $new_v eq $v; + $new_v =~ s/\"/"/g; # since we quote with "" + substr($text, $v_offset, $v_len) = qq("$new_v"); + } + } + print $text; + }, + "tagname, tokenpos, text"); + +# Parse the file passed in from the command line +my $file = shift || usage(); +$p->parse_file($file) || die "Can't open file $file: $!\n"; + +sub usage +{ + my $progname = $0; + $progname =~ s,^.*/,,; + die "Usage: $progname <perlexpr> <filename>\n"; +} diff --git a/eg/hstrip b/eg/hstrip new file mode 100755 index 0000000..b94df3c --- /dev/null +++ b/eg/hstrip @@ -0,0 +1,65 @@ +#!/usr/bin/perl -w + +# This script cleans up an HTML document + +use strict; +use HTML::Parser (); + +# configure these values +my @ignore_attr = + qw(bgcolor background color face style link alink vlink text + onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload + onmousedown onmousemove onmouseout onmouseover onmouseup + onreset onselect onunload + ); +my @ignore_tags = qw(font big small b i); +my @ignore_elements = qw(script style); + +# make it easier to look up attributes +my %ignore_attr = map { $_ => 1} @ignore_attr; + +sub tag +{ + my($pos, $text) = @_; + if (@$pos >= 4) { + # kill some attributes + my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1]; + my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len; + my $edited; + while (@$pos >= 4) { + ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4; + if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) { + substr($text, $k_offset, $next_attr - $k_offset) = ""; + $edited++; + } + $next_attr = $k_offset; + } + # if we killed all attributed, kill any extra whitespace too + $text =~ s/^(<\w+)\s+>$/$1>/ if $edited; + } + print $text; +} + +sub decl +{ + my $type = shift; + print shift if $type eq "doctype"; +} + +sub text +{ + print shift; +} + +HTML::Parser->new(api_version => 3, + start_h => [\&tag, "tokenpos, text"], + process_h => ["", ""], + comment_h => ["", ""], + declaration_h => [\&decl, "tagname, text"], + default_h => [\&text, "text"], + + ignore_tags => \@ignore_tags, + ignore_elements => \@ignore_elements, + ) + ->parse_file(shift) || die "Can't open file: $!\n"; + diff --git a/eg/htext b/eg/htext new file mode 100755 index 0000000..e4d276d --- /dev/null +++ b/eg/htext @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w + +# Extract all plain text from an HTML file + +use strict; +use HTML::Parser 3.00 (); + +my %inside; + +sub tag +{ + my($tag, $num) = @_; + $inside{$tag} += $num; + print " "; # not for all tags +} + +sub text +{ + return if $inside{script} || $inside{style}; + print $_[0]; +} + +HTML::Parser->new(api_version => 3, + handlers => [start => [\&tag, "tagname, '+1'"], + end => [\&tag, "tagname, '-1'"], + text => [\&text, "dtext"], + ], + marked_sections => 1, + )->parse_file(shift) || die "Can't open file: $!\n";; diff --git a/eg/htextsub b/eg/htextsub new file mode 100755 index 0000000..5091273 --- /dev/null +++ b/eg/htextsub @@ -0,0 +1,28 @@ +#!/usr/bin/perl -w + +# Shows how to mangle all plain text in an HTML document, using an arbitrary +# Perl expression. Plain text is all text not within a tag declaration, i.e. +# not in <p ...>, but possibly between <p> and </p> + +use strict; +my $code = shift || usage(); +$code = 'sub edit_print { local $_ = shift; ' . $code . '; print }'; +#print $code; +eval $code; +die $@ if $@; + +use HTML::Parser 3.05; +my $p = HTML::Parser->new(unbroken_text => 1, + default_h => [ sub { print @_; }, "text" ], + text_h => [ \&edit_print, "text" ], + ); + +my $file = shift || usage(); +$p->parse_file($file) || die "Can't open file $file: $!\n"; + +sub usage +{ + my $progname = $0; + $progname =~ s,^.*/,,; + die "Usage: $progname <perlexpr> <filename>\n"; +} diff --git a/eg/htitle b/eg/htitle new file mode 100755 index 0000000..38da5d6 --- /dev/null +++ b/eg/htitle @@ -0,0 +1,21 @@ +#!/usr/bin/perl + +# This program will print out the title of an HTML document. + +use strict; +use HTML::Parser (); + +sub title_handler +{ + my $self = shift; + $self->handler(text => sub { print @_ }, "dtext"); + $self->handler(end => "eof", "self"); +} + +my $p = HTML::Parser->new(api_version => 3, + start_h => [\&title_handler, "self"], + report_tags => ['title'], + ); +$p->parse_file(shift || die) || die $!; +print "\n"; + diff --git a/hints/solaris.pl b/hints/solaris.pl new file mode 100644 index 0000000..f6f94f0 --- /dev/null +++ b/hints/solaris.pl @@ -0,0 +1,4 @@ +if ($Config{gccversion}) { + print "Turning off optimizations to avoid compiler bug\n"; + $self->{OPTIMIZE} = " "; +} diff --git a/hparser.c b/hparser.c new file mode 100644 index 0000000..c6d66de --- /dev/null +++ b/hparser.c @@ -0,0 +1,1902 @@ +/* + * Copyright 1999-2009, Gisle Aas + * Copyright 1999-2000, Michael A. Chase + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + +#ifndef EXTERN +#define EXTERN extern +#endif + +#include "hctype.h" /* isH...() macros */ +#include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */ + + +static +struct literal_tag { + int len; + char* str; + int is_cdata; +} +literal_mode_elem[] = +{ + {6, "script", 1}, + {5, "style", 1}, + {3, "xmp", 1}, + {6, "iframe", 1}, + {9, "plaintext", 1}, + {5, "title", 0}, + {8, "textarea", 0}, + {0, 0, 0} +}; + +enum argcode { + ARG_SELF = 1, /* need to avoid '\0' in argspec string */ + ARG_TOKENS, + ARG_TOKENPOS, + ARG_TOKEN0, + ARG_TAGNAME, + ARG_TAG, + ARG_ATTR, + ARG_ATTRARR, + ARG_ATTRSEQ, + ARG_TEXT, + ARG_DTEXT, + ARG_IS_CDATA, + ARG_SKIPPED_TEXT, + ARG_OFFSET, + ARG_OFFSET_END, + ARG_LENGTH, + ARG_LINE, + ARG_COLUMN, + ARG_EVENT, + ARG_UNDEF, + ARG_LITERAL, /* Always keep last */ + + /* extra flags always encoded first */ + ARG_FLAG_FLAT_ARRAY +}; + +char *argname[] = { + /* Must be in the same order as enum argcode */ + "self", /* ARG_SELF */ + "tokens", /* ARG_TOKENS */ + "tokenpos", /* ARG_TOKENPOS */ + "token0", /* ARG_TOKEN0 */ + "tagname", /* ARG_TAGNAME */ + "tag", /* ARG_TAG */ + "attr", /* ARG_ATTR */ + "@attr", /* ARG_ATTRARR */ + "attrseq", /* ARG_ATTRSEQ */ + "text", /* ARG_TEXT */ + "dtext", /* ARG_DTEXT */ + "is_cdata", /* ARG_IS_CDATA */ + "skipped_text", /* ARG_SKIPPED_TEXT */ + "offset", /* ARG_OFFSET */ + "offset_end", /* ARG_OFFSET_END */ + "length", /* ARG_LENGTH */ + "line", /* ARG_LINE */ + "column", /* ARG_COLUMN */ + "event", /* ARG_EVENT */ + "undef", /* ARG_UNDEF */ + /* ARG_LITERAL (not compared) */ + /* ARG_FLAG_FLAT_ARRAY */ +}; + +#define CASE_SENSITIVE(p_state) \ + ((p_state)->xml_mode || (p_state)->case_sensitive) +#define STRICT_NAMES(p_state) \ + ((p_state)->xml_mode || (p_state)->strict_names) +#define ALLOW_EMPTY_TAG(p_state) \ + ((p_state)->xml_mode || (p_state)->empty_element_tags) + +static void flush_pending_text(PSTATE* p_state, SV* self); + +/* + * Parser functions. + * + * parse() - top level entry point. + * deals with text and calls one of its + * subordinate parse_*() routines after + * looking at the first char after "<" + * parse_decl() - deals with declarations <!...> + * parse_comment() - deals with <!-- ... --> + * parse_marked_section - deals with <![ ... [ ... ]]> + * parse_end() - deals with end tags </...> + * parse_start() - deals with start tags <A...> + * parse_process() - deals with process instructions <?...> + * parse_null() - deals with anything else <....> + * + * report_event() - called whenever any of the parse*() routines + * has recongnized something. + */ + +static void +report_event(PSTATE* p_state, + event_id_t event, + char *beg, char *end, U32 utf8, + token_pos_t *tokens, int num_tokens, + SV* self + ) +{ + struct p_handler *h; + dTHX; + dSP; + AV *array; + STRLEN my_na; + char *argspec; + char *s; + STRLEN offset; + STRLEN line; + STRLEN column; + +#ifdef UNICODE_HTML_PARSER + #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b)) +#else + #define CHR_DIST(a,b) ((a) - (b)) +#endif + + /* some events might still fire after a handler has signaled eof + * so suppress them here. + */ + if (p_state->eof) + return; + + /* capture offsets */ + offset = p_state->offset; + line = p_state->line; + column = p_state->column; + +#if 0 + { /* used for debugging at some point */ + char *s = beg; + int i; + + /* print debug output */ + switch(event) { + case E_DECLARATION: printf("DECLARATION"); break; + case E_COMMENT: printf("COMMENT"); break; + case E_START: printf("START"); break; + case E_END: printf("END"); break; + case E_TEXT: printf("TEXT"); break; + case E_PROCESS: printf("PROCESS"); break; + case E_NONE: printf("NONE"); break; + default: printf("EVENT #%d", event); break; + } + + printf(" ["); + while (s < end) { + if (*s == '\n') { + putchar('\\'); putchar('n'); + } + else + putchar(*s); + s++; + } + printf("] %d\n", end - beg); + for (i = 0; i < num_tokens; i++) { + printf(" token %d: %d %d\n", + i, + tokens[i].beg - beg, + tokens[i].end - tokens[i].beg); + } + } +#endif + + if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) { + token_pos_t t; + char dummy; + t.beg = p_state->pending_end_tag; + t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag); + p_state->pending_end_tag = 0; + report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); + SPAGAIN; + } + + /* update offsets */ + p_state->offset += CHR_DIST(end, beg); + if (line) { + char *s = beg; + char *nl = NULL; + while (s < end) { + if (*s == '\n') { + p_state->line++; + nl = s; + } + s++; + } + if (nl) + p_state->column = CHR_DIST(end, nl) - 1; + else + p_state->column += CHR_DIST(end, beg); + } + + if (event == E_NONE) + goto IGNORE_EVENT; + +#ifdef MARKED_SECTION + if (p_state->ms == MS_IGNORE) + goto IGNORE_EVENT; +#endif + + /* tag filters */ + if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) { + + if (event == E_START || event == E_END) { + SV* tagname = p_state->tmp; + + assert(num_tokens >= 1); + sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); + if (utf8) + SvUTF8_on(tagname); + else + SvUTF8_off(tagname); + if (!CASE_SENSITIVE(p_state)) + sv_lower(aTHX_ tagname); + + if (p_state->ignoring_element) { + if (sv_eq(p_state->ignoring_element, tagname)) { + if (event == E_START) + p_state->ignore_depth++; + else if (--p_state->ignore_depth == 0) { + SvREFCNT_dec(p_state->ignoring_element); + p_state->ignoring_element = 0; + } + } + goto IGNORE_EVENT; + } + + if (p_state->ignore_elements && + hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) + { + if (event == E_START) { + p_state->ignoring_element = newSVsv(tagname); + p_state->ignore_depth = 1; + } + goto IGNORE_EVENT; + } + + if (p_state->ignore_tags && + hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) + { + goto IGNORE_EVENT; + } + if (p_state->report_tags && + !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) + { + goto IGNORE_EVENT; + } + } + else if (p_state->ignoring_element) { + goto IGNORE_EVENT; + } + } + + h = &p_state->handlers[event]; + if (!h->cb) { + /* event = E_DEFAULT; */ + h = &p_state->handlers[E_DEFAULT]; + if (!h->cb) + goto IGNORE_EVENT; + } + + if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) { + /* FALSE scalar ('' or 0) means IGNORE this event */ + return; + } + + if (p_state->unbroken_text && event == E_TEXT) { + /* should buffer text */ + if (!p_state->pend_text) + p_state->pend_text = newSV(256); + if (SvOK(p_state->pend_text)) { + if (p_state->is_cdata != p_state->pend_text_is_cdata) { + flush_pending_text(p_state, self); + SPAGAIN; + goto INIT_PEND_TEXT; + } + } + else { + INIT_PEND_TEXT: + p_state->pend_text_offset = offset; + p_state->pend_text_line = line; + p_state->pend_text_column = column; + p_state->pend_text_is_cdata = p_state->is_cdata; + sv_setpvn(p_state->pend_text, "", 0); + if (!utf8) + SvUTF8_off(p_state->pend_text); + } +#ifdef UNICODE_HTML_PARSER + if (utf8 && !SvUTF8(p_state->pend_text)) + sv_utf8_upgrade(p_state->pend_text); + if (utf8 || !SvUTF8(p_state->pend_text)) { + sv_catpvn(p_state->pend_text, beg, end - beg); + } + else { + SV *tmp = newSVpvn(beg, end - beg); + sv_utf8_upgrade(tmp); + sv_catsv(p_state->pend_text, tmp); + SvREFCNT_dec(tmp); + } +#else + sv_catpvn(p_state->pend_text, beg, end - beg); +#endif + return; + } + else if (p_state->pend_text && SvOK(p_state->pend_text)) { + flush_pending_text(p_state, self); + SPAGAIN; + } + + /* At this point we have decided to generate an event callback */ + + argspec = h->argspec ? SvPV(h->argspec, my_na) : ""; + + if (SvTYPE(h->cb) == SVt_PVAV) { + + if (*argspec == ARG_FLAG_FLAT_ARRAY) { + argspec++; + array = (AV*)h->cb; + } + else { + /* start sub-array for accumulator array */ + array = newAV(); + } + } + else { + array = 0; + if (*argspec == ARG_FLAG_FLAT_ARRAY) + argspec++; + + /* start argument stack for callback */ + ENTER; + SAVETMPS; + PUSHMARK(SP); + } + + for (s = argspec; *s; s++) { + SV* arg = 0; + int push_arg = 1; + enum argcode argcode = (enum argcode)*s; + + switch( argcode ) { + + case ARG_SELF: + arg = sv_mortalcopy(self); + break; + + case ARG_TOKENS: + if (num_tokens >= 1) { + AV* av = newAV(); + SV* prev_token = &PL_sv_undef; + int i; + av_extend(av, num_tokens); + for (i = 0; i < num_tokens; i++) { + if (tokens[i].beg) { + prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(prev_token); + av_push(av, prev_token); + } + else { /* boolean */ + av_push(av, p_state->bool_attr_val + ? newSVsv(p_state->bool_attr_val) + : newSVsv(prev_token)); + } + } + arg = sv_2mortal(newRV_noinc((SV*)av)); + } + break; + + case ARG_TOKENPOS: + if (num_tokens >= 1 && tokens[0].beg >= beg) { + AV* av = newAV(); + int i; + av_extend(av, num_tokens*2); + for (i = 0; i < num_tokens; i++) { + if (tokens[i].beg) { + av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg))); + av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg))); + } + else { /* boolean tag value */ + av_push(av, newSViv(0)); + av_push(av, newSViv(0)); + } + } + arg = sv_2mortal(newRV_noinc((SV*)av)); + } + break; + + case ARG_TOKEN0: + case ARG_TAGNAME: + /* fall through */ + + case ARG_TAG: + if (num_tokens >= 1) { + arg = sv_2mortal(newSVpvn(tokens[0].beg, + tokens[0].end - tokens[0].beg)); + if (utf8) + SvUTF8_on(arg); + if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) + sv_lower(aTHX_ arg); + if (argcode == ARG_TAG && event != E_START) { + char *e_type = "!##/#?#"; + sv_insert(arg, 0, 0, &e_type[event], 1); + } + } + break; + + case ARG_ATTR: + case ARG_ATTRARR: + if (event == E_START) { + HV* hv; + int i; + if (argcode == ARG_ATTR) { + hv = newHV(); + arg = sv_2mortal(newRV_noinc((SV*)hv)); + } + else { +#ifdef __GNUC__ + /* gcc -Wall reports this variable as possibly used uninitialized */ + hv = 0; +#endif + push_arg = 0; /* deal with argument pushing here */ + } + + for (i = 1; i < num_tokens; i += 2) { + SV* attrname = newSVpvn(tokens[i].beg, + tokens[i].end-tokens[i].beg); + SV* attrval; + + if (utf8) + SvUTF8_on(attrname); + if (tokens[i+1].beg) { + char *beg = tokens[i+1].beg; + STRLEN len = tokens[i+1].end - beg; + if (*beg == '"' || *beg == '\'' || (*beg == '`' && p_state->backquote)) { + assert(len >= 2 && *beg == beg[len-1]); + beg++; len -= 2; + } + attrval = newSVpvn(beg, len); + if (utf8) + SvUTF8_on(attrval); + if (!p_state->attr_encoded) { +#ifdef UNICODE_HTML_PARSER + if (p_state->utf8_mode) + sv_utf8_decode(attrval); +#endif + decode_entities(aTHX_ attrval, p_state->entity2char, 0); + if (p_state->utf8_mode) + SvUTF8_off(attrval); + } + } + else { /* boolean */ + if (p_state->bool_attr_val) + attrval = newSVsv(p_state->bool_attr_val); + else + attrval = newSVsv(attrname); + } + + if (!CASE_SENSITIVE(p_state)) + sv_lower(aTHX_ attrname); + + if (argcode == ARG_ATTR) { + if (hv_exists_ent(hv, attrname, 0) || + !hv_store_ent(hv, attrname, attrval, 0)) { + SvREFCNT_dec(attrval); + } + SvREFCNT_dec(attrname); + } + else { /* ARG_ATTRARR */ + if (array) { + av_push(array, attrname); + av_push(array, attrval); + } + else { + XPUSHs(sv_2mortal(attrname)); + XPUSHs(sv_2mortal(attrval)); + } + } + } + } + else if (argcode == ARG_ATTRARR) { + push_arg = 0; + } + break; + + case ARG_ATTRSEQ: /* (v2 compatibility stuff) */ + if (event == E_START) { + AV* av = newAV(); + int i; + for (i = 1; i < num_tokens; i += 2) { + SV* attrname = newSVpvn(tokens[i].beg, + tokens[i].end-tokens[i].beg); + if (utf8) + SvUTF8_on(attrname); + if (!CASE_SENSITIVE(p_state)) + sv_lower(aTHX_ attrname); + av_push(av, attrname); + } + arg = sv_2mortal(newRV_noinc((SV*)av)); + } + break; + + case ARG_TEXT: + arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); + break; + + case ARG_DTEXT: + if (event == E_TEXT) { + arg = sv_2mortal(newSVpvn(beg, end - beg)); + if (utf8) + SvUTF8_on(arg); + if (!p_state->is_cdata) { +#ifdef UNICODE_HTML_PARSER + if (p_state->utf8_mode) + sv_utf8_decode(arg); +#endif + decode_entities(aTHX_ arg, p_state->entity2char, 1); + if (p_state->utf8_mode) + SvUTF8_off(arg); + } + } + break; + + case ARG_IS_CDATA: + if (event == E_TEXT) { + arg = boolSV(p_state->is_cdata); + } + break; + + case ARG_SKIPPED_TEXT: + arg = sv_2mortal(p_state->skipped_text); + p_state->skipped_text = newSVpvn("", 0); + break; + + case ARG_OFFSET: + arg = sv_2mortal(newSViv(offset)); + break; + + case ARG_OFFSET_END: + arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg))); + break; + + case ARG_LENGTH: + arg = sv_2mortal(newSViv(CHR_DIST(end, beg))); + break; + + case ARG_LINE: + arg = sv_2mortal(newSViv(line)); + break; + + case ARG_COLUMN: + arg = sv_2mortal(newSViv(column)); + break; + + case ARG_EVENT: + assert(event >= 0 && event < EVENT_COUNT); + arg = sv_2mortal(newSVpv(event_id_str[event], 0)); + break; + + case ARG_LITERAL: + { + int len = (unsigned char)s[1]; + arg = sv_2mortal(newSVpvn(s+2, len)); + if (SvUTF8(h->argspec)) + SvUTF8_on(arg); + s += len + 1; + } + break; + + case ARG_UNDEF: + arg = sv_mortalcopy(&PL_sv_undef); + break; + + default: + arg = sv_2mortal(newSVpvf("Bad argspec %d", *s)); + break; + } + + if (push_arg) { + if (!arg) + arg = sv_mortalcopy(&PL_sv_undef); + + if (array) { + /* have to fix mortality here or add mortality to + * XPUSHs after removing it from the switch cases. + */ + av_push(array, SvREFCNT_inc(arg)); + } + else { + XPUSHs(arg); + } + } + } + + if (array) { + if (array != (AV*)h->cb) + av_push((AV*)h->cb, newRV_noinc((SV*)array)); + } + else { + PUTBACK; + + if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) { + char *method = SvPV(h->cb, my_na); + perl_call_method(method, G_DISCARD | G_EVAL | G_VOID); + } + else { + perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID); + } + + if (SvTRUE(ERRSV)) { + RETHROW; + } + + FREETMPS; + LEAVE; + } + if (p_state->skipped_text) + SvCUR_set(p_state->skipped_text, 0); + return; + +IGNORE_EVENT: + if (p_state->skipped_text) { + if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) + flush_pending_text(p_state, self); +#ifdef UNICODE_HTML_PARSER + if (utf8 && !SvUTF8(p_state->skipped_text)) + sv_utf8_upgrade(p_state->skipped_text); + if (utf8 || !SvUTF8(p_state->skipped_text)) { +#endif + sv_catpvn(p_state->skipped_text, beg, end - beg); +#ifdef UNICODE_HTML_PARSER + } + else { + SV *tmp = newSVpvn(beg, end - beg); + sv_utf8_upgrade(tmp); + sv_catsv(p_state->skipped_text, tmp); + SvREFCNT_dec(tmp); + } +#endif + } +#undef CHR_DIST + return; +} + + +EXTERN SV* +argspec_compile(SV* src, PSTATE* p_state) +{ + dTHX; + SV* argspec = newSVpvn("", 0); + STRLEN len; + char *s = SvPV(src, len); + char *end = s + len; + + if (SvUTF8(src)) + SvUTF8_on(argspec); + + while (isHSPACE(*s)) + s++; + + if (*s == '@') { + /* try to deal with '@{ ... }' wrapping */ + char *tmp = s + 1; + while (isHSPACE(*tmp)) + tmp++; + if (*tmp == '{') { + char c = ARG_FLAG_FLAT_ARRAY; + sv_catpvn(argspec, &c, 1); + tmp++; + while (isHSPACE(*tmp)) + tmp++; + s = tmp; + } + } + while (s < end) { + if (isHNAME_FIRST(*s) || *s == '@') { + char *name = s; + int a = ARG_SELF; + char **arg_name; + + s++; + while (isHNAME_CHAR(*s)) + s++; + + /* check identifier */ + for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) { + if (strnEQ(*arg_name, name, s - name) && + (*arg_name)[s - name] == '\0') + break; + } + if (a < ARG_LITERAL) { + char c = (unsigned char) a; + sv_catpvn(argspec, &c, 1); + + if (a == ARG_LINE || a == ARG_COLUMN) { + if (!p_state->line) + p_state->line = 1; /* enable tracing of line/column */ + } + if (a == ARG_SKIPPED_TEXT) { + if (!p_state->skipped_text) { + p_state->skipped_text = newSVpvn("", 0); + } + } + if (a == ARG_ATTR || a == ARG_ATTRARR) { + if (p_state->argspec_entity_decode != ARG_DTEXT) + p_state->argspec_entity_decode = ARG_ATTR; + } + else if (a == ARG_DTEXT) { + p_state->argspec_entity_decode = ARG_DTEXT; + } + } + else { + croak("Unrecognized identifier %.*s in argspec", s - name, name); + } + } + else if (*s == '"' || *s == '\'') { + char *string_beg = s; + s++; + while (s < end && *s != *string_beg && *s != '\\') + s++; + if (*s == *string_beg) { + /* literal */ + int len = s - string_beg - 1; + unsigned char buf[2]; + if (len > 255) + croak("Literal string is longer than 255 chars in argspec"); + buf[0] = ARG_LITERAL; + buf[1] = len; + sv_catpvn(argspec, (char*)buf, 2); + sv_catpvn(argspec, string_beg+1, len); + s++; + } + else if (*s == '\\') { + croak("Backslash reserved for literal string in argspec"); + } + else { + croak("Unterminated literal string in argspec"); + } + } + else { + croak("Bad argspec (%s)", s); + } + + while (isHSPACE(*s)) + s++; + + if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) { + /* end of '@{ ... }' */ + s++; + while (isHSPACE(*s)) + s++; + if (s < end) + croak("Bad argspec: stuff after @{...} (%s)", s); + } + + if (s == end) + break; + if (*s != ',') { + croak("Missing comma separator in argspec"); + } + s++; + while (isHSPACE(*s)) + s++; + } + return argspec; +} + + +static void +flush_pending_text(PSTATE* p_state, SV* self) +{ + dTHX; + bool old_unbroken_text = p_state->unbroken_text; + SV* old_pend_text = p_state->pend_text; + bool old_is_cdata = p_state->is_cdata; + STRLEN old_offset = p_state->offset; + STRLEN old_line = p_state->line; + STRLEN old_column = p_state->column; + + assert(p_state->pend_text && SvOK(p_state->pend_text)); + + p_state->unbroken_text = 0; + p_state->pend_text = 0; + p_state->is_cdata = p_state->pend_text_is_cdata; + p_state->offset = p_state->pend_text_offset; + p_state->line = p_state->pend_text_line; + p_state->column = p_state->pend_text_column; + + report_event(p_state, E_TEXT, + SvPVX(old_pend_text), SvEND(old_pend_text), + SvUTF8(old_pend_text), 0, 0, self); + SvOK_off(old_pend_text); + + p_state->unbroken_text = old_unbroken_text; + p_state->pend_text = old_pend_text; + p_state->is_cdata = old_is_cdata; + p_state->offset = old_offset; + p_state->line = old_line; + p_state->column = old_column; +} + +static char* +skip_until_gt(char *beg, char *end) +{ + /* tries to emulate quote skipping behaviour observed in MSIE */ + char *s = beg; + char quote = '\0'; + char prev = ' '; + while (s < end) { + if (!quote && *s == '>') + return s; + if (*s == '"' || *s == '\'') { + if (*s == quote) { + quote = '\0'; /* end of quoted string */ + } + else if (!quote && (prev == ' ' || prev == '=')) { + quote = *s; + } + } + prev = *s++; + } + return end; +} + +static char* +parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg; + + if (p_state->strict_comment) { + dTOKENS(4); + char *start_com = s; /* also used to signal inside/outside */ + + while (1) { + /* try to locate "--" */ + FIND_DASH_DASH: + /* printf("find_dash_dash: [%s]\n", s); */ + while (s < end && *s != '-' && *s != '>') + s++; + + if (s == end) { + FREE_TOKENS; + return beg; + } + + if (*s == '>') { + s++; + if (start_com) + goto FIND_DASH_DASH; + + /* we are done recognizing all comments, make callbacks */ + report_event(p_state, E_COMMENT, + beg - 4, s, utf8, + tokens, num_tokens, + self); + FREE_TOKENS; + + return s; + } + + s++; + if (s == end) { + FREE_TOKENS; + return beg; + } + + if (*s == '-') { + /* two dashes in a row seen */ + s++; + /* do something */ + if (start_com) { + PUSH_TOKEN(start_com, s-2); + start_com = 0; + } + else { + start_com = s; + } + } + } + } + else if (p_state->no_dash_dash_comment_end) { + token_pos_t token; + token.beg = beg; + /* a lone '>' signals end-of-comment */ + while (s < end && *s != '>') + s++; + token.end = s; + if (s < end) { + s++; + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); + return s; + } + else { + return beg; + } + } + else { /* non-strict comment */ + token_pos_t token; + token.beg = beg; + /* try to locate /--\s*>/ which signals end-of-comment */ + LOCATE_END: + while (s < end && *s != '-') + s++; + token.end = s; + if (s < end) { + s++; + if (*s == '-') { + s++; + while (isHSPACE(*s)) + s++; + if (*s == '>') { + s++; + /* yup */ + report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); + return s; + } + } + if (s < end) { + s = token.end + 1; + goto LOCATE_END; + } + } + + if (s == end) + return beg; + } + + return 0; +} + + +#ifdef MARKED_SECTION + +static void +marked_section_update(PSTATE* p_state) +{ + dTHX; + /* we look at p_state->ms_stack to determine p_state->ms */ + AV* ms_stack = p_state->ms_stack; + p_state->ms = MS_NONE; + + if (ms_stack) { + int stack_len = av_len(ms_stack); + int stack_idx; + for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) { + SV** svp = av_fetch(ms_stack, stack_idx, 0); + if (svp) { + AV* tokens = (AV*)SvRV(*svp); + int tokens_len = av_len(tokens); + int i; + assert(SvTYPE(tokens) == SVt_PVAV); + for (i = 0; i <= tokens_len; i++) { + SV** svp = av_fetch(tokens, i, 0); + if (svp) { + STRLEN len; + char *token_str = SvPV(*svp, len); + enum marked_section_t token; + if (strEQ(token_str, "include")) + token = MS_INCLUDE; + else if (strEQ(token_str, "rcdata")) + token = MS_RCDATA; + else if (strEQ(token_str, "cdata")) + token = MS_CDATA; + else if (strEQ(token_str, "ignore")) + token = MS_IGNORE; + else + token = MS_NONE; + if (p_state->ms < token) + p_state->ms = token; + } + } + } + } + } + /* printf("MS %d\n", p_state->ms); */ + p_state->is_cdata = (p_state->ms == MS_CDATA); + return; +} + + +static char* +parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + dTHX; + char *s; + AV* tokens = 0; + + if (!p_state->marked_sections) + return 0; + + assert(beg[0] == '<'); + assert(beg[1] == '!'); + assert(beg[2] == '['); + s = beg + 3; + +FIND_NAMES: + while (isHSPACE(*s)) + s++; + while (isHNAME_FIRST(*s)) { + char *name_start = s; + char *name_end; + SV *name; + s++; + while (isHNAME_CHAR(*s)) + s++; + name_end = s; + while (isHSPACE(*s)) + s++; + if (s == end) + goto PREMATURE; + + if (!tokens) + tokens = newAV(); + name = newSVpvn(name_start, name_end - name_start); + if (utf8) + SvUTF8_on(name); + av_push(tokens, sv_lower(aTHX_ name)); + } + if (*s == '-') { + s++; + if (*s == '-') { + /* comment */ + s++; + while (1) { + while (s < end && *s != '-') + s++; + if (s == end) + goto PREMATURE; + + s++; /* skip first '-' */ + if (*s == '-') { + s++; + /* comment finished */ + goto FIND_NAMES; + } + } + } + else + goto FAIL; + + } + if (*s == '[') { + s++; + /* yup */ + + if (!tokens) { + tokens = newAV(); + av_push(tokens, newSVpvn("include", 7)); + } + + if (!p_state->ms_stack) + p_state->ms_stack = newAV(); + av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); + marked_section_update(p_state); + report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); + return s; + } + +FAIL: + SvREFCNT_dec(tokens); + return 0; /* not yet implemented */ + +PREMATURE: + SvREFCNT_dec(tokens); + return beg; +} +#endif + + +static char* +parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg + 2; + + if (*s == '-') { + /* comment? */ + + char *tmp; + s++; + if (s == end) + return beg; + + if (*s != '-') + goto DECL_FAIL; /* nope, illegal */ + + /* yes, two dashes seen */ + s++; + + tmp = parse_comment(p_state, s, end, utf8, self); + return (tmp == s) ? beg : tmp; + } + +#ifdef MARKED_SECTION + if (*s == '[') { + /* marked section */ + char *tmp; + tmp = parse_marked_section(p_state, beg, end, utf8, self); + if (!tmp) + goto DECL_FAIL; + return tmp; + } +#endif + + if (*s == '>') { + /* make <!> into empty comment <SGML Handbook 36:32> */ + token_pos_t token; + token.beg = s; + token.end = s; + s++; + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); + return s; + } + + if (isALPHA(*s)) { + dTOKENS(8); + char *decl_id = s; + STRLEN decl_id_len; + + s++; + /* declaration */ + while (s < end && isHNAME_CHAR(*s)) + s++; + decl_id_len = s - decl_id; + if (s == end) + goto PREMATURE; + + /* just hardcode a few names as the recognized declarations */ + if (!((decl_id_len == 7 && + strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) || + (decl_id_len == 6 && + strnEQx(decl_id, "ENTITY", 6, !CASE_SENSITIVE(p_state))) + ) + ) + { + goto FAIL; + } + + /* first word available */ + PUSH_TOKEN(decl_id, s); + + while (1) { + while (s < end && isHSPACE(*s)) + s++; + + if (s == end) + goto PREMATURE; + + if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { + char *str_beg = s; + s++; + while (s < end && *s != *str_beg) + s++; + if (s == end) + goto PREMATURE; + s++; + PUSH_TOKEN(str_beg, s); + } + else if (*s == '-') { + /* comment */ + char *com_beg = s; + s++; + if (s == end) + goto PREMATURE; + if (*s != '-') + goto FAIL; + s++; + + while (1) { + while (s < end && *s != '-') + s++; + if (s == end) + goto PREMATURE; + s++; + if (s == end) + goto PREMATURE; + if (*s == '-') { + s++; + PUSH_TOKEN(com_beg, s); + break; + } + } + } + else if (*s != '>') { + /* plain word */ + char *word_beg = s; + s++; + while (s < end && isHNOT_SPACE_GT(*s)) + s++; + if (s == end) + goto PREMATURE; + PUSH_TOKEN(word_beg, s); + } + else { + break; + } + } + + if (s == end) + goto PREMATURE; + if (*s == '>') { + s++; + report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); + FREE_TOKENS; + return s; + } + + FAIL: + FREE_TOKENS; + goto DECL_FAIL; + + PREMATURE: + FREE_TOKENS; + return beg; + + } + +DECL_FAIL: + if (p_state->strict_comment) + return 0; + + /* consider everything up to the first '>' a comment */ + while (s < end && *s != '>') + s++; + if (s < end) { + token_pos_t token; + token.beg = beg + 2; + token.end = s; + s++; + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); + return s; + } + else { + return beg; + } +} + + +static char* +parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg; + int empty_tag = 0; + dTOKENS(16); + + hctype_t tag_name_first, tag_name_char; + hctype_t attr_name_first, attr_name_char; + + if (STRICT_NAMES(p_state)) { + tag_name_first = attr_name_first = HCTYPE_NAME_FIRST; + tag_name_char = attr_name_char = HCTYPE_NAME_CHAR; + } + else { + tag_name_first = tag_name_char = HCTYPE_NOT_SPACE_GT; + attr_name_first = HCTYPE_NOT_SPACE_GT; + attr_name_char = HCTYPE_NOT_SPACE_EQ_GT; + } + + s += 2; + + while (s < end && isHCTYPE(*s, tag_name_char)) { + if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { + if ((s + 1) == end) + goto PREMATURE; + if (*(s + 1) == '>') + break; + } + s++; + } + PUSH_TOKEN(beg+1, s); /* tagname */ + + while (isHSPACE(*s)) + s++; + if (s == end) + goto PREMATURE; + + while (isHCTYPE(*s, attr_name_first)) { + /* attribute */ + char *attr_name_beg = s; + char *attr_name_end; + if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { + if ((s + 1) == end) + goto PREMATURE; + if (*(s + 1) == '>') + break; + } + s++; + while (s < end && isHCTYPE(*s, attr_name_char)) { + if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { + if ((s + 1) == end) + goto PREMATURE; + if (*(s + 1) == '>') + break; + } + s++; + } + if (s == end) + goto PREMATURE; + + attr_name_end = s; + PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */ + + while (isHSPACE(*s)) + s++; + if (s == end) + goto PREMATURE; + + if (*s == '=') { + /* with a value */ + s++; + while (isHSPACE(*s)) + s++; + if (s == end) + goto PREMATURE; + if (*s == '>') { + /* parse it similar to ="" */ + PUSH_TOKEN(s, s); + break; + } + if (*s == '"' || *s == '\'' || (*s == '`' && p_state->backquote)) { + char *str_beg = s; + s++; + while (s < end && *s != *str_beg) + s++; + if (s == end) + goto PREMATURE; + s++; + PUSH_TOKEN(str_beg, s); + } + else { + char *word_start = s; + while (s < end && isHNOT_SPACE_GT(*s)) { + if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { + if ((s + 1) == end) + goto PREMATURE; + if (*(s + 1) == '>') + break; + } + s++; + } + if (s == end) + goto PREMATURE; + PUSH_TOKEN(word_start, s); + } + while (isHSPACE(*s)) + s++; + if (s == end) + goto PREMATURE; + } + else { + PUSH_TOKEN(0, 0); /* boolean attr value */ + } + } + + if (ALLOW_EMPTY_TAG(p_state) && *s == '/') { + s++; + if (s == end) + goto PREMATURE; + empty_tag = 1; + } + + if (*s == '>') { + s++; + /* done */ + report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); + if (empty_tag) { + report_event(p_state, E_END, s, s, utf8, tokens, 1, self); + } + else if (!p_state->xml_mode) { + /* find out if this start tag should put us into literal_mode + */ + int i; + int tag_len = tokens[0].end - tokens[0].beg; + + for (i = 0; literal_mode_elem[i].len; i++) { + if (tag_len == literal_mode_elem[i].len) { + /* try to match it */ + char *s = beg + 1; + char *t = literal_mode_elem[i].str; + int len = tag_len; + while (len) { + if (toLOWER(*s) != *t) + break; + s++; + t++; + if (!--len) { + /* found it */ + p_state->literal_mode = literal_mode_elem[i].str; + p_state->is_cdata = literal_mode_elem[i].is_cdata; + /* printf("Found %s\n", p_state->literal_mode); */ + goto END_OF_LITERAL_SEARCH; + } + } + } + } + END_OF_LITERAL_SEARCH: + ; + } + + FREE_TOKENS; + return s; + } + + FREE_TOKENS; + return 0; + +PREMATURE: + FREE_TOKENS; + return beg; +} + + +static char* +parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg+2; + hctype_t name_first, name_char; + + if (STRICT_NAMES(p_state)) { + name_first = HCTYPE_NAME_FIRST; + name_char = HCTYPE_NAME_CHAR; + } + else { + name_first = name_char = HCTYPE_NOT_SPACE_GT; + } + + if (isHCTYPE(*s, name_first)) { + token_pos_t tagname; + tagname.beg = s; + s++; + while (s < end && isHCTYPE(*s, name_char)) + s++; + tagname.end = s; + + if (p_state->strict_end) { + while (isHSPACE(*s)) + s++; + } + else { + s = skip_until_gt(s, end); + } + if (s < end) { + if (*s == '>') { + s++; + /* a complete end tag has been recognized */ + report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); + return s; + } + } + else { + return beg; + } + } + else if (!p_state->strict_comment) { + s = skip_until_gt(s, end); + if (s < end) { + token_pos_t token; + token.beg = beg + 2; + token.end = s; + s++; + report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); + return s; + } + else { + return beg; + } + } + return 0; +} + + +static char* +parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg + 2; /* skip '<?' */ + /* processing instruction */ + token_pos_t token_pos; + token_pos.beg = s; + + while (s < end) { + if (*s == '>') { + token_pos.end = s; + s++; + + if (p_state->xml_mode || p_state->xml_pic) { + /* XML processing instructions are ended by "?>" */ + if (s - beg < 4 || s[-2] != '?') + continue; + token_pos.end = s - 2; + } + + /* a complete processing instruction seen */ + report_event(p_state, E_PROCESS, beg, s, utf8, + &token_pos, 1, self); + return s; + } + s++; + } + return beg; /* could not find end */ +} + + +#ifdef USE_PFUNC +static char* +parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + return 0; +} + + + +#include "pfunc.h" /* declares the parsefunc[] */ +#endif /* USE_PFUNC */ + +static char* +parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) +{ + char *s = beg; + char *t = beg; + char *new_pos; + + while (!p_state->eof) { + /* + * At the start of this loop we will always be ready for eating text + * or a new tag. We will never be inside some tag. The 't' points + * to where we started and the 's' is advanced as we go. + */ + + while (p_state->literal_mode) { + char *l = p_state->literal_mode; + char *end_text; + + while (s < end && *s != '<') { + s++; + } + + if (s == end) { + s = t; + goto DONE; + } + + end_text = s; + s++; + + /* here we rely on '\0' termination of perl svpv buffers */ + if (*s == '/') { + s++; + while (*l && toLOWER(*s) == *l) { + s++; + l++; + } + + if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) { + /* matched it all */ + token_pos_t end_token; + end_token.beg = end_text + 2; + end_token.end = s; + + while (isHSPACE(*s)) + s++; + if (*s == '>') { + s++; + if (t != end_text) + report_event(p_state, E_TEXT, t, end_text, utf8, + 0, 0, self); + report_event(p_state, E_END, end_text, s, utf8, + &end_token, 1, self); + p_state->literal_mode = 0; + p_state->is_cdata = 0; + t = s; + } + } + } + } + +#ifdef MARKED_SECTION + while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) { + while (s < end && *s != ']') + s++; + if (*s == ']') { + char *end_text = s; + s++; + if (*s == ']' && *(s + 1) == '>') { + s += 2; + /* marked section end */ + if (t != end_text) + report_event(p_state, E_TEXT, t, end_text, utf8, + 0, 0, self); + report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); + t = s; + SvREFCNT_dec(av_pop(p_state->ms_stack)); + marked_section_update(p_state); + continue; + } + } + if (s == end) { + s = t; + goto DONE; + } + } +#endif + + /* first we try to match as much text as possible */ + while (s < end && *s != '<') { +#ifdef MARKED_SECTION + if (p_state->ms && *s == ']') { + char *end_text = s; + s++; + if (*s == ']') { + s++; + if (*s == '>') { + s++; + report_event(p_state, E_TEXT, t, end_text, utf8, + 0, 0, self); + report_event(p_state, E_NONE, end_text, s, utf8, + 0, 0, self); + t = s; + SvREFCNT_dec(av_pop(p_state->ms_stack)); + marked_section_update(p_state); + continue; + } + } + } +#endif + s++; + } + if (s != t) { + if (*s == '<') { + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); + t = s; + } + else { + s--; + if (isHSPACE(*s)) { + /* wait with white space at end */ + while (s >= t && isHSPACE(*s)) + s--; + } + else { + /* might be a chopped up entities/words */ + while (s >= t && !isHSPACE(*s)) + s--; + while (s >= t && isHSPACE(*s)) + s--; + } + s++; + if (s != t) + report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); + break; + } + } + + if (end - s < 3) + break; + + /* next char is known to be '<' and pointed to by 't' as well as 's' */ + s++; + +#ifdef USE_PFUNC + new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); +#else + if (isHNAME_FIRST(*s)) + new_pos = parse_start(p_state, t, end, utf8, self); + else if (*s == '/') + new_pos = parse_end(p_state, t, end, utf8, self); + else if (*s == '!') + new_pos = parse_decl(p_state, t, end, utf8, self); + else if (*s == '?') + new_pos = parse_process(p_state, t, end, utf8, self); + else + new_pos = 0; +#endif /* USE_PFUNC */ + + if (new_pos) { + if (new_pos == t) { + /* no progress, need more data to know what it is */ + s = t; + break; + } + t = s = new_pos; + } + + /* if we get out here then this was not a conforming tag, so + * treat it is plain text at the top of the loop again (we + * have already skipped past the "<"). + */ + } + +DONE: + return s; + +} + +EXTERN void +parse(pTHX_ + PSTATE* p_state, + SV* chunk, + SV* self) +{ + char *s, *beg, *end; + U32 utf8 = 0; + STRLEN len; + + if (!p_state->start_document) { + char dummy[1]; + report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self); + p_state->start_document = 1; + } + + if (!chunk) { + /* eof */ + char empty[1]; + if (p_state->buf && SvOK(p_state->buf)) { + /* flush it */ + s = SvPV(p_state->buf, len); + end = s + len; + utf8 = SvUTF8(p_state->buf); + assert(len); + + while (s < end) { + if (p_state->literal_mode) { + if (strEQ(p_state->literal_mode, "plaintext") || + strEQ(p_state->literal_mode, "xmp") || + strEQ(p_state->literal_mode, "iframe") || + strEQ(p_state->literal_mode, "textarea")) + { + /* rest is considered text */ + break; + } + if (strEQ(p_state->literal_mode, "script") || + strEQ(p_state->literal_mode, "style")) + { + /* effectively make it an empty element */ + token_pos_t t; + char dummy; + t.beg = p_state->literal_mode; + t.end = p_state->literal_mode + strlen(p_state->literal_mode); + report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); + } + else { + p_state->pending_end_tag = p_state->literal_mode; + } + p_state->literal_mode = 0; + s = parse_buf(aTHX_ p_state, s, end, utf8, self); + continue; + } + + if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') { + p_state->no_dash_dash_comment_end = 1; + s = parse_buf(aTHX_ p_state, s, end, utf8, self); + continue; + } + + if (!p_state->strict_comment && *s == '<') { + char *s1 = s + 1; + if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') { + /* some kind of unterminated markup. Report rest as as comment */ + token_pos_t token; + token.beg = s + 1; + token.end = end; + report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); + s = end; + } + } + + break; + } + + if (s < end) { + /* report rest as text */ + report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); + } + + SvREFCNT_dec(p_state->buf); + p_state->buf = 0; + } + if (p_state->pend_text && SvOK(p_state->pend_text)) + flush_pending_text(p_state, self); + + if (p_state->ignoring_element) { + /* document not balanced */ + SvREFCNT_dec(p_state->ignoring_element); + p_state->ignoring_element = 0; + } + report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); + + /* reset state */ + p_state->offset = 0; + if (p_state->line) + p_state->line = 1; + p_state->column = 0; + p_state->start_document = 0; + p_state->literal_mode = 0; + p_state->is_cdata = 0; + return; + } + +#ifdef UNICODE_HTML_PARSER + if (p_state->utf8_mode) + sv_utf8_downgrade(chunk, 0); +#endif + + if (p_state->buf && SvOK(p_state->buf)) { + sv_catsv(p_state->buf, chunk); + beg = SvPV(p_state->buf, len); + utf8 = SvUTF8(p_state->buf); + } + else { + beg = SvPV(chunk, len); + utf8 = SvUTF8(chunk); + if (p_state->offset == 0 && DOWARN) { + /* Print warnings if we find unexpected Unicode BOM forms */ +#ifdef UNICODE_HTML_PARSER + if (p_state->argspec_entity_decode && + !(p_state->attr_encoded && p_state->argspec_entity_decode == ARG_ATTR) && + !p_state->utf8_mode && ( + (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) || + (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) || + (!utf8 && probably_utf8_chunk(aTHX_ beg, len)) + ) + ) + { + warn("Parsing of undecoded UTF-8 will give garbage when decoding entities"); + } + if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) { + warn("Parsing string decoded with wrong endianness"); + } +#endif + if (!utf8 && len >= 4 && + (strnEQ(beg, "\x00\x00\xFE\xFF", 4) || + strnEQ(beg, "\xFE\xFF\x00\x00", 4)) + ) + { + warn("Parsing of undecoded UTF-32"); + } + else if (!utf8 && len >= 2 && + (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2)) + ) + { + warn("Parsing of undecoded UTF-16"); + } + } + } + + if (!len) + return; /* nothing to do */ + + end = beg + len; + s = parse_buf(aTHX_ p_state, beg, end, utf8, self); + + if (s == end || p_state->eof) { + if (p_state->buf) { + SvOK_off(p_state->buf); + } + } + else { + /* need to keep rest in buffer */ + if (p_state->buf) { + /* chop off some chars at the beginning */ + if (SvOK(p_state->buf)) { + sv_chop(p_state->buf, s); + } + else { + sv_setpvn(p_state->buf, s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); + else + SvUTF8_off(p_state->buf); + } + } + else { + p_state->buf = newSVpv(s, end - s); + if (utf8) + SvUTF8_on(p_state->buf); + } + } + return; +} diff --git a/hparser.h b/hparser.h new file mode 100644 index 0000000..986e6c5 --- /dev/null +++ b/hparser.h @@ -0,0 +1,132 @@ +/* + * Copyright 1999-2009, Gisle Aas + * Copyright 1999-2000, Michael A. Chase + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + +/* + * Declare various structures and constants. The main thing + * is 'struct p_state' that contains various fields to represent + * the state of the parser. + */ + +#ifdef MARKED_SECTION + +enum marked_section_t { + MS_NONE = 0, + MS_INCLUDE, + MS_RCDATA, + MS_CDATA, + MS_IGNORE +}; + +#endif /* MARKED_SECTION */ + + +#define P_SIGNATURE 0x16091964 /* tag struct p_state for safer cast */ + +enum event_id { + E_DECLARATION = 0, + E_COMMENT, + E_START, + E_END, + E_TEXT, + E_PROCESS, + E_START_DOCUMENT, + E_END_DOCUMENT, + E_DEFAULT, + /**/ + EVENT_COUNT, + E_NONE /* used for reporting skipped text (non-events) */ +}; +typedef enum event_id event_id_t; + +/* must match event_id_t above */ +static char* event_id_str[] = { + "declaration", + "comment", + "start", + "end", + "text", + "process", + "start_document", + "end_document", + "default", +}; + +struct p_handler { + SV* cb; + SV* argspec; +}; + +struct p_state { + U32 signature; + + /* state */ + SV* buf; + STRLEN offset; + STRLEN line; + STRLEN column; + bool start_document; + bool parsing; + bool eof; + + /* special parsing modes */ + char* literal_mode; + bool is_cdata; + bool no_dash_dash_comment_end; + char *pending_end_tag; + + /* unbroken_text option needs a buffer of pending text */ + SV* pend_text; + bool pend_text_is_cdata; + STRLEN pend_text_offset; + STRLEN pend_text_line; + STRLEN pend_text_column; + + /* skipped text is accumulated here */ + SV* skipped_text; + +#ifdef MARKED_SECTION + /* marked section support */ + enum marked_section_t ms; + AV* ms_stack; + bool marked_sections; +#endif + + /* various boolean configuration attributes */ + bool strict_comment; + bool strict_names; + bool strict_end; + bool xml_mode; + bool unbroken_text; + bool attr_encoded; + bool case_sensitive; + bool closing_plaintext; + bool utf8_mode; + bool empty_element_tags; + bool xml_pic; + bool backquote; + + /* other configuration stuff */ + SV* bool_attr_val; + struct p_handler handlers[EVENT_COUNT]; + int argspec_entity_decode; + + /* filters */ + HV* report_tags; + HV* ignore_tags; + HV* ignore_elements; + + /* these are set when we are currently inside an element we want to ignore */ + SV* ignoring_element; + int ignore_depth; + + /* cache */ + HV* entity2char; /* %HTML::Entities::entity2char */ + SV* tmp; +}; +typedef struct p_state PSTATE; + diff --git a/lib/HTML/Entities.pm b/lib/HTML/Entities.pm new file mode 100644 index 0000000..ecd8e0d --- /dev/null +++ b/lib/HTML/Entities.pm @@ -0,0 +1,483 @@ +package HTML::Entities; + +=encoding utf8 + +=head1 NAME + +HTML::Entities - Encode or decode strings with HTML entities + +=head1 SYNOPSIS + + use HTML::Entities; + + $a = "Våre norske tegn bør æres"; + decode_entities($a); + encode_entities($a, "\200-\377"); + +For example, this: + + $input = "vis-à -vis Beyoncé's naïve\npapier-mâché résumé"; + print encode_entities($input), "\n" + +Prints this out: + + vis-à-vis Beyoncé's naïve + papier-mâché résumé + +=head1 DESCRIPTION + +This module deals with encoding and decoding of strings with HTML +character entities. The module provides the following functions: + +=over 4 + +=item decode_entities( $string, ... ) + +This routine replaces HTML entities found in the $string with the +corresponding Unicode character. Unrecognized entities are left alone. + +If multiple strings are provided as argument they are each decoded +separately and the same number of strings are returned. + +If called in void context the arguments are decoded in-place. + +This routine is exported by default. + +=item _decode_entities( $string, \%entity2char ) + +=item _decode_entities( $string, \%entity2char, $expand_prefix ) + +This will in-place replace HTML entities in $string. The %entity2char +hash must be provided. Named entities not found in the %entity2char +hash are left alone. Numeric entities are expanded unless their value +overflow. + +The keys in %entity2char are the entity names to be expanded and their +values are what they should expand into. The values do not have to be +single character strings. If a key has ";" as suffix, +then occurrences in $string are only expanded if properly terminated +with ";". Entities without ";" will be expanded regardless of how +they are terminated for compatibility with how common browsers treat +entities in the Latin-1 range. + +If $expand_prefix is TRUE then entities without trailing ";" in +%entity2char will even be expanded as a prefix of a longer +unrecognized name. The longest matching name in %entity2char will be +used. This is mainly present for compatibility with an MSIE +misfeature. + + $string = "foo bar"; + _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); + print $string; # will print "foo bar" + +This routine is exported by default. + +=item encode_entities( $string ) + +=item encode_entities( $string, $unsafe_chars ) + +This routine replaces unsafe characters in $string with their entity +representation. A second argument can be given to specify which characters to +consider unsafe. The unsafe characters is specified using the regular +expression character class syntax (what you find within brackets in regular +expressions). + +The default set of characters to encode are control chars, high-bit chars, and +the C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> characters. But this, +for example, would encode I<just> the C<< < >>, C<< & >>, C<< > >>, and C<< " +>> characters: + + $encoded = encode_entities($input, '<>&"'); + +and this would only encode non-plain ascii: + + $encoded = encode_entities($input, '^\n\x20-\x25\x27-\x7e'); + +This routine is exported by default. + +=item encode_entities_numeric( $string ) + +=item encode_entities_numeric( $string, $unsafe_chars ) + +This routine works just like encode_entities, except that the replacement +entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For +example, C<encode_entities("r\xF4le")> returns "rôle", but +C<encode_entities_numeric("r\xF4le")> returns "rôle". + +This routine is I<not> exported by default. But you can always +export it with C<use HTML::Entities qw(encode_entities_numeric);> +or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> + +=back + +All these routines modify the string passed as the first argument, if +called in a void context. In scalar and array contexts, the encoded or +decoded string is returned (without changing the input string). + +If you prefer not to import these routines into your namespace, you can +call them as: + + use HTML::Entities (); + $decoded = HTML::Entities::decode($a); + $encoded = HTML::Entities::encode($a); + $encoded = HTML::Entities::encode_numeric($a); + +The module can also export the %char2entity and the %entity2char +hashes, which contain the mapping from all characters to the +corresponding entities (and vice versa, respectively). + +=head1 COPYRIGHT + +Copyright 1995-2006 Gisle Aas. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +use strict; +use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); +use vars qw(%entity2char %char2entity); + +require 5.004; +require Exporter; +@ISA = qw(Exporter); + +@EXPORT = qw(encode_entities decode_entities _decode_entities); +@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); + +$VERSION = "3.69"; +sub Version { $VERSION; } + +require HTML::Parser; # for fast XS implemented decode_entities + + +%entity2char = ( + # Some normal chars that have special meaning in SGML context + amp => '&', # ampersand +'gt' => '>', # greater than +'lt' => '<', # less than + quot => '"', # double quote + apos => "'", # single quote + + # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML + AElig => chr(198), # capital AE diphthong (ligature) + Aacute => chr(193), # capital A, acute accent + Acirc => chr(194), # capital A, circumflex accent + Agrave => chr(192), # capital A, grave accent + Aring => chr(197), # capital A, ring + Atilde => chr(195), # capital A, tilde + Auml => chr(196), # capital A, dieresis or umlaut mark + Ccedil => chr(199), # capital C, cedilla + ETH => chr(208), # capital Eth, Icelandic + Eacute => chr(201), # capital E, acute accent + Ecirc => chr(202), # capital E, circumflex accent + Egrave => chr(200), # capital E, grave accent + Euml => chr(203), # capital E, dieresis or umlaut mark + Iacute => chr(205), # capital I, acute accent + Icirc => chr(206), # capital I, circumflex accent + Igrave => chr(204), # capital I, grave accent + Iuml => chr(207), # capital I, dieresis or umlaut mark + Ntilde => chr(209), # capital N, tilde + Oacute => chr(211), # capital O, acute accent + Ocirc => chr(212), # capital O, circumflex accent + Ograve => chr(210), # capital O, grave accent + Oslash => chr(216), # capital O, slash + Otilde => chr(213), # capital O, tilde + Ouml => chr(214), # capital O, dieresis or umlaut mark + THORN => chr(222), # capital THORN, Icelandic + Uacute => chr(218), # capital U, acute accent + Ucirc => chr(219), # capital U, circumflex accent + Ugrave => chr(217), # capital U, grave accent + Uuml => chr(220), # capital U, dieresis or umlaut mark + Yacute => chr(221), # capital Y, acute accent + aacute => chr(225), # small a, acute accent + acirc => chr(226), # small a, circumflex accent + aelig => chr(230), # small ae diphthong (ligature) + agrave => chr(224), # small a, grave accent + aring => chr(229), # small a, ring + atilde => chr(227), # small a, tilde + auml => chr(228), # small a, dieresis or umlaut mark + ccedil => chr(231), # small c, cedilla + eacute => chr(233), # small e, acute accent + ecirc => chr(234), # small e, circumflex accent + egrave => chr(232), # small e, grave accent + eth => chr(240), # small eth, Icelandic + euml => chr(235), # small e, dieresis or umlaut mark + iacute => chr(237), # small i, acute accent + icirc => chr(238), # small i, circumflex accent + igrave => chr(236), # small i, grave accent + iuml => chr(239), # small i, dieresis or umlaut mark + ntilde => chr(241), # small n, tilde + oacute => chr(243), # small o, acute accent + ocirc => chr(244), # small o, circumflex accent + ograve => chr(242), # small o, grave accent + oslash => chr(248), # small o, slash + otilde => chr(245), # small o, tilde + ouml => chr(246), # small o, dieresis or umlaut mark + szlig => chr(223), # small sharp s, German (sz ligature) + thorn => chr(254), # small thorn, Icelandic + uacute => chr(250), # small u, acute accent + ucirc => chr(251), # small u, circumflex accent + ugrave => chr(249), # small u, grave accent + uuml => chr(252), # small u, dieresis or umlaut mark + yacute => chr(253), # small y, acute accent + yuml => chr(255), # small y, dieresis or umlaut mark + + # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) + copy => chr(169), # copyright sign + reg => chr(174), # registered sign + nbsp => chr(160), # non breaking space + + # Additional ISO-8859/1 entities listed in rfc1866 (section 14) + iexcl => chr(161), + cent => chr(162), + pound => chr(163), + curren => chr(164), + yen => chr(165), + brvbar => chr(166), + sect => chr(167), + uml => chr(168), + ordf => chr(170), + laquo => chr(171), +'not' => chr(172), # not is a keyword in perl + shy => chr(173), + macr => chr(175), + deg => chr(176), + plusmn => chr(177), + sup1 => chr(185), + sup2 => chr(178), + sup3 => chr(179), + acute => chr(180), + micro => chr(181), + para => chr(182), + middot => chr(183), + cedil => chr(184), + ordm => chr(186), + raquo => chr(187), + frac14 => chr(188), + frac12 => chr(189), + frac34 => chr(190), + iquest => chr(191), +'times' => chr(215), # times is a keyword in perl + divide => chr(247), + + ( $] > 5.007 ? ( + 'OElig;' => chr(338), + 'oelig;' => chr(339), + 'Scaron;' => chr(352), + 'scaron;' => chr(353), + 'Yuml;' => chr(376), + 'fnof;' => chr(402), + 'circ;' => chr(710), + 'tilde;' => chr(732), + 'Alpha;' => chr(913), + 'Beta;' => chr(914), + 'Gamma;' => chr(915), + 'Delta;' => chr(916), + 'Epsilon;' => chr(917), + 'Zeta;' => chr(918), + 'Eta;' => chr(919), + 'Theta;' => chr(920), + 'Iota;' => chr(921), + 'Kappa;' => chr(922), + 'Lambda;' => chr(923), + 'Mu;' => chr(924), + 'Nu;' => chr(925), + 'Xi;' => chr(926), + 'Omicron;' => chr(927), + 'Pi;' => chr(928), + 'Rho;' => chr(929), + 'Sigma;' => chr(931), + 'Tau;' => chr(932), + 'Upsilon;' => chr(933), + 'Phi;' => chr(934), + 'Chi;' => chr(935), + 'Psi;' => chr(936), + 'Omega;' => chr(937), + 'alpha;' => chr(945), + 'beta;' => chr(946), + 'gamma;' => chr(947), + 'delta;' => chr(948), + 'epsilon;' => chr(949), + 'zeta;' => chr(950), + 'eta;' => chr(951), + 'theta;' => chr(952), + 'iota;' => chr(953), + 'kappa;' => chr(954), + 'lambda;' => chr(955), + 'mu;' => chr(956), + 'nu;' => chr(957), + 'xi;' => chr(958), + 'omicron;' => chr(959), + 'pi;' => chr(960), + 'rho;' => chr(961), + 'sigmaf;' => chr(962), + 'sigma;' => chr(963), + 'tau;' => chr(964), + 'upsilon;' => chr(965), + 'phi;' => chr(966), + 'chi;' => chr(967), + 'psi;' => chr(968), + 'omega;' => chr(969), + 'thetasym;' => chr(977), + 'upsih;' => chr(978), + 'piv;' => chr(982), + 'ensp;' => chr(8194), + 'emsp;' => chr(8195), + 'thinsp;' => chr(8201), + 'zwnj;' => chr(8204), + 'zwj;' => chr(8205), + 'lrm;' => chr(8206), + 'rlm;' => chr(8207), + 'ndash;' => chr(8211), + 'mdash;' => chr(8212), + 'lsquo;' => chr(8216), + 'rsquo;' => chr(8217), + 'sbquo;' => chr(8218), + 'ldquo;' => chr(8220), + 'rdquo;' => chr(8221), + 'bdquo;' => chr(8222), + 'dagger;' => chr(8224), + 'Dagger;' => chr(8225), + 'bull;' => chr(8226), + 'hellip;' => chr(8230), + 'permil;' => chr(8240), + 'prime;' => chr(8242), + 'Prime;' => chr(8243), + 'lsaquo;' => chr(8249), + 'rsaquo;' => chr(8250), + 'oline;' => chr(8254), + 'frasl;' => chr(8260), + 'euro;' => chr(8364), + 'image;' => chr(8465), + 'weierp;' => chr(8472), + 'real;' => chr(8476), + 'trade;' => chr(8482), + 'alefsym;' => chr(8501), + 'larr;' => chr(8592), + 'uarr;' => chr(8593), + 'rarr;' => chr(8594), + 'darr;' => chr(8595), + 'harr;' => chr(8596), + 'crarr;' => chr(8629), + 'lArr;' => chr(8656), + 'uArr;' => chr(8657), + 'rArr;' => chr(8658), + 'dArr;' => chr(8659), + 'hArr;' => chr(8660), + 'forall;' => chr(8704), + 'part;' => chr(8706), + 'exist;' => chr(8707), + 'empty;' => chr(8709), + 'nabla;' => chr(8711), + 'isin;' => chr(8712), + 'notin;' => chr(8713), + 'ni;' => chr(8715), + 'prod;' => chr(8719), + 'sum;' => chr(8721), + 'minus;' => chr(8722), + 'lowast;' => chr(8727), + 'radic;' => chr(8730), + 'prop;' => chr(8733), + 'infin;' => chr(8734), + 'ang;' => chr(8736), + 'and;' => chr(8743), + 'or;' => chr(8744), + 'cap;' => chr(8745), + 'cup;' => chr(8746), + 'int;' => chr(8747), + 'there4;' => chr(8756), + 'sim;' => chr(8764), + 'cong;' => chr(8773), + 'asymp;' => chr(8776), + 'ne;' => chr(8800), + 'equiv;' => chr(8801), + 'le;' => chr(8804), + 'ge;' => chr(8805), + 'sub;' => chr(8834), + 'sup;' => chr(8835), + 'nsub;' => chr(8836), + 'sube;' => chr(8838), + 'supe;' => chr(8839), + 'oplus;' => chr(8853), + 'otimes;' => chr(8855), + 'perp;' => chr(8869), + 'sdot;' => chr(8901), + 'lceil;' => chr(8968), + 'rceil;' => chr(8969), + 'lfloor;' => chr(8970), + 'rfloor;' => chr(8971), + 'lang;' => chr(9001), + 'rang;' => chr(9002), + 'loz;' => chr(9674), + 'spades;' => chr(9824), + 'clubs;' => chr(9827), + 'hearts;' => chr(9829), + 'diams;' => chr(9830), + ) : ()) +); + + +# Make the opposite mapping +while (my($entity, $char) = each(%entity2char)) { + $entity =~ s/;\z//; + $char2entity{$char} = "&$entity;"; +} +delete $char2entity{"'"}; # only one-way decoding + +# Fill in missing entities +for (0 .. 255) { + next if exists $char2entity{chr($_)}; + $char2entity{chr($_)} = "&#$_;"; +} + +my %subst; # compiled encoding regexps + +sub encode_entities +{ + return undef unless defined $_[0]; + my $ref; + if (defined wantarray) { + my $x = $_[0]; + $ref = \$x; # copy + } else { + $ref = \$_[0]; # modify in-place + } + if (defined $_[1] and length $_[1]) { + unless (exists $subst{$_[1]}) { + # Because we can't compile regex we fake it with a cached sub + my $chars = $_[1]; + $chars =~ s,(?<!\\)([]/]),\\$1,g; + $chars =~ s,(?<!\\)\\\z,\\\\,; + my $code = "sub {\$_[0] =~ s/([$chars])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; + $subst{$_[1]} = eval $code; + die( $@ . " while trying to turn range: \"$_[1]\"\n " + . "into code: $code\n " + ) if $@; + } + &{$subst{$_[1]}}($$ref); + } else { + # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' + $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; + } + $$ref; +} + +sub encode_entities_numeric { + local %char2entity; + return &encode_entities; # a goto &encode_entities wouldn't work +} + + +sub num_entity { + sprintf "&#x%X;", ord($_[0]); +} + +# Set up aliases +*encode = \&encode_entities; +*encode_numeric = \&encode_entities_numeric; +*encode_numerically = \&encode_entities_numeric; +*decode = \&decode_entities; + +1; diff --git a/lib/HTML/Filter.pm b/lib/HTML/Filter.pm new file mode 100644 index 0000000..c5aa16e --- /dev/null +++ b/lib/HTML/Filter.pm @@ -0,0 +1,112 @@ +package HTML::Filter; + +use strict; +use vars qw(@ISA $VERSION); + +require HTML::Parser; +@ISA=qw(HTML::Parser); + +$VERSION = "3.57"; + +sub declaration { $_[0]->output("<!$_[1]>") } +sub process { $_[0]->output($_[2]) } +sub comment { $_[0]->output("<!--$_[1]-->") } +sub start { $_[0]->output($_[4]) } +sub end { $_[0]->output($_[2]) } +sub text { $_[0]->output($_[1]) } + +sub output { print $_[1] } + +1; + +__END__ + +=head1 NAME + +HTML::Filter - Filter HTML text through the parser + +=head1 NOTE + +B<This module is deprecated.> The C<HTML::Parser> now provides the +functionally of C<HTML::Filter> much more efficiently with the the +C<default> handler. + +=head1 SYNOPSIS + + require HTML::Filter; + $p = HTML::Filter->new->parse_file("index.html"); + +=head1 DESCRIPTION + +C<HTML::Filter> is an HTML parser that by default prints the +original text of each HTML element (a slow version of cat(1) basically). +The callback methods may be overridden to modify the filtering for some +HTML elements and you can override output() method which is called to +print the HTML text. + +C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that +the document should be given to the parser by calling the $p->parse() +or $p->parse_file() methods. + +=head1 EXAMPLES + +The first example is a filter that will remove all comments from an +HTML file. This is achieved by simply overriding the comment method +to do nothing. + + package CommentStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub comment { } # ignore comments + +The second example shows a filter that will remove any E<lt>TABLE>s +found in the HTML file. We specialize the start() and end() methods +to count table tags and then make output not happen when inside a +table. + + package TableStripper; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub start + { + my $self = shift; + $self->{table_seen}++ if $_[0] eq "table"; + $self->SUPER::start(@_); + } + + sub end + { + my $self = shift; + $self->SUPER::end(@_); + $self->{table_seen}-- if $_[0] eq "table"; + } + + sub output + { + my $self = shift; + unless ($self->{table_seen}) { + $self->SUPER::output(@_); + } + } + +If you want to collect the parsed text internally you might want to do +something like this: + + package FilterIntoString; + require HTML::Filter; + @ISA=qw(HTML::Filter); + sub output { push(@{$_[0]->{fhtml}}, $_[1]) } + sub filtered_html { join("", @{$_[0]->{fhtml}}) } + +=head1 SEE ALSO + +L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1997-1999 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/lib/HTML/HeadParser.pm b/lib/HTML/HeadParser.pm new file mode 100644 index 0000000..28e9cac --- /dev/null +++ b/lib/HTML/HeadParser.pm @@ -0,0 +1,315 @@ +package HTML::HeadParser; + +=head1 NAME + +HTML::HeadParser - Parse <HEAD> section of a HTML document + +=head1 SYNOPSIS + + require HTML::HeadParser; + $p = HTML::HeadParser->new; + $p->parse($text) and print "not finished"; + + $p->header('Title') # to access <title>....</title> + $p->header('Content-Base') # to access <base href="http://..."> + $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> + $p->header('X-Meta-Author') # to access <meta name="author" content="..."> + $p->header('X-Meta-Charset') # to access <meta charset="..."> + +=head1 DESCRIPTION + +The C<HTML::HeadParser> is a specialized (and lightweight) +C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> +section of an HTML document. The parse() method +will return a FALSE value as soon as some E<lt>BODY> element or body +text are found, and should not be called again after this. + +Note that the C<HTML::HeadParser> might get confused if raw undecoded +UTF-8 is passed to the parse() method. Make sure the strings are +properly decoded before passing them on. + +The C<HTML::HeadParser> keeps a reference to a header object, and the +parser will update this header object as the various elements of the +E<lt>HEAD> section of the HTML document are recognized. The following +header fields are affected: + +=over 4 + +=item Content-Base: + +The I<Content-Base> header is initialized from the E<lt>base +href="..."> element. + +=item Title: + +The I<Title> header is initialized from the E<lt>title>...E<lt>/title> +element. + +=item Isindex: + +The I<Isindex> header will be added if there is a E<lt>isindex> +element in the E<lt>head>. The header value is initialized from the +I<prompt> attribute if it is present. If no I<prompt> attribute is +given it will have '?' as the value. + +=item X-Meta-Foo: + +All E<lt>meta> elements containing a C<name> attribute will result in +headers using the prefix C<X-Meta-> appended with the value of the +C<name> attribute as the name of the header, and the value of the +C<content> attribute as the pushed header value. + +E<lt>meta> elements containing a C<http-equiv> attribute will result +in headers as in above, but without the C<X-Meta-> prefix in the +header name. + +E<lt>meta> elements containing a C<charset> attribute will result in +an C<X-Meta-Charset> header, using the value of the C<charset> +attribute as the pushed header value. + +The ':' character can't be represented in header field names, so +if the meta element contains this char it's substituted with '-' +before forming the field name. + +=back + +=head1 METHODS + +The following methods (in addition to those provided by the +superclass) are available: + +=over 4 + +=cut + + +require HTML::Parser; +@ISA = qw(HTML::Parser); + +use HTML::Entities (); + +use strict; +use vars qw($VERSION $DEBUG); +#$DEBUG = 1; +$VERSION = "3.71"; + +=item $hp = HTML::HeadParser->new + +=item $hp = HTML::HeadParser->new( $header ) + +The object constructor. The optional $header argument should be a +reference to an object that implement the header() and push_header() +methods as defined by the C<HTTP::Headers> class. Normally it will be +of some class that is a or delegates to the C<HTTP::Headers> class. + +If no $header is given C<HTML::HeadParser> will create an +C<HTTP::Headers> object by itself (initially empty). + +=cut + +sub new +{ + my($class, $header) = @_; + unless ($header) { + require HTTP::Headers; + $header = HTTP::Headers->new; + } + + my $self = $class->SUPER::new(api_version => 3, + start_h => ["start", "self,tagname,attr"], + end_h => ["end", "self,tagname"], + text_h => ["text", "self,text"], + ignore_elements => [qw(script style)], + ); + $self->{'header'} = $header; + $self->{'tag'} = ''; # name of active element that takes textual content + $self->{'text'} = ''; # the accumulated text associated with the element + $self; +} + +=item $hp->header; + +Returns a reference to the header object. + +=item $hp->header( $key ) + +Returns a header value. It is just a shorter way to write +C<$hp-E<gt>header-E<gt>header($key)>. + +=cut + +sub header +{ + my $self = shift; + return $self->{'header'} unless @_; + $self->{'header'}->header(@_); +} + +sub as_string # legacy +{ + my $self = shift; + $self->{'header'}->as_string; +} + +sub flush_text # internal +{ + my $self = shift; + my $tag = $self->{'tag'}; + my $text = $self->{'text'}; + $text =~ s/^\s+//; + $text =~ s/\s+$//; + $text =~ s/\s+/ /g; + print "FLUSH $tag => '$text'\n" if $DEBUG; + if ($tag eq 'title') { + my $decoded; + $decoded = utf8::decode($text) if $self->utf8_mode && defined &utf8::decode; + HTML::Entities::decode($text); + utf8::encode($text) if $decoded; + $self->{'header'}->push_header(Title => $text); + } + $self->{'tag'} = $self->{'text'} = ''; +} + +# This is an quote from the HTML3.2 DTD which shows which elements +# that might be present in a <HEAD>...</HEAD>. Also note that the +# <HEAD> tags themselves might be missing: +# +# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & +# SCRIPT* & META* & LINK*"> +# +# <!ELEMENT HEAD O O (%head.content)> +# +# From HTML 4.01: +# +# <!ENTITY % head.misc "SCRIPT|STYLE|META|LINK|OBJECT"> +# <!ENTITY % head.content "TITLE & BASE?"> +# <!ELEMENT HEAD O O (%head.content;) +(%head.misc;)> +# +# From HTML 5 as of WD-html5-20090825: +# +# One or more elements of metadata content, [...] +# => base, command, link, meta, noscript, script, style, title + +sub start +{ + my($self, $tag, $attr) = @_; # $attr is reference to a HASH + print "START[$tag]\n" if $DEBUG; + $self->flush_text if $self->{'tag'}; + if ($tag eq 'meta') { + my $key = $attr->{'http-equiv'}; + if (!defined($key) || !length($key)) { + if ($attr->{name}) { + $key = "X-Meta-\u$attr->{name}"; + } elsif ($attr->{charset}) { # HTML 5 <meta charset="..."> + $key = "X-Meta-Charset"; + $self->{header}->push_header($key => $attr->{charset}); + return; + } else { + return; + } + } + $key =~ s/:/-/g; + $self->{'header'}->push_header($key => $attr->{content}); + } elsif ($tag eq 'base') { + return unless exists $attr->{href}; + (my $base = $attr->{href}) =~ s/^\s+//; $base =~ s/\s+$//; # HTML5 + $self->{'header'}->push_header('Content-Base' => $base); + } elsif ($tag eq 'isindex') { + # This is a non-standard header. Perhaps we should just ignore + # this element + $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); + } elsif ($tag =~ /^(?:title|noscript|object|command)$/) { + # Just remember tag. Initialize header when we see the end tag. + $self->{'tag'} = $tag; + } elsif ($tag eq 'link') { + return unless exists $attr->{href}; + # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> + my $href = delete($attr->{href}); + $href =~ s/^\s+//; $href =~ s/\s+$//; # HTML5 + my $h_val = "<$href>"; + for (sort keys %{$attr}) { + next if $_ eq "/"; # XHTML junk + $h_val .= qq(; $_="$attr->{$_}"); + } + $self->{'header'}->push_header(Link => $h_val); + } elsif ($tag eq 'head' || $tag eq 'html') { + # ignore + } else { + # stop parsing + $self->eof; + } +} + +sub end +{ + my($self, $tag) = @_; + print "END[$tag]\n" if $DEBUG; + $self->flush_text if $self->{'tag'}; + $self->eof if $tag eq 'head'; +} + +sub text +{ + my($self, $text) = @_; + print "TEXT[$text]\n" if $DEBUG; + unless ($self->{first_chunk}) { + # drop Unicode BOM if found + if ($self->utf8_mode) { + $text =~ s/^\xEF\xBB\xBF//; + } + else { + $text =~ s/^\x{FEFF}//; + } + $self->{first_chunk}++; + } + my $tag = $self->{tag}; + if (!$tag && $text =~ /\S/) { + # Normal text means start of body + $self->eof; + return; + } + return if $tag ne 'title'; + $self->{'text'} .= $text; +} + +BEGIN { + *utf8_mode = sub { 1 } unless HTML::Entities::UNICODE_SUPPORT; +} + +1; + +__END__ + +=back + +=head1 EXAMPLE + + $h = HTTP::Headers->new; + $p = HTML::HeadParser->new($h); + $p->parse(<<EOT); + <title>Stupid example</title> + <base href="http://www.linpro.no/lwp/"> + Normal text starts here. + EOT + undef $p; + print $h->title; # should print "Stupid example" + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTTP::Headers> + +The C<HTTP::Headers> class is distributed as part of the +I<libwww-perl> package. If you don't have that distribution installed +you need to provide the $header argument to the C<HTML::HeadParser> +constructor with your own object that implements the documented +protocol. + +=head1 COPYRIGHT + +Copyright 1996-2001 Gisle Aas. All rights reserved. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + diff --git a/lib/HTML/LinkExtor.pm b/lib/HTML/LinkExtor.pm new file mode 100644 index 0000000..c2f08c6 --- /dev/null +++ b/lib/HTML/LinkExtor.pm @@ -0,0 +1,185 @@ +package HTML::LinkExtor; + +require HTML::Parser; +@ISA = qw(HTML::Parser); +$VERSION = "3.69"; + +=head1 NAME + +HTML::LinkExtor - Extract links from an HTML document + +=head1 SYNOPSIS + + require HTML::LinkExtor; + $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); + sub cb { + my($tag, %links) = @_; + print "$tag @{[%links]}\n"; + } + $p->parse_file("index.html"); + +=head1 DESCRIPTION + +I<HTML::LinkExtor> is an HTML parser that extracts links from an +HTML document. The I<HTML::LinkExtor> is a subclass of +I<HTML::Parser>. This means that the document should be given to the +parser by calling the $p->parse() or $p->parse_file() methods. + +=cut + +use strict; +use HTML::Tagset (); + +# legacy (some applications grabs this hash directly) +use vars qw(%LINK_ELEMENT); +*LINK_ELEMENT = \%HTML::Tagset::linkElements; + +=over 4 + +=item $p = HTML::LinkExtor->new + +=item $p = HTML::LinkExtor->new( $callback ) + +=item $p = HTML::LinkExtor->new( $callback, $base ) + +The constructor takes two optional arguments. The first is a reference +to a callback routine. It will be called as links are found. If a +callback is not provided, then links are just accumulated internally +and can be retrieved by calling the $p->links() method. + +The $base argument is an optional base URL used to absolutize all URLs found. +You need to have the I<URI> module installed if you provide $base. + +The callback is called with the lowercase tag name as first argument, +and then all link attributes as separate key/value pairs. All +non-link attributes are removed. + +=cut + +sub new +{ + my($class, $cb, $base) = @_; + my $self = $class->SUPER::new( + start_h => ["_start_tag", "self,tagname,attr"], + report_tags => [keys %HTML::Tagset::linkElements], + ); + $self->{extractlink_cb} = $cb; + if ($base) { + require URI; + $self->{extractlink_base} = URI->new($base); + } + $self; +} + +sub _start_tag +{ + my($self, $tag, $attr) = @_; + + my $base = $self->{extractlink_base}; + my $links = $HTML::Tagset::linkElements{$tag}; + $links = [$links] unless ref $links; + + my @links; + my $a; + for $a (@$links) { + next unless exists $attr->{$a}; + (my $link = $attr->{$a}) =~ s/^\s+//; $link =~ s/\s+$//; # HTML5 + push(@links, $a, $base ? URI->new($link, $base)->abs($base) : $link); + } + return unless @links; + $self->_found_link($tag, @links); +} + +sub _found_link +{ + my $self = shift; + my $cb = $self->{extractlink_cb}; + if ($cb) { + &$cb(@_); + } else { + push(@{$self->{'links'}}, [@_]); + } +} + +=item $p->links + +Returns a list of all links found in the document. The returned +values will be anonymous arrays with the following elements: + + [$tag, $attr => $url1, $attr2 => $url2,...] + +The $p->links method will also truncate the internal link list. This +means that if the method is called twice without any parsing +between them the second call will return an empty list. + +Also note that $p->links will always be empty if a callback routine +was provided when the I<HTML::LinkExtor> was created. + +=cut + +sub links +{ + my $self = shift; + exists($self->{'links'}) ? @{delete $self->{'links'}} : (); +} + +# We override the parse_file() method so that we can clear the links +# before we start a new file. +sub parse_file +{ + my $self = shift; + delete $self->{'links'}; + $self->SUPER::parse_file(@_); +} + +=back + +=head1 EXAMPLE + +This is an example showing how you can extract links from a document +received using LWP: + + use LWP::UserAgent; + use HTML::LinkExtor; + use URI::URL; + + $url = "http://www.perl.org/"; # for instance + $ua = LWP::UserAgent->new; + + # Set up a callback that collect image links + my @imgs = (); + sub callback { + my($tag, %attr) = @_; + return if $tag ne 'img'; # we only look closer at <img ...> + push(@imgs, values %attr); + } + + # Make the parser. Unfortunately, we don't know the base yet + # (it might be different from $url) + $p = HTML::LinkExtor->new(\&callback); + + # Request document and parse it as it arrives + $res = $ua->request(HTTP::Request->new(GET => $url), + sub {$p->parse($_[0])}); + + # Expand all image URLs to absolute ones + my $base = $res->base; + @imgs = map { $_ = url($_, $base)->abs; } @imgs; + + # Print them out + print join("\n", @imgs), "\n"; + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> + +=head1 COPYRIGHT + +Copyright 1996-2001 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut + +1; diff --git a/lib/HTML/PullParser.pm b/lib/HTML/PullParser.pm new file mode 100644 index 0000000..3083379 --- /dev/null +++ b/lib/HTML/PullParser.pm @@ -0,0 +1,209 @@ +package HTML::PullParser; + +require HTML::Parser; +@ISA=qw(HTML::Parser); +$VERSION = "3.57"; + +use strict; +use Carp (); + +sub new +{ + my($class, %cnf) = @_; + + # Construct argspecs for the various events + my %argspec; + for (qw(start end text declaration comment process default)) { + my $tmp = delete $cnf{$_}; + next unless defined $tmp; + $argspec{$_} = $tmp; + } + Carp::croak("Info not collected for any events") + unless %argspec; + + my $file = delete $cnf{file}; + my $doc = delete $cnf{doc}; + Carp::croak("Can't parse from both 'doc' and 'file' at the same time") + if defined($file) && defined($doc); + Carp::croak("No 'doc' or 'file' given to parse from") + unless defined($file) || defined($doc); + + # Create object + $cnf{api_version} = 3; + my $self = $class->SUPER::new(%cnf); + + my $accum = $self->{pullparser_accum} = []; + while (my($event, $argspec) = each %argspec) { + $self->SUPER::handler($event => $accum, $argspec); + } + + if (defined $doc) { + $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; + $self->{pullparser_str_pos} = 0; + } + else { + if (!ref($file) && ref(\$file) ne "GLOB") { + require IO::File; + $file = IO::File->new($file, "r") || return; + } + + $self->{pullparser_file} = $file; + } + $self; +} + + +sub handler +{ + Carp::croak("Can't set handlers for HTML::PullParser"); +} + + +sub get_token +{ + my $self = shift; + while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { + if (my $f = $self->{pullparser_file}) { + # must try to parse more from the file + my $buf; + if (read($f, $buf, 512)) { + $self->parse($buf); + } else { + $self->eof; + $self->{pullparser_eof}++; + delete $self->{pullparser_file}; + } + } + elsif (my $sref = $self->{pullparser_str_ref}) { + # must try to parse more from the scalar + my $pos = $self->{pullparser_str_pos}; + my $chunk = substr($$sref, $pos, 512); + $self->parse($chunk); + $pos += length($chunk); + if ($pos < length($$sref)) { + $self->{pullparser_str_pos} = $pos; + } + else { + $self->eof; + $self->{pullparser_eof}++; + delete $self->{pullparser_str_ref}; + delete $self->{pullparser_str_pos}; + } + } + else { + die; + } + } + shift @{$self->{pullparser_accum}}; +} + + +sub unget_token +{ + my $self = shift; + unshift @{$self->{pullparser_accum}}, @_; + $self; +} + +1; + + +__END__ + +=head1 NAME + +HTML::PullParser - Alternative HTML::Parser interface + +=head1 SYNOPSIS + + use HTML::PullParser; + + $p = HTML::PullParser->new(file => "index.html", + start => 'event, tagname, @attr', + end => 'event, tagname', + ignore_elements => [qw(script style)], + ) || die "Can't open: $!"; + while (my $token = $p->get_token) { + #...do something with $token + } + +=head1 DESCRIPTION + +The HTML::PullParser is an alternative interface to the HTML::Parser class. +It basically turns the HTML::Parser inside out. You associate a file +(or any IO::Handle object or string) with the parser at construction time and +then repeatedly call $parser->get_token to obtain the tags and text +found in the parsed document. + +The following methods are provided: + +=over 4 + +=item $p = HTML::PullParser->new( file => $file, %options ) + +=item $p = HTML::PullParser->new( doc => \$doc, %options ) + +A C<HTML::PullParser> can be made to parse from either a file or a +literal document based on whether the C<file> or C<doc> option is +passed to the parser's constructor. + +The C<file> passed in can either be a file name or a file handle +object. If a file name is passed, and it can't be opened for reading, +then the constructor will return an undefined value and $! will tell +you why it failed. Otherwise the argument is taken to be some object +that the C<HTML::PullParser> can read() from when it needs more data. +The stream will be read() until EOF, but not closed. + +A C<doc> can be passed plain or as a reference +to a scalar. If a reference is passed then the value of this scalar +should not be changed before all tokens have been extracted. + +Next the information to be returned for the different token types must +be set up. This is done by simply associating an argspec (as defined +in L<HTML::Parser>) with the events you have an interest in. For +instance, if you want C<start> tokens to be reported as the string +C<'S'> followed by the tagname and the attributes you might pass an +C<start>-option like this: + + $p = HTML::PullParser->new( + doc => $document_to_parse, + start => '"S", tagname, @attr', + end => '"E", tagname', + ); + +At last other C<HTML::Parser> options, like C<ignore_tags>, and +C<unbroken_text>, can be passed in. Note that you should not use the +I<event>_h options to set up parser handlers. That would confuse the +inner logic of C<HTML::PullParser>. + +=item $token = $p->get_token + +This method will return the next I<token> found in the HTML document, +or C<undef> at the end of the document. The token is returned as an +array reference. The content of this array match the argspec set up +during C<HTML::PullParser> construction. + +=item $p->unget_token( @tokens ) + +If you find out you have read too many tokens you can push them back, +so that they are returned again the next time $p->get_token is called. + +=back + +=head1 EXAMPLES + +The 'eg/hform' script shows how we might parse the form section of +HTML::Documents using HTML::PullParser. + +=head1 SEE ALSO + +L<HTML::Parser>, L<HTML::TokeParser> + +=head1 COPYRIGHT + +Copyright 1998-2001 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/lib/HTML/TokeParser.pm b/lib/HTML/TokeParser.pm new file mode 100644 index 0000000..959b96f --- /dev/null +++ b/lib/HTML/TokeParser.pm @@ -0,0 +1,371 @@ +package HTML::TokeParser; + +require HTML::PullParser; +@ISA=qw(HTML::PullParser); +$VERSION = "3.69"; + +use strict; +use Carp (); +use HTML::Entities qw(decode_entities); +use HTML::Tagset (); + +my %ARGS = +( + start => "'S',tagname,attr,attrseq,text", + end => "'E',tagname,text", + text => "'T',text,is_cdata", + process => "'PI',token0,text", + comment => "'C',text", + declaration => "'D',text", + + # options that default on + unbroken_text => 1, +); + + +sub new +{ + my $class = shift; + my %cnf; + + if (@_ == 1) { + my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; + %cnf = ($type => $_[0]); + } + else { + unshift @_, (ref($_[0]) eq "SCALAR") ? "doc" : "file" if(scalar(@_) % 2 == 1); + %cnf = @_; + } + + my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; + + my $self = $class->SUPER::new(%ARGS, %cnf) || return undef; + + $self->{textify} = $textify; + $self; +} + + +sub get_tag +{ + my $self = shift; + my $token; + while (1) { + $token = $self->get_token || return undef; + my $type = shift @$token; + next unless $type eq "S" || $type eq "E"; + substr($token->[0], 0, 0) = "/" if $type eq "E"; + return $token unless @_; + for (@_) { + return $token if $token->[0] eq $_; + } + } +} + + +sub _textify { + my($self, $token) = @_; + my $tag = $token->[1]; + return undef unless exists $self->{textify}{$tag}; + + my $alt = $self->{textify}{$tag}; + my $text; + if (ref($alt)) { + $text = &$alt(@$token); + } else { + $text = $token->[2]{$alt || "alt"}; + $text = "[\U$tag]" unless defined $text; + } + return $text; +} + + +sub get_text +{ + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } else { + $tag = "/$tag"; + } + if (!@_ || grep $_ eq $tag, @_) { + $self->unget_token($token); + last; + } + push(@text, " ") + if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; + } + } + join("", @text); +} + + +sub get_trimmed_text +{ + my $self = shift; + my $text = $self->get_text(@_); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +sub get_phrase { + my $self = shift; + my @text; + while (my $token = $self->get_token) { + my $type = $token->[0]; + if ($type eq "T") { + my $text = $token->[1]; + decode_entities($text) unless $token->[2]; + push(@text, $text); + } elsif ($type =~ /^[SE]$/) { + my $tag = $token->[1]; + if ($type eq "S") { + if (defined(my $text = _textify($self, $token))) { + push(@text, $text); + next; + } + } + if (!$HTML::Tagset::isPhraseMarkup{$tag}) { + $self->unget_token($token); + last; + } + push(@text, " ") if $tag eq "br"; + } + } + my $text = join("", @text); + $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; + $text; +} + +1; + + +__END__ + +=head1 NAME + +HTML::TokeParser - Alternative HTML::Parser interface + +=head1 SYNOPSIS + + require HTML::TokeParser; + $p = HTML::TokeParser->new("index.html") || + die "Can't open: $!"; + $p->empty_element_tags(1); # configure its behaviour + + while (my $token = $p->get_token) { + #... + } + +=head1 DESCRIPTION + +The C<HTML::TokeParser> is an alternative interface to the +C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a +predeclared set of token types. If you wish the tokens to be reported +differently you probably want to use the C<HTML::PullParser> directly. + +The following methods are available: + +=over 4 + +=item $p = HTML::TokeParser->new( $filename, %opt ); + +=item $p = HTML::TokeParser->new( $filehandle, %opt ); + +=item $p = HTML::TokeParser->new( \$document, %opt ); + +The object constructor argument is either a file name, a file handle +object, or the complete document to be parsed. Extra options can be +provided as key/value pairs and are processed as documented by the base +classes. + +If the argument is a plain scalar, then it is taken as the name of a +file to be opened and parsed. If the file can't be opened for +reading, then the constructor will return C<undef> and $! will tell +you why it failed. + +If the argument is a reference to a plain scalar, then this scalar is +taken to be the literal document to parse. The value of this +scalar should not be changed before all tokens have been extracted. + +Otherwise the argument is taken to be some object that the +C<HTML::TokeParser> can read() from when it needs more data. Typically +it will be a filehandle of some kind. The stream will be read() until +EOF, but not closed. + +A newly constructed C<HTML::TokeParser> differ from its base classes +by having the C<unbroken_text> attribute enabled by default. See +L<HTML::Parser> for a description of this and other attributes that +influence how the document is parsed. It is often a good idea to enable +C<empty_element_tags> behaviour. + +Note that the parsing result will likely not be valid if raw undecoded +UTF-8 is used as a source. When parsing UTF-8 encoded files turn +on UTF-8 decoding: + + open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; + my $p = HTML::TokeParser->new( $fh ); + # ... + +If a $filename is passed to the constructor the file will be opened in +raw mode and the parsing result will only be valid if its content is +Latin-1 or pure ASCII. + +If parsing from an UTF-8 encoded string buffer decode it first: + + utf8::decode($document); + my $p = HTML::TokeParser->new( \$document ); + # ... + +=item $p->get_token + +This method will return the next I<token> found in the HTML document, +or C<undef> at the end of the document. The token is returned as an +array reference. The first element of the array will be a string +denoting the type of this token: "S" for start tag, "E" for end tag, +"T" for text, "C" for comment, "D" for declaration, and "PI" for +process instructions. The rest of the token array depend on the type +like this: + + ["S", $tag, $attr, $attrseq, $text] + ["E", $tag, $text] + ["T", $text, $is_data] + ["C", $text] + ["D", $text] + ["PI", $token0, $text] + +where $attr is a hash reference, $attrseq is an array reference and +the rest are plain scalars. The L<HTML::Parser/Argspec> explains the +details. + +=item $p->unget_token( @tokens ) + +If you find you have read too many tokens you can push them back, +so that they are returned the next time $p->get_token is called. + +=item $p->get_tag + +=item $p->get_tag( @tags ) + +This method returns the next start or end tag (skipping any other +tokens), or C<undef> if there are no more tags in the document. If +one or more arguments are given, then we skip tokens until one of the +specified tag types is found. For example: + + $p->get_tag("font", "/font"); + +will find the next start or end tag for a font-element. + +The tag information is returned as an array reference in the same form +as for $p->get_token above, but the type code (first element) is +missing. A start tag will be returned like this: + + [$tag, $attr, $attrseq, $text] + +The tagname of end tags are prefixed with "/", i.e. end tag is +returned like this: + + ["/$tag", $text] + +=item $p->get_text + +=item $p->get_text( @endtags ) + +This method returns all text found at the current position. It will +return a zero length string if the next token is not text. Any +entities will be converted to their corresponding character. + +If one or more arguments are given, then we return all text occurring +before the first of the specified tags found. For example: + + $p->get_text("p", "br"); + +will return the text up to either a paragraph of linebreak element. + +The text might span tags that should be I<textified>. This is +controlled by the $p->{textify} attribute, which is a hash that +defines how certain tags can be treated as text. If the name of a +start tag matches a key in this hash then this tag is converted to +text. The hash value is used to specify which tag attribute to obtain +the text from. If this tag attribute is missing, then the upper case +name of the tag enclosed in brackets is returned, e.g. "[IMG]". The +hash value can also be a subroutine reference. In this case the +routine is called with the start tag token content as its argument and +the return value is treated as the text. + +The default $p->{textify} value is: + + {img => "alt", applet => "alt"} + +This means that <IMG> and <APPLET> tags are treated as text, and that +the text to substitute can be found in the ALT attribute. + +=item $p->get_trimmed_text + +=item $p->get_trimmed_text( @endtags ) + +Same as $p->get_text above, but will collapse any sequences of white +space to a single space character. Leading and trailing white space is +removed. + +=item $p->get_phrase + +This will return all text found at the current position ignoring any +phrasal-level tags. Text is extracted until the first non +phrasal-level tag. Textification of tags is the same as for +get_text(). This method will collapse white space in the same way as +get_trimmed_text() does. + +The definition of <i>phrasal-level tags</i> is obtained from the +HTML::Tagset module. + +=back + +=head1 EXAMPLES + +This example extracts all links from a document. It will print one +line for each link, containing the URL and the textual description +between the <A>...</A> tags: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + + while (my $token = $p->get_tag("a")) { + my $url = $token->[1]{href} || "-"; + my $text = $p->get_trimmed_text("/a"); + print "$url\t$text\n"; + } + +This example extract the <TITLE> from the document: + + use HTML::TokeParser; + $p = HTML::TokeParser->new(shift||"index.html"); + if ($p->get_tag("title")) { + my $title = $p->get_trimmed_text; + print "Title: $title\n"; + } + +=head1 SEE ALSO + +L<HTML::PullParser>, L<HTML::Parser> + +=head1 COPYRIGHT + +Copyright 1998-2005 Gisle Aas. + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/mkhctype b/mkhctype new file mode 100755 index 0000000..eeae40d --- /dev/null +++ b/mkhctype @@ -0,0 +1,57 @@ +#!/usr/bin/perl + +($progname = $0) =~ s,.*/,,; + +print "/* This file is autogenerated by $progname */\n"; + +print <<'EOT'; + +#define HCTYPE_SPACE 0x01 +#define HCTYPE_NAME_FIRST 0x02 +#define HCTYPE_NAME_CHAR 0x04 +#define HCTYPE_NOT_SPACE_GT 0x08 +#define HCTYPE_NOT_SPACE_EQ_GT 0x10 +#define HCTYPE_NOT_SPACE_SLASH_GT 0x20 +#define HCTYPE_NOT_SPACE_EQ_SLASH_GT 0x40 + +#define HCTYPE(c) hctype[(unsigned char)(c)] +#define isHCTYPE(c, w) (HCTYPE(c) & (w)) + +#define isHSPACE(c) isHCTYPE(c, HCTYPE_SPACE) +#define isHNAME_FIRST(c) isHCTYPE(c, HCTYPE_NAME_FIRST) +#define isHNAME_CHAR(c) isHCTYPE(c, HCTYPE_NAME_CHAR) +#define isHNOT_SPACE_GT(c) isHCTYPE(c, HCTYPE_NOT_SPACE_GT) + +typedef unsigned char hctype_t; + +EOT + +print "static hctype_t hctype[] = {\n"; + +for my $c (0 .. 255) { + print " " unless $c % 8; + + local $_ = chr($c); + my $v = 0; + if (/^\s$/) { # isSPACE + $v |= 0x1 + } + elsif ($_ ne ">") { + $v |= 0x08; + $v |= 0x10 if $_ ne "="; + $v |= 0x20 if $_ ne "/"; + $v |= 0x40 if $_ ne "="; + } + + if (/^[\w.\-:]$/) { + $v |= 0x4; + $v |= 0x2 unless /^[\d.-]$/; # XML allow /[:_]/ as first char + } + + printf "0x%02x, ", $v; + unless (($c+1) % 8) { + printf " /* %3d - %3d */\n", $c - 7, $c; + } +} +print "};\n"; + @@ -0,0 +1,28 @@ +#!/usr/bin/perl + +($progname = $0) =~ s,.*/,,; + +print "/* This file is autogenerated by $progname */\n"; + +print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; +print "static PFUNC parsefunc[] = {\n"; + +for my $c (0..255) { + local $_ = chr($c); + my $func = "null"; + if (/^[A-Za-z]$/) { + $func = "start"; + } + elsif ($_ eq "/") { + $func = "end"; + } + elsif ($_ eq "!") { + $func = "decl"; + } + elsif ($_ eq "?") { + $func = "process"; + } + printf " %-15s /* %3d */\n", "parse_$func,", $c; +} + +print "};\n"; diff --git a/t/api_version.t b/t/api_version.t new file mode 100644 index 0000000..9803121 --- /dev/null +++ b/t/api_version.t @@ -0,0 +1,22 @@ +use Test::More tests => 4; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +ok(!$p->handler("start"), "API version 3"); + +my $failed; +eval { + my $p = HTML::Parser->new(api_version => 4); + $failed++; +}; +like($@, qr/^API version 4 not supported/); +ok(!$failed, "API version 4"); + +$p = HTML::Parser->new(api_version => 2); + +is($p->handler("start"), "start", "API version 2"); + + diff --git a/t/argspec-bad.t b/t/argspec-bad.t new file mode 100644 index 0000000..8c0b199 --- /dev/null +++ b/t/argspec-bad.t @@ -0,0 +1,40 @@ +use Test::More tests => 6; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +eval { + $p->handler(end => "end", q(xyzzy)); +}; +like($@, qr/^Unrecognized identifier xyzzy in argspec/); + + +eval { + $p->handler(end => "end", q(tagname text)); +}; +like($@, qr/^Missing comma separator in argspec/); + + +eval { + $p->handler(end => "end", q(tagname, "text)); +}; +like($@, qr/^Unterminated literal string in argspec/); + + +eval { + $p->handler(end => "end", q(tagname, "t\\t")); +}; +like($@, qr/^Backslash reserved for literal string in argspec/); + +eval { + $p->handler(end => "end", '"' . ("x" x 256) . '"'); +}; +like($@, qr/^Literal string is longer than 255 chars in argspec/); + +$p->handler(end => sub { is(length(shift), 255) }, + '"' . ("x" x 255) . '"'); +$p->parse("</x>"); + + diff --git a/t/argspec.t b/t/argspec.t new file mode 100644 index 0000000..e8aa7a5 --- /dev/null +++ b/t/argspec.t @@ -0,0 +1,148 @@ + +use strict; +require HTML::Parser; + +my $decl = '<!ENTITY nbsp CDATA " " -- no-break space -->'; +my $com1 = '<!-- Comment -->'; +my $com2 = '<!-- Comment -- -- Comment -->'; +my $start = '<a href="foo">'; +my $end = '</a>'; +my $empty = "<IMG SRC='foo'/>"; +my $proc = '<? something completely different ?>'; + +my @argspec = qw( self offset length + event tagname tag token0 + text + is_cdata dtext + tokens + tokenpos + attr + attrseq ); + +my @result = (); +my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)], + strict_comment => 1, xml_mode => 1); + +my @tests = + ( # string, expected results + $decl => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY', + '<!ENTITY nbsp CDATA " " -- no-break space -->', + undef, undef, + ['ENTITY', 'nbsp', 'CDATA', '" "', '-- no-break space --'], + [2, 6, 9, 4, 16, 5, 22, 8, 31, 20], + undef, undef ]], + $com1 => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ', + '<!-- Comment -->', + undef, undef, + [' Comment '], + [4, 9], + undef, undef ]], + $com2 => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ', + '<!-- Comment -- -- Comment -->', + undef, undef, + [' Comment ', ' Comment '], + [4, 9, 18, 9], + undef, undef ]], + $start => [[$p, 0, 14, 'start', 'a', 'a', 'a', + '<a href="foo">', + undef, undef, + ['a', 'href', '"foo"'], + [1, 1, 3, 4, 8, 5], + {'href', 'foo'}, ['href'] ]], + $end => [[$p, 0, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef ]], + $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG', + "<IMG SRC='foo'/>", + undef, undef, + ['IMG', 'SRC', "'foo'"], + [1, 3, 5, 3, 9, 5], + {'SRC', 'foo'}, ['SRC'] ], + [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG', + '', + undef, undef, + ['IMG'], + undef, + undef, undef ], + ], + $proc => [[$p, 0, 36, 'process', ' something completely different ', + '? something completely different ', + ' something completely different ', + '<? something completely different ?>', + undef, undef, + [' something completely different '], + [2, 32], + undef, undef ]], + "$end\n$end" => [[$p, 0, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef], + [$p, 4, 1, 'text', undef, undef, undef, + "\n", + '', "\n", + undef, + undef, + undef, undef], + [$p, 5, 4, 'end', 'a', '/a', 'a', + '</a>', + undef, undef, + ['a'], + [2, 1], + undef, undef ]], + ); + +use Test::More; +plan tests => @tests / 2; + +sub string_tag { + my (@pieces) = @_; + my $part; + foreach $part ( @pieces ) { + if (!defined $part) { + $part = 'undef'; + } + elsif (!ref $part) { + $part = "'$part'" if $part !~ /^\d+$/; + } + elsif ('ARRAY' eq ref $part ) { + $part = '[' . join(', ', string_tag(@$part)) . ']'; + } + elsif ('HASH' eq ref $part ) { + $part = '{' . join(',', string_tag(%$part)) . '}'; + } + else { + $part = '<' . ref($part) . '>'; + } + } + return join(", ", @pieces ); +} + +my $i = 0; +TEST: +while (@tests) { + my($html, $expected) = splice @tests, 0, 2; + ++$i; + + @result = (); + $p->parse($html)->eof; + + shift(@result) if $result[0][3] eq "start_document"; + pop(@result) if $result[-1][3] eq "end_document"; + + # Compare results for each element expected + foreach (@$expected) { + my $want = string_tag($_); + my $got = string_tag(shift @result); + if ($want ne $got) { + is($want, $got); + next TEST; + } + } + + pass; +} diff --git a/t/argspec2.t b/t/argspec2.t new file mode 100644 index 0000000..6f594b9 --- /dev/null +++ b/t/argspec2.t @@ -0,0 +1,21 @@ +use Test::More tests => 2; + +use strict; +use HTML::Parser; + +my @start; +my @text; + +my $p = HTML::Parser->new(api_version => 3); +$p->handler(start => \@start, '@{tagname, @attr}'); +$p->handler(text => \@text, '@{dtext}'); +$p->parse(<<EOT)->eof; +Hi +<a href="abc">Foo</a><b>:-)</b> +EOT + +is("@start", "a href abc b"); + +is(join("", @text), "Hi\nFoo:-)\n"); + + diff --git a/t/attr-encoded.t b/t/attr-encoded.t new file mode 100644 index 0000000..4d458eb --- /dev/null +++ b/t/attr-encoded.t @@ -0,0 +1,32 @@ +use strict; +use Test::More tests => 2; + +use HTML::Parser (); +my $p = HTML::Parser->new(); +$p->attr_encoded(1); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + $text .= "]"; + }, "tagname,attr"); + +my $html = <<'EOT'; +<tag arg="&<>"> +EOT + +$p->parse($html)->eof; + +is($text, 'S[tag arg=&<>]'); + +$text = ""; +$p->attr_encoded(0); +$p->parse($html)->eof; + +is($text, 'S[tag arg=&<>]'); diff --git a/t/callback.t b/t/callback.t new file mode 100644 index 0000000..7a456cf --- /dev/null +++ b/t/callback.t @@ -0,0 +1,49 @@ +use Test::More tests => 47; + +use strict; +use HTML::Parser; + +my @expected; +my $p = HTML::Parser->new(api_version => 3, + unbroken_text => 1, + default_h => [\@expected, '@{event, text}'], + ); + +my $doc = <<'EOT'; +<title>Hi</title> +<h1>Ho ho</h1> +<--comment-> +EOT + +$p->parse($doc)->eof; +#use Data::Dump; Data::Dump::dump(@expected); + +for my $i (1..length($doc)) { + my @t; + $p->handler(default => \@t); + $p->parse(chunk($doc, $i)); + + # check that we got the same stuff + #diag "X:", join(":", @t); + #diag "Y:", join(":", @expected); + is(join(":", @t), join(":", @expected)); +} + +sub chunk { + my $str = shift; + my $size = shift || 1; + sub { + my $res = substr($str, 0, $size); + #diag "...$res"; + substr($str, 0, $size) = ""; + $res; + } +} + +# Test croking behaviour +$p->handler(default => []); + +eval { + $p->parse(sub { die "Hi" }); +}; +like($@, qr/^Hi/); diff --git a/t/case-sensitive.t b/t/case-sensitive.t new file mode 100644 index 0000000..565b20b --- /dev/null +++ b/t/case-sensitive.t @@ -0,0 +1,85 @@ +use strict; +use Test::More tests => 8; + +use HTML::Parser (); +my $p = HTML::Parser->new(); +$p->case_sensitive(1); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr, $attrseq) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + if (@$attrseq) { $text.=" Order:" ; } + for my $k (@$attrseq) { + $text .= " $k"; + } + $text .= "]"; + }, "tagname,attr,attrseq"); +$p->handler(end => + sub { + my ($tag) = @_; + $text .= "E[$tag]"; + }, "tagname"); + +my $html = <<'EOT'; +<tAg aRg="Value" arg="other value"></tAg> +EOT +my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]'; +my $ci = 'S[tag arg=Value Order: arg arg]E[tag]'; + +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->parse($html)->eof; +is($text, $ci); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->parse($html)->eof; +is($text, $cs); + +$html = <<'EOT'; +<tAg aRg="Value" arg="other value"></tAg> +<iGnOrE></ignore> +EOT +$p->ignore_tags('ignore'); +$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]'; +$ci = 'S[tag arg=Value Order: arg arg]E[tag]'; + +$text = ""; +$p->case_sensitive(0); +$p->xml_mode(0); +$p->parse($html)->eof; +is($text, $ci); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(0); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(0); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + +$text = ""; +$p->case_sensitive(1); +$p->xml_mode(1); +$p->parse($html)->eof; +is($text, $cs); + diff --git a/t/cases.t b/t/cases.t new file mode 100644 index 0000000..a537279 --- /dev/null +++ b/t/cases.t @@ -0,0 +1,105 @@ +use Test::More; + +require HTML::Parser; + +package P; @ISA = qw(HTML::Parser); + +my @result; +sub start +{ + my($self, $tag, $attr) = @_; + push @result, "START[$tag]"; + for (sort keys %$attr) { + push @result, "\t$_: " . $attr->{$_}; + } + $start++; +} + +sub end +{ + my($self, $tag) = @_; + push @result, "END[$tag]"; + $end++; +} + +sub text +{ + my $self = shift; + push @result, "TEXT[$_[0]]"; + $text++; +} + +sub comment +{ + my $self = shift; + push @result, "COMMENT[$_[0]]"; + $comment++; +} + +sub declaration +{ + my $self = shift; + push @result, "DECLARATION[$_[0]]"; + $declaration++; +} + +package main; + + +@tests = + ( + '<a ">' => ['START[a]', "\t\": \""], + '<a/>' => ['START[a/]',], + '<a />' => ['START[a]', "\t/: /"], + '<a a/>' => ['START[a]', "\ta/: a/"], + '<a a/=/>' => ['START[a]', "\ta/: /"], + '<a x="foo bar">' => ['START[a]', "\tx: foo\xA0bar"], + '<a x="foo bar">' => ['START[a]', "\tx: foo bar"], + '<å >' => ['TEXT[<å]', 'TEXT[ >]'], + '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'], + '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'], + '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'], + '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'], + '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'], + "2 <a href='foo bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], + '2 <a href=foo bar> 2' => + ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'], + '2 <a href="foo bar"> 2' => + ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], + '2 <a href="foo\'bar"> 2' => + ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'], + "2 <a href='foo\"bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], + "2 <a href='foo"bar'> 2" => + ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], + '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'], + '2 <a.b-12 a.b = 2 a> 2' => + ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'], + '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'], + '<!ENTITY nbsp CDATA " " -- no-break space -->' => + ['DECLARATION[ENTITY nbsp CDATA " " -- no-break space --]'], + '<!-- comment -->' => ['COMMENT[ comment ]'], + '<!-- comment -- --- comment -->' => + ['COMMENT[ comment ]', 'COMMENT[- comment ]'], + '<!-- comment <!-- not comment --> comment -->' => + ['COMMENT[ comment <!]', 'COMMENT[> comment ]'], + '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'], + ); + +plan tests => @tests / 2; + +my $i = 0; +TEST: +while (@tests) { + ++$i; + my ($html, $expected) = splice @tests, 0, 2; + @result = (); + + $p = new P; + $p->strict_comment(1); + $p->parse($html)->eof; + + ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n", + "Got: @result\n"); +} diff --git a/t/comment.t b/t/comment.t new file mode 100644 index 0000000..303449e --- /dev/null +++ b/t/comment.t @@ -0,0 +1,24 @@ +use Test::More tests => 1; + +use strict; +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3); +my @com; +$p->handler(comment => sub { push(@com, shift) }, "token0"); +$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text"); + +$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>"); +$p->parse("<!--+--"); +$p->parse("\n\n"); +$p->parse(">"); +$p->parse("<!a'b>"); +$p->parse("<!--foo--->"); +$p->parse("<!--foo---->"); +$p->parse("<!--foo----->-->"); +$p->parse("<foo>"); +$p->parse("<!3453><!-3456><!FOO><>"); +$p->eof; + +my $com = join(":", @com); +is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]"); diff --git a/t/crashme.t b/t/crashme.t new file mode 100644 index 0000000..1a1e8e4 --- /dev/null +++ b/t/crashme.t @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# This test will simply run the parser on random junk. + +my $no_tests = shift || 3; +use Test::More; +plan tests => $no_tests; + +use HTML::Parser (); + +my $file = "junk$$.html"; +die if -e $file; + +for (1..$no_tests) { + + open(JUNK, ">$file") || die; + for (1 .. rand(5000)) { + for (1 .. rand(200)) { + print JUNK pack("N", rand(2**32)); + } + print JUNK ("<", "&", ">")[rand(3)]; # make these a bit more likely + } + close(JUNK); + + #diag "Parse @{[-s $file]} bytes of junk"; + + HTML::Parser->new->parse_file($file); + pass(); + + #print_mem(); +} + +unlink($file); + + +sub print_mem +{ + # this probably only works on Linux + open(STAT, "/proc/self/status") || return; + while (<STAT>) { + diag $_ if /^VmSize/; + } +} diff --git a/t/declaration.t b/t/declaration.t new file mode 100644 index 0000000..17de561 --- /dev/null +++ b/t/declaration.t @@ -0,0 +1,62 @@ +use Test::More tests => 2; + +use HTML::Parser; +my $res = ""; + +sub decl +{ + my $t = shift; + $res .= "[" . join("\n", map "<$_>", @$t) . "]"; +} + +sub text +{ + $res .= shift; +} + +my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"], + default_h => [\&text, "text"], + ); + +$p->parse(<<EOT)->eof; +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>-- + "http://www.w3.org/TR/html40/strict.dtd"> + +<!ENTITY foo "<!-- foo -->"> +<!Entity foo "<!-- foo -->"> + +<!row --> foo +EOT + +is($res, <<EOT); +[<DOCTYPE> +<HTML> +<PUBLIC> +<"-//W3C//DTD HTML 4.01//EN"> +<--<comment>--> +<"http://www.w3.org/TR/html40/strict.dtd">] + +[<ENTITY> +<foo> +<"<!-- foo -->">] +[<Entity> +<foo> +<"<!-- foo -->">] + +<!row --> foo +EOT + +$res = ""; +$p->parse(<<EOT)->eof; +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]> +EOT +is($res, <<EOT); +[<DOCTYPE> +<html> +<PUBLIC> +<"-//W3C//DTD XHTML 1.0 Strict//EN"> +<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<[]>] +EOT + diff --git a/t/default.t b/t/default.t new file mode 100644 index 0000000..4b5ed79 --- /dev/null +++ b/t/default.t @@ -0,0 +1,43 @@ +use strict; +use Test::More tests => 3; + +my $text = ""; +use HTML::Parser (); +my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"], + ); + +my $html = <<'EOT'; + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html40/strict.dtd"> + +<title>foo</title> +<!-- comment <a> --> +<?process instruction> + +EOT + +$p->parse($html)->eof; + +is($text, $html); + +$text = ""; +$p->handler(start => sub { }, ""); +$p->handler(declaration => sub { }, ""); +$p->parse($html)->eof; + +my $html2; +$html2 = $html; +$html2 =~ s/<title>//; +$html2 =~ s/<!DOCTYPE[^>]*>//; + +is($text, $html2); + +$text = ""; +$p->handler(start => undef); +$p->parse($html)->eof; + +$html2 = $html; +$html2 =~ s/<!DOCTYPE[^>]*>//; + +is($text, $html2); diff --git a/t/document.t b/t/document.t new file mode 100644 index 0000000..6696939 --- /dev/null +++ b/t/document.t @@ -0,0 +1,41 @@ +#!perl -w + +use Test; +plan tests => 6; + + +use HTML::Parser; +use File::Spec; + +my $events; +my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]); + +$events = ""; +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse_file(File::Spec->devnull); +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->parse(""); +$p->eof; +ok($events, "start_document\nend_document\n"); + +$events = ""; +$p->parse(""); +$p->parse("<a>"); +$p->eof; +ok($events, "start_document\nstart\nend_document\n"); + +$events = ""; +$p->parse("<a> "); +$p->eof; +ok($events, "start_document\nstart\ntext\nend_document\n"); diff --git a/t/dtext.t b/t/dtext.t new file mode 100644 index 0000000..883c61f --- /dev/null +++ b/t/dtext.t @@ -0,0 +1,72 @@ +#!perl -w + +use strict; +use Test::More tests => 2; + +use HTML::Parser (); + +my $dtext = ""; +my $text = ""; + +sub append +{ + $dtext .= shift; + $text .= shift; +} + +my $p = HTML::Parser->new(text_h => [\&append, "dtext, text"], + default_h => [\&append, "text, text" ], + ); + +my $doc = <<'EOT'; +<title>å</title> +<a href="fooå">ååAA<A>AA</a> +<?å> +foo bar +foo bar +&xyzzy +&xyzzy; +<!-- � --> + +ÿ +ÿ +ÿG +<!-- Ā --> +� +� +& +&# +&#x +<xmp>å</xmp> +<script>å</script> +<ScRIPT>å</scRIPT> +<skript>å</script> +EOT + +$p->parse($doc)->eof; + +is($text, $doc); +is($dtext, <<"EOT"); +<title>å</title> +<a href="fooå">ååAA<A>AA</a> +<?å> +foo\240bar +foo\240bar +&xyzzy +&xyzzy; +<!-- � --> +\1 +\377 +\377 +\377G +<!-- Ā --> +� +� +& +&# +&#x +<xmp>å</xmp> +<script>å</script> +<ScRIPT>å</scRIPT> +<skript>å</script> +EOT diff --git a/t/entities.t b/t/entities.t new file mode 100644 index 0000000..f12d2fd --- /dev/null +++ b/t/entities.t @@ -0,0 +1,213 @@ +use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); + +use Test::More tests => 20; + +$a = "Våre norske tegn bør æres"; + +decode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +encode_entities($a); + +is($a, "Våre norske tegn bør æres"); + +decode_entities($a); +encode_entities_numeric($a); + +is($a, "Våre norske tegn bør æres"); + +$a = "<&>\"'"; +is(encode_entities($a), "<&>"'"); +is(encode_entities_numeric($a), "<&>"'"); + +$a = "abcdef"; +is(encode_entities($a, 'a-c'), "abcdef"); + +$a = "[24/7]\\"; +is(encode_entities($a, '/'), "[24/7]\\"); +is(encode_entities($a, '\\/'), "[24/7]\\"); +is(encode_entities($a, '\\'), "[24/7]\"); +is(encode_entities($a, ']\\'), "[24/7]\"); + +# See how well it does against rfc1866... +$ent = $plain = ""; +while (<DATA>) { + next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; + $ent .= "&$1;"; + $plain .= chr($2); +} + +$a = $ent; +decode_entities($a); +is($a, $plain); + +# Try decoding when the ";" are left out +$a = $ent, +$a =~ s/;//g; +decode_entities($a); +is($a, $plain); + + +$a = $plain; +encode_entities($a); +is($a, $ent); + +{ #RT #84144 - https://rt.cpan.org/Public/Bug/Display.html?id=84144 + + my %hash= ( + "Våre norske tegn bør æres" => "Våre norske tegn bør æres" + ); + + my ($got, $eval_ok); + $eval_ok= eval { $got= decode_entities((keys %hash)[0]); 1 }; + is( $eval_ok, 1, "decode_entitites() when processing a key as input"); + is( $got, (values %hash)[0], "decode_entities() decodes a key properly"); +} + +# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> +# Subject: HTML entities problem with 5.11 +# To: libwww-perl@ics.uci.edu +# Date: Fri, 05 Sep 1997 16:56:55 +1000 +# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> +# +# Hi. I've got a problem that has surfaced with the changes to +# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening +# in the process of encoding then decoding special entities. Eg, what goes +# in as "abc&def&ghi" comes out as "abc&def;&ghi;". + +is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); + +# Decoding of ' +is(decode_entities("'"), "'"); +is(encode_entities("'", "'"), "'"); + +is(decode_entities("Attention Homeοωnөrs...1ѕt Tімe Eνөг"), + "Attention Home\x{3BF}\x{3C9}n\x{4E9}rs...1\x{455}t T\x{456}\x{43C}e E\x{3BD}\x{4E9}\x{433}"); +is(decode_entities("{&amp;&amp;& also Яœ}"), + "{&&& also \x{42F}\x{153}}"); + +__END__ +# Quoted from rfc1866.txt + +14. Proposed Entities + + The HTML DTD references the "Added Latin 1" entity set, which only + supplies named entities for a subset of the non-ASCII characters in + [ISO-8859-1], namely the accented characters. The following entities + should be supported so that all ISO 8859-1 characters may only be + referenced symbolically. The names for these entities are taken from + the appendixes of [SGML]. + + <!ENTITY nbsp CDATA " " -- no-break space --> + <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> + <!ENTITY cent CDATA "¢" -- cent sign --> + <!ENTITY pound CDATA "£" -- pound sterling sign --> + <!ENTITY curren CDATA "¤" -- general currency sign --> + <!ENTITY yen CDATA "¥" -- yen sign --> + <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> + <!ENTITY sect CDATA "§" -- section sign --> + <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> + <!ENTITY copy CDATA "©" -- copyright sign --> + <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> + <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> + <!ENTITY not CDATA "¬" -- not sign --> + <!ENTITY shy CDATA "­" -- soft hyphen --> + <!ENTITY reg CDATA "®" -- registered sign --> + <!ENTITY macr CDATA "¯" -- macron --> + <!ENTITY deg CDATA "°" -- degree sign --> + <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> + <!ENTITY sup2 CDATA "²" -- superscript two --> + <!ENTITY sup3 CDATA "³" -- superscript three --> + <!ENTITY acute CDATA "´" -- acute accent --> + <!ENTITY micro CDATA "µ" -- micro sign --> + <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> + <!ENTITY middot CDATA "·" -- middle dot --> + <!ENTITY cedil CDATA "¸" -- cedilla --> + <!ENTITY sup1 CDATA "¹" -- superscript one --> + <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> + <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> + <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> + <!ENTITY frac12 CDATA "½" -- fraction one-half --> + <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> + <!ENTITY iquest CDATA "¿" -- inverted question mark --> + <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> + <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> + <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> + + + +Berners-Lee & Connolly Standards Track [Page 75] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> + <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> + <!ENTITY Aring CDATA "Å" -- capital A, ring --> + <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> + <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> + <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> + <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> + <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> + <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> + <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> + <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> + <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> + <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> + <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> + <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> + <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> + <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> + <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> + <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> + <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> + <!ENTITY times CDATA "×" -- multiply sign --> + <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> + <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> + <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> + <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> + <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> + <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> + <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> + <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> + <!ENTITY agrave CDATA "à" -- small a, grave accent --> + <!ENTITY aacute CDATA "á" -- small a, acute accent --> + <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> + <!ENTITY atilde CDATA "ã" -- small a, tilde --> + <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> + <!ENTITY aring CDATA "å" -- small a, ring --> + <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> + <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> + <!ENTITY egrave CDATA "è" -- small e, grave accent --> + <!ENTITY eacute CDATA "é" -- small e, acute accent --> + <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> + <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> + <!ENTITY igrave CDATA "ì" -- small i, grave accent --> + <!ENTITY iacute CDATA "í" -- small i, acute accent --> + <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> + <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> + <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> + <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> + <!ENTITY ograve CDATA "ò" -- small o, grave accent --> + + + +Berners-Lee & Connolly Standards Track [Page 76] + +RFC 1866 Hypertext Markup Language - 2.0 November 1995 + + + <!ENTITY oacute CDATA "ó" -- small o, acute accent --> + <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> + <!ENTITY otilde CDATA "õ" -- small o, tilde --> + <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> + <!ENTITY divide CDATA "÷" -- divide sign --> + <!ENTITY oslash CDATA "ø" -- small o, slash --> + <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> + <!ENTITY uacute CDATA "ú" -- small u, acute accent --> + <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> + <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> + <!ENTITY yacute CDATA "ý" -- small y, acute accent --> + <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> + <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> diff --git a/t/entities2.t b/t/entities2.t new file mode 100644 index 0000000..537ac78 --- /dev/null +++ b/t/entities2.t @@ -0,0 +1,57 @@ +#!perl -w + +use strict; +use Test::More tests => 9; + +use HTML::Entities qw(_decode_entities); + +eval { + _decode_entities("<", undef); +}; +like($@, qr/^(?:Can't inline decode readonly string|Modification of a read-only value attempted)/); + +eval { + my $a = ""; + _decode_entities($a, $a); +}; +like($@, qr/^2nd argument must be hash reference/); + +eval { + my $a = ""; + _decode_entities($a, []); +}; +like($@, qr/^2nd argument must be hash reference/); + +$a = "<"; +_decode_entities($a, undef); +is($a, "<"); + +_decode_entities($a, { "lt" => "<" }); +is($a, "<"); + +my $x = "x" x 20; + +my $err; +for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { + my $a = $_; + $a =~ s/:/&a;/g; + my $b = $_; + $b =~ s/:/$x/g; + _decode_entities($a, { "a" => $x }); + if ($a ne $b) { + diag "Something went wrong with '$_'"; + $err++; + } +} +ok(!$err); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo\xA0bar"); + +$a = "foo bar"; +_decode_entities($a, \%HTML::Entities::entity2char); +is($a, "foo bar"); + +_decode_entities($a, \%HTML::Entities::entity2char, 1); +is($a, "foo\xA0bar"); diff --git a/t/filter-methods.t b/t/filter-methods.t new file mode 100644 index 0000000..9eccaf1 --- /dev/null +++ b/t/filter-methods.t @@ -0,0 +1,205 @@ +#!/usr/bin/perl -w + +use Test::More tests => 12; +use strict; + +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); +$p->ignore_elements("script"); +$p->unbroken_text(1); + +$p->handler(default => [], "event, text"); +$p->parse(<<"EOT")->eof; +<html><head><title>foo</title><Script language="Perl"> + while (<B>) { + # ... + } +</Script><body> +This is an <i>italic</i> and <b>bold</b> text. +</body> +</html> +EOT + +my $t = join("||", map join("|", @$_), @{$p->handler("default")}); +#diag $t; + +is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text| +This is an italic and bold text. +||end|</body>||text| +||end|</html>||text| +||end_document|", 'ignore_elements'); + + +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->report_tags("a"); +$p->handler(start => sub { + my($tagname, %attr) = @_; + ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); + }, 'tagname, @attr'); +$p->handler(end => sub { + my $tagname = shift; + is($tagname, "a", 'report_tags end'); + }, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> + +This is <a href="#a">very nice</a> example. + +EOT + + +#------------------------------------------------------ + +my @tags; +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(a em)); +$p->ignore_tags(qw(em)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> + +This is <em>yet another</em> <a href="#a">very nice</a> example. + +EOT +is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->ignore_tags(qw(h2)); +$p->report_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->report_tags(qw(h1 h2)); +$p->report_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> +<h3>Next example</h3> + +EOT +is(join('|', @tags), 'h3', 'report_tags replaces filter'); + + +#------------------------------------------------------ + + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h1 h2)); +$p->ignore_tags(qw(h3)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> +<h3>Next example</h3> + +EOT +is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->ignore_tags(); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); + + +#------------------------------------------------------ + +@tags = (); +$p = HTML::Parser->new(api_version => 3); +$p->ignore_tags(qw(h2)); +$p->report_tags(qw(h1 h2)); +$p->handler(end => sub {push @tags, @_;}, 'tagname'); + +$p->parse(<<EOT)->eof; + +<h1>Next example</h1> +<h2>Next example</h2> + +EOT +is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); +#------------------------------------------------------ + +$p = HTML::Parser->new(api_version => 3); +$p->ignore_elements("script"); +my $res=""; +$p->handler(default=> sub {$res.=$_[0];}, 'text'); +$p->parse(<<'EOT')->eof; +A <script> B </script> C </script> D <script> E </script> F +EOT +is($res,"A C D F\n","ignore </script> without <script> correctly"); diff --git a/t/filter.t b/t/filter.t new file mode 100644 index 0000000..3b18f9e --- /dev/null +++ b/t/filter.t @@ -0,0 +1,60 @@ +use Test::More tests => 3; + +my $HTML = <<EOT; + +<!DOCTYPE HTML> +<!-- comment +<h1>Foo</h1> +--> + +<H1 +>Bar</H1 +> + +<Table><tr><td>1<td>2<td>3 +<tr> +</table> + +<?process> + +EOT + +use HTML::Filter; +use SelectSaver; + +my $tmpfile = "test-$$.htm"; +die "$tmpfile already exists" if -e $tmpfile; + +open(HTML, ">$tmpfile") or die "$!"; + +{ + my $save = new SelectSaver(HTML); + HTML::Filter->new->parse($HTML)->eof; +} +close(HTML); + +open(HTML, $tmpfile) or die "$!"; +local($/) = undef; +my $FILTERED = <HTML>; +close(HTML); + +#print $FILTERED; +is($FILTERED, $HTML); + +{ + package MyFilter; + @ISA=qw(HTML::Filter); + sub comment {} + sub output { push(@{$_[0]->{fhtml}}, $_[1]) } + sub filtered_html { join("", @{$_[0]->{fhtml}}) } +} + +my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html; +unlink($tmpfile) or warn "Can't unlink $tmpfile: $!"; + +#diag $f2; + +unlike($f2, qr/Foo/); +like($f2, qr/Bar/); + + diff --git a/t/handler-eof.t b/t/handler-eof.t new file mode 100644 index 0000000..39419dc --- /dev/null +++ b/t/handler-eof.t @@ -0,0 +1,54 @@ +use Test::More tests => 6; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) }, + "attr"); +$p->handler(end => sub { shift->eof }, "self"); +my $text; +$p->handler(text => sub { $text = shift }, "text"); + +is($p->parse("<foo testno=1>"), $p); + +$text = ''; +ok(!$p->parse("</foo><foo testno=999>")); +ok(!$text); + +$p->handler(end => sub { $p->parse("foo"); }, ""); +eval { + $p->parse("</foo>"); +}; +like($@, qr/Parse loop not allowed/); + +# We used to get into an infinite loop if the eof triggered +# handler called ->eof + +use HTML::Parser; +$p = HTML::Parser->new(api_version => 3); + +my $i; +$p->handler("default" => + sub { + my $p=shift; + #++$i; diag "$i @_"; + $p->eof; + }, "self, event"); +$p->parse("Foo"); +$p->eof; + +# We used to sometimes trigger events after a handler signaled eof +my $title=''; +$p = HTML::Parser->new(api_version => 3,); +$p->handler(start=> \&title_handler, 'tagname, self'); +$p->parse("<head><title>foo</title>\n</head>"); +is($title, "foo"); + +sub title_handler { + return if shift ne 'title'; + my $self = shift; + $self->handler(text => sub { $title .= shift}, 'dtext'); + $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self'); +} diff --git a/t/handler.t b/t/handler.t new file mode 100644 index 0000000..8d7bbc5 --- /dev/null +++ b/t/handler.t @@ -0,0 +1,67 @@ +# Test handler method + +use Test::More tests => 11; + +my $testno; + +use HTML::Parser; +{ + package MyParser; + use vars qw(@ISA); + @ISA=(HTML::Parser); + + sub foo + { + Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1); + } + + sub bar + { + Test::More::is($_[1], Test::More->builder->current_test + 1); + } +} + +$p = MyParser->new(api_version => 3); + +eval { + $p->handler(foo => "foo", "foo"); +}; + +like($@, qr/^No handler for foo events/); + +eval { + $p->handler(start => "foo", "foo"); +}; +like($@, qr/^Unrecognized identifier foo in argspec/); + +my $h = $p->handler(start => "foo", "self,tagname"); +ok(!defined($h)); + +$x = \substr("xfoo", 1); +$p->handler(start => $$x, "self,attr"); +$p->parse("<a testno=4>"); + +$p->handler(start => \&MyParser::foo, "self,attr"); +$p->parse("<a testno=5>"); + +$p->handler(start => "foo"); +$p->parse("<a testno=6>"); + +$p->handler(start => "bar", "self,'7'"); +$p->parse("<a>"); + +eval { + $p->handler(start => {}, "self"); +}; +like($@, qr/^Only code or array references allowed as handler/); + +$a = []; +$p->handler(start => $a); +$h = $p->handler("start"); +is($p->handler("start", "foo"), $a); + +is($p->handler("start", \&MyParser::foo, ""), "foo"); + +is($p->handler("start"), \&MyParser::foo); + + diff --git a/t/headparser-http.t b/t/headparser-http.t new file mode 100644 index 0000000..b722c64 --- /dev/null +++ b/t/headparser-http.t @@ -0,0 +1,20 @@ +use Test::More tests => 1; + +eval { + require HTML::HeadParser; + $p = HTML::HeadParser->new; +}; + +SKIP: { +skip $@, 1 if $@ =~ /^Can't locate HTTP/; + +$p = HTML::HeadParser->new($h); +$p->parse(<<EOT); +<title>Stupid example</title> +<base href="http://www.sn.no/libwww-perl/"> +Normal text starts here. +EOT +$h = $p->header; +undef $p; +is($h->title, "Stupid example"); +} diff --git a/t/headparser.t b/t/headparser.t new file mode 100644 index 0000000..1b4b810 --- /dev/null +++ b/t/headparser.t @@ -0,0 +1,200 @@ +#!perl -w + +use strict; +use Test::More tests => 17; + +{ package H; + sub new { bless {}, shift; } + + sub header { + my $self = shift; + my $key = uc(shift); + die if $key =~ /:/; + my $old = $self->{$key}; + if (@_) { $self->{$key} = shift; } + $old; + } + + sub push_header { + my($self, $k, $v) = @_; + $k = uc($k); + die if $k =~ /:/; + if (exists $self->{$k}) { + $self->{$k} = [ $self->{$k} ] unless ref $self->{$k}; + push(@{$self->{$k}}, $v); + } else { + $self->{$k} = $v; + } + } + + sub as_string { + my $self = shift; + my $str = ""; + for (sort keys %$self) { + if (ref($self->{$_})) { + my $v; + for $v (@{$self->{$_}}) { + $str .= "$_: $v\n"; + } + } else { + $str .= "$_: $self->{$_}\n"; + } + } + $str; + } +} + + +my $HTML = <<'EOT'; + +<title>Å være eller å ikke være</title> +<meta http-equiv="Expires" content="Soon"> +<meta http-equiv="Foo" content="Bar"> +<meta name='twitter:card' content='photo' /> +<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas"> + +<script> + + ignore this + +</script> +<noscript> ... and this </noscript> + +<object classid="foo"> + +<base href="http://www.sn.no"> +<meta name="Keywords" content="test, test, test,..."> +<meta name="Keywords" content="more"> +<meta charset="ISO-8859-1"><!-- HTML 5 --> + +Dette er vanlig tekst. Denne teksten definerer også slutten på +<head> delen av dokumentet. + +<style> + + ignore this too + +</style> + +<isindex> + +Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. + +EOT + +$| = 1; + +#$HTML::HeadParser::DEBUG = 1; +require HTML::HeadParser; +my $p = HTML::HeadParser->new( H->new ); + +if ($p->parse($HTML)) { + fail("Need more data which should not happen"); +} else { + #diag $p->as_string; + pass(); +} + +like($p->header('Title'), qr/Å være eller å ikke være/); +is($p->header('Expires'), 'Soon'); +is($p->header('Content-Base'), 'http://www.sn.no'); +is_deeply($p->header('X-Meta-Keywords'), ['test, test, test,...', 'more']); +is($p->header('X-Meta-Charset'), 'ISO-8859-1'); +is($p->header('X-Meta-Twitter-Card'), 'photo'); +like($p->header('Link'), qr/<mailto:gisle\@aas.no>/); + +# This header should not be present because the head ended +ok(!$p->header('Isindex')); + + +# Try feeding one char at a time +my $expected = $p->as_string; +my $nl = 1; +$p = HTML::HeadParser->new(H->new); +while ($HTML =~ /(.)/sg) { + #print STDERR '#' if $nl; + #print STDERR $1; + $nl = $1 eq "\n"; + $p->parse($1) or last; +} +is($p->as_string, $expected); + + +# Try reading it from a file +my $file = "hptest$$.html"; +die "$file already exists" if -e $file; + +open(FILE, ">$file") or die "Can't create $file: $!"; +binmode(FILE); +print FILE $HTML; +print FILE "<p>This is more content...</p>\n" x 2000; +print FILE "<title>Buuuh!</title>\n" x 200; +close FILE or die "Can't close $file: $!"; + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +is($p->header("Title"), "Å være eller å ikke være"); + + +# We got into an infinite loop on data without tags and no EOL. +# This was actually a HTML::Parser bug. +open(FILE, ">$file") or die "Can't create $file: $!"; +print FILE "Foo"; +close(FILE); + +$p = HTML::HeadParser->new(H->new); +$p->parse_file($file); +unlink($file) or warn "Can't unlink $file: $!"; + +ok(!$p->as_string); + +SKIP: { + skip "Need Unicode support", 5 if $] < 5.008; + + # Test that the Unicode BOM does not confuse us? + $p = HTML::HeadParser->new(H->new); + ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>")); + $p->eof; + + is($p->header("title"), "Hi <foo>"); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://rt.cpan.org/Ticket/Display.html?id=27522 +\xEF\xBB\xBF<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html> + <head> + <title> +Parkinson's disease</title> + <meta name="Keywords" content="brain,disease,dopamine,drug,levodopa,parkinson,patients,symptoms,,Medications, Medications"> + </meta> + \t +\t<link href="../../css/ummAdam.css" rel="stylesheet" type="text/css" /> +\t<link rel="stylesheet" rev="stylesheet" href="../../css/ummprint.css" media="print" /> +\t +\t </head> + <body> +EOT + $p->eof; + + is($p->header("title"), "Parkinson's disease"); + is($p->header("link")->[0], '<../../css/ummAdam.css>; rel="stylesheet"; type="text/css"'); + + $p = HTML::HeadParser->new(H->new); + $p->utf8_mode(1); + $p->parse(<<"EOT"); # example from http://www.mjw.com.pl/ +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">\r +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="pl" lang="pl"> \r +\r +<head profile="http://gmpg.org/xfn/11">\r +<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\r +\r +<title> ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa – MJW</title>\r +<link rel="shortcut icon" href="favicon.ico" type="image/x-icon" />\r + +EOT + $p->eof; + is($p->header("title"), "ko\xC5\x84c\xC3\xB3wki kolekcji, outlet, hurtownia odzie\xC5\xBCy Warszawa \xE2\x80\x93 MJW"); +} diff --git a/t/ignore.t b/t/ignore.t new file mode 100644 index 0000000..008739e --- /dev/null +++ b/t/ignore.t @@ -0,0 +1,27 @@ + +use Test::More tests => 4; + +use strict; +use HTML::Parser (); + +my $html = '<A href="foo">text</A>'; + +my $text = ''; +my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']); +$p->parse($html)->eof; +is($text, $html); + +$text = ''; +$p->handler(start => ""); +$p->parse($html)->eof; +is($text, 'text</A>'); + +$text = ''; +$p->handler(end => 0); +$p->parse($html)->eof; +is($text, 'text'); + +$text = ''; +$p->handler(start => undef); +$p->parse($html)->eof; +is($text, '<A href="foo">text'); diff --git a/t/largetags.t b/t/largetags.t new file mode 100644 index 0000000..a9ed3ff --- /dev/null +++ b/t/largetags.t @@ -0,0 +1,38 @@ +# Exercise the tokenpos buffer allocation routines by feeding it +# very large tags. + +use Test::More tests => 2; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3); + +$p->handler("start" => + sub { + my $tp = shift; + #diag int(@$tp), " - ", join(", ", @$tp); + is(@$tp, 2 + 26 * 6 * 4); + }, "tokenpos"); + +$p->handler("declaration" => + sub { + my $t = shift; + #diag int(@$t), " - @$t"; + is(@$t, 26 * 6 * 2 + 1); + }, "tokens"); + +$p->parse("<a "); +for ("aa" .. "fz") { + $p->parse("$_=1 "); +} +$p->parse(">"); + +$p->parse("<!DOCTYPE "); +for ("aa" .. "fz") { + $p->parse("$_ -- $_ -- "); +} +$p->parse(">"); +$p->eof; +exit; + diff --git a/t/linkextor-base.t b/t/linkextor-base.t new file mode 100644 index 0000000..7ef8f02 --- /dev/null +++ b/t/linkextor-base.t @@ -0,0 +1,41 @@ +# This test that HTML::LinkExtor really absolutize links correctly +# when a base URL is given to the constructor. + +use Test::More tests => 5; +require HTML::LinkExtor; + +SKIP: { +eval { + require URI; +}; +skip $@, 5 if $@; + +# Try with base URL and the $p->links interface. +$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); +$p->parse(<<HTML)->eof; +<head> +<base href="http://www.sn.no/"> +</head> +<body background="http://www.sn.no/sn.gif"> + +This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" +lowsrc="img.gif" alt="Image">. +HTML + +@p = $p->links; + +# There should be 4 links in the document +is(@p, 4); + +for (@p) { + ($t, %attr) = @$_ if $_->[0] eq 'img'; +} + +is($t, 'img'); + +is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); + +is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); + +ok(!scalar(keys %attr)); # there should be no more attributes +} diff --git a/t/linkextor-rel.t b/t/linkextor-rel.t new file mode 100644 index 0000000..1190a96 --- /dev/null +++ b/t/linkextor-rel.t @@ -0,0 +1,36 @@ +use Test::More tests => 4; + +require HTML::LinkExtor; + +$HTML = <<HTML; +<head> +<base href="http://www.sn.no/"> +</head> +<body background="http://www.sn.no/sn.gif"> + +This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" +lowsrc="img.gif" alt="Image">. +HTML + + +# Try the callback interface +$links = ""; +$p = HTML::LinkExtor->new( + sub { + my($tag, %links) = @_; + #diag "$tag @{[%links]}"; + $links .= "$tag @{[%links]}\n"; + }); + +$p->parse($HTML); $p->eof; + +ok($links =~ m|^base href http://www\.sn\.no/$|m); +ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); +ok($links =~ m|^a href link\.html$|m); + +# Used to be problems when using the links method on a document with +# no links it it. This is a test to prove that it works. +$p = new HTML::LinkExtor; +$p->parse("this is a document with no links"); $p->eof; +@a = $p->links; +is(@a, 0); diff --git a/t/magic.t b/t/magic.t new file mode 100644 index 0000000..366f275 --- /dev/null +++ b/t/magic.t @@ -0,0 +1,41 @@ +# Check that the magic signature at the top of struct p_state works and that we +# catch modifications to _hparser_xs_state gracefully + +use Test::More tests => 5; + +use HTML::Parser; + +$p = HTML::Parser->new(api_version => 3); + +$p->xml_mode(1); + +# We should not be able to simply modify this stuff +eval { + ${$p->{_hparser_xs_state}} += 4; +}; +like($@, qr/^Modification of a read-only value attempted/); + + +my $x = delete $p->{_hparser_xs_state}; + +eval { + $p->xml_mode(1); +}; +like($@, qr/^Can't find '_hparser_xs_state'/); + +$p->{_hparser_xs_state} = \($$x + 16); + +eval { + $p->xml_mode(1); +}; +like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/); + +$p->{_hparser_xs_state} = 33; +eval { + $p->xml_mode(1); +}; +like($@, qr/^_hparser_xs_state element is not a reference/); + +$p->{_hparser_xs_state} = $x; + +ok($p->xml_mode(0)); diff --git a/t/marked-sect.t b/t/marked-sect.t new file mode 100644 index 0000000..6a63478 --- /dev/null +++ b/t/marked-sect.t @@ -0,0 +1,121 @@ +#!/usr/bin/perl -w + +use strict; +my $tag; +my $text; + +use HTML::Parser (); +my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"], + text_h => [sub { $text .= shift }, "dtext"], + ); + + +use Test::More tests => 14; + +SKIP: { +eval { + $p->marked_sections(1); +}; +skip $@, 14 if $@; + +$p->parse("<![[foo]]>"); +is($text, "foo"); + +$p->parse("<![TEMP INCLUDE[bar]]>"); +is($text, "foobar"); + +$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>"); +is($text, "foobarfoo\n"); + +$text = ""; +$p->parse("<![ CDATA [<foo"); +$p->parse("<![IGNORE[bar]]>,bar>]]><br>"); +is($text, "<foo<![IGNORE[bar,bar>]]>"); + +$text = ""; +$p->parse("<![ RCDATA [å<a>]]><![CDATA[å<a>]]>å<a><br>"); +is($text, "å<a>å<a>å"); +is($tag, "br"); + +$text = ""; +$p->parse("<![INCLUDE RCDATA CDATA IGNORE [fooå<a>]]><br>"); +is($text, ""); + +$text = ""; +$p->parse("<![INCLUDE RCDATA CDATA [fooå<a>]]><br>"); +is($text, "fooå<a>"); + +$text = ""; +$p->parse("<![INCLUDE RCDATA [fooå<a>]]><br>"); +is($text, "fooå<a>"); + +$text = ""; +$p->parse("<![INCLUDE [fooå<a>]]><br>"); +is($text, "fooå"); + +$text = ""; +$p->parse("<![[fooå<a>]]><br>"); +is($text, "fooå"); + +# offsets/line/column numbers +$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"], + marked_sections => 1, + ); +$p->parse(<<'EOT')->eof; +<title>Test</title> +<![CDATA + [fooå<a> +]]> +<![[ +INCLUDE +STUFF +]]> + <h1>Test</h1> +EOT + +my @x; +sub x { + my($line, $col, $offset, $event, $text) = @_; + $text =~ s/\n/\\n/g; + $text =~ s/ /./g; + push(@x, "$line.$col:$offset $event \"$text\"\n"); +} + +#diag @x; +is(join("", @x), <<'EOT'); +1.0:0 start_document "" +1.0:0 start "<title>" +1.7:7 text "Test" +1.11:11 end "</title>" +1.19:19 text "\n" +3.3:32 text "fooå<a>\n" +4.3:49 text "\n" +5.4:54 text "\nINCLUDE\nSTUFF\n" +8.3:72 text "\n.." +9.2:75 start "<h1>" +9.6:79 text "Test" +9.10:83 end "</h1>" +9.15:88 text "\n" +10.0:89 end_document "" +EOT + +my $doc = "<Tag><![CDATA[This is cdata]]></Tag>"; +my $result = ""; +$p = HTML::Parser->new( + marked_sections => 1, + handlers => { + default => [ sub { $result .= join("",@_); }, "skipped_text,text" ] + } +)->parse($doc)->eof; +is($doc, $result); + +$text = ""; +$p = HTML::Parser->new( + text_h => [sub { $text .= shift }, "dtext"], + marked_sections => 1, +); + +$p->parse("<![CDATA[foo [1]]]>"); +is($text, "foo [1]", "CDATA text ending in square bracket"); + +} # SKIP diff --git a/t/msie-compat.t b/t/msie-compat.t new file mode 100644 index 0000000..a297f1e --- /dev/null +++ b/t/msie-compat.t @@ -0,0 +1,79 @@ +#!perl -w + +use strict; +use HTML::Parser; + +use Test::More tests => 4; + +my $TEXT = ""; +sub h +{ + my($event, $tagname, $text, @attr) = @_; + for ($event, $tagname, $text, @attr) { + if (defined) { + s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; + } + else { + $_ = "<undef>"; + } + } + + $TEXT .= "[$event,$tagname,$text," . join(":", @attr) . "]\n"; +} + +my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text,\@attr"]); +$p->parse("<a>"); +$p->parse("</a f>"); +$p->parse("</a 'foo<>' 'bar>' x>"); +$p->parse("</a \"foo<>\""); +$p->parse(" \"bar>\" x>"); +$p->parse("</ foo bar>"); +$p->parse("</ \"<>\" >"); +$p->parse("<!--comment>text<!--comment><p"); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a>,] +[end,a,</a f>,] +[end,a,</a 'foo<>' 'bar>' x>,] +[end,a,</a "foo<>" "bar>" x>,] +[comment, foo bar,</ foo bar>,] +[comment, "<>" ,</ "<>" >,] +[comment,comment,<!--comment>,] +[text,<undef>,text,] +[comment,comment,<!--comment>,] +[comment,p,<p,] +[end_document,<undef>,,] +EOT + +$TEXT = ""; +$p->parse("<!comment>"); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[comment,comment,<!comment>,] +[end_document,<undef>,,] +EOT + +$TEXT = ""; +$p->parse(q(<a name=`foo bar`>)); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a name=`foo bar`>,name:`foo:bar`:bar`] +[end_document,<undef>,,] +EOT + +$p->backquote(1); +$TEXT = ""; +$p->parse(q(<a name=`foo bar`>)); +$p->eof; + +is($TEXT, <<'EOT'); +[start_document,<undef>,,] +[start,a,<a name=`foo bar`>,name:foo bar] +[end_document,<undef>,,] +EOT diff --git a/t/offset.t b/t/offset.t new file mode 100644 index 0000000..840728d --- /dev/null +++ b/t/offset.t @@ -0,0 +1,58 @@ +use strict; +use HTML::Parser (); +use Test::More tests => 1; + +my $HTML = <<'EOT'; + +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html40/strict.dtd"> + +<foo bar baz=3>heisan +</foo> <?process> +<!-- comment --> +<xmp>xmp</xmp> + +EOT + +my $p = HTML::Parser->new(api_version => 3); + +my $sum_len = 0; +my $count = 0; +my $err; + +$p->handler(default => + sub { + my($offset, $length, $offset_end, $line, $col, $text) = @_; + my $copy = $text; + $copy =~ s/\n/\\n/g; + substr($copy, 30) = "..." if length($copy) > 32; + #diag sprintf ">>> %d.%d %s", $line, $col, $copy; + if ($offset != $sum_len) { + diag "offset mismatch $offset vs $sum_len"; + $err++; + } + if ($offset_end != $offset + $length) { + diag "offset_end $offset_end wrong"; + $err++; + } + if ($length != length($text)) { + diag "length mismatch"; + $err++; + } + if (substr($HTML, $offset, $length) ne $text) { + diag "content mismatch"; + $err++; + } + $sum_len += $length; + $count++; + }, + 'offset,length,offset_end,line,column,text'); + +for (split(//, $HTML)) { + $p->parse($_); +} +$p->eof; + +ok($count > 5 && !$err); + + diff --git a/t/options.t b/t/options.t new file mode 100644 index 0000000..ff5f7db --- /dev/null +++ b/t/options.t @@ -0,0 +1,36 @@ +# Test option setting methods + +use Test::More tests => 10; + +use strict; +use HTML::Parser (); + +my $p = HTML::Parser->new(api_version => 3, + xml_mode => 1); +my $old; + +$old = $p->boolean_attribute_value("foo"); +ok(!defined $old); + +$old = $p->boolean_attribute_value(); +is($old, "foo"); + +$old = $p->boolean_attribute_value(undef); +is($old, "foo"); +ok(!defined($p->boolean_attribute_value)); + +ok($p->xml_mode(0)); +ok(!$p->xml_mode); + +my $seen_buggy_comment_warning; +$SIG{__WARN__} = + sub { + local $_ = shift; + $seen_buggy_comment_warning++ + if /^netscape_buggy_comment\(\) is deprecated/; + }; + +ok(!$p->strict_comment(1)); +ok($p->strict_comment); +ok(!$p->netscape_buggy_comment); +ok($seen_buggy_comment_warning); diff --git a/t/parsefile.t b/t/parsefile.t new file mode 100644 index 0000000..f373f06 --- /dev/null +++ b/t/parsefile.t @@ -0,0 +1,45 @@ +use Test::More tests => 6; + +my $filename = "file$$.htm"; +die "$filename is already there" if -e $filename; +open(FILE, ">$filename") || die "Can't create $filename: $!"; +print FILE <<'EOT'; close(FILE); +<title>Heisan</title> +EOT + +{ + package MyParser; + require HTML::Parser; + @ISA=qw(HTML::Parser); + + sub start + { + my($self, $tag, $attr) = @_; + Test::More::is($tag, "title"); + } +} + +MyParser->new->parse_file($filename); +open(FILE, $filename) || die; +MyParser->new->parse_file(*FILE); +seek(FILE, 0, 0) || die; +MyParser->new->parse_file(\*FILE); +close(FILE); + +require IO::File; +my $io = IO::File->new($filename) || die; +MyParser->new->parse_file($io); +$io->seek(0, 0) || die; +MyParser->new->parse_file(*$io); + +my $text = ''; +$io->seek(0, 0) || die; +MyParser->new( + start_h => [ sub{ shift->eof; }, "self" ], + text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io); +ok(!$text); + +close($io); # needed because of bug in perl +undef($io); + +unlink($filename) or warn "Can't unlink $filename: $!"; diff --git a/t/parser.t b/t/parser.t new file mode 100644 index 0000000..0ce4d95 --- /dev/null +++ b/t/parser.t @@ -0,0 +1,184 @@ +use Test::More tests => 7; + +$HTML = <<'HTML'; + +<!DOCTYPE HTML> + +<body> + +Various entities. The parser must never break them in the middle: + +/ +/ +È +௖ + +å-Å + +<ul> +<li><a href="foo 'bar' baz>" id=33>This is a link</a> +<li><a href='foo "bar" baz> å' id=34>This is another one</a> +</ul> + +<p><div align="center"><img src="http://www.perl.com/perl.gif" +alt="camel"></div> + +<!-- this is +a comment --> and this is not. + +<!-- this is the kind of >comment< -- --> that Netscape hates --> + +< this > was not a tag. <this is/not either> + +</body> + +HTML + +#------------------------------------------------------------------- + +{ + package P; + require HTML::Parser; + @ISA=qw(HTML::Parser); + $OUT=''; + $COUNT=0; + + sub new + { + my $class = shift; + my $self = $class->SUPER::new; + $OUT = ''; + die "Can only have one" if $COUNT++; + $self; + } + + sub DESTROY + { + my $self = shift; + eval { $self->SUPER::DESTROY; }; + $COUNT--; + } + + sub declaration + { + my($self, $decl) = @_; + $OUT .= "[[$decl]]|"; + } + + sub start + { + my($self, $tag, $attr) = @_; + $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); + $attr = "/$attr" if length $attr; + $OUT .= "<<$tag$attr>>|"; + } + + sub end + { + my($self, $tag) = @_; + $OUT .= ">>$tag<<|"; + } + + sub comment + { + my($self, $comment) = @_; + $OUT .= "##$comment##|"; + } + + sub text + { + my($self, $text) = @_; + #$text =~ s/\n/\\n/g; + #$text =~ s/\t/\\t/g; + #$text =~ s/ /·/g; + $OUT .= "$text|"; + } + + sub result + { + $OUT; + } +} + +for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { +#for $chunksize (1) { + if ($chunksize =~ /^file/) { + #print "Parsing from $chunksize"; + } else { + #print "Parsing using $chunksize byte chunks"; + } + my $p = P->new; + + if ($chunksize =~ /^file/) { + # First we must create the file + my $tmpfile = "tmp-$$.html"; + my $file = $tmpfile; + die "$file already exists" if -e $file; + open(FILE, ">$file") or die "Can't create $file: $!"; + binmode FILE; + print FILE $HTML; + close(FILE); + + if ($chunksize eq "filehandle") { + require FileHandle; + my $fh = FileHandle->new($file) || die "Can't open $file: $!"; + $file = $fh; + } + + # then we can parse it. + $p->parse_file($file); + close $file if $chunksize eq "filehandle"; + unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; + } else { + my $copy = $HTML; + while (length $copy) { + my $chunk = substr($copy, 0, $chunksize); + substr($copy, 0, $chunksize) = ''; + $p->parse($chunk); + } + $p->eof; + } + + my $res = $p->result; + my $bad; + + # Then we start looking for things that should not happen + if ($res =~ /\s\|\s/) { + diag "broken space"; + $bad++; + } + for ( + # Make sure entities are not broken + '/', '/', 'È', '௖', '', 'å', 'Å', + + # Some elements that should be produced + "|[[DOCTYPE HTML]]|", + "|## this is\na comment ##|", + "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", + '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', + "|>>ul<<|", "|>>body<<|\n\n|", + ) + { + if (index($res, $_) < 0) { + diag "Can't find '$_' in parsed document"; + $bad++; + } + } + + diag $res if $bad || $ENV{PRINT_RESULTS}; + + # And we check that we get the same result all the time + $res =~ s/\|//g; # remove all break marks + if ($last_res && $res ne $last_res) { + diag "The result is not the same as last time"; + $bad++; + } + $last_res = $res; + + unless ($res =~ /Various entities/) { + diag "Some text must be missing"; + $bad++; + } + + ok(!$bad); +} diff --git a/t/plaintext.t b/t/plaintext.t new file mode 100644 index 0000000..9a53a78 --- /dev/null +++ b/t/plaintext.t @@ -0,0 +1,58 @@ +use Test::More tests => 3; + +use strict; +use HTML::Parser; + +my @a; +my $p = HTML::Parser->new(api_version => 3); +$p->handler(default => \@a, '@{event, text, is_cdata}'); +$p->parse(<<EOT)->eof; +<xmp><foo></xmp>x<plaintext><foo> +</plaintext> +foo +EOT + +for (@a) { + $_ = "" unless defined; +} + +my $doc = join(":", @a); + +#diag $doc; + +is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo> +</plaintext> +foo +:1:end_document::"); + +@a = (); +$p->closing_plaintext('yep, emulate gecko'); +$p->parse(<<EOT)->eof; +<plaintext><foo> +</plaintext>foo<b></b> +EOT + +for (@a) { + $_ = "" unless defined; +} + +$doc = join(":", @a); + +#diag $doc; + +is($doc, "start_document:::start:<plaintext>::text:<foo> +:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text: +::end_document::"); + +@a = (); +$p->closing_plaintext('yep, emulate gecko (2)'); +$p->parse(<<EOT)->eof; +<plaintext><foo> +foo<b></b> +EOT + +$doc = join(":", map { defined $_ ? $_ : "" } @a); + +is($doc, "start_document:::start:<plaintext>::text:<foo> +foo<b></b> +:1:end_document::"); @@ -0,0 +1,4 @@ +use Test::More; +eval "use Test::Pod 1.00"; +plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; +all_pod_files_ok(); diff --git a/t/process.t b/t/process.t new file mode 100644 index 0000000..9d27250 --- /dev/null +++ b/t/process.t @@ -0,0 +1,43 @@ +use strict; + +use Test::More tests => 12; + +my $pi; +my $orig; + +use HTML::Parser (); +my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; }, + "token0,text"] + ); + +$p->parse("<a><?foo><a>"); + +is($pi, "foo"); +is($orig, "<?foo>"); + +$p->parse("<a><?><a>"); +is($pi, ""); +is($orig, "<?>"); + +$p->parse("<a><? +foo +><a>"); +is($pi, "\nfoo\n"); +is($orig, "<?\nfoo\n>"); + +for (qw(< a > < ? b a r > < a >)) { + $p->parse($_); +} + +is($pi, "bar"); +is($orig, "<?bar>"); + +$p->xml_mode(1); + +$p->parse("<a><?foo>bar??><a>"); +is($pi, "foo>bar?"); +is($orig, "<?foo>bar??>"); + +$p->parse("<a><??></a>"); +is($pi, ""); +is($orig, "<??>"); diff --git a/t/pullparser.t b/t/pullparser.t new file mode 100644 index 0000000..80a186b --- /dev/null +++ b/t/pullparser.t @@ -0,0 +1,55 @@ +use Test::More tests => 3; + +use HTML::PullParser; + +my $doc = <<'EOT'; +<title>Title</title> +<style> h1 { background: white } +<foo> +</style> +<H1 ID="3">Heading</H1> +<!-- ignore this --> + +This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>. +EOT + +my $p = HTML::PullParser->new(doc => $doc, + start => 'event,tagname,@attr', + end => 'event,tagname', + text => 'event,dtext', + + ignore_elements => [qw(script style)], + unbroken_text => 1, + boolean_attribute_value => 1, + ); + +my $t = $p->get_token; +is($t->[0], "start"); +is($t->[1], "title"); +$p->unget_token($t); + +my @a; +while (my $t = $p->get_token) { + for (@$t) { + s/\s/./g; + } + push(@a, join("|", @$t)); +} + +my $res = join("\n", @a, ""); +#diag $res; +is($res, <<'EOT'); +start|title +text|Title +end|title +text|.. +start|h1|id|3 +text|Heading +end|h1 +text|...This.is.a.text.with.a. +start|a|href|http://www.sol.no|name|l1 +text|link +end|a +text|.. +EOT + diff --git a/t/script.t b/t/script.t new file mode 100644 index 0000000..2a75ccb --- /dev/null +++ b/t/script.t @@ -0,0 +1,41 @@ +#!perl -w + +use strict; +use Test; +plan tests => 1; + +use HTML::Parser; + +my $TEXT = ""; +sub h +{ + my($event, $tagname, $text) = @_; + for ($event, $tagname, $text) { + if (defined) { + s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; + } + else { + $_ = "<undef>"; + } + } + + $TEXT .= "[$event,$tagname,$text]\n"; +} + +my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1); +$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>)); +$p->eof; + +ok($TEXT, <<'EOT'); +[start_document,<undef>,] +[start,tr,<tr>] +[start,td,<td align="center" height="100">] +[start,script,<script src="whatever"/>] +[end,script,] +[start,script,<SCRIPT language="JavaScript1.1">] +[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');] +[end,script,</SCRIPT>] +[end,td,</td>] +[end,tr,</tr>] +[end_document,<undef>,] +EOT diff --git a/t/skipped-text.t b/t/skipped-text.t new file mode 100644 index 0000000..bc39915 --- /dev/null +++ b/t/skipped-text.t @@ -0,0 +1,89 @@ +use Test::More tests => 4; + +use strict; +use HTML::Parser; + +my $p = HTML::Parser->new(api_version => 3); + +$p->report_tags("a"); + +my @doc; + +$p->handler(start => \&a_handler, "skipped_text, text"); +$p->handler(end_document => \@doc, '@{skipped_text}'); + +$p->parse(<<EOT)->eof; +<title>hi</title> +<h1><a href="foo">link</a></h1> +and <a foo="">some</a> text. +EOT + +sub a_handler { + push(@doc, shift); + my $text = shift; + push(@doc, uc($text)); +} + + +is(join("", @doc), <<'EOT'); +<title>hi</title> +<h1><A HREF="FOO">link</a></h1> +and <A FOO="">some</a> text. +EOT + +# +# Comment stripper. Interaction with "" handlers. +# +my $doc = <<EOT; +<html>text</html> +<!-- comment --> +and some more <b>text</b>. +EOT +(my $expected = $doc) =~ s/<!--.*?-->//; + +$p = HTML::Parser->new(api_version => 3); +$p->handler(comment => ""); +$p->handler(end_document => sub { + my $stripped = shift; + #diag $stripped; + is($stripped, $expected); + }, "skipped_text"); +for (split(//, $doc)) { + $p->parse($_); +} +$p->eof; + +# +# Interaction with unbroken text +# +my @x; +$p = HTML::Parser->new(api_version => 3, unbroken_text => 1); +$p->handler(text => \@x, '@{"X", skipped_text, text}'); +$p->handler(end => ""); +$p->handler(end_document => \@x, '@{"Y", skipped_text}'); + +$doc = "a a<a>b b</a>c c<x>d d</x>e"; + +for (split(//, $doc)) { + $p->parse($_); +} +$p->eof; + +#diag join(":", @x); +is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:"); + +# +# The crash that Chip found +# + +my $skipped; +$p = HTML::Parser->new( + ignore_tags => ["foo"], + start_h => [sub {$skipped = shift}, "skipped_text"], +); + +$p->parse("\x{100}<foo>"); +$p->parse("plain"); +$p->parse("<bar>"); +$p->eof; +is($skipped, "\x{100}<foo>plain"); diff --git a/t/stack-realloc.t b/t/stack-realloc.t new file mode 100644 index 0000000..46c7d35 --- /dev/null +++ b/t/stack-realloc.t @@ -0,0 +1,17 @@ +#!perl -w + +# HTML-Parser 3.33 and older used to core dump on this program because +# of missing SPAGAIN calls in parse() XS code. It was not prepared for +# the stack to get realloced. + +$| = 1; + +use Test::More tests => 1; + +use HTML::Parser; +my $x = HTML::Parser->new(api_version => 3); +my @row; +$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname"); +$x->parse("</TD>"); + +pass; diff --git a/t/textarea.t b/t/textarea.t new file mode 100644 index 0000000..120f79b --- /dev/null +++ b/t/textarea.t @@ -0,0 +1,70 @@ +use Test::More tests => 1; + +use strict; +use HTML::Parser; + +my $html = <<'EOT'; +<html> +<title>This is a <nice> title</title> +<!--comment--> +<script language="perl">while (<DATA>) { & }</script> + +<FORM> + +<textarea name="foo" cols=50 rows=10> + +foo +<foo> +<!--comment--> +& +foo +</FORM> + +</textarea> + +</FORM> + +</html> +EOT + +my $dump = ""; +sub tdump { + my @a = @_; + for (@a) { + $_ = "<undef>" unless defined; + s/\n/\\n/g; + } + $dump .= join("|", @a) . "\n"; +} + +my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]); +$p->parse($html)->eof; + +#diag $dump; + +is($dump, <<'EOT'); +start_document||<undef>|<undef> +start|<html>|<undef>|<undef> +text|\n|\n| +start|<title>|<undef>|<undef> +text|This is a <nice> title|This is a <nice> title| +end|</title>|<undef>|<undef> +text|\n|\n| +comment|<!--comment-->|<undef>|<undef> +text|\n|\n| +start|<script language="perl">|<undef>|<undef> +text|while (<DATA>) { & }|while (<DATA>) { & }|1 +end|</script>|<undef>|<undef> +text|\n\n|\n\n| +start|<FORM>|<undef>|<undef> +text|\n\n|\n\n| +start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef> +text|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n| +end|</textarea>|<undef>|<undef> +text|\n\n|\n\n| +end|</FORM>|<undef>|<undef> +text|\n\n|\n\n| +end|</html>|<undef>|<undef> +text|\n|\n| +end_document||<undef>|<undef> +EOT diff --git a/t/threads.t b/t/threads.t new file mode 100644 index 0000000..8da91e9 --- /dev/null +++ b/t/threads.t @@ -0,0 +1,39 @@ +# Verify thread safety. + +use Config; +use Test::More; + +BEGIN { + plan(skip_all => "Not configured for threads") + unless $Config{useithreads} && $] >= 5.008; + plan(tests => 1); +} + +use threads; +use HTML::Parser; + +my $ok=0; + +sub start +{ + my($tag,$attr)=@_; + + $ok += ($tag eq "foo"); + $ok += (defined($attr->{param}) && $attr->{param} eq "bar"); +} + +my $p = HTML::Parser->new + (api_version => 3, + handlers => { + start => [\&start, "tagname,attr"], + }); + +$p->parse("<foo pa"); + +$ok=async { + $p->parse("ram=bar>"); + $ok; +}->join(); + +is($ok,2); + diff --git a/t/tokeparser.t b/t/tokeparser.t new file mode 100644 index 0000000..2084201 --- /dev/null +++ b/t/tokeparser.t @@ -0,0 +1,164 @@ +use Test::More tests => 17; + +use strict; +use HTML::TokeParser; + +# First we create an HTML document to test + +my $file = "ttest$$.htm"; +die "$file already exists" if -e $file; + +open(F, ">$file") or die "Can't create $file: $!"; +print F <<'EOT'; close(F); + +<!--This is a test--> +<html><head><title> + This is the <title> +</title> + + <base href="http://www.perl.com"> +</head> + +<body background="bg.gif"> + + <h1>This is the <b>title</b> again + </h1> + + And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> <!--nice isn't it-->Institute</a> + + <br/><? process instruction > + +</body> +</html> + +EOT + +END { unlink($file) || warn "Can't unlink $file: $!"; } + + +my $p; + + +$p = HTML::TokeParser->new($file) || die "Can't open $file: $!"; +ok($p->unbroken_text); +if ($p->get_tag("foo", "title")) { + my $title = $p->get_trimmed_text; + #diag "Title: $title"; + is($title, "This is the <title>"); +} +undef($p); + +# Test with reference to glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(\*F); +my $scount = 0; +my $ecount = 0; +my $tcount = 0; +my $pcount = 0; +while (my $token = $p->get_token) { + $scount++ if $token->[0] eq "S"; + $ecount++ if $token->[0] eq "E"; + $pcount++ if $token->[0] eq "PI"; +} +undef($p); +close F; + +# Test with glob +open(F, $file) || die "Can't open $file: $!"; +$p = HTML::TokeParser->new(*F); +$tcount++ while $p->get_tag; +undef($p); +close F; + +# Test with plain file name +$p = HTML::TokeParser->new($file) || die; +$tcount++ while $p->get_tag; +undef($p); + +#diag "Number of tokens found: $tcount/2 = $scount + $ecount"; +is($tcount, 34); +is($scount, 10); +is($ecount, 7); +is($pcount, 1); +is($tcount/2, $scount + $ecount); + +ok(!HTML::TokeParser->new("/noT/thEre/$$")); + + +$p = HTML::TokeParser->new($file) || die; +$p->get_tag("a"); +my $atext = $p->get_text; +undef($p); + +is($atext, "Perl\240Institute"); + +# test parsing of embeded document +$p = HTML::TokeParser->new(\<<HTML); +<title>Title</title> +<H1> +Heading +</h1> +HTML + +ok($p->get_tag("h1")); +is($p->get_trimmed_text, "Heading"); +undef($p); + +# test parsing of large embedded documents +my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022; + +#use Time::HiRes qw(time); +my $start = time; +$p = HTML::TokeParser->new(\$doc); +#diag "Construction time: ", time - $start; + +my $count; +while (my $t = $p->get_token) { + $count++ if $t->[0] eq "S"; +} +#diag "Parse time: ", time - $start; + +is($count, 2022); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a heading</H1> +This is s<b>o</b>me<hr>text. +<br /> +This is some more text. +<p> +This is even some more. +EOT + +$p->get_tag("/h1"); + +my $t = $p->get_trimmed_text("br", "p"); +is($t, "This is some text."); + +$p->get_tag; + +$t = $p->get_trimmed_text("br", "p"); +is($t,"This is some more text."); + +undef($p); + +$p = HTML::TokeParser->new(\<<'EOT'); +<H1>This is a <b>bold</b> heading</H1> +This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>. +<p> +This is even some more. +EOT + +$p->get_tag("h1"); + +$t = $p->get_phrase; +is($t, "This is a bold heading"); + +$t = $p->get_phrase; +is($t, ""); + +$p->get_tag; + +$t = $p->get_phrase; +is($t, "This is some italic text. This is some more text."); + +undef($p); diff --git a/t/uentities.t b/t/uentities.t new file mode 100644 index 0000000..36d5179 --- /dev/null +++ b/t/uentities.t @@ -0,0 +1,65 @@ +# Test Unicode entities + +use HTML::Entities; + +use Test::More tests => 26; + +SKIP: { +skip "This perl does not support Unicode or Unicode entities not selected", + 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT; + +is(decode_entities("&euro"), "&euro"); +is(decode_entities("€"), "\x{20AC}"); + +is(decode_entities("å"), "å"); +is(decode_entities("å"), "å"); + +is(decode_entities("񺄠"), chr(500000)); + +is(decode_entities("􏿽"), "\x{10FFFD}"); + +is(decode_entities(""), "\x{FFFC}"); + + +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities(""), ""); +is(decode_entities(""), "\x{FFFD}"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +is(decode_entities("&#ååå࿿"), "&#ååå\x{FFF}"); + +# This might fail when we get more than 64 bit UVs +is(decode_entities("�"), "�"); +is(decode_entities("�"), "�"); + +my $err; +for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) { + my $a = join("", map chr, $_->[0] .. $_->[1]); + + my $e = encode_entities($a); + my $d = decode_entities($e); + + unless ($d eq $a) { + diag "Wrong decoding in range $_->[0] .. $_->[1]"; + # use Devel::Peek; Dump($a); Dump($d); + $err++; + } +} +ok(!$err); + + +is(decode_entities("��"), chr(0x100085)); + +is(decode_entities("�"), chr(0xFFFD)); + +is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}"); +} diff --git a/t/unbroken-text.t b/t/unbroken-text.t new file mode 100644 index 0000000..7de85a9 --- /dev/null +++ b/t/unbroken-text.t @@ -0,0 +1,60 @@ +use strict; +use HTML::Parser; + +use Test::More tests => 3; + +my $text = ""; +sub text +{ + my $cdata = shift() ? "CDATA" : "TEXT"; + my($offset, $line, $col, $t) = @_; + $text .= "[$cdata:$offset:$line.$col:$t]"; +} + +sub tag +{ + $text .= shift; +} + +my $p = HTML::Parser->new(unbroken_text => 1, + text_h => [\&text, "is_cdata,offset,line,column,text"], + start_h => [\&tag, "text"], + end_h => [\&tag, "text"], + ); + +$p->parse("foo "); +$p->parse("bar "); +$p->parse("<foo>"); +$p->parse("bar\n"); +$p->parse("</foo>"); +$p->parse("<xmp>xmp</xmp>"); +$p->parse("atend"); + +#diag $text; +is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>"); + +$text = ""; +$p->eof; + +#diag $text; +is($text, "[TEXT:37:2.20:atend]"); + + +$p = HTML::Parser->new(unbroken_text => 1, + text_h => [\&text, "is_cdata,offset,line,column,text"], + ); + +$text = ""; +$p->parse("foo"); +$p->parse("<foo"); +$p->parse(">bar\n"); +$p->parse("foo<xm"); +$p->parse("p>xmp"); +$p->parse("</xmp"); +$p->parse(">bar"); +$p->eof; + +#diag $text; +is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]"); + + diff --git a/t/unicode-bom.t b/t/unicode-bom.t new file mode 100644 index 0000000..b7398cf --- /dev/null +++ b/t/unicode-bom.t @@ -0,0 +1,63 @@ +#!perl -w + +use strict; +use Test::More tests => 2; +use HTML::Parser; + +SKIP: { +skip "This perl does not support Unicode", 2 if $] < 5.008; + +my @parsed; +my $p = HTML::Parser->new( + api_version => 3, + start_h => [\@parsed, 'tag, attr'], +); + +my @warn; +$SIG{__WARN__} = sub { + push(@warn, $_[0]); +}; + +$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); +$p->eof; + +#use Encode; +$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A)); +$p->eof; + +$p->parse("\xFF\xFE<head>Hi there</head>"); +$p->eof; + +$p->parse("\xFE\xFF<head>Hi there</head>"); +$p->eof; + +$p->parse("\0\0\xFF\xFE<head>Hi there</head>"); +$p->eof; + +$p->parse("\xFE\xFF\0\0<head>Hi there</head>"); +$p->eof; + +for (@warn) { + s/line (\d+)/line ##/g; +} + +is(join("", @warn), <<EOT); +Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. +Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line ##. +Parsing of undecoded UTF-16 at $0 line ##. +Parsing of undecoded UTF-16 at $0 line ##. +Parsing of undecoded UTF-32 at $0 line ##. +Parsing of undecoded UTF-32 at $0 line ##. +EOT + +@warn = (); + +$p = HTML::Parser->new( + api_version => 3, + start_h => [\@parsed, 'tag'], +); + +$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); +$p->eof; +ok(!@warn); +} diff --git a/t/unicode.t b/t/unicode.t new file mode 100644 index 0000000..911c547 --- /dev/null +++ b/t/unicode.t @@ -0,0 +1,198 @@ +#!perl -w + +use strict; +use HTML::Parser; +use Test::More; +BEGIN { + plan skip_all => "This perl does not support Unicode" if $] < 5.008; +} + +plan tests => 105; + +my @warn; +$SIG{__WARN__} = sub { + push(@warn, $_[0]); +}; + +my @parsed; +my $p = HTML::Parser->new( + api_version => 3, + default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'], +); + +my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile ☺</h1>\x{0420}"; +is(length($doc), 46); + +$p->parse($doc)->eof; + +#use Data::Dump; Data::Dump::dump(@parsed); + +is(@parsed, 9); +is($parsed[0][0], "start_document"); + +is($parsed[1][0], "start"); +is($parsed[1][1], "<title>"); +SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") }; +is($parsed[1][3], 0); +is($parsed[1][4], 7); + +is($parsed[2][0], "text"); +is(ord($parsed[2][1]), 0x263A); +is($parsed[2][2], chr(0x263A)); +is($parsed[2][3], 7); +is($parsed[2][4], 1); +is($parsed[2][5], 8); +is($parsed[2][6], 7); + +is($parsed[3][0], "end"); +is($parsed[3][1], "</title>"); +is($parsed[3][3], 8); +is($parsed[3][6], 8); + +is($parsed[4][0], "start"); +is($parsed[4][1], "<h1 id=\x{2600} f>"); +is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0"); +is($parsed[4][8]{id}, "\x{2600}"); + +is($parsed[5][0], "text"); +is($parsed[5][1], "Smile ☺"); +is($parsed[5][2], "Smile \x{263A}"); + +is($parsed[7][0], "text"); +is($parsed[7][1], "\x{0420}"); +is($parsed[7][2], "\x{0420}"); + +is($parsed[8][0], "end_document"); +is($parsed[8][3], length($doc)); +is($parsed[8][5], length($doc)); +is($parsed[8][6], length($doc)); +is(@warn, 0); + +# Try to parse it as an UTF8 encoded string +utf8::encode($doc); +is(length($doc), 51); + +@parsed = (); +$p->parse($doc)->eof; + +#use Data::Dump; Data::Dump::dump(@parsed); + +is(@parsed, 9); +is($parsed[0][0], "start_document"); + +is($parsed[1][0], "start"); +is($parsed[1][1], "<title>"); +SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") }; +is($parsed[1][3], 0); +is($parsed[1][4], 7); + +is($parsed[2][0], "text"); +is(ord($parsed[2][1]), 226); +is($parsed[2][1], "\xE2\x98\xBA"); +is($parsed[2][2], "\xE2\x98\xBA"); +is($parsed[2][3], 7); +is($parsed[2][4], 3); +is($parsed[2][5], 10); +is($parsed[2][6], 7); + +is($parsed[3][0], "end"); +is($parsed[3][1], "</title>"); +is($parsed[3][3], 10); +is($parsed[3][6], 10); + +is($parsed[4][0], "start"); +is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>"); +is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0"); +is($parsed[4][8]{id}, "\xE2\x98\x80"); + +is($parsed[5][0], "text"); +is($parsed[5][1], "Smile ☺"); +is($parsed[5][2], "Smile \x{263A}"); + +is($parsed[8][0], "end_document"); +is($parsed[8][3], length($doc)); +is($parsed[8][5], length($doc)); +is($parsed[8][6], length($doc)); + +is(@warn, 1); +like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); + +my $file = "test-$$.html"; +open(my $fh, ">:utf8", $file) || die; +print $fh <<EOT; +\x{FEFF} +<title>\x{263A} Love! </title> +<h1 id=♥\x{2665}>♥ Love \x{2665}<h1> +EOT +close($fh) || die; + +@warn = (); +@parsed = (); +$p->parse_file($file); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \xE2\x99\xA5"); +is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage +is($parsed[10][3], -s $file); +is(@warn, 1); +like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); + +@warn = (); +@parsed = (); +open($fh, "<:raw:utf8", $file) || die; +$p->parse_file($fh); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\x{2665}\x{2665}"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \x{2665}"); +is($parsed[7][2], "\x{2665} Love \x{2665}"); +is($parsed[10][3], (-s $file) - 2 * 4); +is(@warn, 0); + +@warn = (); +@parsed = (); +open($fh, "<:raw", $file) || die; +$p->utf8_mode(1); +$p->parse_file($fh); +is(@parsed, "11"); +is($parsed[6][0], "start"); +is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5"); +is($parsed[7][0], "text"); +is($parsed[7][1], "♥ Love \xE2\x99\xA5"); +is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5"); +is($parsed[10][3], -s $file); +is(@warn, 0); + +unlink($file); + +@parsed = (); +$p->parse(q(<a href="a=1&lang=2×=3">foo</a>))->eof; +is(@parsed, "5"); +is($parsed[1][0], "start"); +is($parsed[1][8]{href}, "a=1&lang=2\xd7=3"); + +ok(!HTML::Entities::_probably_utf8_chunk("")); +ok(!HTML::Entities::_probably_utf8_chunk("f")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2")); +ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99")); +ok(!HTML::Entities::_probably_utf8_chunk("f\xE2")); +ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99")); + +$p = HTML::Parser->new( + api_version => 3, + default_h => [\@parsed, 'event, text, tag, attr'], + attr_encoded => 1, +); + +@warn = (); +@parsed = (); + +$p->parse($doc)->eof; + +ok(!@warn); +is(@parsed, 9); diff --git a/t/xml-mode.t b/t/xml-mode.t new file mode 100644 index 0000000..cdfc5b0 --- /dev/null +++ b/t/xml-mode.t @@ -0,0 +1,112 @@ +use strict; +use Test::More tests => 8; + +use HTML::Parser (); +my $p = HTML::Parser->new(xml_mode => 1, + ); + +my $text = ""; +$p->handler(start => + sub { + my($tag, $attr) = @_; + $text .= "S[$tag"; + for my $k (sort keys %$attr) { + my $v = $attr->{$k}; + $text .= " $k=$v"; + } + $text .= "]"; + }, "tagname,attr"); +$p->handler(end => + sub { + $text .= "E[" . shift() . "]"; + }, "tagname"); +$p->handler(process => + sub { + $text .= "PI[" . shift() . "]"; + }, "token0"); +$p->handler(text => + sub { + $text .= shift; + }, "text"); + +my $xml = <<'EOT'; +<?xml version="1.0"?> +<?IS10744:arch name="html"?><!-- comment --> +<DOC> +<title html="h1">My first architectual document</title> +<author html="address">Geir Ove Gronmo, grove@infotek.no</author> +<para>This is the first paragraph in this document</para> +<para html="p">This is the second paragraph</para> +<para/> +<xmp><foo></foo></xmp> +</DOC> +EOT + +$p->parse($xml)->eof; + +is($text, <<'EOT'); +PI[xml version="1.0"] +PI[IS10744:arch name="html"] +S[DOC] +S[title html=h1]My first architectual documentE[title] +S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] +S[para]This is the first paragraph in this documentE[para] +S[para html=p]This is the second paragraphE[para] +S[para]E[para] +S[xmp]S[foo]E[foo]E[xmp] +E[DOC] +EOT + +$text = ""; +$p->xml_mode(0); +$p->parse($xml)->eof; + +is($text, <<'EOT'); +PI[xml version="1.0"?] +PI[IS10744:arch name="html"?] +S[doc] +S[title html=h1]My first architectual documentE[title] +S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] +S[para]This is the first paragraph in this documentE[para] +S[para html=p]This is the second paragraphE[para] +S[para/] +S[xmp]<foo></foo>E[xmp] +E[doc] +EOT + +# Test that we get an empty tag back +$p = HTML::Parser->new(api_version => 3, + xml_mode => 1); + +$p->handler("end" => + sub { + my($tagname, $text) = @_; + is($tagname, "Xyzzy"); + ok(!length($text)); + }, "tagname,text"); +$p->parse("<Xyzzy foo=bar/>and some more")->eof; + +# Test that we get an empty tag back +$p = HTML::Parser->new(api_version => 3, + empty_element_tags => 1); + +$p->handler("end" => + sub { + my($tagname, $text) = @_; + is($tagname, "xyzzy"); + ok(!length($text)); + }, "tagname,text"); +$p->parse("<Xyzzy foo=bar/>and some more")->eof; + +$p = HTML::Parser->new( + api_version => 3, + xml_pic => 1, +); + +$p->handler( + "process" => sub { + my($text, $t0) = @_; + is($text, "<?foo > bar?>"); + is($t0, "foo > bar"); + }, "text, token0"); +$p->parse("<?foo > bar?> and then")->eof; diff --git a/tokenpos.h b/tokenpos.h new file mode 100644 index 0000000..aa971bf --- /dev/null +++ b/tokenpos.h @@ -0,0 +1,49 @@ +struct token_pos +{ + char *beg; + char *end; +}; +typedef struct token_pos token_pos_t; + +#define dTOKENS(init_lim) \ + token_pos_t token_buf[init_lim]; \ + int token_lim = init_lim; \ + token_pos_t *tokens = token_buf; \ + int num_tokens = 0 + +#define PUSH_TOKEN(p_beg, p_end) \ + STMT_START { \ + ++num_tokens; \ + if (num_tokens == token_lim) \ + tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \ + tokens[num_tokens-1].beg = p_beg; \ + tokens[num_tokens-1].end = p_end; \ + } STMT_END + +#define FREE_TOKENS \ + STMT_START { \ + if (tokens != token_buf) \ + Safefree(tokens); \ + } STMT_END + +static void +tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap) +{ + int new_lim = *token_lim_ptr; + if (new_lim < 4) + new_lim = 4; + new_lim *= 2; + + if (tokens_on_heap) { + Renew(*token_ptr, new_lim, token_pos_t); + } + else { + token_pos_t *new_tokens; + int i; + New(57, new_tokens, new_lim, token_pos_t); + for (i = 0; i < *token_lim_ptr; i++) + new_tokens[i] = (*token_ptr)[i]; + *token_ptr = new_tokens; + } + *token_lim_ptr = new_lim; +} @@ -0,0 +1,5 @@ +PSTATE* T_PSTATE + +INPUT +T_PSTATE + $var = get_pstate_hv(aTHX_ $arg) @@ -0,0 +1,311 @@ +/* + * Copyright 1999-2009, Gisle Aas. + * + * This library is free software; you can redistribute it and/or + * modify it under the same terms as Perl itself. + */ + +#ifndef EXTERN +#define EXTERN extern +#endif + + +EXTERN SV* +sv_lower(pTHX_ SV* sv) +{ + STRLEN len; + char *s = SvPV_force(sv, len); + for (; len--; s++) + *s = toLOWER(*s); + return sv; +} + +EXTERN int +strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case) +{ + while (n--) { + if (ignore_case) { + if (toLOWER(*s1) != toLOWER(*s2)) + return 0; + } + else { + if (*s1 != *s2) + return 0; + } + s1++; + s2++; + } + return 1; +} + +static void +grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e) +{ + /* + SvPVX ---> AAAAAA...BBBBBB + ^ ^ ^ + t s e + */ + STRLEN t_offset = *t - SvPVX(sv); + STRLEN s_offset = *s - SvPVX(sv); + STRLEN e_offset = *e - SvPVX(sv); + + SvGROW(sv, e_offset + grow + 1); + + *t = SvPVX(sv) + t_offset; + *s = SvPVX(sv) + s_offset; + *e = SvPVX(sv) + e_offset; + + Move(*s, *s+grow, *e - *s, char); + *s += grow; + *e += grow; +} + +EXTERN SV* +decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) +{ + STRLEN len; + char *s = SvPV_force(sv, len); + char *t = s; + char *end = s + len; + char *ent_start; + + char *repl; + STRLEN repl_len; +#ifdef UNICODE_HTML_PARSER + char buf[UTF8_MAXLEN]; + int repl_utf8; + int high_surrogate = 0; +#else + char buf[1]; +#endif + +#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER) + /* gcc -Wall reports this variable as possibly used uninitialized */ + repl_utf8 = 0; +#endif + + while (s < end) { + assert(t <= s); + + if ((*t++ = *s++) != '&') + continue; + + ent_start = s; + repl = 0; + + if (s < end && *s == '#') { + UV num = 0; + int ok = 0; + s++; + if (s < end && (*s == 'x' || *s == 'X')) { + s++; + while (s < end) { + char *tmp = strchr(PL_hexdigit, *s); + if (!tmp) + break; + num = num << 4 | ((tmp - PL_hexdigit) & 15); + if (num > 0x10FFFF) { + /* overflow */ + ok = 0; + break; + } + s++; + ok = 1; + } + } + else { + while (s < end && isDIGIT(*s)) { + num = num * 10 + (*s - '0'); + if (num > 0x10FFFF) { + /* overflow */ + ok = 0; + break; + } + s++; + ok = 1; + } + } + if (num && ok) { +#ifdef UNICODE_HTML_PARSER + if (!SvUTF8(sv) && num <= 255) { + buf[0] = (char) num; + repl = buf; + repl_len = 1; + repl_utf8 = 0; + } + else if (num == 0xFFFE || num == 0xFFFF) { + /* illegal */ + } + else { + char *tmp; + if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ + if (high_surrogate != 0) { + t -= 3; /* Back up past 0xFFFD */ + num = ((high_surrogate - 0xD800) << 10) + + (num - 0xDC00) + 0x10000; + high_surrogate = 0; + } else { + num = 0xFFFD; + } + } + else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ + high_surrogate = num; + num = 0xFFFD; + } + else { + high_surrogate = 0; + /* otherwise invalid? */ + if ((num >= 0xFDD0 && num <= 0xFDEF) || + ((num & 0xFFFE) == 0xFFFE) || + num > 0x10FFFF) + { + num = 0xFFFD; + } + } + + tmp = (char*)uvuni_to_utf8((U8*)buf, num); + repl = buf; + repl_len = tmp - buf; + repl_utf8 = 1; + } +#else + if (num <= 255) { + buf[0] = (char) num & 0xFF; + repl = buf; + repl_len = 1; + } +#endif + } + } + else { + char *ent_name = s; + while (s < end && isALNUM(*s)) + s++; + if (ent_name != s && entity2char) { + SV** svp; + if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || + (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) + ) + { + repl = SvPV(*svp, repl_len); +#ifdef UNICODE_HTML_PARSER + repl_utf8 = SvUTF8(*svp); +#endif + } + else if (expand_prefix) { + char *ss = s - 1; + while (ss > ent_name) { + svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); + if (svp) { + repl = SvPV(*svp, repl_len); +#ifdef UNICODE_HTML_PARSER + repl_utf8 = SvUTF8(*svp); +#endif + s = ss; + break; + } + ss--; + } + } + } +#ifdef UNICODE_HTML_PARSER + high_surrogate = 0; +#endif + } + + if (repl) { + char *repl_allocated = 0; + if (s < end && *s == ';') + s++; + t--; /* '&' already copied, undo it */ + +#ifdef UNICODE_HTML_PARSER + if (*s != '&') { + high_surrogate = 0; + } + + if (!SvUTF8(sv) && repl_utf8) { + /* need to upgrade sv before we continue */ + STRLEN before_gap_len = t - SvPVX(sv); + char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); + STRLEN after_gap_len = end - s; + char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); + + sv_setpvn(sv, before_gap, before_gap_len); + sv_catpvn(sv, after_gap, after_gap_len); + SvUTF8_on(sv); + + Safefree(before_gap); + Safefree(after_gap); + + s = t = SvPVX(sv) + before_gap_len; + end = SvPVX(sv) + before_gap_len + after_gap_len; + } + else if (SvUTF8(sv) && !repl_utf8) { + repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); + repl_allocated = repl; + } +#endif + + if (t + repl_len > s) { + /* need to grow the string */ + grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); + } + + /* copy replacement string into string */ + while (repl_len--) + *t++ = *repl++; + + if (repl_allocated) + Safefree(repl_allocated); + } + else { + while (ent_start < s) + *t++ = *ent_start++; + } + } + + *t = '\0'; + SvCUR_set(sv, t - SvPVX(sv)); + + return sv; +} + +#ifdef UNICODE_HTML_PARSER +static bool +has_hibit(char *s, char *e) +{ + while (s < e) { + U8 ch = *s++; + if (!UTF8_IS_INVARIANT(ch)) { + return 1; + } + } + return 0; +} + + +EXTERN bool +probably_utf8_chunk(pTHX_ char *s, STRLEN len) +{ + char *e = s + len; + STRLEN clen; + + /* ignore partial utf8 char at end of buffer */ + while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) + e--; + if (s < e && UTF8_IS_START((U8)*(e - 1))) + e--; + clen = len - (e - s); + if (clen && UTF8SKIP(e) == clen) { + /* all promised continuation bytes are present */ + e = s + len; + } + + if (!has_hibit(s, e)) + return 0; + + return is_utf8_string((U8*)s, e - s); +} +#endif |