diff options
Diffstat (limited to 'ext')
65 files changed, 0 insertions, 9384 deletions
diff --git a/ext/HTML/Parser/Makefile.PL b/ext/HTML/Parser/Makefile.PL deleted file mode 100644 index 79081f7985..0000000000 --- a/ext/HTML/Parser/Makefile.PL +++ /dev/null @@ -1,30 +0,0 @@ -require 5.006; -use strict; -use ExtUtils::MakeMaker; - -WriteMakefile( - NAME => 'HTML::Parser', - VERSION_FROM => 'Parser.pm', - H => [ "hparser.h", "hctype.h", "tokenpos.h", "pfunc.h", - "hparser.c", "util.c", - ], - PREREQ_PM => { - 'HTML::Tagset' => 3, - 'Test::More' => 0, # only needed to run 'make test' - }, - DEFINE => "-DMARKED_SECTION", - dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, - clean => { FILES => 'hctype.h pfunc.h' }, -); - - -sub MY::postamble -{ - ' -pfunc.h : mkpfunc - $(PERL) mkpfunc >pfunc.h - -hctype.h : mkhctype - $(PERL) mkhctype >hctype.h -' -} diff --git a/ext/HTML/Parser/Parser.pm b/ext/HTML/Parser/Parser.pm deleted file mode 100644 index 72d5a9841f..0000000000 --- a/ext/HTML/Parser/Parser.pm +++ /dev/null @@ -1,1233 +0,0 @@ -package HTML::Parser; - -# Copyright 1996-2007, Gisle Aas. -# Copyright 1999-2000, Michael A. Chase. -# -# This library is free software; you can redistribute it and/or -# modify it under the same terms as Perl itself. - -use strict; -use vars qw($VERSION @ISA); - -$VERSION = '3.56'; # $Date: 2007/01/12 09:18:31 $ - -require HTML::Entities; - -require XSLoader; -XSLoader::load('HTML::Parser', $VERSION); - -sub new -{ - my $class = shift; - my $self = bless {}, $class; - return $self->init(@_); -} - - -sub init -{ - my $self = shift; - $self->_alloc_pstate; - - my %arg = @_; - my $api_version = delete $arg{api_version} || (@_ ? 3 : 2); - if ($api_version >= 4) { - require Carp; - Carp::croak("API version $api_version not supported " . - "by HTML::Parser $VERSION"); - } - - if ($api_version < 3) { - # Set up method callbacks compatible with HTML-Parser-2.xx - $self->handler(text => "text", "self,text,is_cdata"); - $self->handler(end => "end", "self,tagname,text"); - $self->handler(process => "process", "self,token0,text"); - $self->handler(start => "start", - "self,tagname,attr,attrseq,text"); - - $self->handler(comment => - sub { - my($self, $tokens) = @_; - for (@$tokens) { - $self->comment($_); - } - }, "self,tokens"); - - $self->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1)); - }, "self,text"); - } - - if (my $h = delete $arg{handlers}) { - $h = {@$h} if ref($h) eq "ARRAY"; - while (my($event, $cb) = each %$h) { - $self->handler($event => @$cb); - } - } - - # In the end we try to assume plain attribute or handler - while (my($option, $val) = each %arg) { - if ($option =~ /^(\w+)_h$/) { - $self->handler($1 => @$val); - } - elsif ($option =~ /^(text|start|end|process|declaration|comment)$/) { - require Carp; - Carp::croak("Bad constructor option '$option'"); - } - else { - $self->$option($val); - } - } - - return $self; -} - - -sub parse_file -{ - my($self, $file) = @_; - my $opened; - if (!ref($file) && ref(\$file) ne "GLOB") { - # Assume $file is a filename - local(*F); - open(F, $file) || return undef; - binmode(F); # should we? good for byte counts - $opened++; - $file = *F; - } - my $chunk = ''; - while (read($file, $chunk, 512)) { - $self->parse($chunk) || last; - } - close($file) if $opened; - $self->eof; -} - - -sub netscape_buggy_comment # legacy -{ - my $self = shift; - require Carp; - Carp::carp("netscape_buggy_comment() is deprecated. " . - "Please use the strict_comment() method instead"); - my $old = !$self->strict_comment; - $self->strict_comment(!shift) if @_; - return $old; -} - -# set up method stubs -sub text { } -*start = \&text; -*end = \&text; -*comment = \&text; -*declaration = \&text; -*process = \&text; - -1; - -__END__ - - -=head1 NAME - -HTML::Parser - HTML parser class - -=head1 SYNOPSIS - - use HTML::Parser (); - - # Create parser object - $p = HTML::Parser->new( api_version => 3, - start_h => [\&start, "tagname, attr"], - end_h => [\&end, "tagname"], - marked_sections => 1, - ); - - # Parse document text chunk by chunk - $p->parse($chunk1); - $p->parse($chunk2); - #... - $p->eof; # signal end of document - - # Parse directly from file - $p->parse_file("foo.html"); - # or - open(my $fh, "<:utf8", "foo.html") || die; - $p->parse_file($fh); - -=head1 DESCRIPTION - -Objects of the C<HTML::Parser> class will recognize markup and -separate it from plain text (alias data content) in HTML -documents. As different kinds of markup and text are recognized, the -corresponding event handlers are invoked. - -C<HTML::Parser> is not a generic SGML parser. We have tried to -make it able to deal with the HTML that is actually "out there", and -it normally parses as closely as possible to the way the popular web -browsers do it instead of strictly following one of the many HTML -specifications from W3C. Where there is disagreement, there is often -an option that you can enable to get the official behaviour. - -The document to be parsed may be supplied in arbitrary chunks. This -makes on-the-fly parsing as documents are received from the network -possible. - -If event driven parsing does not feel right for your application, you -might want to use C<HTML::PullParser>. This is an C<HTML::Parser> -subclass that allows a more conventional program structure. - - -=head1 METHODS - -The following method is used to construct a new C<HTML::Parser> object: - -=over - -=item $p = HTML::Parser->new( %options_and_handlers ) - -This class method creates a new C<HTML::Parser> object and -returns it. Key/value argument pairs may be provided to assign event -handlers or initialize parser options. The handlers and parser -options can also be set or modified later by the method calls described below. - -If a top level key is in the form "<event>_h" (e.g., "text_h") then it -assigns a handler to that event, otherwise it initializes a parser -option. The event handler specification value must be an array -reference. Multiple handlers may also be assigned with the 'handlers -=> [%handlers]' option. See examples below. - -If new() is called without any arguments, it will create a parser that -uses callback methods compatible with version 2 of C<HTML::Parser>. -See the section on "version 2 compatibility" below for details. - -The special constructor option 'api_version => 2' can be used to -initialize version 2 callbacks while still setting other options and -handlers. The 'api_version => 3' option can be used if you don't want -to set any options and don't want to fall back to v2 compatible -mode. - -Examples: - - $p = HTML::Parser->new(api_version => 3, - text_h => [ sub {...}, "dtext" ]); - -This creates a new parser object with a text event handler subroutine -that receives the original text with general entities decoded. - - $p = HTML::Parser->new(api_version => 3, - start_h => [ 'my_start', "self,tokens" ]); - -This creates a new parser object with a start event handler method -that receives the $p and the tokens array. - - $p = HTML::Parser->new(api_version => 3, - handlers => { text => [\@array, "event,text"], - comment => [\@array, "event,text"], - }); - -This creates a new parser object that stores the event type and the -original text in @array for text and comment events. - -=back - -The following methods feed the HTML document -to the C<HTML::Parser> object: - -=over - -=item $p->parse( $string ) - -Parse $string as the next chunk of the HTML document. The return -value is normally a reference to the parser object (i.e. $p). -Handlers invoked should not attempt to modify the $string in-place until -$p->parse returns. - -If an invoked event handler aborts parsing by calling $p->eof, then -$p->parse() will return a FALSE value. - -=item $p->parse( $code_ref ) - -If a code reference is passed as the argument to be parsed, then the -chunks to be parsed are obtained by invoking this function repeatedly. -Parsing continues until the function returns an empty (or undefined) -result. When this happens $p->eof is automatically signaled. - -Parsing will also abort if one of the event handlers calls $p->eof. - -The effect of this is the same as: - - while (1) { - my $chunk = &$code_ref(); - if (!defined($chunk) || !length($chunk)) { - $p->eof; - return $p; - } - $p->parse($chunk) || return undef; - } - -But it is more efficient as this loop runs internally in XS code. - -=item $p->parse_file( $file ) - -Parse text directly from a file. The $file argument can be a -filename, an open file handle, or a reference to an open file -handle. - -If $file contains a filename and the file can't be opened, then the -method returns an undefined value and $! tells why it failed. -Otherwise the return value is a reference to the parser object. - -If a file handle is passed as the $file argument, then the file will -normally be read until EOF, but not closed. - -If an invoked event handler aborts parsing by calling $p->eof, -then $p->parse_file() may not have read the entire file. - -On systems with multi-byte line terminators, the values passed for the -offset and length argspecs may be too low if parse_file() is called on -a file handle that is not in binary mode. - -If a filename is passed in, then parse_file() will open the file in -binary mode. - -=item $p->eof - -Signals the end of the HTML document. Calling the $p->eof method -outside a handler callback will flush any remaining buffered text -(which triggers the C<text> event if there is any remaining text). - -Calling $p->eof inside a handler will terminate parsing at that point -and cause $p->parse to return a FALSE value. This also terminates -parsing by $p->parse_file(). - -After $p->eof has been called, the parse() and parse_file() methods -can be invoked to feed new documents with the parser object. - -The return value from eof() is a reference to the parser object. - -=back - - -Most parser options are controlled by boolean attributes. -Each boolean attribute is enabled by calling the corresponding method -with a TRUE argument and disabled with a FALSE argument. The -attribute value is left unchanged if no argument is given. The return -value from each method is the old attribute value. - -Methods that can be used to get and/or set parser options are: - -=over - -=item $p->attr_encoded - -=item $p->attr_encoded( $bool ) - -By default, the C<attr> and C<@attr> argspecs will have general -entities for attribute values decoded. Enabling this attribute leaves -entities alone. - -=item $p->boolean_attribute_value( $val ) - -This method sets the value reported for boolean attributes inside HTML -start tags. By default, the name of the attribute is also used as its -value. This affects the values reported for C<tokens> and C<attr> -argspecs. - -=item $p->case_sensitive - -=item $p->case_sensitive( $bool ) - -By default, tagnames and attribute names are down-cased. Enabling this -attribute leaves them as found in the HTML source document. - -=item $p->closing_plaintext - -=item $p->closing_plaintext( $bool ) - -By default, "plaintext" element can never be closed. Everything up to -the end of the document is parsed in CDATA mode. This historical -behaviour is what at least MSIE does. Enabling this attribute makes -closing "</plaintext>" tag effective and the parsing process will resume -after seeing this tag. This emulates gecko-based browsers. - -=item $p->empty_element_tags - -=item $p->empty_element_tags( $bool ) - -By default, empty element tags are not recognized as such and the "/" -before ">" is just treated like a normal name character (unless -C<strict_names> is enabled). Enabling this attribute make -C<HTML::Parser> recognize these tags. - -Empty element tags look like start tags, but end with the character -sequence "/>" instead of ">". When recognized by C<HTML::Parser> they -cause an artificial end event in addition to the start event. The -C<text> for the artificial end event will be empty and the C<tokenpos> -array will be undefined even though the the token array will have one -element containing the tag name. - -=item $p->marked_sections - -=item $p->marked_sections( $bool ) - -By default, section markings like <![CDATA[...]]> are treated like -ordinary text. When this attribute is enabled section markings are -honoured. - -There are currently no events associated with the marked section -markup, but the text can be returned as C<skipped_text>. - -=item $p->strict_comment - -=item $p->strict_comment( $bool ) - -By default, comments are terminated by the first occurrence of "-->". -This is the behaviour of most popular browsers (like Mozilla, Opera and -MSIE), but it is not correct according to the official HTML -standard. Officially, you need an even number of "--" tokens before -the closing ">" is recognized and there may not be anything but -whitespace between an even and an odd "--". - -The official behaviour is enabled by enabling this attribute. - -Enabling of 'strict_comment' also disables recognizing these forms as -comments: - - </ comment> - <! comment> - - -=item $p->strict_end - -=item $p->strict_end( $bool ) - -By default, attributes and other junk are allowed to be present on end tags in a -manner that emulates MSIE's behaviour. - -The official behaviour is enabled with this attribute. If enabled, -only whitespace is allowed between the tagname and the final ">". - -=item $p->strict_names - -=item $p->strict_names( $bool ) - -By default, almost anything is allowed in tag and attribute names. -This is the behaviour of most popular browsers and allows us to parse -some broken tags with invalid attribute values like: - - <IMG SRC=newprevlstGr.gif ALT=[PREV LIST] BORDER=0> - -By default, "LIST]" is parsed as a boolean attribute, not as -part of the ALT value as was clearly intended. This is also what -Mozilla sees. - -The official behaviour is enabled by enabling this attribute. If -enabled, it will cause the tag above to be reported as text -since "LIST]" is not a legal attribute name. - -=item $p->unbroken_text - -=item $p->unbroken_text( $bool ) - -By default, blocks of text are given to the text handler as soon as -possible (but the parser takes care always to break text at a -boundary between whitespace and non-whitespace so single words and -entities can always be decoded safely). This might create breaks that -make it hard to do transformations on the text. When this attribute is -enabled, blocks of text are always reported in one piece. This will -delay the text event until the following (non-text) event has been -recognized by the parser. - -Note that the C<offset> argspec will give you the offset of the first -segment of text and C<length> is the combined length of the segments. -Since there might be ignored tags in between, these numbers can't be -used to directly index in the original document file. - -=item $p->utf8_mode - -=item $p->utf8_mode( $bool ) - -Enable this option when parsing raw undecoded UTF-8. This tells the -parser that the entities expanded for strings reported by C<attr>, -C<@attr> and C<dtext> should be expanded as decoded UTF-8 so they end -up compatible with the surrounding text. - -If C<utf8_mode> is enabled then it is an error to pass strings -containing characters with code above 255 to the parse() method, and -the parse() method will croak if you try. - -Example: The Unicode character "\x{2665}" is "\xE2\x99\xA5" when UTF-8 -encoded. The character can also be represented by the entity -"♥" or "♥". If we feed the parser: - - $p->parse("\xE2\x99\xA5♥"); - -then C<dtext> will be reported as "\xE2\x99\xA5\x{2665}" without -C<utf8_mode> enabled, but as "\xE2\x99\xA5\xE2\x99\xA5" when enabled. -The later string is what you want. - -This option is only available with perl-5.8 or better. - -=item $p->xml_mode - -=item $p->xml_mode( $bool ) - -Enabling this attribute changes the parser to allow some XML -constructs. This enables the behaviour controlled by individually by -the C<case_sensitive>, C<empty_element_tags>, C<strict_names> and -C<xml_pic> attributes and also suppresses special treatment of -elements that are parsed as CDATA for HTML. - -=item $p->xml_pic - -=item $p->xml_pic( $bool ) - -By default, I<processing instructions> are terminated by ">". When -this attribute is enabled, processing instructions are terminated by -"?>" instead. - -=back - -As markup and text is recognized, handlers are invoked. The following -method is used to set up handlers for different events: - -=over - -=item $p->handler( event => \&subroutine, $argspec ) - -=item $p->handler( event => $method_name, $argspec ) - -=item $p->handler( event => \@accum, $argspec ) - -=item $p->handler( event => "" ); - -=item $p->handler( event => undef ); - -=item $p->handler( event ); - -This method assigns a subroutine, method, or array to handle an event. - -Event is one of C<text>, C<start>, C<end>, C<declaration>, C<comment>, -C<process>, C<start_document>, C<end_document> or C<default>. - -The C<\&subroutine> is a reference to a subroutine which is called to handle -the event. - -The C<$method_name> is the name of a method of $p which is called to handle -the event. - -The C<@accum> is an array that will hold the event information as -sub-arrays. - -If the second argument is "", the event is ignored. -If it is undef, the default handler is invoked for the event. - -The C<$argspec> is a string that describes the information to be reported -for the event. Any requested information that does not apply to a -specific event is passed as C<undef>. If argspec is omitted, then it -is left unchanged. - -The return value from $p->handler is the old callback routine or a -reference to the accumulator array. - -Any return values from handler callback routines/methods are always -ignored. A handler callback can request parsing to be aborted by -invoking the $p->eof method. A handler callback is not allowed to -invoke the $p->parse() or $p->parse_file() method. An exception will -be raised if it tries. - -Examples: - - $p->handler(start => "start", 'self, attr, attrseq, text' ); - -This causes the "start" method of object $p to be called for 'start' events. -The callback signature is $p->start(\%attr, \@attr_seq, $text). - - $p->handler(start => \&start, 'attr, attrseq, text' ); - -This causes subroutine start() to be called for 'start' events. -The callback signature is start(\%attr, \@attr_seq, $text). - - $p->handler(start => \@accum, '"S", attr, attrseq, text' ); - -This causes 'start' event information to be saved in @accum. -The array elements will be ['S', \%attr, \@attr_seq, $text]. - - $p->handler(start => ""); - -This causes 'start' events to be ignored. It also suppresses -invocations of any default handler for start events. It is in most -cases equivalent to $p->handler(start => sub {}), but is more -efficient. It is different from the empty-sub-handler in that -C<skipped_text> is not reset by it. - - $p->handler(start => undef); - -This causes no handler to be associated with start events. -If there is a default handler it will be invoked. - -=back - -Filters based on tags can be set up to limit the number of events -reported. The main bottleneck during parsing is often the huge number -of callbacks made from the parser. Applying filters can improve -performance significantly. - -The following methods control filters: - -=over - -=item $p->ignore_elements( @tags ) - -Both the C<start> event and the C<end> event as well as any events that -would be reported in between are suppressed. The ignored elements can -contain nested occurrences of itself. Example: - - $p->ignore_elements(qw(script style)); - -The C<script> and C<style> tags will always nest properly since their -content is parsed in CDATA mode. For most other tags -C<ignore_elements> must be used with caution since HTML is often not -I<well formed>. - -=item $p->ignore_tags( @tags ) - -Any C<start> and C<end> events involving any of the tags given are -suppressed. To reset the filter (i.e. don't suppress any C<start> and -C<end> events), call C<ignore_tags> without an argument. - -=item $p->report_tags( @tags ) - -Any C<start> and C<end> events involving any of the tags I<not> given -are suppressed. To reset the filter (i.e. report all C<start> and -C<end> events), call C<report_tags> without an argument. - -=back - -Internally, the system has two filter lists, one for C<report_tags> -and one for C<ignore_tags>, and both filters are applied. This -effectively gives C<ignore_tags> precedence over C<report_tags>. - -Examples: - - $p->ignore_tags(qw(style)); - $p->report_tags(qw(script style)); - -results in only C<script> events being reported. - -=head2 Argspec - -Argspec is a string containing a comma-separated list that describes -the information reported by the event. The following argspec -identifier names can be used: - -=over - -=item C<attr> - -Attr causes a reference to a hash of attribute name/value pairs to be -passed. - -Boolean attributes' values are either the value set by -$p->boolean_attribute_value, or the attribute name if no value has been -set by $p->boolean_attribute_value. - -This passes undef except for C<start> events. - -Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute -names are forced to lower case. - -General entities are decoded in the attribute values and -one layer of matching quotes enclosing the attribute values is removed. - -The Unicode character set is assumed for entity decoding. With Perl -version 5.6 or earlier only the Latin-1 range is supported, and -entities for characters outside the range 0..255 are left unchanged. - -=item C<@attr> - -Basically the same as C<attr>, but keys and values are passed as -individual arguments and the original sequence of the attributes is -kept. The parameters passed will be the same as the @attr calculated -here: - - @attr = map { $_ => $attr->{$_} } @$attrseq; - -assuming $attr and $attrseq here are the hash and array passed as the -result of C<attr> and C<attrseq> argspecs. - -This passes no values for events besides C<start>. - -=item C<attrseq> - -Attrseq causes a reference to an array of attribute names to be -passed. This can be useful if you want to walk the C<attr> hash in -the original sequence. - -This passes undef except for C<start> events. - -Unless C<xml_mode> or C<case_sensitive> is enabled, the attribute -names are forced to lower case. - -=item C<column> - -Column causes the column number of the start of the event to be passed. -The first column on a line is 0. - -=item C<dtext> - -Dtext causes the decoded text to be passed. General entities are -automatically decoded unless the event was inside a CDATA section or -was between literal start and end tags (C<script>, C<style>, -C<xmp>, and C<plaintext>). - -The Unicode character set is assumed for entity decoding. With Perl -version 5.6 or earlier only the Latin-1 range is supported, and -entities for characters outside the range 0..255 are left unchanged. - -This passes undef except for C<text> events. - -=item C<event> - -Event causes the event name to be passed. - -The event name is one of C<text>, C<start>, C<end>, C<declaration>, -C<comment>, C<process>, C<start_document> or C<end_document>. - -=item C<is_cdata> - -Is_cdata causes a TRUE value to be passed if the event is inside a CDATA -section or between literal start and end tags (C<script>, -C<style>, C<xmp>, and C<plaintext>). - -if the flag is FALSE for a text event, then you should normally -either use C<dtext> or decode the entities yourself before the text is -processed further. - -=item C<length> - -Length causes the number of bytes of the source text of the event to -be passed. - -=item C<line> - -Line causes the line number of the start of the event to be passed. -The first line in the document is 1. Line counting doesn't start -until at least one handler requests this value to be reported. - -=item C<offset> - -Offset causes the byte position in the HTML document of the start of -the event to be passed. The first byte in the document has offset 0. - -=item C<offset_end> - -Offset_end causes the byte position in the HTML document of the end of -the event to be passed. This is the same as C<offset> + C<length>. - -=item C<self> - -Self causes the current object to be passed to the handler. If the -handler is a method, this must be the first element in the argspec. - -An alternative to passing self as an argspec is to register closures -that capture $self by themselves as handlers. Unfortunately this -creates circular references which prevent the HTML::Parser object -from being garbage collected. Using the C<self> argspec avoids this -problem. - -=item C<skipped_text> - -Skipped_text returns the concatenated text of all the events that have -been skipped since the last time an event was reported. Events might -be skipped because no handler is registered for them or because some -filter applies. Skipped text also includes marked section markup, -since there are no events that can catch it. - -If an C<"">-handler is registered for an event, then the text for this -event is not included in C<skipped_text>. Skipped text both before -and after the C<"">-event is included in the next reported -C<skipped_text>. - -=item C<tag> - -Same as C<tagname>, but prefixed with "/" if it belongs to an C<end> -event and "!" for a declaration. The C<tag> does not have any prefix -for C<start> events, and is in this case identical to C<tagname>. - -=item C<tagname> - -This is the element name (or I<generic identifier> in SGML jargon) for -start and end tags. Since HTML is case insensitive, this name is -forced to lower case to ease string matching. - -Since XML is case sensitive, the tagname case is not changed when -C<xml_mode> is enabled. The same happens if the C<case_sensitive> attribute -is set. - -The declaration type of declaration elements is also passed as a tagname, -even if that is a bit strange. -In fact, in the current implementation tagname is -identical to C<token0> except that the name may be forced to lower case. - -=item C<token0> - -Token0 causes the original text of the first token string to be -passed. This should always be the same as $tokens->[0]. - -For C<declaration> events, this is the declaration type. - -For C<start> and C<end> events, this is the tag name. - -For C<process> and non-strict C<comment> events, this is everything -inside the tag. - -This passes undef if there are no tokens in the event. - -=item C<tokenpos> - -Tokenpos causes a reference to an array of token positions to be -passed. For each string that appears in C<tokens>, this array -contains two numbers. The first number is the offset of the start of -the token in the original C<text> and the second number is the length -of the token. - -Boolean attributes in a C<start> event will have (0,0) for the -attribute value offset and length. - -This passes undef if there are no tokens in the event (e.g., C<text>) -and for artificial C<end> events triggered by empty element tags. - -If you are using these offsets and lengths to modify C<text>, you -should either work from right to left, or be very careful to calculate -the changes to the offsets. - -=item C<tokens> - -Tokens causes a reference to an array of token strings to be passed. -The strings are exactly as they were found in the original text, -no decoding or case changes are applied. - -For C<declaration> events, the array contains each word, comment, and -delimited string starting with the declaration type. - -For C<comment> events, this contains each sub-comment. If -$p->strict_comments is disabled, there will be only one sub-comment. - -For C<start> events, this contains the original tag name followed by -the attribute name/value pairs. The values of boolean attributes will -be either the value set by $p->boolean_attribute_value, or the -attribute name if no value has been set by -$p->boolean_attribute_value. - -For C<end> events, this contains the original tag name (always one token). - -For C<process> events, this contains the process instructions (always one -token). - -This passes C<undef> for C<text> events. - -=item C<text> - -Text causes the source text (including markup element delimiters) to be -passed. - -=item C<undef> - -Pass an undefined value. Useful as padding where the same handler -routine is registered for multiple events. - -=item C<'...'> - -A literal string of 0 to 255 characters enclosed -in single (') or double (") quotes is passed as entered. - -=back - -The whole argspec string can be wrapped up in C<'@{...}'> to signal -that the resulting event array should be flattened. This only makes a -difference if an array reference is used as the handler target. -Consider this example: - - $p->handler(text => [], 'text'); - $p->handler(text => [], '@{text}']); - -With two text events; C<"foo">, C<"bar">; then the first example will end -up with [["foo"], ["bar"]] and the second with ["foo", "bar"] in -the handler target array. - - -=head2 Events - -Handlers for the following events can be registered: - -=over - -=item C<comment> - -This event is triggered when a markup comment is recognized. - -Example: - - <!-- This is a comment -- -- So is this --> - -=item C<declaration> - -This event is triggered when a I<markup declaration> is recognized. - -For typical HTML documents, the only declaration you are -likely to find is <!DOCTYPE ...>. - -Example: - - <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html40/strict.dtd"> - -DTDs inside <!DOCTYPE ...> will confuse HTML::Parser. - -=item C<default> - -This event is triggered for events that do not have a specific -handler. You can set up a handler for this event to catch stuff you -did not want to catch explicitly. - -=item C<end> - -This event is triggered when an end tag is recognized. - -Example: - - </A> - -=item C<end_document> - -This event is triggered when $p->eof is called and after any remaining -text is flushed. There is no document text associated with this event. - -=item C<process> - -This event is triggered when a processing instructions markup is -recognized. - -The format and content of processing instructions are system and -application dependent. - -Examples: - - <? HTML processing instructions > - <? XML processing instructions ?> - -=item C<start> - -This event is triggered when a start tag is recognized. - -Example: - - <A HREF="http://www.perl.com/"> - -=item C<start_document> - -This event is triggered before any other events for a new document. A -handler for it can be used to initialize stuff. There is no document -text associated with this event. - -=item C<text> - -This event is triggered when plain text (characters) is recognized. -The text may contain multiple lines. A sequence of text may be broken -between several text events unless $p->unbroken_text is enabled. - -The parser will make sure that it does not break a word or a sequence -of whitespace between two text events. - -=back - -=head2 Unicode - -The C<HTML::Parser> can parse Unicode strings when running under -perl-5.8 or better. If Unicode is passed to $p->parse() then chunks -of Unicode will be reported to the handlers. The offset and length -argspecs will also report their position in terms of characters. - -It is safe to parse raw undecoded UTF-8 if you either avoid decoding -entities and make sure to not use I<argspecs> that do, or enable the -C<utf8_mode> for the parser. Parsing of undecoded UTF-8 might be -useful when parsing from a file where you need the reported offsets -and lengths to match the byte offsets in the file. - -If a filename is passed to $p->parse_file() then the file will be read -in binary mode. This will be fine if the file contains only ASCII or -Latin-1 characters. If the file contains UTF-8 encoded text then care -must be taken when decoding entities as described in the previous -paragraph, but better is to open the file with the UTF-8 layer so that -it is decoded properly: - - open(my $fh, "<:utf8", "index.html") || die "...: $!"; - $p->parse_file($fh); - -If the file contains text encoded in a charset besides ASCII, Latin-1 -or UTF-8 then decoding will always be needed. - -=head1 VERSION 2 COMPATIBILITY - -When an C<HTML::Parser> object is constructed with no arguments, a set -of handlers is automatically provided that is compatible with the old -HTML::Parser version 2 callback methods. - -This is equivalent to the following method calls: - - $p->handler(start => "start", "self, tagname, attr, attrseq, text"); - $p->handler(end => "end", "self, tagname, text"); - $p->handler(text => "text", "self, text, is_cdata"); - $p->handler(process => "process", "self, token0, text"); - $p->handler(comment => - sub { - my($self, $tokens) = @_; - for (@$tokens) {$self->comment($_);}}, - "self, tokens"); - $p->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1));}, - "self, text"); - -Setting up these handlers can also be requested with the "api_version => -2" constructor option. - -=head1 SUBCLASSING - -The C<HTML::Parser> class is subclassable. Parser objects are plain -hashes and C<HTML::Parser> reserves only hash keys that start with -"_hparser". The parser state can be set up by invoking the init() -method, which takes the same arguments as new(). - -=head1 EXAMPLES - -The first simple example shows how you might strip out comments from -an HTML document. We achieve this by setting up a comment handler that -does nothing and a default handler that will print out anything else: - - use HTML::Parser; - HTML::Parser->new(default_h => [sub { print shift }, 'text'], - comment_h => [""], - )->parse_file(shift || die) || die $!; - -An alternative implementation is: - - use HTML::Parser; - HTML::Parser->new(end_document_h => [sub { print shift }, - 'skipped_text'], - comment_h => [""], - )->parse_file(shift || die) || die $!; - -This will in most cases be much more efficient since only a single -callback will be made. - -The next example prints out the text that is inside the <title> -element of an HTML document. Here we start by setting up a start -handler. When it sees the title start tag it enables a text handler -that prints any text found and an end handler that will terminate -parsing as soon as the title end tag is seen: - - use HTML::Parser (); - - sub start_handler - { - return if shift ne "title"; - my $self = shift; - $self->handler(text => sub { print shift }, "dtext"); - $self->handler(end => sub { shift->eof if shift eq "title"; }, - "tagname,self"); - } - - my $p = HTML::Parser->new(api_version => 3); - $p->handler( start => \&start_handler, "tagname,self"); - $p->parse_file(shift || die) || die $!; - print "\n"; - -More examples are found in the F<eg/> directory of the C<HTML-Parser> -distribution: the program C<hrefsub> shows how you can edit all links -found in a document; the program C<htextsub> shows how to edit the text only; the -program C<hstrip> shows how you can strip out certain tags/elements -and/or attributes; and the program C<htext> show how to obtain the -plain text, but not any script/style content. - -You can browse the F<eg/> directory online from the I<[Browse]> link on -the http://search.cpan.org/~gaas/HTML-Parser/ page. - -=head1 BUGS - -The <style> and <script> sections do not end with the first "</", but -need the complete corresponding end tag. The standard behaviour is -not really practical. - -When the I<strict_comment> option is enabled, we still recognize -comments where there is something other than whitespace between even -and odd "--" markers. - -Once $p->boolean_attribute_value has been set, there is no way to -restore the default behaviour. - -There is currently no way to get both quote characters -into the same literal argspec. - -Empty tags, e.g. "<>" and "</>", are not recognized. SGML allows them -to repeat the previous start tag or close the previous start tag -respectively. - -NET tags, e.g. "code/.../" are not recognized. This is SGML -shorthand for "<code>...</code>". - -Unclosed start or end tags, e.g. "<tt<b>...</b</tt>" are not -recognized. - -=head1 DIAGNOSTICS - -The following messages may be produced by HTML::Parser. The notation -in this listing is the same as used in L<perldiag>: - -=over - -=item Not a reference to a hash - -(F) The object blessed into or subclassed from HTML::Parser is not a -hash as required by the HTML::Parser methods. - -=item Bad signature in parser state object at %p - -(F) The _hparser_xs_state element does not refer to a valid state structure. -Something must have changed the internal value -stored in this hash element, or the memory has been overwritten. - -=item _hparser_xs_state element is not a reference - -(F) The _hparser_xs_state element has been destroyed. - -=item Can't find '_hparser_xs_state' element in HTML::Parser hash - -(F) The _hparser_xs_state element is missing from the parser hash. -It was either deleted, or not created when the object was created. - -=item API version %s not supported by HTML::Parser %s - -(F) The constructor option 'api_version' with an argument greater than -or equal to 4 is reserved for future extensions. - -=item Bad constructor option '%s' - -(F) An unknown constructor option key was passed to the new() or -init() methods. - -=item Parse loop not allowed - -(F) A handler invoked the parse() or parse_file() method. -This is not permitted. - -=item marked sections not supported - -(F) The $p->marked_sections() method was invoked in a HTML::Parser -module that was compiled without support for marked sections. - -=item Unknown boolean attribute (%d) - -(F) Something is wrong with the internal logic that set up aliases for -boolean attributes. - -=item Only code or array references allowed as handler - -(F) The second argument for $p->handler must be either a subroutine -reference, then name of a subroutine or method, or a reference to an -array. - -=item No handler for %s events - -(F) The first argument to $p->handler must be a valid event name; i.e. one -of "start", "end", "text", "process", "declaration" or "comment". - -=item Unrecognized identifier %s in argspec - -(F) The identifier is not a known argspec name. -Use one of the names mentioned in the argspec section above. - -=item Literal string is longer than 255 chars in argspec - -(F) The current implementation limits the length of literals in -an argspec to 255 characters. Make the literal shorter. - -=item Backslash reserved for literal string in argspec - -(F) The backslash character "\" is not allowed in argspec literals. -It is reserved to permit quoting inside a literal in a later version. - -=item Unterminated literal string in argspec - -(F) The terminating quote character for a literal was not found. - -=item Bad argspec (%s) - -(F) Only identifier names, literals, spaces and commas -are allowed in argspecs. - -=item Missing comma separator in argspec - -(F) Identifiers in an argspec must be separated with ",". - -=item Parsing of undecoded UTF-8 will give garbage when decoding entities - -(W) The first chunk parsed appears to contain undecoded UTF-8 and one -or more argspecs that decode entities are used for the callback -handlers. - -The result of decoding will be a mix of encoded and decoded characters -for any entities that expand to characters with code above 127. This -is not a good thing. - -The solution is to use the Encode::encode_utf8() on the data before -feeding it to the $p->parse(). For $p->parse_file() pass a file that -has been opened in ":utf8" mode. - -The parser can process raw undecoded UTF-8 sanely if the C<utf8_mode> -is enabled or if the "attr", "@attr" or "dtext" argspecs is avoided. - -=item Parsing string decoded with wrong endianess - -(W) The first character in the document is U+FFFE. This is not a -legal Unicode character but a byte swapped BOM. The result of parsing -will likely be garbage. - -=item Parsing of undecoded UTF-32 - -(W) The parser found the Unicode UTF-32 BOM signature at the start -of the document. The result of parsing will likely be garbage. - -=item Parsing of undecoded UTF-16 - -(W) The parser found the Unicode UTF-16 BOM signature at the start of -the document. The result of parsing will likely be garbage. - -=back - -=head1 SEE ALSO - -L<HTML::Entities>, L<HTML::PullParser>, L<HTML::TokeParser>, L<HTML::HeadParser>, -L<HTML::LinkExtor>, L<HTML::Form> - -L<HTML::TreeBuilder> (part of the I<HTML-Tree> distribution) - -http://www.w3.org/TR/html4 - -More information about marked sections and processing instructions may -be found at C<http://www.sgml.u-net.com/book/sgml-8.htm>. - -=head1 COPYRIGHT - - Copyright 1996-2007 Gisle Aas. All rights reserved. - Copyright 1999-2000 Michael A. Chase. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/ext/HTML/Parser/Parser.xs b/ext/HTML/Parser/Parser.xs deleted file mode 100644 index a173eb65b0..0000000000 --- a/ext/HTML/Parser/Parser.xs +++ /dev/null @@ -1,672 +0,0 @@ -/* $Id: Parser.xs,v 2.137 2007/01/12 10:18:39 gisle Exp $ - * - * Copyright 1999-2005, Gisle Aas. - * Copyright 1999-2000, Michael A. Chase. - * - * This library is free software; you can redistribute it and/or - * modify it under the same terms as Perl itself. - */ - - -/* - * Standard XS greeting. - */ -#ifdef __cplusplus -extern "C" { -#endif -#define PERL_NO_GET_CONTEXT /* we want efficiency */ -#include "EXTERN.h" -#include "perl.h" -#include "XSUB.h" -#ifdef __cplusplus -} -#endif - - - -/* - * Some perl version compatibility gruff. - */ -#include "patchlevel.h" -#if PATCHLEVEL <= 4 /* perl5.004_XX */ - -#ifndef PL_sv_undef - #define PL_sv_undef sv_undef - #define PL_sv_yes sv_yes -#endif - -#ifndef PL_hexdigit - #define PL_hexdigit hexdigit -#endif - -#ifndef ERRSV - #define ERRSV GvSV(errgv) -#endif - -#if (PATCHLEVEL == 4 && SUBVERSION <= 4) -/* The newSVpvn function was introduced in perl5.004_05 */ -static SV * -newSVpvn(char *s, STRLEN len) -{ - register SV *sv = newSV(0); - sv_setpvn(sv,s,len); - return sv; -} -#endif /* not perl5.004_05 */ -#endif /* perl5.004_XX */ - -#ifndef dNOOP - #define dNOOP extern int errno -#endif -#ifndef dTHX - #define dTHX dNOOP - #define pTHX_ - #define aTHX_ -#endif - -#ifndef MEMBER_TO_FPTR - #define MEMBER_TO_FPTR(x) (x) -#endif - -#ifndef INT2PTR - #define INT2PTR(any,d) (any)(d) - #define PTR2IV(p) (IV)(p) -#endif - - -#if PATCHLEVEL > 6 || (PATCHLEVEL == 6 && SUBVERSION > 0) - #define RETHROW croak(Nullch) -#else - #define RETHROW { STRLEN my_na; croak("%s", SvPV(ERRSV, my_na)); } -#endif - -#if PATCHLEVEL < 8 - /* No useable Unicode support */ - /* Make these harmless if present */ - #undef SvUTF8 - #undef SvUTF8_on - #undef SvUTF8_off - #define SvUTF8(sv) 0 - #define SvUTF8_on(sv) 0 - #define SvUTF8_off(sv) 0 -#else - #define UNICODE_HTML_PARSER -#endif - -#ifdef G_WARN_ON - #define DOWARN (PL_dowarn & G_WARN_ON) -#else - #define DOWARN PL_dowarn -#endif - -/* - * Include stuff. We include .c files instead of linking them, - * so that they don't have to pollute the external dll name space. - */ - -#ifdef EXTERN - #undef EXTERN -#endif - -#define EXTERN static /* Don't pollute */ - -#include "hparser.h" -#include "util.c" -#include "hparser.c" - - -/* - * Support functions for the XS glue - */ - -static SV* -check_handler(pTHX_ SV* h) -{ - if (SvROK(h)) { - SV* myref = SvRV(h); - if (SvTYPE(myref) == SVt_PVCV) - return newSVsv(h); - if (SvTYPE(myref) == SVt_PVAV) - return SvREFCNT_inc(myref); - croak("Only code or array references allowed as handler"); - } - return SvOK(h) ? newSVsv(h) : 0; -} - - -static PSTATE* -get_pstate_iv(pTHX_ SV* sv) -{ - PSTATE *p; -#if PATCHLEVEL < 8 - p = INT2PTR(PSTATE*, SvIV(sv)); -#else - MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, '~') : NULL; - - if (!mg) - croak("Lost parser state magic"); - p = (PSTATE *)mg->mg_ptr; - if (!p) - croak("Lost parser state magic"); -#endif - if (p->signature != P_SIGNATURE) - croak("Bad signature in parser state object at %p", p); - return p; -} - - -static PSTATE* -get_pstate_hv(pTHX_ SV* sv) /* used by XS typemap */ -{ - HV* hv; - SV** svp; - - sv = SvRV(sv); - if (!sv || SvTYPE(sv) != SVt_PVHV) - croak("Not a reference to a hash"); - hv = (HV*)sv; - svp = hv_fetch(hv, "_hparser_xs_state", 17, 0); - if (svp) { - if (SvROK(*svp)) - return get_pstate_iv(aTHX_ SvRV(*svp)); - else - croak("_hparser_xs_state element is not a reference"); - } - croak("Can't find '_hparser_xs_state' element in HTML::Parser hash"); - return 0; -} - - -static void -free_pstate(pTHX_ PSTATE* pstate) -{ - int i; - SvREFCNT_dec(pstate->buf); - SvREFCNT_dec(pstate->pend_text); - SvREFCNT_dec(pstate->skipped_text); -#ifdef MARKED_SECTION - SvREFCNT_dec(pstate->ms_stack); -#endif - SvREFCNT_dec(pstate->bool_attr_val); - for (i = 0; i < EVENT_COUNT; i++) { - SvREFCNT_dec(pstate->handlers[i].cb); - SvREFCNT_dec(pstate->handlers[i].argspec); - } - - SvREFCNT_dec(pstate->report_tags); - SvREFCNT_dec(pstate->ignore_tags); - SvREFCNT_dec(pstate->ignore_elements); - SvREFCNT_dec(pstate->ignoring_element); - - SvREFCNT_dec(pstate->tmp); - - pstate->signature = 0; - Safefree(pstate); -} - -static int -magic_free_pstate(pTHX_ SV *sv, MAGIC *mg) -{ -#if PATCHLEVEL < 8 - free_pstate(aTHX_ get_pstate_iv(aTHX_ sv)); -#else - free_pstate(aTHX_ (PSTATE *)mg->mg_ptr); -#endif - return 0; -} - -#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 - -static PSTATE * -dup_pstate(pTHX_ PSTATE *pstate, CLONE_PARAMS *params) -{ - PSTATE *pstate2; - int i; - - Newz(56, pstate2, 1, PSTATE); - pstate2->signature = pstate->signature; - - pstate2->buf = SvREFCNT_inc(sv_dup(pstate->buf, params)); - pstate2->offset = pstate->offset; - pstate2->line = pstate->line; - pstate2->column = pstate->column; - pstate2->start_document = pstate->start_document; - pstate2->parsing = pstate->parsing; - pstate2->eof = pstate->eof; - - pstate2->literal_mode = pstate->literal_mode; - pstate2->is_cdata = pstate->is_cdata; - pstate2->no_dash_dash_comment_end = pstate->no_dash_dash_comment_end; - pstate2->pending_end_tag = pstate->pending_end_tag; - - pstate2->pend_text = SvREFCNT_inc(sv_dup(pstate->pend_text, params)); - pstate2->pend_text_is_cdata = pstate->pend_text_is_cdata; - pstate2->pend_text_offset = pstate->pend_text_offset; - pstate2->pend_text_line = pstate->pend_text_offset; - pstate2->pend_text_column = pstate->pend_text_column; - - pstate2->skipped_text = SvREFCNT_inc(sv_dup(pstate->skipped_text, params)); - -#ifdef MARKED_SECTION - pstate2->ms = pstate->ms; - pstate2->ms_stack = - (AV *)SvREFCNT_inc(sv_dup((SV *)pstate->ms_stack, params)); - pstate2->marked_sections = pstate->marked_sections; -#endif - - pstate2->strict_comment = pstate->strict_comment; - pstate2->strict_names = pstate->strict_names; - pstate2->strict_end = pstate->strict_end; - pstate2->xml_mode = pstate->xml_mode; - pstate2->unbroken_text = pstate->unbroken_text; - pstate2->attr_encoded = pstate->attr_encoded; - pstate2->case_sensitive = pstate->case_sensitive; - pstate2->closing_plaintext = pstate->closing_plaintext; - pstate2->utf8_mode = pstate->utf8_mode; - pstate2->empty_element_tags = pstate->empty_element_tags; - pstate2->xml_pic = pstate->xml_pic; - - pstate2->bool_attr_val = - SvREFCNT_inc(sv_dup(pstate->bool_attr_val, params)); - for (i = 0; i < EVENT_COUNT; i++) { - pstate2->handlers[i].cb = - SvREFCNT_inc(sv_dup(pstate->handlers[i].cb, params)); - pstate2->handlers[i].argspec = - SvREFCNT_inc(sv_dup(pstate->handlers[i].argspec, params)); - } - pstate2->argspec_entity_decode = pstate->argspec_entity_decode; - - pstate2->report_tags = - (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->report_tags, params)); - pstate2->ignore_tags = - (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_tags, params)); - pstate2->ignore_elements = - (HV *)SvREFCNT_inc(sv_dup((SV *)pstate->ignore_elements, params)); - - pstate2->ignoring_element = - SvREFCNT_inc(sv_dup(pstate->ignoring_element, params)); - pstate2->ignore_depth = pstate->ignore_depth; - - if (params->flags & CLONEf_JOIN_IN) { - pstate2->entity2char = - perl_get_hv("HTML::Entities::entity2char", TRUE); - } else { - pstate2->entity2char = (HV *)sv_dup((SV *)pstate->entity2char, params); - } - pstate2->tmp = SvREFCNT_inc(sv_dup(pstate->tmp, params)); - - return pstate2; -} - -static int -magic_dup_pstate(pTHX_ MAGIC *mg, CLONE_PARAMS *params) -{ - mg->mg_ptr = (char *)dup_pstate(aTHX_ (PSTATE *)mg->mg_ptr, params); - return 0; -} - -#endif - -MGVTBL vtbl_pstate = -{ - 0, - 0, - 0, - 0, - MEMBER_TO_FPTR(magic_free_pstate), -#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 - 0, - MEMBER_TO_FPTR(magic_dup_pstate), -#endif -}; - - -/* - * XS interface definition. - */ - -MODULE = HTML::Parser PACKAGE = HTML::Parser - -PROTOTYPES: DISABLE - -void -_alloc_pstate(self) - SV* self; - PREINIT: - PSTATE* pstate; - SV* sv; - HV* hv; - MAGIC* mg; - - CODE: - sv = SvRV(self); - if (!sv || SvTYPE(sv) != SVt_PVHV) - croak("Not a reference to a hash"); - hv = (HV*)sv; - - Newz(56, pstate, 1, PSTATE); - pstate->signature = P_SIGNATURE; - pstate->entity2char = perl_get_hv("HTML::Entities::entity2char", TRUE); - pstate->tmp = NEWSV(0, 20); - - sv = newSViv(PTR2IV(pstate)); -#if PATCHLEVEL < 8 - sv_magic(sv, 0, '~', 0, 0); -#else - sv_magic(sv, 0, '~', (char *)pstate, 0); -#endif - mg = mg_find(sv, '~'); - assert(mg); - mg->mg_virtual = &vtbl_pstate; -#if defined(USE_ITHREADS) && PATCHLEVEL >= 8 - mg->mg_flags |= MGf_DUP; -#endif - SvREADONLY_on(sv); - - hv_store(hv, "_hparser_xs_state", 17, newRV_noinc(sv), 0); - -void -parse(self, chunk) - SV* self; - SV* chunk - PREINIT: - PSTATE* p_state = get_pstate_hv(aTHX_ self); - PPCODE: - if (p_state->parsing) - croak("Parse loop not allowed"); - p_state->parsing = 1; - if (SvROK(chunk) && SvTYPE(SvRV(chunk)) == SVt_PVCV) { - SV* generator = chunk; - STRLEN len; - do { - int count; - PUSHMARK(SP); - count = perl_call_sv(generator, G_SCALAR|G_EVAL); - SPAGAIN; - chunk = count ? POPs : 0; - PUTBACK; - - if (SvTRUE(ERRSV)) { - p_state->parsing = 0; - p_state->eof = 0; - RETHROW; - } - - if (chunk && SvOK(chunk)) { - (void)SvPV(chunk, len); /* get length */ - } - else { - len = 0; - } - parse(aTHX_ p_state, len ? chunk : 0, self); - SPAGAIN; - - } while (len && !p_state->eof); - } - else { - parse(aTHX_ p_state, chunk, self); - SPAGAIN; - } - p_state->parsing = 0; - if (p_state->eof) { - p_state->eof = 0; - PUSHs(sv_newmortal()); - } - else { - PUSHs(self); - } - -void -eof(self) - SV* self; - PREINIT: - PSTATE* p_state = get_pstate_hv(aTHX_ self); - PPCODE: - if (p_state->parsing) - p_state->eof = 1; - else { - p_state->parsing = 1; - parse(aTHX_ p_state, 0, self); /* flush */ - p_state->parsing = 0; - } - PUSHs(self); - -SV* -strict_comment(pstate,...) - PSTATE* pstate - ALIAS: - HTML::Parser::strict_comment = 1 - HTML::Parser::strict_names = 2 - HTML::Parser::xml_mode = 3 - HTML::Parser::unbroken_text = 4 - HTML::Parser::marked_sections = 5 - HTML::Parser::attr_encoded = 6 - HTML::Parser::case_sensitive = 7 - HTML::Parser::strict_end = 8 - HTML::Parser::closing_plaintext = 9 - HTML::Parser::utf8_mode = 10 - HTML::Parser::empty_element_tags = 11 - HTML::Parser::xml_pic = 12 - PREINIT: - bool *attr; - CODE: - switch (ix) { - case 1: attr = &pstate->strict_comment; break; - case 2: attr = &pstate->strict_names; break; - case 3: attr = &pstate->xml_mode; break; - case 4: attr = &pstate->unbroken_text; break; - case 5: -#ifdef MARKED_SECTION - attr = &pstate->marked_sections; break; -#else - croak("marked sections not supported"); break; -#endif - case 6: attr = &pstate->attr_encoded; break; - case 7: attr = &pstate->case_sensitive; break; - case 8: attr = &pstate->strict_end; break; - case 9: attr = &pstate->closing_plaintext; break; -#ifdef UNICODE_HTML_PARSER - case 10: attr = &pstate->utf8_mode; break; -#else - case 10: croak("The utf8_mode does not work with this perl; perl-5.8 or better required"); -#endif - case 11: attr = &pstate->empty_element_tags; break; - case 12: attr = &pstate->xml_pic; break; - default: - croak("Unknown boolean attribute (%d)", ix); - } - RETVAL = boolSV(*attr); - if (items > 1) - *attr = SvTRUE(ST(1)); - OUTPUT: - RETVAL - -SV* -boolean_attribute_value(pstate,...) - PSTATE* pstate - CODE: - RETVAL = pstate->bool_attr_val ? newSVsv(pstate->bool_attr_val) - : &PL_sv_undef; - if (items > 1) { - SvREFCNT_dec(pstate->bool_attr_val); - pstate->bool_attr_val = newSVsv(ST(1)); - } - OUTPUT: - RETVAL - -void -ignore_tags(pstate,...) - PSTATE* pstate - ALIAS: - HTML::Parser::report_tags = 1 - HTML::Parser::ignore_tags = 2 - HTML::Parser::ignore_elements = 3 - PREINIT: - HV** attr; - int i; - CODE: - switch (ix) { - case 1: attr = &pstate->report_tags; break; - case 2: attr = &pstate->ignore_tags; break; - case 3: attr = &pstate->ignore_elements; break; - default: - croak("Unknown tag-list attribute (%d)", ix); - } - if (GIMME_V != G_VOID) - croak("Can't report tag lists yet"); - - items--; /* pstate */ - if (items) { - if (*attr) - hv_clear(*attr); - else - *attr = newHV(); - - for (i = 0; i < items; i++) { - SV* sv = ST(i+1); - if (SvROK(sv)) { - sv = SvRV(sv); - if (SvTYPE(sv) == SVt_PVAV) { - AV* av = (AV*)sv; - STRLEN j; - STRLEN len = av_len(av) + 1; - for (j = 0; j < len; j++) { - SV**svp = av_fetch(av, j, 0); - if (svp) { - hv_store_ent(*attr, *svp, newSViv(0), 0); - } - } - } - else - croak("Tag list must be plain scalars and arrays"); - } - else { - hv_store_ent(*attr, sv, newSViv(0), 0); - } - } - } - else if (*attr) { - SvREFCNT_dec(*attr); - *attr = 0; - } - -void -handler(pstate, eventname,...) - PSTATE* pstate - SV* eventname - PREINIT: - STRLEN name_len; - char *name = SvPV(eventname, name_len); - int event = -1; - int i; - struct p_handler *h; - PPCODE: - /* map event name string to event_id */ - for (i = 0; i < EVENT_COUNT; i++) { - if (strEQ(name, event_id_str[i])) { - event = i; - break; - } - } - if (event < 0) - croak("No handler for %s events", name); - - h = &pstate->handlers[event]; - - /* set up return value */ - if (h->cb) { - PUSHs((SvTYPE(h->cb) == SVt_PVAV) - ? sv_2mortal(newRV_inc(h->cb)) - : sv_2mortal(newSVsv(h->cb))); - } - else { - PUSHs(&PL_sv_undef); - } - - /* update */ - if (items > 3) { - SvREFCNT_dec(h->argspec); - h->argspec = 0; - h->argspec = argspec_compile(ST(3), pstate); - } - if (items > 2) { - SvREFCNT_dec(h->cb); - h->cb = 0; - h->cb = check_handler(aTHX_ ST(2)); - } - - -MODULE = HTML::Parser PACKAGE = HTML::Entities - -void -decode_entities(...) - PREINIT: - int i; - HV *entity2char = perl_get_hv("HTML::Entities::entity2char", FALSE); - PPCODE: - if (GIMME_V == G_SCALAR && items > 1) - items = 1; - for (i = 0; i < items; i++) { - if (GIMME_V != G_VOID) - ST(i) = sv_2mortal(newSVsv(ST(i))); - else if (SvREADONLY(ST(i))) - croak("Can't inline decode readonly string"); - decode_entities(aTHX_ ST(i), entity2char, 0); - } - SP += items; - -void -_decode_entities(string, entities, ...) - SV* string - SV* entities - PREINIT: - HV* entities_hv; - bool expand_prefix = (items > 2) ? SvTRUE(ST(2)) : 0; - CODE: - if (SvOK(entities)) { - if (SvROK(entities) && SvTYPE(SvRV(entities)) == SVt_PVHV) { - entities_hv = (HV*)SvRV(entities); - } - else { - croak("2nd argument must be hash reference"); - } - } - else { - entities_hv = 0; - } - if (SvREADONLY(string)) - croak("Can't inline decode readonly string"); - decode_entities(aTHX_ string, entities_hv, expand_prefix); - -bool -_probably_utf8_chunk(string) - SV* string - PREINIT: - STRLEN len; - char *s; - CODE: -#ifdef UNICODE_HTML_PARSER - sv_utf8_downgrade(string, 0); - s = SvPV(string, len); - RETVAL = probably_utf8_chunk(aTHX_ s, len); -#else - RETVAL = 0; /* avoid never initialized complains from compiler */ - croak("_probably_utf8_chunk() only works for Unicode enabled perls"); -#endif - OUTPUT: - RETVAL - -int -UNICODE_SUPPORT() - PROTOTYPE: - CODE: -#ifdef UNICODE_HTML_PARSER - RETVAL = 1; -#else - RETVAL = 0; -#endif - OUTPUT: - RETVAL - - -MODULE = HTML::Parser PACKAGE = HTML::Parser diff --git a/ext/HTML/Parser/hints/solaris.pl b/ext/HTML/Parser/hints/solaris.pl deleted file mode 100644 index f6f94f0aca..0000000000 --- a/ext/HTML/Parser/hints/solaris.pl +++ /dev/null @@ -1,4 +0,0 @@ -if ($Config{gccversion}) { - print "Turning off optimizations to avoid compiler bug\n"; - $self->{OPTIMIZE} = " "; -} diff --git a/ext/HTML/Parser/hparser.c b/ext/HTML/Parser/hparser.c deleted file mode 100644 index e5887c331f..0000000000 --- a/ext/HTML/Parser/hparser.c +++ /dev/null @@ -1,1916 +0,0 @@ -/* $Id: hparser.c,v 2.134 2007/01/12 10:54:06 gisle Exp $ - * - * Copyright 1999-2007, Gisle Aas - * Copyright 1999-2000, Michael A. Chase - * - * This library is free software; you can redistribute it and/or - * modify it under the same terms as Perl itself. - */ - -#ifndef EXTERN -#define EXTERN extern -#endif - -#include "hctype.h" /* isH...() macros */ -#include "tokenpos.h" /* dTOKEN; PUSH_TOKEN() */ - - -static -struct literal_tag { - int len; - char* str; - int is_cdata; -} -literal_mode_elem[] = -{ - {6, "script", 1}, - {5, "style", 1}, - {3, "xmp", 1}, - {9, "plaintext", 1}, - {5, "title", 0}, - {8, "textarea", 0}, - {0, 0, 0} -}; - -enum argcode { - ARG_SELF = 1, /* need to avoid '\0' in argspec string */ - ARG_TOKENS, - ARG_TOKENPOS, - ARG_TOKEN0, - ARG_TAGNAME, - ARG_TAG, - ARG_ATTR, - ARG_ATTRARR, - ARG_ATTRSEQ, - ARG_TEXT, - ARG_DTEXT, - ARG_IS_CDATA, - ARG_SKIPPED_TEXT, - ARG_OFFSET, - ARG_OFFSET_END, - ARG_LENGTH, - ARG_LINE, - ARG_COLUMN, - ARG_EVENT, - ARG_UNDEF, - ARG_LITERAL, /* Always keep last */ - - /* extra flags always encoded first */ - ARG_FLAG_FLAT_ARRAY -}; - -char *argname[] = { - /* Must be in the same order as enum argcode */ - "self", /* ARG_SELF */ - "tokens", /* ARG_TOKENS */ - "tokenpos", /* ARG_TOKENPOS */ - "token0", /* ARG_TOKEN0 */ - "tagname", /* ARG_TAGNAME */ - "tag", /* ARG_TAG */ - "attr", /* ARG_ATTR */ - "@attr", /* ARG_ATTRARR */ - "attrseq", /* ARG_ATTRSEQ */ - "text", /* ARG_TEXT */ - "dtext", /* ARG_DTEXT */ - "is_cdata", /* ARG_IS_CDATA */ - "skipped_text", /* ARG_SKIPPED_TEXT */ - "offset", /* ARG_OFFSET */ - "offset_end", /* ARG_OFFSET_END */ - "length", /* ARG_LENGTH */ - "line", /* ARG_LINE */ - "column", /* ARG_COLUMN */ - "event", /* ARG_EVENT */ - "undef", /* ARG_UNDEF */ - /* ARG_LITERAL (not compared) */ - /* ARG_FLAG_FLAT_ARRAY */ -}; - -#define CASE_SENSITIVE(p_state) \ - ((p_state)->xml_mode || (p_state)->case_sensitive) -#define STRICT_NAMES(p_state) \ - ((p_state)->xml_mode || (p_state)->strict_names) -#define ALLOW_EMPTY_TAG(p_state) \ - ((p_state)->xml_mode || (p_state)->empty_element_tags) - -static void flush_pending_text(PSTATE* p_state, SV* self); - -/* - * Parser functions. - * - * parse() - top level entry point. - * deals with text and calls one of its - * subordinate parse_*() routines after - * looking at the first char after "<" - * parse_decl() - deals with declarations <!...> - * parse_comment() - deals with <!-- ... --> - * parse_marked_section - deals with <![ ... [ ... ]]> - * parse_end() - deals with end tags </...> - * parse_start() - deals with start tags <A...> - * parse_process() - deals with process instructions <?...> - * parse_null() - deals with anything else <....> - * - * report_event() - called whenever any of the parse*() routines - * has recongnized something. - */ - -static void -report_event(PSTATE* p_state, - event_id_t event, - char *beg, char *end, U32 utf8, - token_pos_t *tokens, int num_tokens, - SV* self - ) -{ - struct p_handler *h; - dTHX; - dSP; - AV *array; - STRLEN my_na; - char *argspec; - char *s; - STRLEN offset; - STRLEN line; - STRLEN column; - -#ifdef UNICODE_HTML_PARSER - #define CHR_DIST(a,b) (utf8 ? utf8_distance((U8*)(a),(U8*)(b)) : (a) - (b)) -#else - #define CHR_DIST(a,b) ((a) - (b)) -#endif - - /* some events might still fire after a handler has signaled eof - * so suppress them here. - */ - if (p_state->eof) - return; - - /* capture offsets */ - offset = p_state->offset; - line = p_state->line; - column = p_state->column; - -#if 0 - { /* used for debugging at some point */ - char *s = beg; - int i; - - /* print debug output */ - switch(event) { - case E_DECLARATION: printf("DECLARATION"); break; - case E_COMMENT: printf("COMMENT"); break; - case E_START: printf("START"); break; - case E_END: printf("END"); break; - case E_TEXT: printf("TEXT"); break; - case E_PROCESS: printf("PROCESS"); break; - case E_NONE: printf("NONE"); break; - default: printf("EVENT #%d", event); break; - } - - printf(" ["); - while (s < end) { - if (*s == '\n') { - putchar('\\'); putchar('n'); - } - else - putchar(*s); - s++; - } - printf("] %d\n", end - beg); - for (i = 0; i < num_tokens; i++) { - printf(" token %d: %d %d\n", - i, - tokens[i].beg - beg, - tokens[i].end - tokens[i].beg); - } - } -#endif - - if (p_state->pending_end_tag && event != E_TEXT && event != E_COMMENT) { - token_pos_t t; - char dummy; - t.beg = p_state->pending_end_tag; - t.end = p_state->pending_end_tag + strlen(p_state->pending_end_tag); - p_state->pending_end_tag = 0; - report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); - SPAGAIN; - } - - /* update offsets */ - p_state->offset += CHR_DIST(end, beg); - if (line) { - char *s = beg; - char *nl = NULL; - while (s < end) { - if (*s == '\n') { - p_state->line++; - nl = s; - } - s++; - } - if (nl) - p_state->column = CHR_DIST(end, nl) - 1; - else - p_state->column += CHR_DIST(end, beg); - } - - if (event == E_NONE) - goto IGNORE_EVENT; - -#ifdef MARKED_SECTION - if (p_state->ms == MS_IGNORE) - goto IGNORE_EVENT; -#endif - - /* tag filters */ - if (p_state->ignore_tags || p_state->report_tags || p_state->ignore_elements) { - - if (event == E_START || event == E_END) { - SV* tagname = p_state->tmp; - - assert(num_tokens >= 1); - sv_setpvn(tagname, tokens[0].beg, tokens[0].end - tokens[0].beg); - if (utf8) - SvUTF8_on(tagname); - else - SvUTF8_off(tagname); - if (!CASE_SENSITIVE(p_state)) - sv_lower(aTHX_ tagname); - - if (p_state->ignoring_element) { - if (sv_eq(p_state->ignoring_element, tagname)) { - if (event == E_START) - p_state->ignore_depth++; - else if (--p_state->ignore_depth == 0) { - SvREFCNT_dec(p_state->ignoring_element); - p_state->ignoring_element = 0; - } - } - goto IGNORE_EVENT; - } - - if (p_state->ignore_elements && - hv_fetch_ent(p_state->ignore_elements, tagname, 0, 0)) - { - if (event == E_START) { - p_state->ignoring_element = newSVsv(tagname); - p_state->ignore_depth = 1; - } - goto IGNORE_EVENT; - } - - if (p_state->ignore_tags && - hv_fetch_ent(p_state->ignore_tags, tagname, 0, 0)) - { - goto IGNORE_EVENT; - } - if (p_state->report_tags && - !hv_fetch_ent(p_state->report_tags, tagname, 0, 0)) - { - goto IGNORE_EVENT; - } - } - else if (p_state->ignoring_element) { - goto IGNORE_EVENT; - } - } - - h = &p_state->handlers[event]; - if (!h->cb) { - /* event = E_DEFAULT; */ - h = &p_state->handlers[E_DEFAULT]; - if (!h->cb) - goto IGNORE_EVENT; - } - - if (SvTYPE(h->cb) != SVt_PVAV && !SvTRUE(h->cb)) { - /* FALSE scalar ('' or 0) means IGNORE this event */ - return; - } - - if (p_state->unbroken_text && event == E_TEXT) { - /* should buffer text */ - if (!p_state->pend_text) - p_state->pend_text = newSV(256); - if (SvOK(p_state->pend_text)) { - if (p_state->is_cdata != p_state->pend_text_is_cdata) { - flush_pending_text(p_state, self); - SPAGAIN; - goto INIT_PEND_TEXT; - } - } - else { - INIT_PEND_TEXT: - p_state->pend_text_offset = offset; - p_state->pend_text_line = line; - p_state->pend_text_column = column; - p_state->pend_text_is_cdata = p_state->is_cdata; - sv_setpvn(p_state->pend_text, "", 0); - if (!utf8) - SvUTF8_off(p_state->pend_text); - } -#ifdef UNICODE_HTML_PARSER - if (utf8 && !SvUTF8(p_state->pend_text)) - sv_utf8_upgrade(p_state->pend_text); - if (utf8 || !SvUTF8(p_state->pend_text)) { - sv_catpvn(p_state->pend_text, beg, end - beg); - } - else { - SV *tmp = newSVpvn(beg, end - beg); - sv_utf8_upgrade(tmp); - sv_catsv(p_state->pend_text, tmp); - SvREFCNT_dec(tmp); - } -#else - sv_catpvn(p_state->pend_text, beg, end - beg); -#endif - return; - } - else if (p_state->pend_text && SvOK(p_state->pend_text)) { - flush_pending_text(p_state, self); - SPAGAIN; - } - - /* At this point we have decided to generate an event callback */ - - argspec = h->argspec ? SvPV(h->argspec, my_na) : ""; - - if (SvTYPE(h->cb) == SVt_PVAV) { - - if (*argspec == ARG_FLAG_FLAT_ARRAY) { - argspec++; - array = (AV*)h->cb; - } - else { - /* start sub-array for accumulator array */ - array = newAV(); - } - } - else { - array = 0; - if (*argspec == ARG_FLAG_FLAT_ARRAY) - argspec++; - - /* start argument stack for callback */ - ENTER; - SAVETMPS; - PUSHMARK(SP); - } - - for (s = argspec; *s; s++) { - SV* arg = 0; - int push_arg = 1; - enum argcode argcode = (enum argcode)*s; - - switch( argcode ) { - - case ARG_SELF: - arg = sv_mortalcopy(self); - break; - - case ARG_TOKENS: - if (num_tokens >= 1) { - AV* av = newAV(); - SV* prev_token = &PL_sv_undef; - int i; - av_extend(av, num_tokens); - for (i = 0; i < num_tokens; i++) { - if (tokens[i].beg) { - prev_token = newSVpvn(tokens[i].beg, tokens[i].end-tokens[i].beg); - if (utf8) - SvUTF8_on(prev_token); - av_push(av, prev_token); - } - else { /* boolean */ - av_push(av, p_state->bool_attr_val - ? newSVsv(p_state->bool_attr_val) - : newSVsv(prev_token)); - } - } - arg = sv_2mortal(newRV_noinc((SV*)av)); - } - break; - - case ARG_TOKENPOS: - if (num_tokens >= 1 && tokens[0].beg >= beg) { - AV* av = newAV(); - int i; - av_extend(av, num_tokens*2); - for (i = 0; i < num_tokens; i++) { - if (tokens[i].beg) { - av_push(av, newSViv(CHR_DIST(tokens[i].beg, beg))); - av_push(av, newSViv(CHR_DIST(tokens[i].end, tokens[i].beg))); - } - else { /* boolean tag value */ - av_push(av, newSViv(0)); - av_push(av, newSViv(0)); - } - } - arg = sv_2mortal(newRV_noinc((SV*)av)); - } - break; - - case ARG_TOKEN0: - case ARG_TAGNAME: - /* fall through */ - - case ARG_TAG: - if (num_tokens >= 1) { - arg = sv_2mortal(newSVpvn(tokens[0].beg, - tokens[0].end - tokens[0].beg)); - if (utf8) - SvUTF8_on(arg); - if (!CASE_SENSITIVE(p_state) && argcode != ARG_TOKEN0) - sv_lower(aTHX_ arg); - if (argcode == ARG_TAG && event != E_START) { - char *e_type = "!##/#?#"; - sv_insert(arg, 0, 0, &e_type[event], 1); - } - } - break; - - case ARG_ATTR: - case ARG_ATTRARR: - if (event == E_START) { - HV* hv; - int i; - if (argcode == ARG_ATTR) { - hv = newHV(); - arg = sv_2mortal(newRV_noinc((SV*)hv)); - } - else { -#ifdef __GNUC__ - /* gcc -Wall reports this variable as possibly used uninitialized */ - hv = 0; -#endif - push_arg = 0; /* deal with argument pushing here */ - } - - for (i = 1; i < num_tokens; i += 2) { - SV* attrname = newSVpvn(tokens[i].beg, - tokens[i].end-tokens[i].beg); - SV* attrval; - - if (utf8) - SvUTF8_on(attrname); - if (tokens[i+1].beg) { - char *beg = tokens[i+1].beg; - STRLEN len = tokens[i+1].end - beg; - if (*beg == '"' || *beg == '\'') { - assert(len >= 2 && *beg == beg[len-1]); - beg++; len -= 2; - } - attrval = newSVpvn(beg, len); - if (utf8) - SvUTF8_on(attrval); - if (!p_state->attr_encoded) { -#ifdef UNICODE_HTML_PARSER - if (p_state->utf8_mode) - sv_utf8_decode(attrval); -#endif - decode_entities(aTHX_ attrval, p_state->entity2char, 0); - if (p_state->utf8_mode) - SvUTF8_off(attrval); - } - } - else { /* boolean */ - if (p_state->bool_attr_val) - attrval = newSVsv(p_state->bool_attr_val); - else - attrval = newSVsv(attrname); - } - - if (!CASE_SENSITIVE(p_state)) - sv_lower(aTHX_ attrname); - - if (argcode == ARG_ATTR) { - if (hv_exists_ent(hv, attrname, 0) || - !hv_store_ent(hv, attrname, attrval, 0)) { - SvREFCNT_dec(attrval); - } - SvREFCNT_dec(attrname); - } - else { /* ARG_ATTRARR */ - if (array) { - av_push(array, attrname); - av_push(array, attrval); - } - else { - XPUSHs(sv_2mortal(attrname)); - XPUSHs(sv_2mortal(attrval)); - } - } - } - } - else if (argcode == ARG_ATTRARR) { - push_arg = 0; - } - break; - - case ARG_ATTRSEQ: /* (v2 compatibility stuff) */ - if (event == E_START) { - AV* av = newAV(); - int i; - for (i = 1; i < num_tokens; i += 2) { - SV* attrname = newSVpvn(tokens[i].beg, - tokens[i].end-tokens[i].beg); - if (utf8) - SvUTF8_on(attrname); - if (!CASE_SENSITIVE(p_state)) - sv_lower(aTHX_ attrname); - av_push(av, attrname); - } - arg = sv_2mortal(newRV_noinc((SV*)av)); - } - break; - - case ARG_TEXT: - arg = sv_2mortal(newSVpvn(beg, end - beg)); - if (utf8) - SvUTF8_on(arg); - break; - - case ARG_DTEXT: - if (event == E_TEXT) { - arg = sv_2mortal(newSVpvn(beg, end - beg)); - if (utf8) - SvUTF8_on(arg); - if (!p_state->is_cdata) { -#ifdef UNICODE_HTML_PARSER - if (p_state->utf8_mode) - sv_utf8_decode(arg); -#endif - decode_entities(aTHX_ arg, p_state->entity2char, 1); - if (p_state->utf8_mode) - SvUTF8_off(arg); - } - } - break; - - case ARG_IS_CDATA: - if (event == E_TEXT) { - arg = boolSV(p_state->is_cdata); - } - break; - - case ARG_SKIPPED_TEXT: - arg = sv_2mortal(p_state->skipped_text); - p_state->skipped_text = newSVpvn("", 0); - break; - - case ARG_OFFSET: - arg = sv_2mortal(newSViv(offset)); - break; - - case ARG_OFFSET_END: - arg = sv_2mortal(newSViv(offset + CHR_DIST(end, beg))); - break; - - case ARG_LENGTH: - arg = sv_2mortal(newSViv(CHR_DIST(end, beg))); - break; - - case ARG_LINE: - arg = sv_2mortal(newSViv(line)); - break; - - case ARG_COLUMN: - arg = sv_2mortal(newSViv(column)); - break; - - case ARG_EVENT: - assert(event >= 0 && event < EVENT_COUNT); - arg = sv_2mortal(newSVpv(event_id_str[event], 0)); - break; - - case ARG_LITERAL: - { - int len = (unsigned char)s[1]; - arg = sv_2mortal(newSVpvn(s+2, len)); - if (SvUTF8(h->argspec)) - SvUTF8_on(arg); - s += len + 1; - } - break; - - case ARG_UNDEF: - arg = sv_mortalcopy(&PL_sv_undef); - break; - - default: - arg = sv_2mortal(newSVpvf("Bad argspec %d", *s)); - break; - } - - if (push_arg) { - if (!arg) - arg = sv_mortalcopy(&PL_sv_undef); - - if (array) { - /* have to fix mortality here or add mortality to - * XPUSHs after removing it from the switch cases. - */ - av_push(array, SvREFCNT_inc(arg)); - } - else { - XPUSHs(arg); - } - } - } - - if (array) { - if (array != (AV*)h->cb) - av_push((AV*)h->cb, newRV_noinc((SV*)array)); - } - else { - PUTBACK; - - if ((enum argcode)*argspec == ARG_SELF && !SvROK(h->cb)) { - char *method = SvPV(h->cb, my_na); - perl_call_method(method, G_DISCARD | G_EVAL | G_VOID); - } - else { - perl_call_sv(h->cb, G_DISCARD | G_EVAL | G_VOID); - } - - if (SvTRUE(ERRSV)) { - RETHROW; - } - - FREETMPS; - LEAVE; - } - if (p_state->skipped_text) - SvCUR_set(p_state->skipped_text, 0); - return; - -IGNORE_EVENT: - if (p_state->skipped_text) { - if (event != E_TEXT && p_state->pend_text && SvOK(p_state->pend_text)) - flush_pending_text(p_state, self); -#ifdef UNICODE_HTML_PARSER - if (utf8 && !SvUTF8(p_state->skipped_text)) - sv_utf8_upgrade(p_state->skipped_text); - if (utf8 || !SvUTF8(p_state->skipped_text)) { -#endif - sv_catpvn(p_state->skipped_text, beg, end - beg); -#ifdef UNICODE_HTML_PARSER - } - else { - SV *tmp = newSVpvn(beg, end - beg); - sv_utf8_upgrade(tmp); - sv_catsv(p_state->pend_text, tmp); - SvREFCNT_dec(tmp); - } -#endif - } -#undef CHR_DIST - return; -} - - -EXTERN SV* -argspec_compile(SV* src, PSTATE* p_state) -{ - dTHX; - SV* argspec = newSVpvn("", 0); - STRLEN len; - char *s = SvPV(src, len); - char *end = s + len; - - if (SvUTF8(src)) - SvUTF8_on(argspec); - - while (isHSPACE(*s)) - s++; - - if (*s == '@') { - /* try to deal with '@{ ... }' wrapping */ - char *tmp = s + 1; - while (isHSPACE(*tmp)) - tmp++; - if (*tmp == '{') { - char c = ARG_FLAG_FLAT_ARRAY; - sv_catpvn(argspec, &c, 1); - tmp++; - while (isHSPACE(*tmp)) - tmp++; - s = tmp; - } - } - while (s < end) { - if (isHNAME_FIRST(*s) || *s == '@') { - char *name = s; - int a = ARG_SELF; - char **arg_name; - - s++; - while (isHNAME_CHAR(*s)) - s++; - - /* check identifier */ - for ( arg_name = argname; a < ARG_LITERAL ; ++a, ++arg_name ) { - if (strnEQ(*arg_name, name, s - name) && - (*arg_name)[s - name] == '\0') - break; - } - if (a < ARG_LITERAL) { - char c = (unsigned char) a; - sv_catpvn(argspec, &c, 1); - - if (a == ARG_LINE || a == ARG_COLUMN) { - if (!p_state->line) - p_state->line = 1; /* enable tracing of line/column */ - } - if (a == ARG_SKIPPED_TEXT) { - if (!p_state->skipped_text) { - p_state->skipped_text = newSVpvn("", 0); - } - } - if (a == ARG_ATTR || a == ARG_ATTRARR || a == ARG_DTEXT) { - p_state->argspec_entity_decode++; - } - } - else { - croak("Unrecognized identifier %.*s in argspec", s - name, name); - } - } - else if (*s == '"' || *s == '\'') { - char *string_beg = s; - s++; - while (s < end && *s != *string_beg && *s != '\\') - s++; - if (*s == *string_beg) { - /* literal */ - int len = s - string_beg - 1; - unsigned char buf[2]; - if (len > 255) - croak("Literal string is longer than 255 chars in argspec"); - buf[0] = ARG_LITERAL; - buf[1] = len; - sv_catpvn(argspec, (char*)buf, 2); - sv_catpvn(argspec, string_beg+1, len); - s++; - } - else if (*s == '\\') { - croak("Backslash reserved for literal string in argspec"); - } - else { - croak("Unterminated literal string in argspec"); - } - } - else { - croak("Bad argspec (%s)", s); - } - - while (isHSPACE(*s)) - s++; - - if (*s == '}' && SvPVX(argspec)[0] == ARG_FLAG_FLAT_ARRAY) { - /* end of '@{ ... }' */ - s++; - while (isHSPACE(*s)) - s++; - if (s < end) - croak("Bad argspec: stuff after @{...} (%s)", s); - } - - if (s == end) - break; - if (*s != ',') { - croak("Missing comma separator in argspec"); - } - s++; - while (isHSPACE(*s)) - s++; - } - return argspec; -} - - -static void -flush_pending_text(PSTATE* p_state, SV* self) -{ - dTHX; - bool old_unbroken_text = p_state->unbroken_text; - SV* old_pend_text = p_state->pend_text; - bool old_is_cdata = p_state->is_cdata; - STRLEN old_offset = p_state->offset; - STRLEN old_line = p_state->line; - STRLEN old_column = p_state->column; - - assert(p_state->pend_text && SvOK(p_state->pend_text)); - - p_state->unbroken_text = 0; - p_state->pend_text = 0; - p_state->is_cdata = p_state->pend_text_is_cdata; - p_state->offset = p_state->pend_text_offset; - p_state->line = p_state->pend_text_line; - p_state->column = p_state->pend_text_column; - - report_event(p_state, E_TEXT, - SvPVX(old_pend_text), SvEND(old_pend_text), - SvUTF8(old_pend_text), 0, 0, self); - SvOK_off(old_pend_text); - - p_state->unbroken_text = old_unbroken_text; - p_state->pend_text = old_pend_text; - p_state->is_cdata = old_is_cdata; - p_state->offset = old_offset; - p_state->line = old_line; - p_state->column = old_column; -} - -static char* -skip_until_gt(char *beg, char *end) -{ - /* tries to emulate quote skipping behaviour observed in MSIE */ - char *s = beg; - char quote = '\0'; - char prev = ' '; - while (s < end) { - if (!quote && *s == '>') - return s; - if (*s == '"' || *s == '\'') { - if (*s == quote) { - quote = '\0'; /* end of quoted string */ - } - else if (!quote && (prev == ' ' || prev == '=')) { - quote = *s; - } - } - prev = *s++; - } - return end; -} - -static char* -parse_comment(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg; - - if (p_state->strict_comment) { - dTOKENS(4); - char *start_com = s; /* also used to signal inside/outside */ - - while (1) { - /* try to locate "--" */ - FIND_DASH_DASH: - /* printf("find_dash_dash: [%s]\n", s); */ - while (s < end && *s != '-' && *s != '>') - s++; - - if (s == end) { - FREE_TOKENS; - return beg; - } - - if (*s == '>') { - s++; - if (start_com) - goto FIND_DASH_DASH; - - /* we are done recognizing all comments, make callbacks */ - report_event(p_state, E_COMMENT, - beg - 4, s, utf8, - tokens, num_tokens, - self); - FREE_TOKENS; - - return s; - } - - s++; - if (s == end) { - FREE_TOKENS; - return beg; - } - - if (*s == '-') { - /* two dashes in a row seen */ - s++; - /* do something */ - if (start_com) { - PUSH_TOKEN(start_com, s-2); - start_com = 0; - } - else { - start_com = s; - } - } - } - } - else if (p_state->no_dash_dash_comment_end) { - token_pos_t token; - token.beg = beg; - /* a lone '>' signals end-of-comment */ - while (s < end && *s != '>') - s++; - token.end = s; - if (s < end) { - s++; - report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); - return s; - } - else { - return beg; - } - } - else { /* non-strict comment */ - token_pos_t token; - token.beg = beg; - /* try to locate /--\s*>/ which signals end-of-comment */ - LOCATE_END: - while (s < end && *s != '-') - s++; - token.end = s; - if (s < end) { - s++; - if (*s == '-') { - s++; - while (isHSPACE(*s)) - s++; - if (*s == '>') { - s++; - /* yup */ - report_event(p_state, E_COMMENT, beg-4, s, utf8, &token, 1, self); - return s; - } - } - if (s < end) { - s = token.end + 1; - goto LOCATE_END; - } - } - - if (s == end) - return beg; - } - - return 0; -} - - -#ifdef MARKED_SECTION - -static void -marked_section_update(PSTATE* p_state) -{ - dTHX; - /* we look at p_state->ms_stack to determine p_state->ms */ - AV* ms_stack = p_state->ms_stack; - p_state->ms = MS_NONE; - - if (ms_stack) { - int stack_len = av_len(ms_stack); - int stack_idx; - for (stack_idx = 0; stack_idx <= stack_len; stack_idx++) { - SV** svp = av_fetch(ms_stack, stack_idx, 0); - if (svp) { - AV* tokens = (AV*)SvRV(*svp); - int tokens_len = av_len(tokens); - int i; - assert(SvTYPE(tokens) == SVt_PVAV); - for (i = 0; i <= tokens_len; i++) { - SV** svp = av_fetch(tokens, i, 0); - if (svp) { - STRLEN len; - char *token_str = SvPV(*svp, len); - enum marked_section_t token; - if (strEQ(token_str, "include")) - token = MS_INCLUDE; - else if (strEQ(token_str, "rcdata")) - token = MS_RCDATA; - else if (strEQ(token_str, "cdata")) - token = MS_CDATA; - else if (strEQ(token_str, "ignore")) - token = MS_IGNORE; - else - token = MS_NONE; - if (p_state->ms < token) - p_state->ms = token; - } - } - } - } - } - /* printf("MS %d\n", p_state->ms); */ - p_state->is_cdata = (p_state->ms == MS_CDATA); - return; -} - - -static char* -parse_marked_section(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - dTHX; - char *s; - AV* tokens = 0; - - if (!p_state->marked_sections) - return 0; - - assert(beg[0] == '<'); - assert(beg[1] == '!'); - assert(beg[2] == '['); - s = beg + 3; - -FIND_NAMES: - while (isHSPACE(*s)) - s++; - while (isHNAME_FIRST(*s)) { - char *name_start = s; - char *name_end; - SV *name; - s++; - while (isHNAME_CHAR(*s)) - s++; - name_end = s; - while (isHSPACE(*s)) - s++; - if (s == end) - goto PREMATURE; - - if (!tokens) - tokens = newAV(); - name = newSVpvn(name_start, name_end - name_start); - if (utf8) - SvUTF8_on(name); - av_push(tokens, sv_lower(aTHX_ name)); - } - if (*s == '-') { - s++; - if (*s == '-') { - /* comment */ - s++; - while (1) { - while (s < end && *s != '-') - s++; - if (s == end) - goto PREMATURE; - - s++; /* skip first '-' */ - if (*s == '-') { - s++; - /* comment finished */ - goto FIND_NAMES; - } - } - } - else - goto FAIL; - - } - if (*s == '[') { - s++; - /* yup */ - - if (!tokens) { - tokens = newAV(); - av_push(tokens, newSVpvn("include", 7)); - } - - if (!p_state->ms_stack) - p_state->ms_stack = newAV(); - av_push(p_state->ms_stack, newRV_noinc((SV*)tokens)); - marked_section_update(p_state); - report_event(p_state, E_NONE, beg, s, utf8, 0, 0, self); - return s; - } - -FAIL: - SvREFCNT_dec(tokens); - return 0; /* not yet implemented */ - -PREMATURE: - SvREFCNT_dec(tokens); - return beg; -} -#endif - - -static char* -parse_decl(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg + 2; - - if (*s == '-') { - /* comment? */ - - char *tmp; - s++; - if (s == end) - return beg; - - if (*s != '-') - goto DECL_FAIL; /* nope, illegal */ - - /* yes, two dashes seen */ - s++; - - tmp = parse_comment(p_state, s, end, utf8, self); - return (tmp == s) ? beg : tmp; - } - -#ifdef MARKED_SECTION - if (*s == '[') { - /* marked section */ - char *tmp; - tmp = parse_marked_section(p_state, beg, end, utf8, self); - if (!tmp) - goto DECL_FAIL; - return tmp; - } -#endif - - if (*s == '>') { - /* make <!> into empty comment <SGML Handbook 36:32> */ - token_pos_t token; - token.beg = s; - token.end = s; - s++; - report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); - return s; - } - - if (isALPHA(*s)) { - dTOKENS(8); - char *decl_id = s; - STRLEN decl_id_len; - - s++; - /* declaration */ - while (s < end && isHNAME_CHAR(*s)) - s++; - decl_id_len = s - decl_id; - if (s == end) - goto PREMATURE; - - /* just hardcode a few names as the recognized declarations */ - if (!((decl_id_len == 7 && - strnEQx(decl_id, "DOCTYPE", 7, !CASE_SENSITIVE(p_state))) || - (decl_id_len == 6 && - strnEQx(decl_id, "ENTITY", 6, !CASE_SENSITIVE(p_state))) - ) - ) - { - goto FAIL; - } - - /* first word available */ - PUSH_TOKEN(decl_id, s); - - while (1) { - while (s < end && isHSPACE(*s)) - s++; - - if (s == end) - goto PREMATURE; - - if (*s == '"' || *s == '\'') { - char *str_beg = s; - s++; - while (s < end && *s != *str_beg) - s++; - if (s == end) - goto PREMATURE; - s++; - PUSH_TOKEN(str_beg, s); - } - else if (*s == '-') { - /* comment */ - char *com_beg = s; - s++; - if (s == end) - goto PREMATURE; - if (*s != '-') - goto FAIL; - s++; - - while (1) { - while (s < end && *s != '-') - s++; - if (s == end) - goto PREMATURE; - s++; - if (s == end) - goto PREMATURE; - if (*s == '-') { - s++; - PUSH_TOKEN(com_beg, s); - break; - } - } - } - else if (*s != '>') { - /* plain word */ - char *word_beg = s; - s++; - while (s < end && isHNOT_SPACE_GT(*s)) - s++; - if (s == end) - goto PREMATURE; - PUSH_TOKEN(word_beg, s); - } - else { - break; - } - } - - if (s == end) - goto PREMATURE; - if (*s == '>') { - s++; - report_event(p_state, E_DECLARATION, beg, s, utf8, tokens, num_tokens, self); - FREE_TOKENS; - return s; - } - - FAIL: - FREE_TOKENS; - goto DECL_FAIL; - - PREMATURE: - FREE_TOKENS; - return beg; - - } - -DECL_FAIL: - if (p_state->strict_comment) - return 0; - - /* consider everything up to the first '>' a comment */ - while (s < end && *s != '>') - s++; - if (s < end) { - token_pos_t token; - token.beg = beg + 2; - token.end = s; - s++; - report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); - return s; - } - else { - return beg; - } -} - - -static char* -parse_start(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg; - int empty_tag = 0; - dTOKENS(16); - - hctype_t tag_name_first, tag_name_char; - hctype_t attr_name_first, attr_name_char; - - if (STRICT_NAMES(p_state)) { - tag_name_first = attr_name_first = HCTYPE_NAME_FIRST; - tag_name_char = attr_name_char = HCTYPE_NAME_CHAR; - } - else { - tag_name_first = tag_name_char = HCTYPE_NOT_SPACE_GT; - attr_name_first = HCTYPE_NOT_SPACE_GT; - attr_name_char = HCTYPE_NOT_SPACE_EQ_GT; - } - - s += 2; - - while (s < end && isHCTYPE(*s, tag_name_char)) { - if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { - if ((s + 1) == end) - goto PREMATURE; - if (*(s + 1) == '>') - break; - } - s++; - } - PUSH_TOKEN(beg+1, s); /* tagname */ - - while (isHSPACE(*s)) - s++; - if (s == end) - goto PREMATURE; - - while (isHCTYPE(*s, attr_name_first)) { - /* attribute */ - char *attr_name_beg = s; - char *attr_name_end; - if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { - if ((s + 1) == end) - goto PREMATURE; - if (*(s + 1) == '>') - break; - } - s++; - while (s < end && isHCTYPE(*s, attr_name_char)) { - if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { - if ((s + 1) == end) - goto PREMATURE; - if (*(s + 1) == '>') - break; - } - s++; - } - if (s == end) - goto PREMATURE; - - attr_name_end = s; - PUSH_TOKEN(attr_name_beg, attr_name_end); /* attr name */ - - while (isHSPACE(*s)) - s++; - if (s == end) - goto PREMATURE; - - if (*s == '=') { - /* with a value */ - s++; - while (isHSPACE(*s)) - s++; - if (s == end) - goto PREMATURE; - if (*s == '>') { - /* parse it similar to ="" */ - PUSH_TOKEN(s, s); - break; - } - if (*s == '"' || *s == '\'') { - char *str_beg = s; - s++; - while (s < end && *s != *str_beg) - s++; - if (s == end) - goto PREMATURE; - s++; - PUSH_TOKEN(str_beg, s); - } - else { - char *word_start = s; - while (s < end && isHNOT_SPACE_GT(*s)) { - if (*s == '/' && ALLOW_EMPTY_TAG(p_state)) { - if ((s + 1) == end) - goto PREMATURE; - if (*(s + 1) == '>') - break; - } - s++; - } - if (s == end) - goto PREMATURE; - PUSH_TOKEN(word_start, s); - } - while (isHSPACE(*s)) - s++; - if (s == end) - goto PREMATURE; - } - else { - PUSH_TOKEN(0, 0); /* boolean attr value */ - } - } - - if (ALLOW_EMPTY_TAG(p_state) && *s == '/') { - s++; - if (s == end) - goto PREMATURE; - empty_tag = 1; - } - - if (*s == '>') { - s++; - /* done */ - report_event(p_state, E_START, beg, s, utf8, tokens, num_tokens, self); - if (empty_tag) { - report_event(p_state, E_END, s, s, utf8, tokens, 1, self); - } - else if (!p_state->xml_mode) { - /* find out if this start tag should put us into literal_mode - */ - int i; - int tag_len = tokens[0].end - tokens[0].beg; - - for (i = 0; literal_mode_elem[i].len; i++) { - if (tag_len == literal_mode_elem[i].len) { - /* try to match it */ - char *s = beg + 1; - char *t = literal_mode_elem[i].str; - int len = tag_len; - while (len) { - if (toLOWER(*s) != *t) - break; - s++; - t++; - if (!--len) { - /* found it */ - p_state->literal_mode = literal_mode_elem[i].str; - p_state->is_cdata = literal_mode_elem[i].is_cdata; - /* printf("Found %s\n", p_state->literal_mode); */ - goto END_OF_LITERAL_SEARCH; - } - } - } - } - END_OF_LITERAL_SEARCH: - ; - } - - FREE_TOKENS; - return s; - } - - FREE_TOKENS; - return 0; - -PREMATURE: - FREE_TOKENS; - return beg; -} - - -static char* -parse_end(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg+2; - hctype_t name_first, name_char; - - if (STRICT_NAMES(p_state)) { - name_first = HCTYPE_NAME_FIRST; - name_char = HCTYPE_NAME_CHAR; - } - else { - name_first = name_char = HCTYPE_NOT_SPACE_GT; - } - - if (isHCTYPE(*s, name_first)) { - token_pos_t tagname; - tagname.beg = s; - s++; - while (s < end && isHCTYPE(*s, name_char)) - s++; - tagname.end = s; - - if (p_state->strict_end) { - while (isHSPACE(*s)) - s++; - } - else { - s = skip_until_gt(s, end); - } - if (s < end) { - if (*s == '>') { - s++; - /* a complete end tag has been recognized */ - report_event(p_state, E_END, beg, s, utf8, &tagname, 1, self); - return s; - } - } - else { - return beg; - } - } - else if (!p_state->strict_comment) { - s = skip_until_gt(s, end); - if (s < end) { - token_pos_t token; - token.beg = beg + 2; - token.end = s; - s++; - report_event(p_state, E_COMMENT, beg, s, utf8, &token, 1, self); - return s; - } - else { - return beg; - } - } - return 0; -} - - -static char* -parse_process(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg + 2; /* skip '<?' */ - /* processing instruction */ - token_pos_t token_pos; - token_pos.beg = s; - - while (s < end) { - if (*s == '>') { - token_pos.end = s; - s++; - - if (p_state->xml_mode || p_state->xml_pic) { - /* XML processing instructions are ended by "?>" */ - if (s - beg < 4 || s[-2] != '?') - continue; - token_pos.end = s - 2; - } - - /* a complete processing instruction seen */ - report_event(p_state, E_PROCESS, beg, s, utf8, - &token_pos, 1, self); - return s; - } - s++; - } - return beg; /* could not fix end */ -} - - -#ifdef USE_PFUNC -static char* -parse_null(PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - return 0; -} - - - -#include "pfunc.h" /* declares the parsefunc[] */ -#endif /* USE_PFUNC */ - -static char* -parse_buf(pTHX_ PSTATE* p_state, char *beg, char *end, U32 utf8, SV* self) -{ - char *s = beg; - char *t = beg; - char *new_pos; - - while (!p_state->eof) { - /* - * At the start of this loop we will always be ready for eating text - * or a new tag. We will never be inside some tag. The 't' points - * to where we started and the 's' is advanced as we go. - */ - - while (p_state->literal_mode) { - char *l = p_state->literal_mode; - bool skip_quoted_end = (strEQ(l, "script") || strEQ(l, "style")); - char inside_quote = 0; - bool escape_next = 0; - char *end_text; - - while (s < end) { - if (*s == '<' && !inside_quote) - break; - if (skip_quoted_end) { - if (escape_next) { - escape_next = 0; - } - else { - if (*s == '\\') - escape_next = 1; - else if (inside_quote && *s == inside_quote) - inside_quote = 0; - else if (*s == '\r' || *s == '\n') - inside_quote = 0; - else if (!inside_quote && (*s == '"' || *s == '\'')) - inside_quote = *s; - } - } - s++; - } - - if (s == end) { - s = t; - goto DONE; - } - - end_text = s; - s++; - - /* here we rely on '\0' termination of perl svpv buffers */ - if (*s == '/') { - s++; - while (*l && toLOWER(*s) == *l) { - s++; - l++; - } - - if (!*l && (strNE(p_state->literal_mode, "plaintext") || p_state->closing_plaintext)) { - /* matched it all */ - token_pos_t end_token; - end_token.beg = end_text + 2; - end_token.end = s; - - while (isHSPACE(*s)) - s++; - if (*s == '>') { - s++; - if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, utf8, - 0, 0, self); - report_event(p_state, E_END, end_text, s, utf8, - &end_token, 1, self); - p_state->literal_mode = 0; - p_state->is_cdata = 0; - t = s; - } - } - } - } - -#ifdef MARKED_SECTION - while (p_state->ms == MS_CDATA || p_state->ms == MS_RCDATA) { - while (s < end && *s != ']') - s++; - if (*s == ']') { - char *end_text = s; - s++; - if (*s == ']' && *(s + 1) == '>') { - s += 2; - /* marked section end */ - if (t != end_text) - report_event(p_state, E_TEXT, t, end_text, utf8, - 0, 0, self); - report_event(p_state, E_NONE, end_text, s, utf8, 0, 0, self); - t = s; - SvREFCNT_dec(av_pop(p_state->ms_stack)); - marked_section_update(p_state); - continue; - } - } - if (s == end) { - s = t; - goto DONE; - } - } -#endif - - /* first we try to match as much text as possible */ - while (s < end && *s != '<') { -#ifdef MARKED_SECTION - if (p_state->ms && *s == ']') { - char *end_text = s; - s++; - if (*s == ']') { - s++; - if (*s == '>') { - s++; - report_event(p_state, E_TEXT, t, end_text, utf8, - 0, 0, self); - report_event(p_state, E_NONE, end_text, s, utf8, - 0, 0, self); - t = s; - SvREFCNT_dec(av_pop(p_state->ms_stack)); - marked_section_update(p_state); - continue; - } - } - } -#endif - s++; - } - if (s != t) { - if (*s == '<') { - report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); - t = s; - } - else { - s--; - if (isHSPACE(*s)) { - /* wait with white space at end */ - while (s >= t && isHSPACE(*s)) - s--; - } - else { - /* might be a chopped up entities/words */ - while (s >= t && !isHSPACE(*s)) - s--; - while (s >= t && isHSPACE(*s)) - s--; - } - s++; - if (s != t) - report_event(p_state, E_TEXT, t, s, utf8, 0, 0, self); - break; - } - } - - if (end - s < 3) - break; - - /* next char is known to be '<' and pointed to by 't' as well as 's' */ - s++; - -#ifdef USE_PFUNC - new_pos = parsefunc[(unsigned char)*s](p_state, t, end, utf8, self); -#else - if (isHNAME_FIRST(*s)) - new_pos = parse_start(p_state, t, end, utf8, self); - else if (*s == '/') - new_pos = parse_end(p_state, t, end, utf8, self); - else if (*s == '!') - new_pos = parse_decl(p_state, t, end, utf8, self); - else if (*s == '?') - new_pos = parse_process(p_state, t, end, utf8, self); - else - new_pos = 0; -#endif /* USE_PFUNC */ - - if (new_pos) { - if (new_pos == t) { - /* no progress, need more data to know what it is */ - s = t; - break; - } - t = s = new_pos; - } - - /* if we get out here then this was not a conforming tag, so - * treat it is plain text at the top of the loop again (we - * have already skipped past the "<"). - */ - } - -DONE: - return s; - -} - -EXTERN void -parse(pTHX_ - PSTATE* p_state, - SV* chunk, - SV* self) -{ - char *s, *beg, *end; - U32 utf8 = 0; - STRLEN len; - - if (!p_state->start_document) { - char dummy[1]; - report_event(p_state, E_START_DOCUMENT, dummy, dummy, 0, 0, 0, self); - p_state->start_document = 1; - } - - if (!chunk) { - /* eof */ - char empty[1]; - if (p_state->buf && SvOK(p_state->buf)) { - /* flush it */ - s = SvPV(p_state->buf, len); - end = s + len; - utf8 = SvUTF8(p_state->buf); - assert(len); - - while (s < end) { - if (p_state->literal_mode) { - if (strEQ(p_state->literal_mode, "plaintext") || - strEQ(p_state->literal_mode, "xmp") || - strEQ(p_state->literal_mode, "textarea")) - { - /* rest is considered text */ - break; - } - if (strEQ(p_state->literal_mode, "script") || - strEQ(p_state->literal_mode, "style")) - { - /* effectively make it an empty element */ - token_pos_t t; - char dummy; - t.beg = p_state->literal_mode; - t.end = p_state->literal_mode + strlen(p_state->literal_mode); - report_event(p_state, E_END, &dummy, &dummy, 0, &t, 1, self); - } - else { - p_state->pending_end_tag = p_state->literal_mode; - } - p_state->literal_mode = 0; - s = parse_buf(aTHX_ p_state, s, end, utf8, self); - continue; - } - - if (!p_state->strict_comment && !p_state->no_dash_dash_comment_end && *s == '<') { - p_state->no_dash_dash_comment_end = 1; - s = parse_buf(aTHX_ p_state, s, end, utf8, self); - continue; - } - - if (!p_state->strict_comment && *s == '<') { - char *s1 = s + 1; - if (s1 == end || isHNAME_FIRST(*s1) || *s1 == '/' || *s1 == '!' || *s1 == '?') { - /* some kind of unterminated markup. Report rest as as comment */ - token_pos_t token; - token.beg = s + 1; - token.end = end; - report_event(p_state, E_COMMENT, s, end, utf8, &token, 1, self); - s = end; - } - } - - break; - } - - if (s < end) { - /* report rest as text */ - report_event(p_state, E_TEXT, s, end, utf8, 0, 0, self); - } - - SvREFCNT_dec(p_state->buf); - p_state->buf = 0; - } - if (p_state->pend_text && SvOK(p_state->pend_text)) - flush_pending_text(p_state, self); - - if (p_state->ignoring_element) { - /* document not balanced */ - SvREFCNT_dec(p_state->ignoring_element); - p_state->ignoring_element = 0; - } - report_event(p_state, E_END_DOCUMENT, empty, empty, 0, 0, 0, self); - - /* reset state */ - p_state->offset = 0; - if (p_state->line) - p_state->line = 1; - p_state->column = 0; - p_state->start_document = 0; - p_state->literal_mode = 0; - p_state->is_cdata = 0; - return; - } - -#ifdef UNICODE_HTML_PARSER - if (p_state->utf8_mode) - sv_utf8_downgrade(chunk, 0); -#endif - - if (p_state->buf && SvOK(p_state->buf)) { - sv_catsv(p_state->buf, chunk); - beg = SvPV(p_state->buf, len); - utf8 = SvUTF8(p_state->buf); - } - else { - beg = SvPV(chunk, len); - utf8 = SvUTF8(chunk); - if (p_state->offset == 0 && DOWARN) { - /* Print warnings if we find unexpected Unicode BOM forms */ -#ifdef UNICODE_HTML_PARSER - if (p_state->argspec_entity_decode && - !p_state->utf8_mode && ( - (!utf8 && len >= 3 && strnEQ(beg, "\xEF\xBB\xBF", 3)) || - (utf8 && len >= 6 && strnEQ(beg, "\xC3\xAF\xC2\xBB\xC2\xBF", 6)) || - (!utf8 && probably_utf8_chunk(aTHX_ beg, len)) - ) - ) - { - warn("Parsing of undecoded UTF-8 will give garbage when decoding entities"); - } - if (utf8 && len >= 2 && strnEQ(beg, "\xFF\xFE", 2)) { - warn("Parsing string decoded with wrong endianess"); - } -#endif - if (!utf8 && len >= 4 && - (strnEQ(beg, "\x00\x00\xFE\xFF", 4) || - strnEQ(beg, "\xFE\xFF\x00\x00", 4)) - ) - { - warn("Parsing of undecoded UTF-32"); - } - else if (!utf8 && len >= 2 && - (strnEQ(beg, "\xFE\xFF", 2) || strnEQ(beg, "\xFF\xFE", 2)) - ) - { - warn("Parsing of undecoded UTF-16"); - } - } - } - - if (!len) - return; /* nothing to do */ - - end = beg + len; - s = parse_buf(aTHX_ p_state, beg, end, utf8, self); - - if (s == end || p_state->eof) { - if (p_state->buf) { - SvOK_off(p_state->buf); - } - } - else { - /* need to keep rest in buffer */ - if (p_state->buf) { - /* chop off some chars at the beginning */ - if (SvOK(p_state->buf)) { - sv_chop(p_state->buf, s); - } - else { - sv_setpvn(p_state->buf, s, end - s); - if (utf8) - SvUTF8_on(p_state->buf); - else - SvUTF8_off(p_state->buf); - } - } - else { - p_state->buf = newSVpv(s, end - s); - if (utf8) - SvUTF8_on(p_state->buf); - } - } - return; -} diff --git a/ext/HTML/Parser/hparser.h b/ext/HTML/Parser/hparser.h deleted file mode 100644 index e7d1ee823b..0000000000 --- a/ext/HTML/Parser/hparser.h +++ /dev/null @@ -1,132 +0,0 @@ -/* $Id: hparser.h,v 2.34 2006/04/26 07:01:10 gisle Exp $ - * - * Copyright 1999-2005, Gisle Aas - * Copyright 1999-2000, Michael A. Chase - * - * This library is free software; you can redistribute it and/or - * modify it under the same terms as Perl itself. - */ - -/* - * Declare various structures and constants. The main thing - * is 'struct p_state' that contains various fields to represent - * the state of the parser. - */ - -#ifdef MARKED_SECTION - -enum marked_section_t { - MS_NONE = 0, - MS_INCLUDE, - MS_RCDATA, - MS_CDATA, - MS_IGNORE -}; - -#endif /* MARKED_SECTION */ - - -#define P_SIGNATURE 0x16091964 /* tag struct p_state for safer cast */ - -enum event_id { - E_DECLARATION = 0, - E_COMMENT, - E_START, - E_END, - E_TEXT, - E_PROCESS, - E_START_DOCUMENT, - E_END_DOCUMENT, - E_DEFAULT, - /**/ - EVENT_COUNT, - E_NONE /* used for reporting skipped text (non-events) */ -}; -typedef enum event_id event_id_t; - -/* must match event_id_t above */ -static char* event_id_str[] = { - "declaration", - "comment", - "start", - "end", - "text", - "process", - "start_document", - "end_document", - "default", -}; - -struct p_handler { - SV* cb; - SV* argspec; -}; - -struct p_state { - U32 signature; - - /* state */ - SV* buf; - STRLEN offset; - STRLEN line; - STRLEN column; - bool start_document; - bool parsing; - bool eof; - - /* special parsing modes */ - char* literal_mode; - bool is_cdata; - bool no_dash_dash_comment_end; - char *pending_end_tag; - - /* unbroken_text option needs a buffer of pending text */ - SV* pend_text; - bool pend_text_is_cdata; - STRLEN pend_text_offset; - STRLEN pend_text_line; - STRLEN pend_text_column; - - /* skipped text is accumulated here */ - SV* skipped_text; - -#ifdef MARKED_SECTION - /* marked section support */ - enum marked_section_t ms; - AV* ms_stack; - bool marked_sections; -#endif - - /* various boolean configuration attributes */ - bool strict_comment; - bool strict_names; - bool strict_end; - bool xml_mode; - bool unbroken_text; - bool attr_encoded; - bool case_sensitive; - bool closing_plaintext; - bool utf8_mode; - bool empty_element_tags; - bool xml_pic; - - /* other configuration stuff */ - SV* bool_attr_val; - struct p_handler handlers[EVENT_COUNT]; - bool argspec_entity_decode; - - /* filters */ - HV* report_tags; - HV* ignore_tags; - HV* ignore_elements; - - /* these are set when we are currently inside an element we want to ignore */ - SV* ignoring_element; - int ignore_depth; - - /* cache */ - HV* entity2char; /* %HTML::Entities::entity2char */ - SV* tmp; -}; -typedef struct p_state PSTATE; - diff --git a/ext/HTML/Parser/lib/HTML/Entities.pm b/ext/HTML/Parser/lib/HTML/Entities.pm deleted file mode 100644 index 1e7dfc1f06..0000000000 --- a/ext/HTML/Parser/lib/HTML/Entities.pm +++ /dev/null @@ -1,491 +0,0 @@ -package HTML::Entities; - -# $Id: Entities.pm,v 1.35 2006/03/22 09:15:23 gisle Exp $ - -=head1 NAME - -HTML::Entities - Encode or decode strings with HTML entities - -=head1 SYNOPSIS - - use HTML::Entities; - - $a = "Våre norske tegn bør æres"; - decode_entities($a); - encode_entities($a, "\200-\377"); - -For example, this: - - $input = "vis-à-vis Beyoncé's naïve\npapier-mâché résumé"; - print encode_entities($input), "\n" - -Prints this out: - - vis-à-vis Beyoncé's naïve - papier-mâché résumé - -=head1 DESCRIPTION - -This module deals with encoding and decoding of strings with HTML -character entities. The module provides the following functions: - -=over 4 - -=item decode_entities( $string, ... ) - -This routine replaces HTML entities found in the $string with the -corresponding Unicode character. Under perl 5.6 and earlier only -characters in the Latin-1 range are replaced. Unrecognized -entities are left alone. - -If multiple strings are provided as argument they are each decoded -separately and the same number of strings are returned. - -If called in void context the arguments are decoded in-place. - -This routine is exported by default. - -=item _decode_entities( $string, \%entity2char ) - -=item _decode_entities( $string, \%entity2char, $expand_prefix ) - -This will in-place replace HTML entities in $string. The %entity2char -hash must be provided. Named entities not found in the %entity2char -hash are left alone. Numeric entities are expanded unless their value -overflow. - -The keys in %entity2char are the entity names to be expanded and their -values are what they should expand into. The values do not have to be -single character strings. If a key has ";" as suffix, -then occurrences in $string are only expanded if properly terminated -with ";". Entities without ";" will be expanded regardless of how -they are terminated for compatiblity with how common browsers treat -entities in the Latin-1 range. - -If $expand_prefix is TRUE then entities without trailing ";" in -%entity2char will even be expanded as a prefix of a longer -unrecognized name. The longest matching name in %entity2char will be -used. This is mainly present for compatibility with an MSIE -misfeature. - - $string = "foo bar"; - _decode_entities($string, { nb => "@", nbsp => "\xA0" }, 1); - print $string; # will print "foo bar" - -This routine is exported by default. - -=item encode_entities( $string ) - -=item encode_entities( $string, $unsafe_chars ) - -This routine replaces unsafe characters in $string with their entity -representation. A second argument can be given to specify which -characters to consider unsafe (i.e., which to escape). The default set -of characters to encode are control chars, high-bit chars, and the -C<< < >>, C<< & >>, C<< > >>, C<< ' >> and C<< " >> -characters. But this, for example, would encode I<just> the -C<< < >>, C<< & >>, C<< > >>, and C<< " >> characters: - - $encoded = encode_entities($input, '<>&"'); - -This routine is exported by default. - -=item encode_entities_numeric( $string ) - -=item encode_entities_numeric( $string, $unsafe_chars ) - -This routine works just like encode_entities, except that the replacement -entities are always C<&#xI<hexnum>;> and never C<&I<entname>;>. For -example, C<encode_entities("r\xF4le")> returns "rôle", but -C<encode_entities_numeric("r\xF4le")> returns "rôle". - -This routine is I<not> exported by default. But you can always -export it with C<use HTML::Entities qw(encode_entities_numeric);> -or even C<use HTML::Entities qw(:DEFAULT encode_entities_numeric);> - -=back - -All these routines modify the string passed as the first argument, if -called in a void context. In scalar and array contexts, the encoded or -decoded string is returned (without changing the input string). - -If you prefer not to import these routines into your namespace, you can -call them as: - - use HTML::Entities (); - $decoded = HTML::Entities::decode($a); - $encoded = HTML::Entities::encode($a); - $encoded = HTML::Entities::encode_numeric($a); - -The module can also export the %char2entity and the %entity2char -hashes, which contain the mapping from all characters to the -corresponding entities (and vice versa, respectively). - -=head1 COPYRIGHT - -Copyright 1995-2006 Gisle Aas. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - -use strict; -use vars qw(@ISA @EXPORT @EXPORT_OK $VERSION); -use vars qw(%entity2char %char2entity); - -require 5.004; -require Exporter; -@ISA = qw(Exporter); - -@EXPORT = qw(encode_entities decode_entities _decode_entities); -@EXPORT_OK = qw(%entity2char %char2entity encode_entities_numeric); - -$VERSION = sprintf("%d.%02d", q$Revision: 1.35 $ =~ /(\d+)\.(\d+)/); -sub Version { $VERSION; } - -require HTML::Parser; # for fast XS implemented decode_entities - - -%entity2char = ( - # Some normal chars that have special meaning in SGML context - amp => '&', # ampersand -'gt' => '>', # greater than -'lt' => '<', # less than - quot => '"', # double quote - apos => "'", # single quote - - # PUBLIC ISO 8879-1986//ENTITIES Added Latin 1//EN//HTML - AElig => chr(198), # capital AE diphthong (ligature) - Aacute => chr(193), # capital A, acute accent - Acirc => chr(194), # capital A, circumflex accent - Agrave => chr(192), # capital A, grave accent - Aring => chr(197), # capital A, ring - Atilde => chr(195), # capital A, tilde - Auml => chr(196), # capital A, dieresis or umlaut mark - Ccedil => chr(199), # capital C, cedilla - ETH => chr(208), # capital Eth, Icelandic - Eacute => chr(201), # capital E, acute accent - Ecirc => chr(202), # capital E, circumflex accent - Egrave => chr(200), # capital E, grave accent - Euml => chr(203), # capital E, dieresis or umlaut mark - Iacute => chr(205), # capital I, acute accent - Icirc => chr(206), # capital I, circumflex accent - Igrave => chr(204), # capital I, grave accent - Iuml => chr(207), # capital I, dieresis or umlaut mark - Ntilde => chr(209), # capital N, tilde - Oacute => chr(211), # capital O, acute accent - Ocirc => chr(212), # capital O, circumflex accent - Ograve => chr(210), # capital O, grave accent - Oslash => chr(216), # capital O, slash - Otilde => chr(213), # capital O, tilde - Ouml => chr(214), # capital O, dieresis or umlaut mark - THORN => chr(222), # capital THORN, Icelandic - Uacute => chr(218), # capital U, acute accent - Ucirc => chr(219), # capital U, circumflex accent - Ugrave => chr(217), # capital U, grave accent - Uuml => chr(220), # capital U, dieresis or umlaut mark - Yacute => chr(221), # capital Y, acute accent - aacute => chr(225), # small a, acute accent - acirc => chr(226), # small a, circumflex accent - aelig => chr(230), # small ae diphthong (ligature) - agrave => chr(224), # small a, grave accent - aring => chr(229), # small a, ring - atilde => chr(227), # small a, tilde - auml => chr(228), # small a, dieresis or umlaut mark - ccedil => chr(231), # small c, cedilla - eacute => chr(233), # small e, acute accent - ecirc => chr(234), # small e, circumflex accent - egrave => chr(232), # small e, grave accent - eth => chr(240), # small eth, Icelandic - euml => chr(235), # small e, dieresis or umlaut mark - iacute => chr(237), # small i, acute accent - icirc => chr(238), # small i, circumflex accent - igrave => chr(236), # small i, grave accent - iuml => chr(239), # small i, dieresis or umlaut mark - ntilde => chr(241), # small n, tilde - oacute => chr(243), # small o, acute accent - ocirc => chr(244), # small o, circumflex accent - ograve => chr(242), # small o, grave accent - oslash => chr(248), # small o, slash - otilde => chr(245), # small o, tilde - ouml => chr(246), # small o, dieresis or umlaut mark - szlig => chr(223), # small sharp s, German (sz ligature) - thorn => chr(254), # small thorn, Icelandic - uacute => chr(250), # small u, acute accent - ucirc => chr(251), # small u, circumflex accent - ugrave => chr(249), # small u, grave accent - uuml => chr(252), # small u, dieresis or umlaut mark - yacute => chr(253), # small y, acute accent - yuml => chr(255), # small y, dieresis or umlaut mark - - # Some extra Latin 1 chars that are listed in the HTML3.2 draft (21-May-96) - copy => chr(169), # copyright sign - reg => chr(174), # registered sign - nbsp => chr(160), # non breaking space - - # Additional ISO-8859/1 entities listed in rfc1866 (section 14) - iexcl => chr(161), - cent => chr(162), - pound => chr(163), - curren => chr(164), - yen => chr(165), - brvbar => chr(166), - sect => chr(167), - uml => chr(168), - ordf => chr(170), - laquo => chr(171), -'not' => chr(172), # not is a keyword in perl - shy => chr(173), - macr => chr(175), - deg => chr(176), - plusmn => chr(177), - sup1 => chr(185), - sup2 => chr(178), - sup3 => chr(179), - acute => chr(180), - micro => chr(181), - para => chr(182), - middot => chr(183), - cedil => chr(184), - ordm => chr(186), - raquo => chr(187), - frac14 => chr(188), - frac12 => chr(189), - frac34 => chr(190), - iquest => chr(191), -'times' => chr(215), # times is a keyword in perl - divide => chr(247), - - ( $] > 5.007 ? ( - 'OElig;' => chr(338), - 'oelig;' => chr(339), - 'Scaron;' => chr(352), - 'scaron;' => chr(353), - 'Yuml;' => chr(376), - 'fnof;' => chr(402), - 'circ;' => chr(710), - 'tilde;' => chr(732), - 'Alpha;' => chr(913), - 'Beta;' => chr(914), - 'Gamma;' => chr(915), - 'Delta;' => chr(916), - 'Epsilon;' => chr(917), - 'Zeta;' => chr(918), - 'Eta;' => chr(919), - 'Theta;' => chr(920), - 'Iota;' => chr(921), - 'Kappa;' => chr(922), - 'Lambda;' => chr(923), - 'Mu;' => chr(924), - 'Nu;' => chr(925), - 'Xi;' => chr(926), - 'Omicron;' => chr(927), - 'Pi;' => chr(928), - 'Rho;' => chr(929), - 'Sigma;' => chr(931), - 'Tau;' => chr(932), - 'Upsilon;' => chr(933), - 'Phi;' => chr(934), - 'Chi;' => chr(935), - 'Psi;' => chr(936), - 'Omega;' => chr(937), - 'alpha;' => chr(945), - 'beta;' => chr(946), - 'gamma;' => chr(947), - 'delta;' => chr(948), - 'epsilon;' => chr(949), - 'zeta;' => chr(950), - 'eta;' => chr(951), - 'theta;' => chr(952), - 'iota;' => chr(953), - 'kappa;' => chr(954), - 'lambda;' => chr(955), - 'mu;' => chr(956), - 'nu;' => chr(957), - 'xi;' => chr(958), - 'omicron;' => chr(959), - 'pi;' => chr(960), - 'rho;' => chr(961), - 'sigmaf;' => chr(962), - 'sigma;' => chr(963), - 'tau;' => chr(964), - 'upsilon;' => chr(965), - 'phi;' => chr(966), - 'chi;' => chr(967), - 'psi;' => chr(968), - 'omega;' => chr(969), - 'thetasym;' => chr(977), - 'upsih;' => chr(978), - 'piv;' => chr(982), - 'ensp;' => chr(8194), - 'emsp;' => chr(8195), - 'thinsp;' => chr(8201), - 'zwnj;' => chr(8204), - 'zwj;' => chr(8205), - 'lrm;' => chr(8206), - 'rlm;' => chr(8207), - 'ndash;' => chr(8211), - 'mdash;' => chr(8212), - 'lsquo;' => chr(8216), - 'rsquo;' => chr(8217), - 'sbquo;' => chr(8218), - 'ldquo;' => chr(8220), - 'rdquo;' => chr(8221), - 'bdquo;' => chr(8222), - 'dagger;' => chr(8224), - 'Dagger;' => chr(8225), - 'bull;' => chr(8226), - 'hellip;' => chr(8230), - 'permil;' => chr(8240), - 'prime;' => chr(8242), - 'Prime;' => chr(8243), - 'lsaquo;' => chr(8249), - 'rsaquo;' => chr(8250), - 'oline;' => chr(8254), - 'frasl;' => chr(8260), - 'euro;' => chr(8364), - 'image;' => chr(8465), - 'weierp;' => chr(8472), - 'real;' => chr(8476), - 'trade;' => chr(8482), - 'alefsym;' => chr(8501), - 'larr;' => chr(8592), - 'uarr;' => chr(8593), - 'rarr;' => chr(8594), - 'darr;' => chr(8595), - 'harr;' => chr(8596), - 'crarr;' => chr(8629), - 'lArr;' => chr(8656), - 'uArr;' => chr(8657), - 'rArr;' => chr(8658), - 'dArr;' => chr(8659), - 'hArr;' => chr(8660), - 'forall;' => chr(8704), - 'part;' => chr(8706), - 'exist;' => chr(8707), - 'empty;' => chr(8709), - 'nabla;' => chr(8711), - 'isin;' => chr(8712), - 'notin;' => chr(8713), - 'ni;' => chr(8715), - 'prod;' => chr(8719), - 'sum;' => chr(8721), - 'minus;' => chr(8722), - 'lowast;' => chr(8727), - 'radic;' => chr(8730), - 'prop;' => chr(8733), - 'infin;' => chr(8734), - 'ang;' => chr(8736), - 'and;' => chr(8743), - 'or;' => chr(8744), - 'cap;' => chr(8745), - 'cup;' => chr(8746), - 'int;' => chr(8747), - 'there4;' => chr(8756), - 'sim;' => chr(8764), - 'cong;' => chr(8773), - 'asymp;' => chr(8776), - 'ne;' => chr(8800), - 'equiv;' => chr(8801), - 'le;' => chr(8804), - 'ge;' => chr(8805), - 'sub;' => chr(8834), - 'sup;' => chr(8835), - 'nsub;' => chr(8836), - 'sube;' => chr(8838), - 'supe;' => chr(8839), - 'oplus;' => chr(8853), - 'otimes;' => chr(8855), - 'perp;' => chr(8869), - 'sdot;' => chr(8901), - 'lceil;' => chr(8968), - 'rceil;' => chr(8969), - 'lfloor;' => chr(8970), - 'rfloor;' => chr(8971), - 'lang;' => chr(9001), - 'rang;' => chr(9002), - 'loz;' => chr(9674), - 'spades;' => chr(9824), - 'clubs;' => chr(9827), - 'hearts;' => chr(9829), - 'diams;' => chr(9830), - ) : ()) -); - - -# Make the opposite mapping -while (my($entity, $char) = each(%entity2char)) { - $entity =~ s/;\z//; - $char2entity{$char} = "&$entity;"; -} -delete $char2entity{"'"}; # only one-way decoding - -# Fill in missing entities -for (0 .. 255) { - next if exists $char2entity{chr($_)}; - $char2entity{chr($_)} = "&#$_;"; -} - -my %subst; # compiled encoding regexps - -sub decode_entities_old -{ - my $array; - if (defined wantarray) { - $array = [@_]; # copy - } else { - $array = \@_; # modify in-place - } - my $c; - for (@$array) { - s/(&\#(\d+);?)/$2 < 256 ? chr($2) : $1/eg; - s/(&\#[xX]([0-9a-fA-F]+);?)/$c = hex($2); $c < 256 ? chr($c) : $1/eg; - s/(&(\w+);?)/$entity2char{$2} || $1/eg; - } - wantarray ? @$array : $array->[0]; -} - -sub encode_entities -{ - my $ref; - if (defined wantarray) { - my $x = $_[0]; - $ref = \$x; # copy - } else { - $ref = \$_[0]; # modify in-place - } - if (defined $_[1] and length $_[1]) { - unless (exists $subst{$_[1]}) { - # Because we can't compile regex we fake it with a cached sub - my $code = "sub {\$_[0] =~ s/([$_[1]])/\$char2entity{\$1} || num_entity(\$1)/ge; }"; - $subst{$_[1]} = eval $code; - die( $@ . " while trying to turn range: \"$_[1]\"\n " - . "into code: $code\n " - ) if $@; - } - &{$subst{$_[1]}}($$ref); - } else { - # Encode control chars, high bit chars and '<', '&', '>', ''' and '"' - $$ref =~ s/([^\n\r\t !\#\$%\(-;=?-~])/$char2entity{$1} || num_entity($1)/ge; - } - $$ref; -} - -sub encode_entities_numeric { - local %char2entity; - return &encode_entities; # a goto &encode_entities wouldn't work -} - - -sub num_entity { - sprintf "&#x%X;", ord($_[0]); -} - -# Set up aliases -*encode = \&encode_entities; -*encode_numeric = \&encode_entities_numeric; -*encode_numerically = \&encode_entities_numeric; -*decode = \&decode_entities; - -1; diff --git a/ext/HTML/Parser/lib/HTML/Filter.pm b/ext/HTML/Parser/lib/HTML/Filter.pm deleted file mode 100644 index 21fafac621..0000000000 --- a/ext/HTML/Parser/lib/HTML/Filter.pm +++ /dev/null @@ -1,112 +0,0 @@ -package HTML::Filter; - -use strict; -use vars qw(@ISA $VERSION); - -require HTML::Parser; -@ISA=qw(HTML::Parser); - -$VERSION = sprintf("%d.%02d", q$Revision: 2.11 $ =~ /(\d+)\.(\d+)/); - -sub declaration { $_[0]->output("<!$_[1]>") } -sub process { $_[0]->output($_[2]) } -sub comment { $_[0]->output("<!--$_[1]-->") } -sub start { $_[0]->output($_[4]) } -sub end { $_[0]->output($_[2]) } -sub text { $_[0]->output($_[1]) } - -sub output { print $_[1] } - -1; - -__END__ - -=head1 NAME - -HTML::Filter - Filter HTML text through the parser - -=head1 NOTE - -B<This module is deprecated.> The C<HTML::Parser> now provides the -functionally of C<HTML::Filter> much more efficiently with the the -C<default> handler. - -=head1 SYNOPSIS - - require HTML::Filter; - $p = HTML::Filter->new->parse_file("index.html"); - -=head1 DESCRIPTION - -C<HTML::Filter> is an HTML parser that by default prints the -original text of each HTML element (a slow version of cat(1) basically). -The callback methods may be overridden to modify the filtering for some -HTML elements and you can override output() method which is called to -print the HTML text. - -C<HTML::Filter> is a subclass of C<HTML::Parser>. This means that -the document should be given to the parser by calling the $p->parse() -or $p->parse_file() methods. - -=head1 EXAMPLES - -The first example is a filter that will remove all comments from an -HTML file. This is achieved by simply overriding the comment method -to do nothing. - - package CommentStripper; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub comment { } # ignore comments - -The second example shows a filter that will remove any E<lt>TABLE>s -found in the HTML file. We specialize the start() and end() methods -to count table tags and then make output not happen when inside a -table. - - package TableStripper; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub start - { - my $self = shift; - $self->{table_seen}++ if $_[0] eq "table"; - $self->SUPER::start(@_); - } - - sub end - { - my $self = shift; - $self->SUPER::end(@_); - $self->{table_seen}-- if $_[0] eq "table"; - } - - sub output - { - my $self = shift; - unless ($self->{table_seen}) { - $self->SUPER::output(@_); - } - } - -If you want to collect the parsed text internally you might want to do -something like this: - - package FilterIntoString; - require HTML::Filter; - @ISA=qw(HTML::Filter); - sub output { push(@{$_[0]->{fhtml}}, $_[1]) } - sub filtered_html { join("", @{$_[0]->{fhtml}}) } - -=head1 SEE ALSO - -L<HTML::Parser> - -=head1 COPYRIGHT - -Copyright 1997-1999 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/ext/HTML/Parser/lib/HTML/HeadParser.pm b/ext/HTML/Parser/lib/HTML/HeadParser.pm deleted file mode 100644 index a8974f832b..0000000000 --- a/ext/HTML/Parser/lib/HTML/HeadParser.pm +++ /dev/null @@ -1,259 +0,0 @@ -package HTML::HeadParser; - -=head1 NAME - -HTML::HeadParser - Parse <HEAD> section of a HTML document - -=head1 SYNOPSIS - - require HTML::HeadParser; - $p = HTML::HeadParser->new; - $p->parse($text) and print "not finished"; - - $p->header('Title') # to access <title>....</title> - $p->header('Content-Base') # to access <base href="http://..."> - $p->header('Foo') # to access <meta http-equiv="Foo" content="..."> - -=head1 DESCRIPTION - -The C<HTML::HeadParser> is a specialized (and lightweight) -C<HTML::Parser> that will only parse the E<lt>HEAD>...E<lt>/HEAD> -section of an HTML document. The parse() method -will return a FALSE value as soon as some E<lt>BODY> element or body -text are found, and should not be called again after this. - -Note that the C<HTML::HeadParser> might get confused if raw undecoded -UTF-8 is passed to the parse() method. Make sure the strings are -properly decoded before passing them on. - -The C<HTML::HeadParser> keeps a reference to a header object, and the -parser will update this header object as the various elements of the -E<lt>HEAD> section of the HTML document are recognized. The following -header fields are affected: - -=over 4 - -=item Content-Base: - -The I<Content-Base> header is initialized from the E<lt>base -href="..."> element. - -=item Title: - -The I<Title> header is initialized from the E<lt>title>...E<lt>/title> -element. - -=item Isindex: - -The I<Isindex> header will be added if there is a E<lt>isindex> -element in the E<lt>head>. The header value is initialized from the -I<prompt> attribute if it is present. If no I<prompt> attribute is -given it will have '?' as the value. - -=item X-Meta-Foo: - -All E<lt>meta> elements will initialize headers with the prefix -"C<X-Meta->" on the name. If the E<lt>meta> element contains a -C<http-equiv> attribute, then it will be honored as the header name. - -=back - -=head1 METHODS - -The following methods (in addition to those provided by the -superclass) are available: - -=over 4 - -=cut - - -require HTML::Parser; -@ISA = qw(HTML::Parser); - -use HTML::Entities (); - -use strict; -use vars qw($VERSION $DEBUG); -#$DEBUG = 1; -$VERSION = sprintf("%d.%02d", q$Revision: 2.22 $ =~ /(\d+)\.(\d+)/); - -=item $hp = HTML::HeadParser->new - -=item $hp = HTML::HeadParser->new( $header ) - -The object constructor. The optional $header argument should be a -reference to an object that implement the header() and push_header() -methods as defined by the C<HTTP::Headers> class. Normally it will be -of some class that isa or delegates to the C<HTTP::Headers> class. - -If no $header is given C<HTML::HeadParser> will create an -C<HTTP::Header> object by itself (initially empty). - -=cut - -sub new -{ - my($class, $header) = @_; - unless ($header) { - require HTTP::Headers; - $header = HTTP::Headers->new; - } - - my $self = $class->SUPER::new(api_version => 2, - ignore_elements => [qw(script style)], - ); - $self->{'header'} = $header; - $self->{'tag'} = ''; # name of active element that takes textual content - $self->{'text'} = ''; # the accumulated text associated with the element - $self; -} - -=item $hp->header; - -Returns a reference to the header object. - -=item $hp->header( $key ) - -Returns a header value. It is just a shorter way to write -C<$hp-E<gt>header-E<gt>header($key)>. - -=cut - -sub header -{ - my $self = shift; - return $self->{'header'} unless @_; - $self->{'header'}->header(@_); -} - -sub as_string # legacy -{ - my $self = shift; - $self->{'header'}->as_string; -} - -sub flush_text # internal -{ - my $self = shift; - my $tag = $self->{'tag'}; - my $text = $self->{'text'}; - $text =~ s/^\s+//; - $text =~ s/\s+$//; - $text =~ s/\s+/ /g; - print "FLUSH $tag => '$text'\n" if $DEBUG; - if ($tag eq 'title') { - HTML::Entities::decode($text); - $self->{'header'}->push_header(Title => $text); - } - $self->{'tag'} = $self->{'text'} = ''; -} - -# This is an quote from the HTML3.2 DTD which shows which elements -# that might be present in a <HEAD>...</HEAD>. Also note that the -# <HEAD> tags themselves might be missing: -# -# <!ENTITY % head.content "TITLE & ISINDEX? & BASE? & STYLE? & -# SCRIPT* & META* & LINK*"> -# -# <!ELEMENT HEAD O O (%head.content)> - - -sub start -{ - my($self, $tag, $attr) = @_; # $attr is reference to a HASH - print "START[$tag]\n" if $DEBUG; - $self->flush_text if $self->{'tag'}; - if ($tag eq 'meta') { - my $key = $attr->{'http-equiv'}; - if (!defined($key) || !length($key)) { - return unless $attr->{'name'}; - $key = "X-Meta-\u$attr->{'name'}"; - } - $self->{'header'}->push_header($key => $attr->{content}); - } elsif ($tag eq 'base') { - return unless exists $attr->{href}; - $self->{'header'}->push_header('Content-Base' => $attr->{href}); - } elsif ($tag eq 'isindex') { - # This is a non-standard header. Perhaps we should just ignore - # this element - $self->{'header'}->push_header(Isindex => $attr->{prompt} || '?'); - } elsif ($tag =~ /^(?:title|script|style)$/) { - # Just remember tag. Initialize header when we see the end tag. - $self->{'tag'} = $tag; - } elsif ($tag eq 'link') { - return unless exists $attr->{href}; - # <link href="http:..." rel="xxx" rev="xxx" title="xxx"> - my $h_val = "<" . delete($attr->{href}) . ">"; - for (sort keys %{$attr}) { - $h_val .= qq(; $_="$attr->{$_}"); - } - $self->{'header'}->push_header(Link => $h_val); - } elsif ($tag eq 'head' || $tag eq 'html') { - # ignore - } else { - # stop parsing - $self->eof; - } -} - -sub end -{ - my($self, $tag) = @_; - print "END[$tag]\n" if $DEBUG; - $self->flush_text if $self->{'tag'}; - $self->eof if $tag eq 'head'; -} - -sub text -{ - my($self, $text) = @_; - $text =~ s/\x{FEFF}//; # drop Unicode BOM if found - print "TEXT[$text]\n" if $DEBUG; - my $tag = $self->{tag}; - if (!$tag && $text =~ /\S/) { - # Normal text means start of body - $self->eof; - return; - } - return if $tag ne 'title'; - $self->{'text'} .= $text; -} - -1; - -__END__ - -=back - -=head1 EXAMPLE - - $h = HTTP::Headers->new; - $p = HTML::HeadParser->new($h); - $p->parse(<<EOT); - <title>Stupid example</title> - <base href="http://www.linpro.no/lwp/"> - Normal text starts here. - EOT - undef $p; - print $h->title; # should print "Stupid example" - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTTP::Headers> - -The C<HTTP::Headers> class is distributed as part of the -I<libwww-perl> package. If you don't have that distribution installed -you need to provide the $header argument to the C<HTML::HeadParser> -constructor with your own object that implements the documented -protocol. - -=head1 COPYRIGHT - -Copyright 1996-2001 Gisle Aas. All rights reserved. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - diff --git a/ext/HTML/Parser/lib/HTML/LinkExtor.pm b/ext/HTML/Parser/lib/HTML/LinkExtor.pm deleted file mode 100644 index d543a5aba7..0000000000 --- a/ext/HTML/Parser/lib/HTML/LinkExtor.pm +++ /dev/null @@ -1,187 +0,0 @@ -package HTML::LinkExtor; - -# $Id: LinkExtor.pm,v 1.33 2003/10/10 10:20:56 gisle Exp $ - -require HTML::Parser; -@ISA = qw(HTML::Parser); -$VERSION = sprintf("%d.%02d", q$Revision: 1.33 $ =~ /(\d+)\.(\d+)/); - -=head1 NAME - -HTML::LinkExtor - Extract links from an HTML document - -=head1 SYNOPSIS - - require HTML::LinkExtor; - $p = HTML::LinkExtor->new(\&cb, "http://www.perl.org/"); - sub cb { - my($tag, %links) = @_; - print "$tag @{[%links]}\n"; - } - $p->parse_file("index.html"); - -=head1 DESCRIPTION - -I<HTML::LinkExtor> is an HTML parser that extracts links from an -HTML document. The I<HTML::LinkExtor> is a subclass of -I<HTML::Parser>. This means that the document should be given to the -parser by calling the $p->parse() or $p->parse_file() methods. - -=cut - -use strict; -use HTML::Tagset (); - -# legacy (some applications grabs this hash directly) -use vars qw(%LINK_ELEMENT); -*LINK_ELEMENT = \%HTML::Tagset::linkElements; - -=over 4 - -=item $p = HTML::LinkExtor->new - -=item $p = HTML::LinkExtor->new( $callback ) - -=item $p = HTML::LinkExtor->new( $callback, $base ) - -The constructor takes two optional arguments. The first is a reference -to a callback routine. It will be called as links are found. If a -callback is not provided, then links are just accumulated internally -and can be retrieved by calling the $p->links() method. - -The $base argument is an optional base URL used to absolutize all URLs found. -You need to have the I<URI> module installed if you provide $base. - -The callback is called with the lowercase tag name as first argument, -and then all link attributes as separate key/value pairs. All -non-link attributes are removed. - -=cut - -sub new -{ - my($class, $cb, $base) = @_; - my $self = $class->SUPER::new( - start_h => ["_start_tag", "self,tagname,attr"], - report_tags => [keys %HTML::Tagset::linkElements], - ); - $self->{extractlink_cb} = $cb; - if ($base) { - require URI; - $self->{extractlink_base} = URI->new($base); - } - $self; -} - -sub _start_tag -{ - my($self, $tag, $attr) = @_; - - my $base = $self->{extractlink_base}; - my $links = $HTML::Tagset::linkElements{$tag}; - $links = [$links] unless ref $links; - - my @links; - my $a; - for $a (@$links) { - next unless exists $attr->{$a}; - push(@links, $a, $base ? URI->new($attr->{$a}, $base)->abs($base) - : $attr->{$a}); - } - return unless @links; - $self->_found_link($tag, @links); -} - -sub _found_link -{ - my $self = shift; - my $cb = $self->{extractlink_cb}; - if ($cb) { - &$cb(@_); - } else { - push(@{$self->{'links'}}, [@_]); - } -} - -=item $p->links - -Returns a list of all links found in the document. The returned -values will be anonymous arrays with the follwing elements: - - [$tag, $attr => $url1, $attr2 => $url2,...] - -The $p->links method will also truncate the internal link list. This -means that if the method is called twice without any parsing -between them the second call will return an empty list. - -Also note that $p->links will always be empty if a callback routine -was provided when the I<HTML::LinkExtor> was created. - -=cut - -sub links -{ - my $self = shift; - exists($self->{'links'}) ? @{delete $self->{'links'}} : (); -} - -# We override the parse_file() method so that we can clear the links -# before we start a new file. -sub parse_file -{ - my $self = shift; - delete $self->{'links'}; - $self->SUPER::parse_file(@_); -} - -=back - -=head1 EXAMPLE - -This is an example showing how you can extract links from a document -received using LWP: - - use LWP::UserAgent; - use HTML::LinkExtor; - use URI::URL; - - $url = "http://www.perl.org/"; # for instance - $ua = LWP::UserAgent->new; - - # Set up a callback that collect image links - my @imgs = (); - sub callback { - my($tag, %attr) = @_; - return if $tag ne 'img'; # we only look closer at <img ...> - push(@imgs, values %attr); - } - - # Make the parser. Unfortunately, we don't know the base yet - # (it might be diffent from $url) - $p = HTML::LinkExtor->new(\&callback); - - # Request document and parse it as it arrives - $res = $ua->request(HTTP::Request->new(GET => $url), - sub {$p->parse($_[0])}); - - # Expand all image URLs to absolute ones - my $base = $res->base; - @imgs = map { $_ = url($_, $base)->abs; } @imgs; - - # Print them out - print join("\n", @imgs), "\n"; - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTML::Tagset>, L<LWP>, L<URI::URL> - -=head1 COPYRIGHT - -Copyright 1996-2001 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut - -1; diff --git a/ext/HTML/Parser/lib/HTML/PullParser.pm b/ext/HTML/Parser/lib/HTML/PullParser.pm deleted file mode 100644 index e851fe001d..0000000000 --- a/ext/HTML/Parser/lib/HTML/PullParser.pm +++ /dev/null @@ -1,211 +0,0 @@ -package HTML::PullParser; - -# $Id: PullParser.pm,v 2.9 2006/04/26 08:00:28 gisle Exp $ - -require HTML::Parser; -@ISA=qw(HTML::Parser); -$VERSION = sprintf("%d.%02d", q$Revision: 2.9 $ =~ /(\d+)\.(\d+)/); - -use strict; -use Carp (); - -sub new -{ - my($class, %cnf) = @_; - - # Construct argspecs for the various events - my %argspec; - for (qw(start end text declaration comment process default)) { - my $tmp = delete $cnf{$_}; - next unless defined $tmp; - $argspec{$_} = $tmp; - } - Carp::croak("Info not collected for any events") - unless %argspec; - - my $file = delete $cnf{file}; - my $doc = delete $cnf{doc}; - Carp::croak("Can't parse from both 'doc' and 'file' at the same time") - if defined($file) && defined($doc); - Carp::croak("No 'doc' or 'file' given to parse from") - unless defined($file) || defined($doc); - - # Create object - $cnf{api_version} = 3; - my $self = $class->SUPER::new(%cnf); - - my $accum = $self->{pullparser_accum} = []; - while (my($event, $argspec) = each %argspec) { - $self->SUPER::handler($event => $accum, $argspec); - } - - if (defined $doc) { - $self->{pullparser_str_ref} = ref($doc) ? $doc : \$doc; - $self->{pullparser_str_pos} = 0; - } - else { - if (!ref($file) && ref(\$file) ne "GLOB") { - require IO::File; - $file = IO::File->new($file, "r") || return; - } - - $self->{pullparser_file} = $file; - } - $self; -} - - -sub handler -{ - Carp::croak("Can't set handlers for HTML::PullParser"); -} - - -sub get_token -{ - my $self = shift; - while (!@{$self->{pullparser_accum}} && !$self->{pullparser_eof}) { - if (my $f = $self->{pullparser_file}) { - # must try to parse more from the file - my $buf; - if (read($f, $buf, 512)) { - $self->parse($buf); - } else { - $self->eof; - $self->{pullparser_eof}++; - delete $self->{pullparser_file}; - } - } - elsif (my $sref = $self->{pullparser_str_ref}) { - # must try to parse more from the scalar - my $pos = $self->{pullparser_str_pos}; - my $chunk = substr($$sref, $pos, 512); - $self->parse($chunk); - $pos += length($chunk); - if ($pos < length($$sref)) { - $self->{pullparser_str_pos} = $pos; - } - else { - $self->eof; - $self->{pullparser_eof}++; - delete $self->{pullparser_str_ref}; - delete $self->{pullparser_str_pos}; - } - } - else { - die; - } - } - shift @{$self->{pullparser_accum}}; -} - - -sub unget_token -{ - my $self = shift; - unshift @{$self->{pullparser_accum}}, @_; - $self; -} - -1; - - -__END__ - -=head1 NAME - -HTML::PullParser - Alternative HTML::Parser interface - -=head1 SYNOPSIS - - use HTML::PullParser; - - $p = HTML::PullParser->new(file => "index.html", - start => 'event, tagname, @attr', - end => 'event, tagname', - ignore_elements => [qw(script style)], - ) || die "Can't open: $!"; - while (my $token = $p->get_token) { - #...do something with $token - } - -=head1 DESCRIPTION - -The HTML::PullParser is an alternative interface to the HTML::Parser class. -It basically turns the HTML::Parser inside out. You associate a file -(or any IO::Handle object or string) with the parser at construction time and -then repeatedly call $parser->get_token to obtain the tags and text -found in the parsed document. - -The following methods are provided: - -=over 4 - -=item $p = HTML::PullParser->new( file => $file, %options ) - -=item $p = HTML::PullParser->new( doc => \$doc, %options ) - -A C<HTML::PullParser> can be made to parse from either a file or a -literal document based on whether the C<file> or C<doc> option is -passed to the parser's constructor. - -The C<file> passed in can either be a file name or a file handle -object. If a file name is passed, and it can't be opened for reading, -then the constructor will return an undefined value and $! will tell -you why it failed. Otherwise the argument is taken to be some object -that the C<HTML::PullParser> can read() from when it needs more data. -The stream will be read() until EOF, but not closed. - -A C<doc> can be passed plain or as a reference -to a scalar. If a reference is passed then the value of this scalar -should not be changed before all tokens have been extracted. - -Next the information to be returned for the different token types must -be set up. This is done by simply associating an argspec (as defined -in L<HTML::Parser>) with the events you have an interest in. For -instance, if you want C<start> tokens to be reported as the string -C<'S'> followed by the tagname and the attributes you might pass an -C<start>-option like this: - - $p = HTML::PullParser->new( - doc => $document_to_parse, - start => '"S", tagname, @attr', - end => '"E", tagname', - ); - -At last other C<HTML::Parser> options, like C<ignore_tags>, and -C<unbroken_text>, can be passed in. Note that you should not use the -I<event>_h options to set up parser handlers. That would confuse the -inner logic of C<HTML::PullParser>. - -=item $token = $p->get_token - -This method will return the next I<token> found in the HTML document, -or C<undef> at the end of the document. The token is returned as an -array reference. The content of this array match the argspec set up -during C<HTML::PullParser> construction. - -=item $p->unget_token( @tokens ) - -If you find out you have read too many tokens you can push them back, -so that they are returned again the next time $p->get_token is called. - -=back - -=head1 EXAMPLES - -The 'eg/hform' script shows how we might parse the form section of -HTML::Documents using HTML::PullParser. - -=head1 SEE ALSO - -L<HTML::Parser>, L<HTML::TokeParser> - -=head1 COPYRIGHT - -Copyright 1998-2001 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/ext/HTML/Parser/lib/HTML/TokeParser.pm b/ext/HTML/Parser/lib/HTML/TokeParser.pm deleted file mode 100644 index a1b8837cb4..0000000000 --- a/ext/HTML/Parser/lib/HTML/TokeParser.pm +++ /dev/null @@ -1,371 +0,0 @@ -package HTML::TokeParser; - -# $Id: TokeParser.pm,v 2.37 2006/04/26 08:00:28 gisle Exp $ - -require HTML::PullParser; -@ISA=qw(HTML::PullParser); -$VERSION = sprintf("%d.%02d", q$Revision: 2.37 $ =~ /(\d+)\.(\d+)/); - -use strict; -use Carp (); -use HTML::Entities qw(decode_entities); -use HTML::Tagset (); - -my %ARGS = -( - start => "'S',tagname,attr,attrseq,text", - end => "'E',tagname,text", - text => "'T',text,is_cdata", - process => "'PI',token0,text", - comment => "'C',text", - declaration => "'D',text", - - # options that default on - unbroken_text => 1, -); - - -sub new -{ - my $class = shift; - my %cnf; - if (@_ == 1) { - my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file"; - %cnf = ($type => $_[0]); - } - else { - %cnf = @_; - } - - my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"}; - - my $self = $class->SUPER::new(%cnf, %ARGS) || return undef; - - $self->{textify} = $textify; - $self; -} - - -sub get_tag -{ - my $self = shift; - my $token; - while (1) { - $token = $self->get_token || return undef; - my $type = shift @$token; - next unless $type eq "S" || $type eq "E"; - substr($token->[0], 0, 0) = "/" if $type eq "E"; - return $token unless @_; - for (@_) { - return $token if $token->[0] eq $_; - } - } -} - - -sub _textify { - my($self, $token) = @_; - my $tag = $token->[1]; - return undef unless exists $self->{textify}{$tag}; - - my $alt = $self->{textify}{$tag}; - my $text; - if (ref($alt)) { - $text = &$alt(@$token); - } else { - $text = $token->[2]{$alt || "alt"}; - $text = "[\U$tag]" unless defined $text; - } - return $text; -} - - -sub get_text -{ - my $self = shift; - my @text; - while (my $token = $self->get_token) { - my $type = $token->[0]; - if ($type eq "T") { - my $text = $token->[1]; - decode_entities($text) unless $token->[2]; - push(@text, $text); - } elsif ($type =~ /^[SE]$/) { - my $tag = $token->[1]; - if ($type eq "S") { - if (defined(my $text = _textify($self, $token))) { - push(@text, $text); - next; - } - } else { - $tag = "/$tag"; - } - if (!@_ || grep $_ eq $tag, @_) { - $self->unget_token($token); - last; - } - push(@text, " ") - if $tag eq "br" || !$HTML::Tagset::isPhraseMarkup{$token->[1]}; - } - } - join("", @text); -} - - -sub get_trimmed_text -{ - my $self = shift; - my $text = $self->get_text(@_); - $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; - $text; -} - -sub get_phrase { - my $self = shift; - my @text; - while (my $token = $self->get_token) { - my $type = $token->[0]; - if ($type eq "T") { - my $text = $token->[1]; - decode_entities($text) unless $token->[2]; - push(@text, $text); - } elsif ($type =~ /^[SE]$/) { - my $tag = $token->[1]; - if ($type eq "S") { - if (defined(my $text = _textify($self, $token))) { - push(@text, $text); - next; - } - } - if (!$HTML::Tagset::isPhraseMarkup{$tag}) { - $self->unget_token($token); - last; - } - push(@text, " ") if $tag eq "br"; - } - } - my $text = join("", @text); - $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g; - $text; -} - -1; - - -__END__ - -=head1 NAME - -HTML::TokeParser - Alternative HTML::Parser interface - -=head1 SYNOPSIS - - require HTML::TokeParser; - $p = HTML::TokeParser->new("index.html") || - die "Can't open: $!"; - $p->empty_element_tags(1); # configure its behaviour - - while (my $token = $p->get_token) { - #... - } - -=head1 DESCRIPTION - -The C<HTML::TokeParser> is an alternative interface to the -C<HTML::Parser> class. It is an C<HTML::PullParser> subclass with a -predeclared set of token types. If you wish the tokens to be reported -differently you probably want to use the C<HTML::PullParser> directly. - -The following methods are available: - -=over 4 - -=item $p = HTML::TokeParser->new( $filename, %opt ); - -=item $p = HTML::TokeParser->new( $filehandle, %opt ); - -=item $p = HTML::TokeParser->new( \$document, %opt ); - -The object constructor argument is either a file name, a file handle -object, or the complete document to be parsed. Extra options can be -provided as key/value pairs and are processed as documented by the base -classes. - -If the argument is a plain scalar, then it is taken as the name of a -file to be opened and parsed. If the file can't be opened for -reading, then the constructor will return C<undef> and $! will tell -you why it failed. - -If the argument is a reference to a plain scalar, then this scalar is -taken to be the literal document to parse. The value of this -scalar should not be changed before all tokens have been extracted. - -Otherwise the argument is taken to be some object that the -C<HTML::TokeParser> can read() from when it needs more data. Typically -it will be a filehandle of some kind. The stream will be read() until -EOF, but not closed. - -A newly constructed C<HTML::TokeParser> differ from its base classes -by having the C<unbroken_text> attribute enabled by default. See -L<HTML::Parser> for a description of this and other attributes that -influence how the document is parsed. It is often a good idea to enable -C<empty_element_tags> behaviour. - -Note that the parsing result will likely not be valid if raw undecoded -UTF-8 is used as a source. When parsing UTF-8 encoded files turn -on UTF-8 decoding: - - open(my $fh, "<:utf8", "index.html") || die "Can't open 'index.html': $!"; - my $p = HTML::TokeParser->new( $fh ); - # ... - -If a $filename is passed to the constructor the file will be opened in -raw mode and the parsing result will only be valid if its content is -Latin-1 or pure ASCII. - -If parsing from an UTF-8 encoded string buffer decode it first: - - utf8::decode($document); - my $p = HTML::TokeParser->new( \$document ); - # ... - -=item $p->get_token - -This method will return the next I<token> found in the HTML document, -or C<undef> at the end of the document. The token is returned as an -array reference. The first element of the array will be a string -denoting the type of this token: "S" for start tag, "E" for end tag, -"T" for text, "C" for comment, "D" for declaration, and "PI" for -process instructions. The rest of the token array depend on the type -like this: - - ["S", $tag, $attr, $attrseq, $text] - ["E", $tag, $text] - ["T", $text, $is_data] - ["C", $text] - ["D", $text] - ["PI", $token0, $text] - -where $attr is a hash reference, $attrseq is an array reference and -the rest are plain scalars. The L<HTML::Parser/Argspec> explains the -details. - -=item $p->unget_token( @tokens ) - -If you find you have read too many tokens you can push them back, -so that they are returned the next time $p->get_token is called. - -=item $p->get_tag - -=item $p->get_tag( @tags ) - -This method returns the next start or end tag (skipping any other -tokens), or C<undef> if there are no more tags in the document. If -one or more arguments are given, then we skip tokens until one of the -specified tag types is found. For example: - - $p->get_tag("font", "/font"); - -will find the next start or end tag for a font-element. - -The tag information is returned as an array reference in the same form -as for $p->get_token above, but the type code (first element) is -missing. A start tag will be returned like this: - - [$tag, $attr, $attrseq, $text] - -The tagname of end tags are prefixed with "/", i.e. end tag is -returned like this: - - ["/$tag", $text] - -=item $p->get_text - -=item $p->get_text( @endtags ) - -This method returns all text found at the current position. It will -return a zero length string if the next token is not text. Any -entities will be converted to their corresponding character. - -If one or more arguments are given, then we return all text occurring -before the first of the specified tags found. For example: - - $p->get_text("p", "br"); - -will return the text up to either a paragraph of linebreak element. - -The text might span tags that should be I<textified>. This is -controlled by the $p->{textify} attribute, which is a hash that -defines how certain tags can be treated as text. If the name of a -start tag matches a key in this hash then this tag is converted to -text. The hash value is used to specify which tag attribute to obtain -the text from. If this tag attribute is missing, then the upper case -name of the tag enclosed in brackets is returned, e.g. "[IMG]". The -hash value can also be a subroutine reference. In this case the -routine is called with the start tag token content as its argument and -the return value is treated as the text. - -The default $p->{textify} value is: - - {img => "alt", applet => "alt"} - -This means that <IMG> and <APPLET> tags are treated as text, and that -the text to substitute can be found in the ALT attribute. - -=item $p->get_trimmed_text - -=item $p->get_trimmed_text( @endtags ) - -Same as $p->get_text above, but will collapse any sequences of white -space to a single space character. Leading and trailing white space is -removed. - -=item $p->get_phrase - -This will return all text found at the current position ignoring any -phrasal-level tags. Text is extracted until the first non -phrasal-level tag. Textification of tags is the same as for -get_text(). This method will collapse white space in the same way as -get_trimmed_text() does. - -The definition of <i>phrasal-level tags</i> is obtained from the -HTML::Tagset module. - -=back - -=head1 EXAMPLES - -This example extracts all links from a document. It will print one -line for each link, containing the URL and the textual description -between the <A>...</A> tags: - - use HTML::TokeParser; - $p = HTML::TokeParser->new(shift||"index.html"); - - while (my $token = $p->get_tag("a")) { - my $url = $token->[1]{href} || "-"; - my $text = $p->get_trimmed_text("/a"); - print "$url\t$text\n"; - } - -This example extract the <TITLE> from the document: - - use HTML::TokeParser; - $p = HTML::TokeParser->new(shift||"index.html"); - if ($p->get_tag("title")) { - my $title = $p->get_trimmed_text; - print "Title: $title\n"; - } - -=head1 SEE ALSO - -L<HTML::PullParser>, L<HTML::Parser> - -=head1 COPYRIGHT - -Copyright 1998-2005 Gisle Aas. - -This library is free software; you can redistribute it and/or -modify it under the same terms as Perl itself. - -=cut diff --git a/ext/HTML/Parser/mkhctype b/ext/HTML/Parser/mkhctype deleted file mode 100755 index eeae40df67..0000000000 --- a/ext/HTML/Parser/mkhctype +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/perl - -($progname = $0) =~ s,.*/,,; - -print "/* This file is autogenerated by $progname */\n"; - -print <<'EOT'; - -#define HCTYPE_SPACE 0x01 -#define HCTYPE_NAME_FIRST 0x02 -#define HCTYPE_NAME_CHAR 0x04 -#define HCTYPE_NOT_SPACE_GT 0x08 -#define HCTYPE_NOT_SPACE_EQ_GT 0x10 -#define HCTYPE_NOT_SPACE_SLASH_GT 0x20 -#define HCTYPE_NOT_SPACE_EQ_SLASH_GT 0x40 - -#define HCTYPE(c) hctype[(unsigned char)(c)] -#define isHCTYPE(c, w) (HCTYPE(c) & (w)) - -#define isHSPACE(c) isHCTYPE(c, HCTYPE_SPACE) -#define isHNAME_FIRST(c) isHCTYPE(c, HCTYPE_NAME_FIRST) -#define isHNAME_CHAR(c) isHCTYPE(c, HCTYPE_NAME_CHAR) -#define isHNOT_SPACE_GT(c) isHCTYPE(c, HCTYPE_NOT_SPACE_GT) - -typedef unsigned char hctype_t; - -EOT - -print "static hctype_t hctype[] = {\n"; - -for my $c (0 .. 255) { - print " " unless $c % 8; - - local $_ = chr($c); - my $v = 0; - if (/^\s$/) { # isSPACE - $v |= 0x1 - } - elsif ($_ ne ">") { - $v |= 0x08; - $v |= 0x10 if $_ ne "="; - $v |= 0x20 if $_ ne "/"; - $v |= 0x40 if $_ ne "="; - } - - if (/^[\w.\-:]$/) { - $v |= 0x4; - $v |= 0x2 unless /^[\d.-]$/; # XML allow /[:_]/ as first char - } - - printf "0x%02x, ", $v; - unless (($c+1) % 8) { - printf " /* %3d - %3d */\n", $c - 7, $c; - } -} -print "};\n"; - diff --git a/ext/HTML/Parser/mkpfunc b/ext/HTML/Parser/mkpfunc deleted file mode 100755 index 810bc1fb1c..0000000000 --- a/ext/HTML/Parser/mkpfunc +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/perl - -($progname = $0) =~ s,.*/,,; - -print "/* This file is autogenerated by $progname */\n"; - -print "typedef char*(*PFUNC)(PSTATE*, char *beg, char *end, U32 utf8, SV* self);\n"; -print "static PFUNC parsefunc[] = {\n"; - -for my $c (0..255) { - local $_ = chr($c); - my $func = "null"; - if (/^[A-Za-z]$/) { - $func = "start"; - } - elsif ($_ eq "/") { - $func = "end"; - } - elsif ($_ eq "!") { - $func = "decl"; - } - elsif ($_ eq "?") { - $func = "process"; - } - printf " %-15s /* %3d */\n", "parse_$func,", $c; -} - -print "};\n"; diff --git a/ext/HTML/Parser/t/api_version.t b/ext/HTML/Parser/t/api_version.t deleted file mode 100644 index 98031216c4..0000000000 --- a/ext/HTML/Parser/t/api_version.t +++ /dev/null @@ -1,22 +0,0 @@ -use Test::More tests => 4; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3); - -ok(!$p->handler("start"), "API version 3"); - -my $failed; -eval { - my $p = HTML::Parser->new(api_version => 4); - $failed++; -}; -like($@, qr/^API version 4 not supported/); -ok(!$failed, "API version 4"); - -$p = HTML::Parser->new(api_version => 2); - -is($p->handler("start"), "start", "API version 2"); - - diff --git a/ext/HTML/Parser/t/argspec-bad.t b/ext/HTML/Parser/t/argspec-bad.t deleted file mode 100644 index 8c0b19975b..0000000000 --- a/ext/HTML/Parser/t/argspec-bad.t +++ /dev/null @@ -1,40 +0,0 @@ -use Test::More tests => 6; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3); - -eval { - $p->handler(end => "end", q(xyzzy)); -}; -like($@, qr/^Unrecognized identifier xyzzy in argspec/); - - -eval { - $p->handler(end => "end", q(tagname text)); -}; -like($@, qr/^Missing comma separator in argspec/); - - -eval { - $p->handler(end => "end", q(tagname, "text)); -}; -like($@, qr/^Unterminated literal string in argspec/); - - -eval { - $p->handler(end => "end", q(tagname, "t\\t")); -}; -like($@, qr/^Backslash reserved for literal string in argspec/); - -eval { - $p->handler(end => "end", '"' . ("x" x 256) . '"'); -}; -like($@, qr/^Literal string is longer than 255 chars in argspec/); - -$p->handler(end => sub { is(length(shift), 255) }, - '"' . ("x" x 255) . '"'); -$p->parse("</x>"); - - diff --git a/ext/HTML/Parser/t/argspec.t b/ext/HTML/Parser/t/argspec.t deleted file mode 100644 index e8aa7a5c4a..0000000000 --- a/ext/HTML/Parser/t/argspec.t +++ /dev/null @@ -1,148 +0,0 @@ - -use strict; -require HTML::Parser; - -my $decl = '<!ENTITY nbsp CDATA " " -- no-break space -->'; -my $com1 = '<!-- Comment -->'; -my $com2 = '<!-- Comment -- -- Comment -->'; -my $start = '<a href="foo">'; -my $end = '</a>'; -my $empty = "<IMG SRC='foo'/>"; -my $proc = '<? something completely different ?>'; - -my @argspec = qw( self offset length - event tagname tag token0 - text - is_cdata dtext - tokens - tokenpos - attr - attrseq ); - -my @result = (); -my $p = HTML::Parser -> new(default_h => [\@result, join(',', @argspec)], - strict_comment => 1, xml_mode => 1); - -my @tests = - ( # string, expected results - $decl => [[$p, 0, 52, 'declaration', 'ENTITY', '!ENTITY', 'ENTITY', - '<!ENTITY nbsp CDATA " " -- no-break space -->', - undef, undef, - ['ENTITY', 'nbsp', 'CDATA', '" "', '-- no-break space --'], - [2, 6, 9, 4, 16, 5, 22, 8, 31, 20], - undef, undef ]], - $com1 => [[$p, 0, 16, 'comment', ' Comment ', '# Comment ', ' Comment ', - '<!-- Comment -->', - undef, undef, - [' Comment '], - [4, 9], - undef, undef ]], - $com2 => [[$p, 0, 30, 'comment', ' Comment ', '# Comment ', ' Comment ', - '<!-- Comment -- -- Comment -->', - undef, undef, - [' Comment ', ' Comment '], - [4, 9, 18, 9], - undef, undef ]], - $start => [[$p, 0, 14, 'start', 'a', 'a', 'a', - '<a href="foo">', - undef, undef, - ['a', 'href', '"foo"'], - [1, 1, 3, 4, 8, 5], - {'href', 'foo'}, ['href'] ]], - $end => [[$p, 0, 4, 'end', 'a', '/a', 'a', - '</a>', - undef, undef, - ['a'], - [2, 1], - undef, undef ]], - $empty => [[$p, 0, 16, 'start', 'IMG', 'IMG', 'IMG', - "<IMG SRC='foo'/>", - undef, undef, - ['IMG', 'SRC', "'foo'"], - [1, 3, 5, 3, 9, 5], - {'SRC', 'foo'}, ['SRC'] ], - [$p, 16, 0, 'end', 'IMG', '/IMG', 'IMG', - '', - undef, undef, - ['IMG'], - undef, - undef, undef ], - ], - $proc => [[$p, 0, 36, 'process', ' something completely different ', - '? something completely different ', - ' something completely different ', - '<? something completely different ?>', - undef, undef, - [' something completely different '], - [2, 32], - undef, undef ]], - "$end\n$end" => [[$p, 0, 4, 'end', 'a', '/a', 'a', - '</a>', - undef, undef, - ['a'], - [2, 1], - undef, undef], - [$p, 4, 1, 'text', undef, undef, undef, - "\n", - '', "\n", - undef, - undef, - undef, undef], - [$p, 5, 4, 'end', 'a', '/a', 'a', - '</a>', - undef, undef, - ['a'], - [2, 1], - undef, undef ]], - ); - -use Test::More; -plan tests => @tests / 2; - -sub string_tag { - my (@pieces) = @_; - my $part; - foreach $part ( @pieces ) { - if (!defined $part) { - $part = 'undef'; - } - elsif (!ref $part) { - $part = "'$part'" if $part !~ /^\d+$/; - } - elsif ('ARRAY' eq ref $part ) { - $part = '[' . join(', ', string_tag(@$part)) . ']'; - } - elsif ('HASH' eq ref $part ) { - $part = '{' . join(',', string_tag(%$part)) . '}'; - } - else { - $part = '<' . ref($part) . '>'; - } - } - return join(", ", @pieces ); -} - -my $i = 0; -TEST: -while (@tests) { - my($html, $expected) = splice @tests, 0, 2; - ++$i; - - @result = (); - $p->parse($html)->eof; - - shift(@result) if $result[0][3] eq "start_document"; - pop(@result) if $result[-1][3] eq "end_document"; - - # Compare results for each element expected - foreach (@$expected) { - my $want = string_tag($_); - my $got = string_tag(shift @result); - if ($want ne $got) { - is($want, $got); - next TEST; - } - } - - pass; -} diff --git a/ext/HTML/Parser/t/argspec2.t b/ext/HTML/Parser/t/argspec2.t deleted file mode 100644 index 6f594b97b6..0000000000 --- a/ext/HTML/Parser/t/argspec2.t +++ /dev/null @@ -1,21 +0,0 @@ -use Test::More tests => 2; - -use strict; -use HTML::Parser; - -my @start; -my @text; - -my $p = HTML::Parser->new(api_version => 3); -$p->handler(start => \@start, '@{tagname, @attr}'); -$p->handler(text => \@text, '@{dtext}'); -$p->parse(<<EOT)->eof; -Hi -<a href="abc">Foo</a><b>:-)</b> -EOT - -is("@start", "a href abc b"); - -is(join("", @text), "Hi\nFoo:-)\n"); - - diff --git a/ext/HTML/Parser/t/attr-encoded.t b/ext/HTML/Parser/t/attr-encoded.t deleted file mode 100644 index 4d458eb76c..0000000000 --- a/ext/HTML/Parser/t/attr-encoded.t +++ /dev/null @@ -1,32 +0,0 @@ -use strict; -use Test::More tests => 2; - -use HTML::Parser (); -my $p = HTML::Parser->new(); -$p->attr_encoded(1); - -my $text = ""; -$p->handler(start => - sub { - my($tag, $attr) = @_; - $text .= "S[$tag"; - for my $k (sort keys %$attr) { - my $v = $attr->{$k}; - $text .= " $k=$v"; - } - $text .= "]"; - }, "tagname,attr"); - -my $html = <<'EOT'; -<tag arg="&<>"> -EOT - -$p->parse($html)->eof; - -is($text, 'S[tag arg=&<>]'); - -$text = ""; -$p->attr_encoded(0); -$p->parse($html)->eof; - -is($text, 'S[tag arg=&<>]'); diff --git a/ext/HTML/Parser/t/callback.t b/ext/HTML/Parser/t/callback.t deleted file mode 100644 index 7a456cfd82..0000000000 --- a/ext/HTML/Parser/t/callback.t +++ /dev/null @@ -1,49 +0,0 @@ -use Test::More tests => 47; - -use strict; -use HTML::Parser; - -my @expected; -my $p = HTML::Parser->new(api_version => 3, - unbroken_text => 1, - default_h => [\@expected, '@{event, text}'], - ); - -my $doc = <<'EOT'; -<title>Hi</title> -<h1>Ho ho</h1> -<--comment-> -EOT - -$p->parse($doc)->eof; -#use Data::Dump; Data::Dump::dump(@expected); - -for my $i (1..length($doc)) { - my @t; - $p->handler(default => \@t); - $p->parse(chunk($doc, $i)); - - # check that we got the same stuff - #diag "X:", join(":", @t); - #diag "Y:", join(":", @expected); - is(join(":", @t), join(":", @expected)); -} - -sub chunk { - my $str = shift; - my $size = shift || 1; - sub { - my $res = substr($str, 0, $size); - #diag "...$res"; - substr($str, 0, $size) = ""; - $res; - } -} - -# Test croking behaviour -$p->handler(default => []); - -eval { - $p->parse(sub { die "Hi" }); -}; -like($@, qr/^Hi/); diff --git a/ext/HTML/Parser/t/case-sensitive.t b/ext/HTML/Parser/t/case-sensitive.t deleted file mode 100644 index 565b20b3e9..0000000000 --- a/ext/HTML/Parser/t/case-sensitive.t +++ /dev/null @@ -1,85 +0,0 @@ -use strict; -use Test::More tests => 8; - -use HTML::Parser (); -my $p = HTML::Parser->new(); -$p->case_sensitive(1); - -my $text = ""; -$p->handler(start => - sub { - my($tag, $attr, $attrseq) = @_; - $text .= "S[$tag"; - for my $k (sort keys %$attr) { - my $v = $attr->{$k}; - $text .= " $k=$v"; - } - if (@$attrseq) { $text.=" Order:" ; } - for my $k (@$attrseq) { - $text .= " $k"; - } - $text .= "]"; - }, "tagname,attr,attrseq"); -$p->handler(end => - sub { - my ($tag) = @_; - $text .= "E[$tag]"; - }, "tagname"); - -my $html = <<'EOT'; -<tAg aRg="Value" arg="other value"></tAg> -EOT -my $cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]'; -my $ci = 'S[tag arg=Value Order: arg arg]E[tag]'; - -$p->parse($html)->eof; -is($text, $cs); - -$text = ""; -$p->case_sensitive(0); -$p->parse($html)->eof; -is($text, $ci); - -$text = ""; -$p->case_sensitive(1); -$p->xml_mode(1); -$p->parse($html)->eof; -is($text, $cs); - -$text = ""; -$p->case_sensitive(0); -$p->parse($html)->eof; -is($text, $cs); - -$html = <<'EOT'; -<tAg aRg="Value" arg="other value"></tAg> -<iGnOrE></ignore> -EOT -$p->ignore_tags('ignore'); -$cs = 'S[tAg aRg=Value arg=other value Order: aRg arg]E[tAg]S[iGnOrE]'; -$ci = 'S[tag arg=Value Order: arg arg]E[tag]'; - -$text = ""; -$p->case_sensitive(0); -$p->xml_mode(0); -$p->parse($html)->eof; -is($text, $ci); - -$text = ""; -$p->case_sensitive(1); -$p->xml_mode(0); -$p->parse($html)->eof; -is($text, $cs); - -$text = ""; -$p->case_sensitive(0); -$p->xml_mode(1); -$p->parse($html)->eof; -is($text, $cs); - -$text = ""; -$p->case_sensitive(1); -$p->xml_mode(1); -$p->parse($html)->eof; -is($text, $cs); - diff --git a/ext/HTML/Parser/t/cases.t b/ext/HTML/Parser/t/cases.t deleted file mode 100644 index a53727950c..0000000000 --- a/ext/HTML/Parser/t/cases.t +++ /dev/null @@ -1,105 +0,0 @@ -use Test::More; - -require HTML::Parser; - -package P; @ISA = qw(HTML::Parser); - -my @result; -sub start -{ - my($self, $tag, $attr) = @_; - push @result, "START[$tag]"; - for (sort keys %$attr) { - push @result, "\t$_: " . $attr->{$_}; - } - $start++; -} - -sub end -{ - my($self, $tag) = @_; - push @result, "END[$tag]"; - $end++; -} - -sub text -{ - my $self = shift; - push @result, "TEXT[$_[0]]"; - $text++; -} - -sub comment -{ - my $self = shift; - push @result, "COMMENT[$_[0]]"; - $comment++; -} - -sub declaration -{ - my $self = shift; - push @result, "DECLARATION[$_[0]]"; - $declaration++; -} - -package main; - - -@tests = - ( - '<a ">' => ['START[a]', "\t\": \""], - '<a/>' => ['START[a/]',], - '<a />' => ['START[a]', "\t/: /"], - '<a a/>' => ['START[a]', "\ta/: a/"], - '<a a/=/>' => ['START[a]', "\ta/: /"], - '<a x="foo bar">' => ['START[a]', "\tx: foo\xA0bar"], - '<a x="foo bar">' => ['START[a]', "\tx: foo bar"], - '<å >' => ['TEXT[<å]', 'TEXT[ >]'], - '2 < 5' => ['TEXT[2 ]', 'TEXT[<]', 'TEXT[ 5]'], - '2 <5> 2' => ['TEXT[2 ]', 'TEXT[<5>]', 'TEXT[ 2]'], - '2 <a' => ['TEXT[2 ]', 'TEXT[<a]'], - '2 <a> 2' => ['TEXT[2 ]', 'START[a]', 'TEXT[ 2]'], - '2 <a href=foo' => ['TEXT[2 ]', 'TEXT[<a href=foo]'], - "2 <a href='foo bar'> 2" => - ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], - '2 <a href=foo bar> 2' => - ['TEXT[2 ]', 'START[a]', "\tbar: bar", "\thref: foo", 'TEXT[ 2]'], - '2 <a href="foo bar"> 2' => - ['TEXT[2 ]', 'START[a]', "\thref: foo bar", 'TEXT[ 2]'], - '2 <a href="foo\'bar"> 2' => - ['TEXT[2 ]', 'START[a]', "\thref: foo'bar", 'TEXT[ 2]'], - "2 <a href='foo\"bar'> 2" => - ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], - "2 <a href='foo"bar'> 2" => - ['TEXT[2 ]', 'START[a]', "\thref: foo\"bar", 'TEXT[ 2]'], - '2 <a.b> 2' => ['TEXT[2 ]', 'START[a.b]', 'TEXT[ 2]'], - '2 <a.b-12 a.b = 2 a> 2' => - ['TEXT[2 ]', 'START[a.b-12]', "\ta: a", "\ta.b: 2", 'TEXT[ 2]'], - '2 <a_b> 2' => ['TEXT[2 ]', 'START[a_b]', 'TEXT[ 2]'], - '<!ENTITY nbsp CDATA " " -- no-break space -->' => - ['DECLARATION[ENTITY nbsp CDATA " " -- no-break space --]'], - '<!-- comment -->' => ['COMMENT[ comment ]'], - '<!-- comment -- --- comment -->' => - ['COMMENT[ comment ]', 'COMMENT[- comment ]'], - '<!-- comment <!-- not comment --> comment -->' => - ['COMMENT[ comment <!]', 'COMMENT[> comment ]'], - '<!-- <a href="foo"> -->' => ['COMMENT[ <a href="foo"> ]'], - ); - -plan tests => @tests / 2; - -my $i = 0; -TEST: -while (@tests) { - ++$i; - my ($html, $expected) = splice @tests, 0, 2; - @result = (); - - $p = new P; - $p->strict_comment(1); - $p->parse($html)->eof; - - ok(eq_array($expected, \@result)) or diag("Expected: @$expected\n", - "Got: @result\n"); -} diff --git a/ext/HTML/Parser/t/comment.t b/ext/HTML/Parser/t/comment.t deleted file mode 100644 index 303449ebaa..0000000000 --- a/ext/HTML/Parser/t/comment.t +++ /dev/null @@ -1,24 +0,0 @@ -use Test::More tests => 1; - -use strict; -use HTML::Parser; - -my $p = HTML::Parser->new(api_version => 3); -my @com; -$p->handler(comment => sub { push(@com, shift) }, "token0"); -$p->handler(default => sub { push(@com, shift() . "[" . shift() . "]") }, "event, text"); - -$p->parse("<foo><><!><!-><!--><!---><!----><!-----><!------>"); -$p->parse("<!--+--"); -$p->parse("\n\n"); -$p->parse(">"); -$p->parse("<!a'b>"); -$p->parse("<!--foo--->"); -$p->parse("<!--foo---->"); -$p->parse("<!--foo----->-->"); -$p->parse("<foo>"); -$p->parse("<!3453><!-3456><!FOO><>"); -$p->eof; - -my $com = join(":", @com); -is($com, "start_document[]:start[<foo>]:text[<>]::-:><!-::-:--:+:a'b:foo-:foo--:foo---:text[-->]:start[<foo>]:3453:-3456:FOO:text[<>]:end_document[]"); diff --git a/ext/HTML/Parser/t/crashme.t b/ext/HTML/Parser/t/crashme.t deleted file mode 100644 index 1a1e8e47c9..0000000000 --- a/ext/HTML/Parser/t/crashme.t +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/perl - -# This test will simply run the parser on random junk. - -my $no_tests = shift || 3; -use Test::More; -plan tests => $no_tests; - -use HTML::Parser (); - -my $file = "junk$$.html"; -die if -e $file; - -for (1..$no_tests) { - - open(JUNK, ">$file") || die; - for (1 .. rand(5000)) { - for (1 .. rand(200)) { - print JUNK pack("N", rand(2**32)); - } - print JUNK ("<", "&", ">")[rand(3)]; # make these a bit more likely - } - close(JUNK); - - #diag "Parse @{[-s $file]} bytes of junk"; - - HTML::Parser->new->parse_file($file); - pass(); - - #print_mem(); -} - -unlink($file); - - -sub print_mem -{ - # this probably only works on Linux - open(STAT, "/proc/self/status") || return; - while (<STAT>) { - diag $_ if /^VmSize/; - } -} diff --git a/ext/HTML/Parser/t/declaration.t b/ext/HTML/Parser/t/declaration.t deleted file mode 100644 index 17de561651..0000000000 --- a/ext/HTML/Parser/t/declaration.t +++ /dev/null @@ -1,62 +0,0 @@ -use Test::More tests => 2; - -use HTML::Parser; -my $res = ""; - -sub decl -{ - my $t = shift; - $res .= "[" . join("\n", map "<$_>", @$t) . "]"; -} - -sub text -{ - $res .= shift; -} - -my $p = HTML::Parser->new(declaration_h => [\&decl, "tokens"], - default_h => [\&text, "text"], - ); - -$p->parse(<<EOT)->eof; -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" --<comment>-- - "http://www.w3.org/TR/html40/strict.dtd"> - -<!ENTITY foo "<!-- foo -->"> -<!Entity foo "<!-- foo -->"> - -<!row --> foo -EOT - -is($res, <<EOT); -[<DOCTYPE> -<HTML> -<PUBLIC> -<"-//W3C//DTD HTML 4.01//EN"> -<--<comment>--> -<"http://www.w3.org/TR/html40/strict.dtd">] - -[<ENTITY> -<foo> -<"<!-- foo -->">] -[<Entity> -<foo> -<"<!-- foo -->">] - -<!row --> foo -EOT - -$res = ""; -$p->parse(<<EOT)->eof; -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" - "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"[]> -EOT -is($res, <<EOT); -[<DOCTYPE> -<html> -<PUBLIC> -<"-//W3C//DTD XHTML 1.0 Strict//EN"> -<"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> -<[]>] -EOT - diff --git a/ext/HTML/Parser/t/default.t b/ext/HTML/Parser/t/default.t deleted file mode 100644 index 4b5ed794fd..0000000000 --- a/ext/HTML/Parser/t/default.t +++ /dev/null @@ -1,43 +0,0 @@ -use strict; -use Test::More tests => 3; - -my $text = ""; -use HTML::Parser (); -my $p = HTML::Parser->new(default_h => [sub { $text .= shift }, "text"], - ); - -my $html = <<'EOT'; - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html40/strict.dtd"> - -<title>foo</title> -<!-- comment <a> --> -<?process instruction> - -EOT - -$p->parse($html)->eof; - -is($text, $html); - -$text = ""; -$p->handler(start => sub { }, ""); -$p->handler(declaration => sub { }, ""); -$p->parse($html)->eof; - -my $html2; -$html2 = $html; -$html2 =~ s/<title>//; -$html2 =~ s/<!DOCTYPE[^>]*>//; - -is($text, $html2); - -$text = ""; -$p->handler(start => undef); -$p->parse($html)->eof; - -$html2 = $html; -$html2 =~ s/<!DOCTYPE[^>]*>//; - -is($text, $html2); diff --git a/ext/HTML/Parser/t/document.t b/ext/HTML/Parser/t/document.t deleted file mode 100644 index 6696939050..0000000000 --- a/ext/HTML/Parser/t/document.t +++ /dev/null @@ -1,41 +0,0 @@ -#!perl -w - -use Test; -plan tests => 6; - - -use HTML::Parser; -use File::Spec; - -my $events; -my $p = HTML::Parser->new(default_h => [sub { $events .= "$_[0]\n";}, "event"]); - -$events = ""; -$p->eof; -ok($events, "start_document\nend_document\n"); - -$events = ""; -$p->parse_file(File::Spec->devnull); -ok($events, "start_document\nend_document\n"); - -$events = ""; -$p->parse(""); -$p->eof; -ok($events, "start_document\nend_document\n"); - -$events = ""; -$p->parse(""); -$p->parse(""); -$p->eof; -ok($events, "start_document\nend_document\n"); - -$events = ""; -$p->parse(""); -$p->parse("<a>"); -$p->eof; -ok($events, "start_document\nstart\nend_document\n"); - -$events = ""; -$p->parse("<a> "); -$p->eof; -ok($events, "start_document\nstart\ntext\nend_document\n"); diff --git a/ext/HTML/Parser/t/dtext.t b/ext/HTML/Parser/t/dtext.t deleted file mode 100644 index 883c61f045..0000000000 --- a/ext/HTML/Parser/t/dtext.t +++ /dev/null @@ -1,72 +0,0 @@ -#!perl -w - -use strict; -use Test::More tests => 2; - -use HTML::Parser (); - -my $dtext = ""; -my $text = ""; - -sub append -{ - $dtext .= shift; - $text .= shift; -} - -my $p = HTML::Parser->new(text_h => [\&append, "dtext, text"], - default_h => [\&append, "text, text" ], - ); - -my $doc = <<'EOT'; -<title>å</title> -<a href="fooå">ååAA<A>AA</a> -<?å> -foo bar -foo bar -&xyzzy -&xyzzy; -<!-- � --> - -ÿ -ÿ -ÿG -<!-- Ā --> -� -� -& -&# -&#x -<xmp>å</xmp> -<script>å</script> -<ScRIPT>å</scRIPT> -<skript>å</script> -EOT - -$p->parse($doc)->eof; - -is($text, $doc); -is($dtext, <<"EOT"); -<title>å</title> -<a href="fooå">ååAA<A>AA</a> -<?å> -foo\240bar -foo\240bar -&xyzzy -&xyzzy; -<!-- � --> -\1 -\377 -\377 -\377G -<!-- Ā --> -� -� -& -&# -&#x -<xmp>å</xmp> -<script>å</script> -<ScRIPT>å</scRIPT> -<skript>å</script> -EOT diff --git a/ext/HTML/Parser/t/entities.t b/ext/HTML/Parser/t/entities.t deleted file mode 100644 index b8342f5a7c..0000000000 --- a/ext/HTML/Parser/t/entities.t +++ /dev/null @@ -1,193 +0,0 @@ -use HTML::Entities qw(decode_entities encode_entities encode_entities_numeric); - -use Test::More tests => 12; - -$a = "Våre norske tegn bør æres"; - -decode_entities($a); - -is($a, "Våre norske tegn bør æres"); - -encode_entities($a); - -is($a, "Våre norske tegn bør æres"); - -decode_entities($a); -encode_entities_numeric($a); - -is($a, "Våre norske tegn bør æres"); - -$a = "<&>\"'"; -is(encode_entities($a), "<&>"'"); -is(encode_entities_numeric($a), "<&>"'"); - -$a = "abcdef"; -is(encode_entities($a, 'a-c'), "abcdef"); - - -# See how well it does against rfc1866... -$ent = $plain = ""; -while (<DATA>) { - next unless /^\s*<!ENTITY\s+(\w+)\s*CDATA\s*\"&\#(\d+)/; - $ent .= "&$1;"; - $plain .= chr($2); -} - -$a = $ent; -decode_entities($a); -is($a, $plain); - -# Try decoding when the ";" are left out -$a = $ent, -$a =~ s/;//g; -decode_entities($a); -is($a, $plain); - - -$a = $plain; -encode_entities($a); -is($a, $ent); - - -# From: Bill Simpson-Young <bill.simpson-young@cmis.csiro.au> -# Subject: HTML entities problem with 5.11 -# To: libwww-perl@ics.uci.edu -# Date: Fri, 05 Sep 1997 16:56:55 +1000 -# Message-Id: <199709050657.QAA10089@snowy.nsw.cmis.CSIRO.AU> -# -# Hi. I've got a problem that has surfaced with the changes to -# HTML::Entities.pm for 5.11 (it doesn't happen with 5.08). It's happening -# in the process of encoding then decoding special entities. Eg, what goes -# in as "abc&def&ghi" comes out as "abc&def;&ghi;". - -is(decode_entities("abc&def&ghi&abc;&def;"), "abc&def&ghi&abc;&def;"); - -# Decoding of ' -is(decode_entities("'"), "'"); -is(encode_entities("'", "'"), "'"); - - -__END__ -# Quoted from rfc1866.txt - -14. Proposed Entities - - The HTML DTD references the "Added Latin 1" entity set, which only - supplies named entities for a subset of the non-ASCII characters in - [ISO-8859-1], namely the accented characters. The following entities - should be supported so that all ISO 8859-1 characters may only be - referenced symbolically. The names for these entities are taken from - the appendixes of [SGML]. - - <!ENTITY nbsp CDATA " " -- no-break space --> - <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark --> - <!ENTITY cent CDATA "¢" -- cent sign --> - <!ENTITY pound CDATA "£" -- pound sterling sign --> - <!ENTITY curren CDATA "¤" -- general currency sign --> - <!ENTITY yen CDATA "¥" -- yen sign --> - <!ENTITY brvbar CDATA "¦" -- broken (vertical) bar --> - <!ENTITY sect CDATA "§" -- section sign --> - <!ENTITY uml CDATA "¨" -- umlaut (dieresis) --> - <!ENTITY copy CDATA "©" -- copyright sign --> - <!ENTITY ordf CDATA "ª" -- ordinal indicator, feminine --> - <!ENTITY laquo CDATA "«" -- angle quotation mark, left --> - <!ENTITY not CDATA "¬" -- not sign --> - <!ENTITY shy CDATA "­" -- soft hyphen --> - <!ENTITY reg CDATA "®" -- registered sign --> - <!ENTITY macr CDATA "¯" -- macron --> - <!ENTITY deg CDATA "°" -- degree sign --> - <!ENTITY plusmn CDATA "±" -- plus-or-minus sign --> - <!ENTITY sup2 CDATA "²" -- superscript two --> - <!ENTITY sup3 CDATA "³" -- superscript three --> - <!ENTITY acute CDATA "´" -- acute accent --> - <!ENTITY micro CDATA "µ" -- micro sign --> - <!ENTITY para CDATA "¶" -- pilcrow (paragraph sign) --> - <!ENTITY middot CDATA "·" -- middle dot --> - <!ENTITY cedil CDATA "¸" -- cedilla --> - <!ENTITY sup1 CDATA "¹" -- superscript one --> - <!ENTITY ordm CDATA "º" -- ordinal indicator, masculine --> - <!ENTITY raquo CDATA "»" -- angle quotation mark, right --> - <!ENTITY frac14 CDATA "¼" -- fraction one-quarter --> - <!ENTITY frac12 CDATA "½" -- fraction one-half --> - <!ENTITY frac34 CDATA "¾" -- fraction three-quarters --> - <!ENTITY iquest CDATA "¿" -- inverted question mark --> - <!ENTITY Agrave CDATA "À" -- capital A, grave accent --> - <!ENTITY Aacute CDATA "Á" -- capital A, acute accent --> - <!ENTITY Acirc CDATA "Â" -- capital A, circumflex accent --> - - - -Berners-Lee & Connolly Standards Track [Page 75] - -RFC 1866 Hypertext Markup Language - 2.0 November 1995 - - - <!ENTITY Atilde CDATA "Ã" -- capital A, tilde --> - <!ENTITY Auml CDATA "Ä" -- capital A, dieresis or umlaut mark --> - <!ENTITY Aring CDATA "Å" -- capital A, ring --> - <!ENTITY AElig CDATA "Æ" -- capital AE diphthong (ligature) --> - <!ENTITY Ccedil CDATA "Ç" -- capital C, cedilla --> - <!ENTITY Egrave CDATA "È" -- capital E, grave accent --> - <!ENTITY Eacute CDATA "É" -- capital E, acute accent --> - <!ENTITY Ecirc CDATA "Ê" -- capital E, circumflex accent --> - <!ENTITY Euml CDATA "Ë" -- capital E, dieresis or umlaut mark --> - <!ENTITY Igrave CDATA "Ì" -- capital I, grave accent --> - <!ENTITY Iacute CDATA "Í" -- capital I, acute accent --> - <!ENTITY Icirc CDATA "Î" -- capital I, circumflex accent --> - <!ENTITY Iuml CDATA "Ï" -- capital I, dieresis or umlaut mark --> - <!ENTITY ETH CDATA "Ð" -- capital Eth, Icelandic --> - <!ENTITY Ntilde CDATA "Ñ" -- capital N, tilde --> - <!ENTITY Ograve CDATA "Ò" -- capital O, grave accent --> - <!ENTITY Oacute CDATA "Ó" -- capital O, acute accent --> - <!ENTITY Ocirc CDATA "Ô" -- capital O, circumflex accent --> - <!ENTITY Otilde CDATA "Õ" -- capital O, tilde --> - <!ENTITY Ouml CDATA "Ö" -- capital O, dieresis or umlaut mark --> - <!ENTITY times CDATA "×" -- multiply sign --> - <!ENTITY Oslash CDATA "Ø" -- capital O, slash --> - <!ENTITY Ugrave CDATA "Ù" -- capital U, grave accent --> - <!ENTITY Uacute CDATA "Ú" -- capital U, acute accent --> - <!ENTITY Ucirc CDATA "Û" -- capital U, circumflex accent --> - <!ENTITY Uuml CDATA "Ü" -- capital U, dieresis or umlaut mark --> - <!ENTITY Yacute CDATA "Ý" -- capital Y, acute accent --> - <!ENTITY THORN CDATA "Þ" -- capital THORN, Icelandic --> - <!ENTITY szlig CDATA "ß" -- small sharp s, German (sz ligature) --> - <!ENTITY agrave CDATA "à" -- small a, grave accent --> - <!ENTITY aacute CDATA "á" -- small a, acute accent --> - <!ENTITY acirc CDATA "â" -- small a, circumflex accent --> - <!ENTITY atilde CDATA "ã" -- small a, tilde --> - <!ENTITY auml CDATA "ä" -- small a, dieresis or umlaut mark --> - <!ENTITY aring CDATA "å" -- small a, ring --> - <!ENTITY aelig CDATA "æ" -- small ae diphthong (ligature) --> - <!ENTITY ccedil CDATA "ç" -- small c, cedilla --> - <!ENTITY egrave CDATA "è" -- small e, grave accent --> - <!ENTITY eacute CDATA "é" -- small e, acute accent --> - <!ENTITY ecirc CDATA "ê" -- small e, circumflex accent --> - <!ENTITY euml CDATA "ë" -- small e, dieresis or umlaut mark --> - <!ENTITY igrave CDATA "ì" -- small i, grave accent --> - <!ENTITY iacute CDATA "í" -- small i, acute accent --> - <!ENTITY icirc CDATA "î" -- small i, circumflex accent --> - <!ENTITY iuml CDATA "ï" -- small i, dieresis or umlaut mark --> - <!ENTITY eth CDATA "ð" -- small eth, Icelandic --> - <!ENTITY ntilde CDATA "ñ" -- small n, tilde --> - <!ENTITY ograve CDATA "ò" -- small o, grave accent --> - - - -Berners-Lee & Connolly Standards Track [Page 76] - -RFC 1866 Hypertext Markup Language - 2.0 November 1995 - - - <!ENTITY oacute CDATA "ó" -- small o, acute accent --> - <!ENTITY ocirc CDATA "ô" -- small o, circumflex accent --> - <!ENTITY otilde CDATA "õ" -- small o, tilde --> - <!ENTITY ouml CDATA "ö" -- small o, dieresis or umlaut mark --> - <!ENTITY divide CDATA "÷" -- divide sign --> - <!ENTITY oslash CDATA "ø" -- small o, slash --> - <!ENTITY ugrave CDATA "ù" -- small u, grave accent --> - <!ENTITY uacute CDATA "ú" -- small u, acute accent --> - <!ENTITY ucirc CDATA "û" -- small u, circumflex accent --> - <!ENTITY uuml CDATA "ü" -- small u, dieresis or umlaut mark --> - <!ENTITY yacute CDATA "ý" -- small y, acute accent --> - <!ENTITY thorn CDATA "þ" -- small thorn, Icelandic --> - <!ENTITY yuml CDATA "ÿ" -- small y, dieresis or umlaut mark --> diff --git a/ext/HTML/Parser/t/entities2.t b/ext/HTML/Parser/t/entities2.t deleted file mode 100644 index 7840c719ba..0000000000 --- a/ext/HTML/Parser/t/entities2.t +++ /dev/null @@ -1,57 +0,0 @@ -#!perl -w - -use strict; -use Test::More tests => 9; - -use HTML::Entities qw(_decode_entities); - -eval { - _decode_entities("<", undef); -}; -like($@, qr/^Can't inline decode readonly string/); - -eval { - my $a = ""; - _decode_entities($a, $a); -}; -like($@, qr/^2nd argument must be hash reference/); - -eval { - my $a = ""; - _decode_entities($a, []); -}; -like($@, qr/^2nd argument must be hash reference/); - -$a = "<"; -_decode_entities($a, undef); -is($a, "<"); - -_decode_entities($a, { "lt" => "<" }); -is($a, "<"); - -my $x = "x" x 20; - -my $err; -for (":", ":a", "a:", "a:a", "a:a:a", "a:::a") { - my $a = $_; - $a =~ s/:/&a;/g; - my $b = $_; - $b =~ s/:/$x/g; - _decode_entities($a, { "a" => $x }); - if ($a ne $b) { - diag "Something went wrong with '$_'"; - $err++; - } -} -ok(!$err); - -$a = "foo bar"; -_decode_entities($a, \%HTML::Entities::entity2char); -is($a, "foo\xA0bar"); - -$a = "foo bar"; -_decode_entities($a, \%HTML::Entities::entity2char); -is($a, "foo bar"); - -_decode_entities($a, \%HTML::Entities::entity2char, 1); -is($a, "foo\xA0bar"); diff --git a/ext/HTML/Parser/t/filter-methods.t b/ext/HTML/Parser/t/filter-methods.t deleted file mode 100644 index 9eccaf1655..0000000000 --- a/ext/HTML/Parser/t/filter-methods.t +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/perl -w - -use Test::More tests => 12; -use strict; - -use HTML::Parser; - -my $p = HTML::Parser->new(api_version => 3, ignore_tags => [qw(b i em tt)]); -$p->ignore_elements("script"); -$p->unbroken_text(1); - -$p->handler(default => [], "event, text"); -$p->parse(<<"EOT")->eof; -<html><head><title>foo</title><Script language="Perl"> - while (<B>) { - # ... - } -</Script><body> -This is an <i>italic</i> and <b>bold</b> text. -</body> -</html> -EOT - -my $t = join("||", map join("|", @$_), @{$p->handler("default")}); -#diag $t; - -is($t, "start_document|||start|<html>||start|<head>||start|<title>||text|foo||end|</title>||start|<body>||text| -This is an italic and bold text. -||end|</body>||text| -||end|</html>||text| -||end_document|", 'ignore_elements'); - - -#------------------------------------------------------ - -$p = HTML::Parser->new(api_version => 3); -$p->report_tags("a"); -$p->handler(start => sub { - my($tagname, %attr) = @_; - ok($tagname eq "a" && $attr{href} eq "#a", 'report_tags start'); - }, 'tagname, @attr'); -$p->handler(end => sub { - my $tagname = shift; - is($tagname, "a", 'report_tags end'); - }, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> - -This is <a href="#a">very nice</a> example. - -EOT - - -#------------------------------------------------------ - -my @tags; -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(a em)); -$p->ignore_tags(qw(em)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> - -This is <em>yet another</em> <a href="#a">very nice</a> example. - -EOT -is(join('|', @tags), 'a', 'report_tags followed by ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1)); -$p->report_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> - -EOT -is(join('|', @tags), 'h1|h2', 'reset report_tags filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->ignore_tags(qw(h2)); -$p->report_tags(qw(h1 h2)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> - -EOT -is(join('|', @tags), 'h1', 'report_tags does not reset ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->ignore_tags(qw(h2)); -$p->report_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> - -EOT -is(join('|', @tags), 'h1', 'reset report_tags does no reset ignore_tags'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->report_tags(qw(h1 h2)); -$p->report_tags(qw(h3)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> -<h3>Next example</h3> - -EOT -is(join('|', @tags), 'h3', 'report_tags replaces filter'); - - -#------------------------------------------------------ - - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h1 h2)); -$p->ignore_tags(qw(h3)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> -<h3>Next example</h3> - -EOT -is(join('|', @tags), 'h1|h2', 'ignore_tags replaces filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h2)); -$p->ignore_tags(); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> - -EOT -is(join('|', @tags), 'h1|h2', 'reset ignore_tags filter'); - - -#------------------------------------------------------ - -@tags = (); -$p = HTML::Parser->new(api_version => 3); -$p->ignore_tags(qw(h2)); -$p->report_tags(qw(h1 h2)); -$p->handler(end => sub {push @tags, @_;}, 'tagname'); - -$p->parse(<<EOT)->eof; - -<h1>Next example</h1> -<h2>Next example</h2> - -EOT -is(join('|', @tags), 'h1', 'ignore_tags before report_tags'); -#------------------------------------------------------ - -$p = HTML::Parser->new(api_version => 3); -$p->ignore_elements("script"); -my $res=""; -$p->handler(default=> sub {$res.=$_[0];}, 'text'); -$p->parse(<<'EOT')->eof; -A <script> B </script> C </script> D <script> E </script> F -EOT -is($res,"A C D F\n","ignore </script> without <script> correctly"); diff --git a/ext/HTML/Parser/t/filter.t b/ext/HTML/Parser/t/filter.t deleted file mode 100644 index 3b18f9ec71..0000000000 --- a/ext/HTML/Parser/t/filter.t +++ /dev/null @@ -1,60 +0,0 @@ -use Test::More tests => 3; - -my $HTML = <<EOT; - -<!DOCTYPE HTML> -<!-- comment -<h1>Foo</h1> ---> - -<H1 ->Bar</H1 -> - -<Table><tr><td>1<td>2<td>3 -<tr> -</table> - -<?process> - -EOT - -use HTML::Filter; -use SelectSaver; - -my $tmpfile = "test-$$.htm"; -die "$tmpfile already exists" if -e $tmpfile; - -open(HTML, ">$tmpfile") or die "$!"; - -{ - my $save = new SelectSaver(HTML); - HTML::Filter->new->parse($HTML)->eof; -} -close(HTML); - -open(HTML, $tmpfile) or die "$!"; -local($/) = undef; -my $FILTERED = <HTML>; -close(HTML); - -#print $FILTERED; -is($FILTERED, $HTML); - -{ - package MyFilter; - @ISA=qw(HTML::Filter); - sub comment {} - sub output { push(@{$_[0]->{fhtml}}, $_[1]) } - sub filtered_html { join("", @{$_[0]->{fhtml}}) } -} - -my $f2 = MyFilter->new->parse_file($tmpfile)->filtered_html; -unlink($tmpfile) or warn "Can't unlink $tmpfile: $!"; - -#diag $f2; - -unlike($f2, qr/Foo/); -like($f2, qr/Bar/); - - diff --git a/ext/HTML/Parser/t/handler-eof.t b/ext/HTML/Parser/t/handler-eof.t deleted file mode 100644 index 39419dc4da..0000000000 --- a/ext/HTML/Parser/t/handler-eof.t +++ /dev/null @@ -1,54 +0,0 @@ -use Test::More tests => 6; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3); - -$p->handler(start => sub { my $attr = shift; is($attr->{testno}, 1) }, - "attr"); -$p->handler(end => sub { shift->eof }, "self"); -my $text; -$p->handler(text => sub { $text = shift }, "text"); - -is($p->parse("<foo testno=1>"), $p); - -$text = ''; -ok(!$p->parse("</foo><foo testno=999>")); -ok(!$text); - -$p->handler(end => sub { $p->parse("foo"); }, ""); -eval { - $p->parse("</foo>"); -}; -like($@, qr/Parse loop not allowed/); - -# We used to get into an infinite loop if the eof triggered -# handler called ->eof - -use HTML::Parser; -$p = HTML::Parser->new(api_version => 3); - -my $i; -$p->handler("default" => - sub { - my $p=shift; - #++$i; diag "$i @_"; - $p->eof; - }, "self, event"); -$p->parse("Foo"); -$p->eof; - -# We used to sometimes trigger events after a handler signaled eof -my $title=''; -$p = HTML::Parser->new(api_version => 3,); -$p->handler(start=> \&title_handler, 'tagname, self'); -$p->parse("<head><title>foo</title>\n</head>"); -is($title, "foo"); - -sub title_handler { - return if shift ne 'title'; - my $self = shift; - $self->handler(text => sub { $title .= shift}, 'dtext'); - $self->handler(end => sub { shift->eof if shift eq 'title' }, 'tagname, self'); -} diff --git a/ext/HTML/Parser/t/handler.t b/ext/HTML/Parser/t/handler.t deleted file mode 100644 index 8d7bbc51ee..0000000000 --- a/ext/HTML/Parser/t/handler.t +++ /dev/null @@ -1,67 +0,0 @@ -# Test handler method - -use Test::More tests => 11; - -my $testno; - -use HTML::Parser; -{ - package MyParser; - use vars qw(@ISA); - @ISA=(HTML::Parser); - - sub foo - { - Test::More::is($_[1]{testno}, Test::More->builder->current_test + 1); - } - - sub bar - { - Test::More::is($_[1], Test::More->builder->current_test + 1); - } -} - -$p = MyParser->new(api_version => 3); - -eval { - $p->handler(foo => "foo", "foo"); -}; - -like($@, qr/^No handler for foo events/); - -eval { - $p->handler(start => "foo", "foo"); -}; -like($@, qr/^Unrecognized identifier foo in argspec/); - -my $h = $p->handler(start => "foo", "self,tagname"); -ok(!defined($h)); - -$x = \substr("xfoo", 1); -$p->handler(start => $$x, "self,attr"); -$p->parse("<a testno=4>"); - -$p->handler(start => \&MyParser::foo, "self,attr"); -$p->parse("<a testno=5>"); - -$p->handler(start => "foo"); -$p->parse("<a testno=6>"); - -$p->handler(start => "bar", "self,'7'"); -$p->parse("<a>"); - -eval { - $p->handler(start => {}, "self"); -}; -like($@, qr/^Only code or array references allowed as handler/); - -$a = []; -$p->handler(start => $a); -$h = $p->handler("start"); -is($p->handler("start", "foo"), $a); - -is($p->handler("start", \&MyParser::foo, ""), "foo"); - -is($p->handler("start"), \&MyParser::foo); - - diff --git a/ext/HTML/Parser/t/headparser-http.t b/ext/HTML/Parser/t/headparser-http.t deleted file mode 100644 index b722c64f9c..0000000000 --- a/ext/HTML/Parser/t/headparser-http.t +++ /dev/null @@ -1,20 +0,0 @@ -use Test::More tests => 1; - -eval { - require HTML::HeadParser; - $p = HTML::HeadParser->new; -}; - -SKIP: { -skip $@, 1 if $@ =~ /^Can't locate HTTP/; - -$p = HTML::HeadParser->new($h); -$p->parse(<<EOT); -<title>Stupid example</title> -<base href="http://www.sn.no/libwww-perl/"> -Normal text starts here. -EOT -$h = $p->header; -undef $p; -is($h->title, "Stupid example"); -} diff --git a/ext/HTML/Parser/t/headparser.t b/ext/HTML/Parser/t/headparser.t deleted file mode 100644 index b4228be0b2..0000000000 --- a/ext/HTML/Parser/t/headparser.t +++ /dev/null @@ -1,154 +0,0 @@ -#!perl -w - -use strict; -use Test::More tests => 11; - -{ package H; - sub new { bless {}, shift; } - - sub header { - my $self = shift; - my $key = uc(shift); - my $old = $self->{$key}; - if (@_) { $self->{$key} = shift; } - $old; - } - - sub push_header { - my($self, $k, $v) = @_; - $k = uc($k); - if (exists $self->{$k}) { - $self->{$k} = [ $self->{$k} ] unless ref $self->{$k}; - push(@{$self->{$k}}, $v); - } else { - $self->{$k} = $v; - } - } - - sub as_string { - my $self = shift; - my $str = ""; - for (sort keys %$self) { - if (ref($self->{$_})) { - my $v; - for $v (@{$self->{$_}}) { - $str .= "$_: $v\n"; - } - } else { - $str .= "$_: $self->{$_}\n"; - } - } - $str; - } -} - - -my $HTML = <<'EOT'; - -<title>Å være eller å ikke være</title> -<meta http-equiv="Expires" content="Soon"> -<meta http-equiv="Foo" content="Bar"> -<link href="mailto:gisle@aas.no" rev=made title="Gisle Aas"> - -<script> - - "</script>" - ignore this - -</script> - -<base href="http://www.sn.no"> -<meta name="Keywords" content="test, test, test,..."> -<meta name="Keywords" content="more"> - -Dette er vanlig tekst. Denne teksten definerer også slutten på -<head> delen av dokumentet. - -<style> - - "</style>" - ignore this too - -</style> - -<isindex> - -Dette er også vanlig tekst som ikke skal blir parset i det hele tatt. - -EOT - -$| = 1; - -#$HTML::HeadParser::DEBUG = 1; -require HTML::HeadParser; -my $p = HTML::HeadParser->new( H->new ); - -if ($p->parse($HTML)) { - fail("Need more data which should not happen"); -} else { - #diag $p->as_string; - pass(); -} - -like($p->header('Title'), qr/Å være eller å ikke være/); -is($p->header('Expires'), 'Soon'); -is($p->header('Content-Base'), 'http://www.sn.no'); -like($p->header('Link'), qr/<mailto:gisle\@aas.no>/); - -# This header should not be present because the head ended -ok(!$p->header('Isindex')); - - -# Try feeding one char at a time -my $expected = $p->as_string; -my $nl = 1; -$p = HTML::HeadParser->new(H->new); -while ($HTML =~ /(.)/sg) { - #print STDERR '#' if $nl; - #print STDERR $1; - $nl = $1 eq "\n"; - $p->parse($1) or last; -} -is($p->as_string, $expected); - - -# Try reading it from a file -my $file = "hptest$$.html"; -die "$file already exists" if -e $file; - -open(FILE, ">$file") or die "Can't create $file: $!"; -binmode(FILE); -print FILE $HTML; -print FILE "<p>This is more content...</p>\n" x 2000; -print FILE "<title>Buuuh!</title>\n" x 200; -close FILE or die "Can't close $file: $!"; - -$p = HTML::HeadParser->new(H->new); -$p->parse_file($file); -unlink($file) or warn "Can't unlink $file: $!"; - -is($p->header("Title"), "Å være eller å ikke være"); - - -# We got into an infinite loop on data without tags and no EOL. -# This was actually a HTML::Parser bug. -open(FILE, ">$file") or die "Can't create $file: $!"; -print FILE "Foo"; -close(FILE); - -$p = HTML::HeadParser->new(H->new); -$p->parse_file($file); -unlink($file) or warn "Can't unlink $file: $!"; - -ok(!$p->as_string); - -SKIP: { - skip "Need Unicode support", 2 if $] < 5.008; - - # Test that the Unicode BOM does not confuse us? - $p = HTML::HeadParser->new(H->new); - ok($p->parse("\x{FEFF}\n<title>Hi <foo></title>")); - $p->eof; - - is($p->header("title"), "Hi <foo>"); -} diff --git a/ext/HTML/Parser/t/ignore.t b/ext/HTML/Parser/t/ignore.t deleted file mode 100644 index 008739e0ad..0000000000 --- a/ext/HTML/Parser/t/ignore.t +++ /dev/null @@ -1,27 +0,0 @@ - -use Test::More tests => 4; - -use strict; -use HTML::Parser (); - -my $html = '<A href="foo">text</A>'; - -my $text = ''; -my $p = HTML::Parser->new(default_h => [sub {$text .= shift;}, 'text']); -$p->parse($html)->eof; -is($text, $html); - -$text = ''; -$p->handler(start => ""); -$p->parse($html)->eof; -is($text, 'text</A>'); - -$text = ''; -$p->handler(end => 0); -$p->parse($html)->eof; -is($text, 'text'); - -$text = ''; -$p->handler(start => undef); -$p->parse($html)->eof; -is($text, '<A href="foo">text'); diff --git a/ext/HTML/Parser/t/largetags.t b/ext/HTML/Parser/t/largetags.t deleted file mode 100644 index a9ed3ff69e..0000000000 --- a/ext/HTML/Parser/t/largetags.t +++ /dev/null @@ -1,38 +0,0 @@ -# Exercise the tokenpos buffer allocation routines by feeding it -# very large tags. - -use Test::More tests => 2; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3); - -$p->handler("start" => - sub { - my $tp = shift; - #diag int(@$tp), " - ", join(", ", @$tp); - is(@$tp, 2 + 26 * 6 * 4); - }, "tokenpos"); - -$p->handler("declaration" => - sub { - my $t = shift; - #diag int(@$t), " - @$t"; - is(@$t, 26 * 6 * 2 + 1); - }, "tokens"); - -$p->parse("<a "); -for ("aa" .. "fz") { - $p->parse("$_=1 "); -} -$p->parse(">"); - -$p->parse("<!DOCTYPE "); -for ("aa" .. "fz") { - $p->parse("$_ -- $_ -- "); -} -$p->parse(">"); -$p->eof; -exit; - diff --git a/ext/HTML/Parser/t/linkextor-base.t b/ext/HTML/Parser/t/linkextor-base.t deleted file mode 100644 index 7ef8f0254b..0000000000 --- a/ext/HTML/Parser/t/linkextor-base.t +++ /dev/null @@ -1,41 +0,0 @@ -# This test that HTML::LinkExtor really absolutize links correctly -# when a base URL is given to the constructor. - -use Test::More tests => 5; -require HTML::LinkExtor; - -SKIP: { -eval { - require URI; -}; -skip $@, 5 if $@; - -# Try with base URL and the $p->links interface. -$p = HTML::LinkExtor->new(undef, "http://www.sn.no/foo/foo.html"); -$p->parse(<<HTML)->eof; -<head> -<base href="http://www.sn.no/"> -</head> -<body background="http://www.sn.no/sn.gif"> - -This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" -lowsrc="img.gif" alt="Image">. -HTML - -@p = $p->links; - -# There should be 4 links in the document -is(@p, 4); - -for (@p) { - ($t, %attr) = @$_ if $_->[0] eq 'img'; -} - -is($t, 'img'); - -is(delete $attr{src}, "http://www.sn.no/foo/img.jpg"); - -is(delete $attr{lowsrc}, "http://www.sn.no/foo/img.gif"); - -ok(!scalar(keys %attr)); # there should be no more attributes -} diff --git a/ext/HTML/Parser/t/linkextor-rel.t b/ext/HTML/Parser/t/linkextor-rel.t deleted file mode 100644 index 1190a96c3f..0000000000 --- a/ext/HTML/Parser/t/linkextor-rel.t +++ /dev/null @@ -1,36 +0,0 @@ -use Test::More tests => 4; - -require HTML::LinkExtor; - -$HTML = <<HTML; -<head> -<base href="http://www.sn.no/"> -</head> -<body background="http://www.sn.no/sn.gif"> - -This is <A HREF="link.html">link</a> and an <img SRC="img.jpg" -lowsrc="img.gif" alt="Image">. -HTML - - -# Try the callback interface -$links = ""; -$p = HTML::LinkExtor->new( - sub { - my($tag, %links) = @_; - #diag "$tag @{[%links]}"; - $links .= "$tag @{[%links]}\n"; - }); - -$p->parse($HTML); $p->eof; - -ok($links =~ m|^base href http://www\.sn\.no/$|m); -ok($links =~ m|^body background http://www\.sn\.no/sn\.gif$|m); -ok($links =~ m|^a href link\.html$|m); - -# Used to be problems when using the links method on a document with -# no links it it. This is a test to prove that it works. -$p = new HTML::LinkExtor; -$p->parse("this is a document with no links"); $p->eof; -@a = $p->links; -is(@a, 0); diff --git a/ext/HTML/Parser/t/magic.t b/ext/HTML/Parser/t/magic.t deleted file mode 100644 index 366f275236..0000000000 --- a/ext/HTML/Parser/t/magic.t +++ /dev/null @@ -1,41 +0,0 @@ -# Check that the magic signature at the top of struct p_state works and that we -# catch modifications to _hparser_xs_state gracefully - -use Test::More tests => 5; - -use HTML::Parser; - -$p = HTML::Parser->new(api_version => 3); - -$p->xml_mode(1); - -# We should not be able to simply modify this stuff -eval { - ${$p->{_hparser_xs_state}} += 4; -}; -like($@, qr/^Modification of a read-only value attempted/); - - -my $x = delete $p->{_hparser_xs_state}; - -eval { - $p->xml_mode(1); -}; -like($@, qr/^Can't find '_hparser_xs_state'/); - -$p->{_hparser_xs_state} = \($$x + 16); - -eval { - $p->xml_mode(1); -}; -like($@, $] >= 5.008 ? qr/^Lost parser state magic/ : qr/^Bad signature in parser state object/); - -$p->{_hparser_xs_state} = 33; -eval { - $p->xml_mode(1); -}; -like($@, qr/^_hparser_xs_state element is not a reference/); - -$p->{_hparser_xs_state} = $x; - -ok($p->xml_mode(0)); diff --git a/ext/HTML/Parser/t/marked-sect.t b/ext/HTML/Parser/t/marked-sect.t deleted file mode 100644 index 6a63478156..0000000000 --- a/ext/HTML/Parser/t/marked-sect.t +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -my $tag; -my $text; - -use HTML::Parser (); -my $p = HTML::Parser->new(start_h => [sub { $tag = shift }, "tagname"], - text_h => [sub { $text .= shift }, "dtext"], - ); - - -use Test::More tests => 14; - -SKIP: { -eval { - $p->marked_sections(1); -}; -skip $@, 14 if $@; - -$p->parse("<![[foo]]>"); -is($text, "foo"); - -$p->parse("<![TEMP INCLUDE[bar]]>"); -is($text, "foobar"); - -$p->parse("<![ INCLUDE -- IGNORE -- [foo<![IGNORE[bar]]>]]>\n<br>"); -is($text, "foobarfoo\n"); - -$text = ""; -$p->parse("<![ CDATA [<foo"); -$p->parse("<![IGNORE[bar]]>,bar>]]><br>"); -is($text, "<foo<![IGNORE[bar,bar>]]>"); - -$text = ""; -$p->parse("<![ RCDATA [å<a>]]><![CDATA[å<a>]]>å<a><br>"); -is($text, "å<a>å<a>å"); -is($tag, "br"); - -$text = ""; -$p->parse("<![INCLUDE RCDATA CDATA IGNORE [fooå<a>]]><br>"); -is($text, ""); - -$text = ""; -$p->parse("<![INCLUDE RCDATA CDATA [fooå<a>]]><br>"); -is($text, "fooå<a>"); - -$text = ""; -$p->parse("<![INCLUDE RCDATA [fooå<a>]]><br>"); -is($text, "fooå<a>"); - -$text = ""; -$p->parse("<![INCLUDE [fooå<a>]]><br>"); -is($text, "fooå"); - -$text = ""; -$p->parse("<![[fooå<a>]]><br>"); -is($text, "fooå"); - -# offsets/line/column numbers -$p = HTML::Parser->new(default_h => [\&x, "line,column,offset,event,text"], - marked_sections => 1, - ); -$p->parse(<<'EOT')->eof; -<title>Test</title> -<![CDATA - [fooå<a> -]]> -<![[ -INCLUDE -STUFF -]]> - <h1>Test</h1> -EOT - -my @x; -sub x { - my($line, $col, $offset, $event, $text) = @_; - $text =~ s/\n/\\n/g; - $text =~ s/ /./g; - push(@x, "$line.$col:$offset $event \"$text\"\n"); -} - -#diag @x; -is(join("", @x), <<'EOT'); -1.0:0 start_document "" -1.0:0 start "<title>" -1.7:7 text "Test" -1.11:11 end "</title>" -1.19:19 text "\n" -3.3:32 text "fooå<a>\n" -4.3:49 text "\n" -5.4:54 text "\nINCLUDE\nSTUFF\n" -8.3:72 text "\n.." -9.2:75 start "<h1>" -9.6:79 text "Test" -9.10:83 end "</h1>" -9.15:88 text "\n" -10.0:89 end_document "" -EOT - -my $doc = "<Tag><![CDATA[This is cdata]]></Tag>"; -my $result = ""; -$p = HTML::Parser->new( - marked_sections => 1, - handlers => { - default => [ sub { $result .= join("",@_); }, "skipped_text,text" ] - } -)->parse($doc)->eof; -is($doc, $result); - -$text = ""; -$p = HTML::Parser->new( - text_h => [sub { $text .= shift }, "dtext"], - marked_sections => 1, -); - -$p->parse("<![CDATA[foo [1]]]>"); -is($text, "foo [1]", "CDATA text ending in square bracket"); - -} # SKIP diff --git a/ext/HTML/Parser/t/msie-compat.t b/ext/HTML/Parser/t/msie-compat.t deleted file mode 100644 index 90d4b7e3b7..0000000000 --- a/ext/HTML/Parser/t/msie-compat.t +++ /dev/null @@ -1,58 +0,0 @@ -#!perl -w - -use strict; -use HTML::Parser; - -use Test::More tests => 2; - -my $TEXT = ""; -sub h -{ - my($event, $tagname, $text) = @_; - for ($event, $tagname, $text) { - if (defined) { - s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; - } - else { - $_ = "<undef>"; - } - } - - $TEXT .= "[$event,$tagname,$text]\n"; -} - -my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"]); -$p->parse("<a>"); -$p->parse("</a f>"); -$p->parse("</a 'foo<>' 'bar>' x>"); -$p->parse("</a \"foo<>\""); -$p->parse(" \"bar>\" x>"); -$p->parse("</ foo bar>"); -$p->parse("</ \"<>\" >"); -$p->parse("<!--comment>text<!--comment><p"); -$p->eof; - -is($TEXT, <<'EOT'); -[start_document,<undef>,] -[start,a,<a>] -[end,a,</a f>] -[end,a,</a 'foo<>' 'bar>' x>] -[end,a,</a "foo<>" "bar>" x>] -[comment, foo bar,</ foo bar>] -[comment, "<>" ,</ "<>" >] -[comment,comment,<!--comment>] -[text,<undef>,text] -[comment,comment,<!--comment>] -[comment,p,<p] -[end_document,<undef>,] -EOT - -$TEXT = ""; -$p->parse("<!comment>"); -$p->eof; - -is($TEXT, <<'EOT'); -[start_document,<undef>,] -[comment,comment,<!comment>] -[end_document,<undef>,] -EOT diff --git a/ext/HTML/Parser/t/offset.t b/ext/HTML/Parser/t/offset.t deleted file mode 100644 index 840728d7c3..0000000000 --- a/ext/HTML/Parser/t/offset.t +++ /dev/null @@ -1,58 +0,0 @@ -use strict; -use HTML::Parser (); -use Test::More tests => 1; - -my $HTML = <<'EOT'; - -<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" - "http://www.w3.org/TR/html40/strict.dtd"> - -<foo bar baz=3>heisan -</foo> <?process> -<!-- comment --> -<xmp>xmp</xmp> - -EOT - -my $p = HTML::Parser->new(api_version => 3); - -my $sum_len = 0; -my $count = 0; -my $err; - -$p->handler(default => - sub { - my($offset, $length, $offset_end, $line, $col, $text) = @_; - my $copy = $text; - $copy =~ s/\n/\\n/g; - substr($copy, 30) = "..." if length($copy) > 32; - #diag sprintf ">>> %d.%d %s", $line, $col, $copy; - if ($offset != $sum_len) { - diag "offset mismatch $offset vs $sum_len"; - $err++; - } - if ($offset_end != $offset + $length) { - diag "offset_end $offset_end wrong"; - $err++; - } - if ($length != length($text)) { - diag "length mismatch"; - $err++; - } - if (substr($HTML, $offset, $length) ne $text) { - diag "content mismatch"; - $err++; - } - $sum_len += $length; - $count++; - }, - 'offset,length,offset_end,line,column,text'); - -for (split(//, $HTML)) { - $p->parse($_); -} -$p->eof; - -ok($count > 5 && !$err); - - diff --git a/ext/HTML/Parser/t/options.t b/ext/HTML/Parser/t/options.t deleted file mode 100644 index ff5f7db564..0000000000 --- a/ext/HTML/Parser/t/options.t +++ /dev/null @@ -1,36 +0,0 @@ -# Test option setting methods - -use Test::More tests => 10; - -use strict; -use HTML::Parser (); - -my $p = HTML::Parser->new(api_version => 3, - xml_mode => 1); -my $old; - -$old = $p->boolean_attribute_value("foo"); -ok(!defined $old); - -$old = $p->boolean_attribute_value(); -is($old, "foo"); - -$old = $p->boolean_attribute_value(undef); -is($old, "foo"); -ok(!defined($p->boolean_attribute_value)); - -ok($p->xml_mode(0)); -ok(!$p->xml_mode); - -my $seen_buggy_comment_warning; -$SIG{__WARN__} = - sub { - local $_ = shift; - $seen_buggy_comment_warning++ - if /^netscape_buggy_comment\(\) is deprecated/; - }; - -ok(!$p->strict_comment(1)); -ok($p->strict_comment); -ok(!$p->netscape_buggy_comment); -ok($seen_buggy_comment_warning); diff --git a/ext/HTML/Parser/t/parsefile.t b/ext/HTML/Parser/t/parsefile.t deleted file mode 100644 index f373f066f6..0000000000 --- a/ext/HTML/Parser/t/parsefile.t +++ /dev/null @@ -1,45 +0,0 @@ -use Test::More tests => 6; - -my $filename = "file$$.htm"; -die "$filename is already there" if -e $filename; -open(FILE, ">$filename") || die "Can't create $filename: $!"; -print FILE <<'EOT'; close(FILE); -<title>Heisan</title> -EOT - -{ - package MyParser; - require HTML::Parser; - @ISA=qw(HTML::Parser); - - sub start - { - my($self, $tag, $attr) = @_; - Test::More::is($tag, "title"); - } -} - -MyParser->new->parse_file($filename); -open(FILE, $filename) || die; -MyParser->new->parse_file(*FILE); -seek(FILE, 0, 0) || die; -MyParser->new->parse_file(\*FILE); -close(FILE); - -require IO::File; -my $io = IO::File->new($filename) || die; -MyParser->new->parse_file($io); -$io->seek(0, 0) || die; -MyParser->new->parse_file(*$io); - -my $text = ''; -$io->seek(0, 0) || die; -MyParser->new( - start_h => [ sub{ shift->eof; }, "self" ], - text_h => [ sub{ $text = shift; }, "text" ])->parse_file(*$io); -ok(!$text); - -close($io); # needed because of bug in perl -undef($io); - -unlink($filename) or warn "Can't unlink $filename: $!"; diff --git a/ext/HTML/Parser/t/parser.t b/ext/HTML/Parser/t/parser.t deleted file mode 100644 index 0ce4d95803..0000000000 --- a/ext/HTML/Parser/t/parser.t +++ /dev/null @@ -1,184 +0,0 @@ -use Test::More tests => 7; - -$HTML = <<'HTML'; - -<!DOCTYPE HTML> - -<body> - -Various entities. The parser must never break them in the middle: - -/ -/ -È -௖ - -å-Å - -<ul> -<li><a href="foo 'bar' baz>" id=33>This is a link</a> -<li><a href='foo "bar" baz> å' id=34>This is another one</a> -</ul> - -<p><div align="center"><img src="http://www.perl.com/perl.gif" -alt="camel"></div> - -<!-- this is -a comment --> and this is not. - -<!-- this is the kind of >comment< -- --> that Netscape hates --> - -< this > was not a tag. <this is/not either> - -</body> - -HTML - -#------------------------------------------------------------------- - -{ - package P; - require HTML::Parser; - @ISA=qw(HTML::Parser); - $OUT=''; - $COUNT=0; - - sub new - { - my $class = shift; - my $self = $class->SUPER::new; - $OUT = ''; - die "Can only have one" if $COUNT++; - $self; - } - - sub DESTROY - { - my $self = shift; - eval { $self->SUPER::DESTROY; }; - $COUNT--; - } - - sub declaration - { - my($self, $decl) = @_; - $OUT .= "[[$decl]]|"; - } - - sub start - { - my($self, $tag, $attr) = @_; - $attr = join("/", map "$_=$attr->{$_}", sort keys %$attr); - $attr = "/$attr" if length $attr; - $OUT .= "<<$tag$attr>>|"; - } - - sub end - { - my($self, $tag) = @_; - $OUT .= ">>$tag<<|"; - } - - sub comment - { - my($self, $comment) = @_; - $OUT .= "##$comment##|"; - } - - sub text - { - my($self, $text) = @_; - #$text =~ s/\n/\\n/g; - #$text =~ s/\t/\\t/g; - #$text =~ s/ /·/g; - $OUT .= "$text|"; - } - - sub result - { - $OUT; - } -} - -for $chunksize (64*1024, 64, 13, 3, 1, "file", "filehandle") { -#for $chunksize (1) { - if ($chunksize =~ /^file/) { - #print "Parsing from $chunksize"; - } else { - #print "Parsing using $chunksize byte chunks"; - } - my $p = P->new; - - if ($chunksize =~ /^file/) { - # First we must create the file - my $tmpfile = "tmp-$$.html"; - my $file = $tmpfile; - die "$file already exists" if -e $file; - open(FILE, ">$file") or die "Can't create $file: $!"; - binmode FILE; - print FILE $HTML; - close(FILE); - - if ($chunksize eq "filehandle") { - require FileHandle; - my $fh = FileHandle->new($file) || die "Can't open $file: $!"; - $file = $fh; - } - - # then we can parse it. - $p->parse_file($file); - close $file if $chunksize eq "filehandle"; - unlink($tmpfile) || warn "Can't unlink $tmpfile: $!"; - } else { - my $copy = $HTML; - while (length $copy) { - my $chunk = substr($copy, 0, $chunksize); - substr($copy, 0, $chunksize) = ''; - $p->parse($chunk); - } - $p->eof; - } - - my $res = $p->result; - my $bad; - - # Then we start looking for things that should not happen - if ($res =~ /\s\|\s/) { - diag "broken space"; - $bad++; - } - for ( - # Make sure entities are not broken - '/', '/', 'È', '௖', '', 'å', 'Å', - - # Some elements that should be produced - "|[[DOCTYPE HTML]]|", - "|## this is\na comment ##|", - "|<<ul>>|\n|<<li>>|<<a/href=foo 'bar' baz>/id=33>>|", - '|<<li>>|<<a/href=foo "bar" baz> å/id=34>>', - "|>>ul<<|", "|>>body<<|\n\n|", - ) - { - if (index($res, $_) < 0) { - diag "Can't find '$_' in parsed document"; - $bad++; - } - } - - diag $res if $bad || $ENV{PRINT_RESULTS}; - - # And we check that we get the same result all the time - $res =~ s/\|//g; # remove all break marks - if ($last_res && $res ne $last_res) { - diag "The result is not the same as last time"; - $bad++; - } - $last_res = $res; - - unless ($res =~ /Various entities/) { - diag "Some text must be missing"; - $bad++; - } - - ok(!$bad); -} diff --git a/ext/HTML/Parser/t/plaintext.t b/ext/HTML/Parser/t/plaintext.t deleted file mode 100644 index b2e1e19e11..0000000000 --- a/ext/HTML/Parser/t/plaintext.t +++ /dev/null @@ -1,45 +0,0 @@ -use Test::More tests => 2; - -use strict; -use HTML::Parser; - -my @a; -my $p = HTML::Parser->new(api_version => 3); -$p->handler(default => \@a, '@{event, text, is_cdata}'); -$p->parse(<<EOT)->eof; -<xmp><foo></xmp>x<plaintext><foo> -</plaintext> -foo -EOT - -for (@a) { - $_ = "" unless defined; -} - -my $doc = join(":", @a); - -#diag $doc; - -is($doc, "start_document:::start:<xmp>::text:<foo>:1:end:</xmp>::text:x::start:<plaintext>::text:<foo> -</plaintext> -foo -:1:end_document::"); - -@a = (); -$p->closing_plaintext('yep, emulate gecko'); -$p->parse(<<EOT)->eof; -<plaintext><foo> -</plaintext>foo<b></b> -EOT - -for (@a) { - $_ = "" unless defined; -} - -$doc = join(":", @a); - -#diag $doc; - -is($doc, "start_document:::start:<plaintext>::text:<foo> -:1:end:</plaintext>::text:foo::start:<b>::end:</b>::text: -::end_document::"); diff --git a/ext/HTML/Parser/t/pod.t b/ext/HTML/Parser/t/pod.t deleted file mode 100644 index 437887a0a2..0000000000 --- a/ext/HTML/Parser/t/pod.t +++ /dev/null @@ -1,4 +0,0 @@ -use Test::More; -eval "use Test::Pod 1.00"; -plan skip_all => "Test::Pod 1.00 required for testing POD" if $@; -all_pod_files_ok(); diff --git a/ext/HTML/Parser/t/process.t b/ext/HTML/Parser/t/process.t deleted file mode 100644 index 9d27250a92..0000000000 --- a/ext/HTML/Parser/t/process.t +++ /dev/null @@ -1,43 +0,0 @@ -use strict; - -use Test::More tests => 12; - -my $pi; -my $orig; - -use HTML::Parser (); -my $p = HTML::Parser->new(process_h => [sub { $pi = shift; $orig = shift; }, - "token0,text"] - ); - -$p->parse("<a><?foo><a>"); - -is($pi, "foo"); -is($orig, "<?foo>"); - -$p->parse("<a><?><a>"); -is($pi, ""); -is($orig, "<?>"); - -$p->parse("<a><? -foo -><a>"); -is($pi, "\nfoo\n"); -is($orig, "<?\nfoo\n>"); - -for (qw(< a > < ? b a r > < a >)) { - $p->parse($_); -} - -is($pi, "bar"); -is($orig, "<?bar>"); - -$p->xml_mode(1); - -$p->parse("<a><?foo>bar??><a>"); -is($pi, "foo>bar?"); -is($orig, "<?foo>bar??>"); - -$p->parse("<a><??></a>"); -is($pi, ""); -is($orig, "<??>"); diff --git a/ext/HTML/Parser/t/pullparser.t b/ext/HTML/Parser/t/pullparser.t deleted file mode 100644 index 80a186b399..0000000000 --- a/ext/HTML/Parser/t/pullparser.t +++ /dev/null @@ -1,55 +0,0 @@ -use Test::More tests => 3; - -use HTML::PullParser; - -my $doc = <<'EOT'; -<title>Title</title> -<style> h1 { background: white } -<foo> -</style> -<H1 ID="3">Heading</H1> -<!-- ignore this --> - -This is a text with a <A HREF="http://www.sol.no" name="l1">link</a>. -EOT - -my $p = HTML::PullParser->new(doc => $doc, - start => 'event,tagname,@attr', - end => 'event,tagname', - text => 'event,dtext', - - ignore_elements => [qw(script style)], - unbroken_text => 1, - boolean_attribute_value => 1, - ); - -my $t = $p->get_token; -is($t->[0], "start"); -is($t->[1], "title"); -$p->unget_token($t); - -my @a; -while (my $t = $p->get_token) { - for (@$t) { - s/\s/./g; - } - push(@a, join("|", @$t)); -} - -my $res = join("\n", @a, ""); -#diag $res; -is($res, <<'EOT'); -start|title -text|Title -end|title -text|.. -start|h1|id|3 -text|Heading -end|h1 -text|...This.is.a.text.with.a. -start|a|href|http://www.sol.no|name|l1 -text|link -end|a -text|.. -EOT - diff --git a/ext/HTML/Parser/t/script.t b/ext/HTML/Parser/t/script.t deleted file mode 100644 index 2a75ccb35b..0000000000 --- a/ext/HTML/Parser/t/script.t +++ /dev/null @@ -1,41 +0,0 @@ -#!perl -w - -use strict; -use Test; -plan tests => 1; - -use HTML::Parser; - -my $TEXT = ""; -sub h -{ - my($event, $tagname, $text) = @_; - for ($event, $tagname, $text) { - if (defined) { - s/([\n\r\t])/sprintf "\\%03o", ord($1)/ge; - } - else { - $_ = "<undef>"; - } - } - - $TEXT .= "[$event,$tagname,$text]\n"; -} - -my $p = HTML::Parser->new(default_h => [\&h, "event,tagname,text"], empty_element_tags => 1); -$p->parse(q(<tr><td align="center" height="100"><script src="whatever"/><SCRIPT language="JavaScript1.1">bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');</SCRIPT></td></tr>)); -$p->eof; - -ok($TEXT, <<'EOT'); -[start_document,<undef>,] -[start,tr,<tr>] -[start,td,<td align="center" height="100">] -[start,script,<script src="whatever"/>] -[end,script,] -[start,script,<SCRIPT language="JavaScript1.1">] -[text,<undef>,bust = Math.floor(1000000*Math.random());document.write('<SCR' + 'IPT LANGUAGE="JavaScript1.1" SRC="http://adv.virgilio.it/js.ng/site=virg&adsize=728x90&subsite=mail&sez=comfree&pos=43&bust='+bust+'?">\n');document.write('</SCR' + 'IPT>\n');] -[end,script,</SCRIPT>] -[end,td,</td>] -[end,tr,</tr>] -[end_document,<undef>,] -EOT diff --git a/ext/HTML/Parser/t/skipped-text.t b/ext/HTML/Parser/t/skipped-text.t deleted file mode 100644 index 8bd2704b20..0000000000 --- a/ext/HTML/Parser/t/skipped-text.t +++ /dev/null @@ -1,74 +0,0 @@ -use Test::More tests => 3; - -use strict; -use HTML::Parser; - -my $p = HTML::Parser->new(api_version => 3); - -$p->report_tags("a"); - -my @doc; - -$p->handler(start => \&a_handler, "skipped_text, text"); -$p->handler(end_document => \@doc, '@{skipped_text}'); - -$p->parse(<<EOT)->eof; -<title>hi</title> -<h1><a href="foo">link</a></h1> -and <a foo="">some</a> text. -EOT - -sub a_handler { - push(@doc, shift); - my $text = shift; - push(@doc, uc($text)); -} - - -is(join("", @doc), <<'EOT'); -<title>hi</title> -<h1><A HREF="FOO">link</a></h1> -and <A FOO="">some</a> text. -EOT - -# -# Comment stripper. Interaction with "" handlers. -# -my $doc = <<EOT; -<html>text</html> -<!-- comment --> -and some more <b>text</b>. -EOT -(my $expected = $doc) =~ s/<!--.*?-->//; - -$p = HTML::Parser->new(api_version => 3); -$p->handler(comment => ""); -$p->handler(end_document => sub { - my $stripped = shift; - #diag $stripped; - is($stripped, $expected); - }, "skipped_text"); -for (split(//, $doc)) { - $p->parse($_); -} -$p->eof; - -# -# Interaction with unbroken text -# -my @x; -$p = HTML::Parser->new(api_version => 3, unbroken_text => 1); -$p->handler(text => \@x, '@{"X", skipped_text, text}'); -$p->handler(end => ""); -$p->handler(end_document => \@x, '@{"Y", skipped_text}'); - -$doc = "a a<a>b b</a>c c<x>d d</x>e"; - -for (split(//, $doc)) { - $p->parse($_); -} -$p->eof; - -#diag join(":", @x); -is(join(":", @x), "X::a a:X:<a>:b bc c:X:<x>:d de:Y:"); - diff --git a/ext/HTML/Parser/t/stack-realloc.t b/ext/HTML/Parser/t/stack-realloc.t deleted file mode 100644 index 46c7d35666..0000000000 --- a/ext/HTML/Parser/t/stack-realloc.t +++ /dev/null @@ -1,17 +0,0 @@ -#!perl -w - -# HTML-Parser 3.33 and older used to core dump on this program because -# of missing SPAGAIN calls in parse() XS code. It was not prepared for -# the stack to get realloced. - -$| = 1; - -use Test::More tests => 1; - -use HTML::Parser; -my $x = HTML::Parser->new(api_version => 3); -my @row; -$x->handler(end => sub { push(@row, (1) x 505); 1 }, "tagname"); -$x->parse("</TD>"); - -pass; diff --git a/ext/HTML/Parser/t/textarea.t b/ext/HTML/Parser/t/textarea.t deleted file mode 100644 index 120f79bb29..0000000000 --- a/ext/HTML/Parser/t/textarea.t +++ /dev/null @@ -1,70 +0,0 @@ -use Test::More tests => 1; - -use strict; -use HTML::Parser; - -my $html = <<'EOT'; -<html> -<title>This is a <nice> title</title> -<!--comment--> -<script language="perl">while (<DATA>) { & }</script> - -<FORM> - -<textarea name="foo" cols=50 rows=10> - -foo -<foo> -<!--comment--> -& -foo -</FORM> - -</textarea> - -</FORM> - -</html> -EOT - -my $dump = ""; -sub tdump { - my @a = @_; - for (@a) { - $_ = "<undef>" unless defined; - s/\n/\\n/g; - } - $dump .= join("|", @a) . "\n"; -} - -my $p = HTML::Parser->new(default_h => [\&tdump, "event,text,dtext,is_cdata"]); -$p->parse($html)->eof; - -#diag $dump; - -is($dump, <<'EOT'); -start_document||<undef>|<undef> -start|<html>|<undef>|<undef> -text|\n|\n| -start|<title>|<undef>|<undef> -text|This is a <nice> title|This is a <nice> title| -end|</title>|<undef>|<undef> -text|\n|\n| -comment|<!--comment-->|<undef>|<undef> -text|\n|\n| -start|<script language="perl">|<undef>|<undef> -text|while (<DATA>) { & }|while (<DATA>) { & }|1 -end|</script>|<undef>|<undef> -text|\n\n|\n\n| -start|<FORM>|<undef>|<undef> -text|\n\n|\n\n| -start|<textarea name="foo" cols=50 rows=10>|<undef>|<undef> -text|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n|\n\nfoo\n<foo>\n<!--comment-->\n&\nfoo\n</FORM>\n\n| -end|</textarea>|<undef>|<undef> -text|\n\n|\n\n| -end|</FORM>|<undef>|<undef> -text|\n\n|\n\n| -end|</html>|<undef>|<undef> -text|\n|\n| -end_document||<undef>|<undef> -EOT diff --git a/ext/HTML/Parser/t/threads.t b/ext/HTML/Parser/t/threads.t deleted file mode 100644 index 8da91e9b78..0000000000 --- a/ext/HTML/Parser/t/threads.t +++ /dev/null @@ -1,39 +0,0 @@ -# Verify thread safety. - -use Config; -use Test::More; - -BEGIN { - plan(skip_all => "Not configured for threads") - unless $Config{useithreads} && $] >= 5.008; - plan(tests => 1); -} - -use threads; -use HTML::Parser; - -my $ok=0; - -sub start -{ - my($tag,$attr)=@_; - - $ok += ($tag eq "foo"); - $ok += (defined($attr->{param}) && $attr->{param} eq "bar"); -} - -my $p = HTML::Parser->new - (api_version => 3, - handlers => { - start => [\&start, "tagname,attr"], - }); - -$p->parse("<foo pa"); - -$ok=async { - $p->parse("ram=bar>"); - $ok; -}->join(); - -is($ok,2); - diff --git a/ext/HTML/Parser/t/tokeparser.t b/ext/HTML/Parser/t/tokeparser.t deleted file mode 100644 index 2084201fb9..0000000000 --- a/ext/HTML/Parser/t/tokeparser.t +++ /dev/null @@ -1,164 +0,0 @@ -use Test::More tests => 17; - -use strict; -use HTML::TokeParser; - -# First we create an HTML document to test - -my $file = "ttest$$.htm"; -die "$file already exists" if -e $file; - -open(F, ">$file") or die "Can't create $file: $!"; -print F <<'EOT'; close(F); - -<!--This is a test--> -<html><head><title> - This is the <title> -</title> - - <base href="http://www.perl.com"> -</head> - -<body background="bg.gif"> - - <h1>This is the <b>title</b> again - </h1> - - And this is a link to the <a href="http://www.perl.com"><img src="camel.gif" alt="Perl"> <!--nice isn't it-->Institute</a> - - <br/><? process instruction > - -</body> -</html> - -EOT - -END { unlink($file) || warn "Can't unlink $file: $!"; } - - -my $p; - - -$p = HTML::TokeParser->new($file) || die "Can't open $file: $!"; -ok($p->unbroken_text); -if ($p->get_tag("foo", "title")) { - my $title = $p->get_trimmed_text; - #diag "Title: $title"; - is($title, "This is the <title>"); -} -undef($p); - -# Test with reference to glob -open(F, $file) || die "Can't open $file: $!"; -$p = HTML::TokeParser->new(\*F); -my $scount = 0; -my $ecount = 0; -my $tcount = 0; -my $pcount = 0; -while (my $token = $p->get_token) { - $scount++ if $token->[0] eq "S"; - $ecount++ if $token->[0] eq "E"; - $pcount++ if $token->[0] eq "PI"; -} -undef($p); -close F; - -# Test with glob -open(F, $file) || die "Can't open $file: $!"; -$p = HTML::TokeParser->new(*F); -$tcount++ while $p->get_tag; -undef($p); -close F; - -# Test with plain file name -$p = HTML::TokeParser->new($file) || die; -$tcount++ while $p->get_tag; -undef($p); - -#diag "Number of tokens found: $tcount/2 = $scount + $ecount"; -is($tcount, 34); -is($scount, 10); -is($ecount, 7); -is($pcount, 1); -is($tcount/2, $scount + $ecount); - -ok(!HTML::TokeParser->new("/noT/thEre/$$")); - - -$p = HTML::TokeParser->new($file) || die; -$p->get_tag("a"); -my $atext = $p->get_text; -undef($p); - -is($atext, "Perl\240Institute"); - -# test parsing of embeded document -$p = HTML::TokeParser->new(\<<HTML); -<title>Title</title> -<H1> -Heading -</h1> -HTML - -ok($p->get_tag("h1")); -is($p->get_trimmed_text, "Heading"); -undef($p); - -# test parsing of large embedded documents -my $doc = "<a href='foo'>foo is bar</a>\n\n\n" x 2022; - -#use Time::HiRes qw(time); -my $start = time; -$p = HTML::TokeParser->new(\$doc); -#diag "Construction time: ", time - $start; - -my $count; -while (my $t = $p->get_token) { - $count++ if $t->[0] eq "S"; -} -#diag "Parse time: ", time - $start; - -is($count, 2022); - -$p = HTML::TokeParser->new(\<<'EOT'); -<H1>This is a heading</H1> -This is s<b>o</b>me<hr>text. -<br /> -This is some more text. -<p> -This is even some more. -EOT - -$p->get_tag("/h1"); - -my $t = $p->get_trimmed_text("br", "p"); -is($t, "This is some text."); - -$p->get_tag; - -$t = $p->get_trimmed_text("br", "p"); -is($t,"This is some more text."); - -undef($p); - -$p = HTML::TokeParser->new(\<<'EOT'); -<H1>This is a <b>bold</b> heading</H1> -This is some <i>italic</i> text.<br />This is some <span id=x>more text</span>. -<p> -This is even some more. -EOT - -$p->get_tag("h1"); - -$t = $p->get_phrase; -is($t, "This is a bold heading"); - -$t = $p->get_phrase; -is($t, ""); - -$p->get_tag; - -$t = $p->get_phrase; -is($t, "This is some italic text. This is some more text."); - -undef($p); diff --git a/ext/HTML/Parser/t/uentities.t b/ext/HTML/Parser/t/uentities.t deleted file mode 100644 index b9decc54a0..0000000000 --- a/ext/HTML/Parser/t/uentities.t +++ /dev/null @@ -1,67 +0,0 @@ -# Test Unicode entities - -use HTML::Entities; - -use Test::More tests => 27; - -SKIP: { -skip "This perl does not support Unicode or Unicode entities not selected", - 27 if $] < 5.008 || !&HTML::Entities::UNICODE_SUPPORT; - -is(decode_entities("&euro"), "&euro"); -is(decode_entities("€"), "\x{20AC}"); - -is(decode_entities("å"), "å"); -is(decode_entities("å"), "å"); - -is(decode_entities("񺄠"), chr(500000)); - -is(decode_entities("􏿽"), "\x{10FFFD}"); - -is(decode_entities(""), "\x{FFFC}"); - - -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities(""), "\x{FFFD}"); -is(decode_entities("�"), chr(0xFFFD)); -is(decode_entities("�"), chr(0xFFFD)); - -is(decode_entities("�"), "\0"); -is(decode_entities("�"), "\0"); -is(decode_entities("�"), "\0"); -is(decode_entities("�"), "\0"); - -is(decode_entities("&#ååå࿿"), "&#ååå\x{FFF}"); - -# This might fail when we get more than 64 bit UVs -is(decode_entities("�"), "�"); -is(decode_entities("�"), "�"); - -my $err; -for ([32, 48], [120, 169], [240, 250], [250, 260], [965, 975], [3000, 3005]) { - my $a = join("", map chr, $_->[0] .. $_->[1]); - - my $e = encode_entities($a); - my $d = decode_entities($e); - - unless ($d eq $a) { - diag "Wrong decoding in range $_->[0] .. $_->[1]"; - # use Devel::Peek; Dump($a); Dump($d); - $err++; - } -} -ok(!$err); - - -is(decode_entities("��"), chr(0x100085)); - -is(decode_entities("��"), chr(0x100085)); - -is(decode_entities("�"), chr(0xFFFD)); - -is(decode_entities("\260’\260"), "\x{b0}\x{2019}\x{b0}"); -} diff --git a/ext/HTML/Parser/t/unbroken-text.t b/ext/HTML/Parser/t/unbroken-text.t deleted file mode 100644 index 7de85a9880..0000000000 --- a/ext/HTML/Parser/t/unbroken-text.t +++ /dev/null @@ -1,60 +0,0 @@ -use strict; -use HTML::Parser; - -use Test::More tests => 3; - -my $text = ""; -sub text -{ - my $cdata = shift() ? "CDATA" : "TEXT"; - my($offset, $line, $col, $t) = @_; - $text .= "[$cdata:$offset:$line.$col:$t]"; -} - -sub tag -{ - $text .= shift; -} - -my $p = HTML::Parser->new(unbroken_text => 1, - text_h => [\&text, "is_cdata,offset,line,column,text"], - start_h => [\&tag, "text"], - end_h => [\&tag, "text"], - ); - -$p->parse("foo "); -$p->parse("bar "); -$p->parse("<foo>"); -$p->parse("bar\n"); -$p->parse("</foo>"); -$p->parse("<xmp>xmp</xmp>"); -$p->parse("atend"); - -#diag $text; -is($text, "[TEXT:0:1.0:foo bar ]<foo>[TEXT:13:1.13:bar\n]</foo><xmp>[CDATA:28:2.11:xmp]</xmp>"); - -$text = ""; -$p->eof; - -#diag $text; -is($text, "[TEXT:37:2.20:atend]"); - - -$p = HTML::Parser->new(unbroken_text => 1, - text_h => [\&text, "is_cdata,offset,line,column,text"], - ); - -$text = ""; -$p->parse("foo"); -$p->parse("<foo"); -$p->parse(">bar\n"); -$p->parse("foo<xm"); -$p->parse("p>xmp"); -$p->parse("</xmp"); -$p->parse(">bar"); -$p->eof; - -#diag $text; -is($text, "[TEXT:0:1.0:foobar\nfoo][CDATA:20:2.8:xmp][TEXT:29:2.17:bar]"); - - diff --git a/ext/HTML/Parser/t/unicode-bom.t b/ext/HTML/Parser/t/unicode-bom.t deleted file mode 100644 index 34e066fb6c..0000000000 --- a/ext/HTML/Parser/t/unicode-bom.t +++ /dev/null @@ -1,59 +0,0 @@ -#!perl -w - -use strict; -use Test::More tests => 2; -use HTML::Parser; - -SKIP: { -skip "This perl does not support Unicode", 2 if $] < 5.008; - -my @parsed; -my $p = HTML::Parser->new( - api_version => 3, - start_h => [\@parsed, 'tag, attr'], -); - -my @warn; -$SIG{__WARN__} = sub { - push(@warn, $_[0]); -}; - -$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); -$p->eof; - -#use Encode; -$p->parse("\xEF\xBB\xBF<head>Hi there</head>" . chr(0x263A)); -$p->eof; - -$p->parse("\xFF\xFE<head>Hi there</head>"); -$p->eof; - -$p->parse("\xFE\xFF<head>Hi there</head>"); -$p->eof; - -$p->parse("\0\0\xFF\xFE<head>Hi there</head>"); -$p->eof; - -$p->parse("\xFE\xFF\0\0<head>Hi there</head>"); -$p->eof; - -is(join("", @warn), <<EOT); -Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 21. -Parsing of undecoded UTF-8 will give garbage when decoding entities at $0 line 25. -Parsing of undecoded UTF-16 at $0 line 28. -Parsing of undecoded UTF-16 at $0 line 31. -Parsing of undecoded UTF-32 at $0 line 34. -Parsing of undecoded UTF-32 at $0 line 37. -EOT - -@warn = (); - -$p = HTML::Parser->new( - api_version => 3, - start_h => [\@parsed, 'tag'], -); - -$p->parse("\xEF\xBB\xBF<head>Hi there</head>"); -$p->eof; -ok(!@warn); -} diff --git a/ext/HTML/Parser/t/unicode.t b/ext/HTML/Parser/t/unicode.t deleted file mode 100644 index 82902dec8b..0000000000 --- a/ext/HTML/Parser/t/unicode.t +++ /dev/null @@ -1,183 +0,0 @@ -#!perl -w - -use strict; -use HTML::Parser; -use Test::More tests => 103; - -SKIP: { -skip "This perl does not support Unicode", 103 if $] < 5.008; - -my @warn; -$SIG{__WARN__} = sub { - push(@warn, $_[0]); -}; - -my @parsed; -my $p = HTML::Parser->new( - api_version => 3, - default_h => [\@parsed, 'event, text, dtext, offset, length, offset_end, column, tokenpos, attr'], -); - -my $doc = "<title>\x{263A}</title><h1 id=\x{2600} f>Smile ☺</h1>\x{0420}"; -is(length($doc), 46); - -$p->parse($doc)->eof; - -#use Data::Dump; Data::Dump::dump(@parsed); - -is(@parsed, 9); -is($parsed[0][0], "start_document"); - -is($parsed[1][0], "start"); -is($parsed[1][1], "<title>"); -SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(utf8::is_utf8($parsed[1][1]), "is_utf8") }; -is($parsed[1][3], 0); -is($parsed[1][4], 7); - -is($parsed[2][0], "text"); -is(ord($parsed[2][1]), 0x263A); -is($parsed[2][2], chr(0x263A)); -is($parsed[2][3], 7); -is($parsed[2][4], 1); -is($parsed[2][5], 8); -is($parsed[2][6], 7); - -is($parsed[3][0], "end"); -is($parsed[3][1], "</title>"); -is($parsed[3][3], 8); -is($parsed[3][6], 8); - -is($parsed[4][0], "start"); -is($parsed[4][1], "<h1 id=\x{2600} f>"); -is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|1|9|1|0|0"); -is($parsed[4][8]{id}, "\x{2600}"); - -is($parsed[5][0], "text"); -is($parsed[5][1], "Smile ☺"); -is($parsed[5][2], "Smile \x{263A}"); - -is($parsed[7][0], "text"); -is($parsed[7][1], "\x{0420}"); -is($parsed[7][2], "\x{0420}"); - -is($parsed[8][0], "end_document"); -is($parsed[8][3], length($doc)); -is($parsed[8][5], length($doc)); -is($parsed[8][6], length($doc)); -is(@warn, 0); - -# Try to parse it as an UTF8 encoded string -utf8::encode($doc); -is(length($doc), 51); - -@parsed = (); -$p->parse($doc)->eof; - -#use Data::Dump; Data::Dump::dump(@parsed); - -is(@parsed, 9); -is($parsed[0][0], "start_document"); - -is($parsed[1][0], "start"); -is($parsed[1][1], "<title>"); -SKIP: { skip "no utf8::is_utf8", 1 if !defined(&utf8::is_utf8); ok(!utf8::is_utf8($parsed[1][1]), "!is_utf8") }; -is($parsed[1][3], 0); -is($parsed[1][4], 7); - -is($parsed[2][0], "text"); -is(ord($parsed[2][1]), 226); -is($parsed[2][1], "\xE2\x98\xBA"); -is($parsed[2][2], "\xE2\x98\xBA"); -is($parsed[2][3], 7); -is($parsed[2][4], 3); -is($parsed[2][5], 10); -is($parsed[2][6], 7); - -is($parsed[3][0], "end"); -is($parsed[3][1], "</title>"); -is($parsed[3][3], 10); -is($parsed[3][6], 10); - -is($parsed[4][0], "start"); -is($parsed[4][1], "<h1 id=\xE2\x98\x80 f>"); -is(join("|", @{$parsed[4][7]}), "1|2|4|2|7|3|11|1|0|0"); -is($parsed[4][8]{id}, "\xE2\x98\x80"); - -is($parsed[5][0], "text"); -is($parsed[5][1], "Smile ☺"); -is($parsed[5][2], "Smile \x{263A}"); - -is($parsed[8][0], "end_document"); -is($parsed[8][3], length($doc)); -is($parsed[8][5], length($doc)); -is($parsed[8][6], length($doc)); - -is(@warn, 1); -like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); - -my $file = "test-$$.html"; -open(my $fh, ">:utf8", $file) || die; -print $fh <<EOT; -\x{FEFF} -<title>\x{263A} Love! </title> -<h1 id=♥\x{2665}>♥ Love \x{2665}<h1> -EOT -close($fh) || die; - -@warn = (); -@parsed = (); -$p->parse_file($file); -is(@parsed, "11"); -is($parsed[6][0], "start"); -is($parsed[6][8]{id}, "\x{2665}\xE2\x99\xA5"); -is($parsed[7][0], "text"); -is($parsed[7][1], "♥ Love \xE2\x99\xA5"); -is($parsed[7][2], "\x{2665} Love \xE2\x99\xA5"); # expected garbage -is($parsed[10][3], -s $file); -is(@warn, 1); -like($warn[0], qr/^Parsing of undecoded UTF-8 will give garbage when decoding entities/); - -@warn = (); -@parsed = (); -open($fh, "<:raw:utf8", $file) || die; -$p->parse_file($fh); -is(@parsed, "11"); -is($parsed[6][0], "start"); -is($parsed[6][8]{id}, "\x{2665}\x{2665}"); -is($parsed[7][0], "text"); -is($parsed[7][1], "♥ Love \x{2665}"); -is($parsed[7][2], "\x{2665} Love \x{2665}"); -is($parsed[10][3], (-s $file) - 2 * 4); -is(@warn, 0); - -@warn = (); -@parsed = (); -open($fh, "<:raw", $file) || die; -$p->utf8_mode(1); -$p->parse_file($fh); -is(@parsed, "11"); -is($parsed[6][0], "start"); -is($parsed[6][8]{id}, "\xE2\x99\xA5\xE2\x99\xA5"); -is($parsed[7][0], "text"); -is($parsed[7][1], "♥ Love \xE2\x99\xA5"); -is($parsed[7][2], "\xE2\x99\xA5 Love \xE2\x99\xA5"); -is($parsed[10][3], -s $file); -is(@warn, 0); - -unlink($file); - -@parsed = (); -$p->parse(q(<a href="a=1&lang=2×=3">foo</a>))->eof; -is(@parsed, "5"); -is($parsed[1][0], "start"); -is($parsed[1][8]{href}, "a=1&lang=2\xd7=3"); - -ok(!HTML::Entities::_probably_utf8_chunk("")); -ok(!HTML::Entities::_probably_utf8_chunk("f")); -ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5")); -ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o")); -ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2")); -ok(HTML::Entities::_probably_utf8_chunk("f\xE2\x99\xA5o\xE2\x99")); -ok(!HTML::Entities::_probably_utf8_chunk("f\xE2")); -ok(!HTML::Entities::_probably_utf8_chunk("f\xE2\x99")); -} diff --git a/ext/HTML/Parser/t/xml-mode.t b/ext/HTML/Parser/t/xml-mode.t deleted file mode 100644 index cdfc5b0ca0..0000000000 --- a/ext/HTML/Parser/t/xml-mode.t +++ /dev/null @@ -1,112 +0,0 @@ -use strict; -use Test::More tests => 8; - -use HTML::Parser (); -my $p = HTML::Parser->new(xml_mode => 1, - ); - -my $text = ""; -$p->handler(start => - sub { - my($tag, $attr) = @_; - $text .= "S[$tag"; - for my $k (sort keys %$attr) { - my $v = $attr->{$k}; - $text .= " $k=$v"; - } - $text .= "]"; - }, "tagname,attr"); -$p->handler(end => - sub { - $text .= "E[" . shift() . "]"; - }, "tagname"); -$p->handler(process => - sub { - $text .= "PI[" . shift() . "]"; - }, "token0"); -$p->handler(text => - sub { - $text .= shift; - }, "text"); - -my $xml = <<'EOT'; -<?xml version="1.0"?> -<?IS10744:arch name="html"?><!-- comment --> -<DOC> -<title html="h1">My first architectual document</title> -<author html="address">Geir Ove Gronmo, grove@infotek.no</author> -<para>This is the first paragraph in this document</para> -<para html="p">This is the second paragraph</para> -<para/> -<xmp><foo></foo></xmp> -</DOC> -EOT - -$p->parse($xml)->eof; - -is($text, <<'EOT'); -PI[xml version="1.0"] -PI[IS10744:arch name="html"] -S[DOC] -S[title html=h1]My first architectual documentE[title] -S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] -S[para]This is the first paragraph in this documentE[para] -S[para html=p]This is the second paragraphE[para] -S[para]E[para] -S[xmp]S[foo]E[foo]E[xmp] -E[DOC] -EOT - -$text = ""; -$p->xml_mode(0); -$p->parse($xml)->eof; - -is($text, <<'EOT'); -PI[xml version="1.0"?] -PI[IS10744:arch name="html"?] -S[doc] -S[title html=h1]My first architectual documentE[title] -S[author html=address]Geir Ove Gronmo, grove@infotek.noE[author] -S[para]This is the first paragraph in this documentE[para] -S[para html=p]This is the second paragraphE[para] -S[para/] -S[xmp]<foo></foo>E[xmp] -E[doc] -EOT - -# Test that we get an empty tag back -$p = HTML::Parser->new(api_version => 3, - xml_mode => 1); - -$p->handler("end" => - sub { - my($tagname, $text) = @_; - is($tagname, "Xyzzy"); - ok(!length($text)); - }, "tagname,text"); -$p->parse("<Xyzzy foo=bar/>and some more")->eof; - -# Test that we get an empty tag back -$p = HTML::Parser->new(api_version => 3, - empty_element_tags => 1); - -$p->handler("end" => - sub { - my($tagname, $text) = @_; - is($tagname, "xyzzy"); - ok(!length($text)); - }, "tagname,text"); -$p->parse("<Xyzzy foo=bar/>and some more")->eof; - -$p = HTML::Parser->new( - api_version => 3, - xml_pic => 1, -); - -$p->handler( - "process" => sub { - my($text, $t0) = @_; - is($text, "<?foo > bar?>"); - is($t0, "foo > bar"); - }, "text, token0"); -$p->parse("<?foo > bar?> and then")->eof; diff --git a/ext/HTML/Parser/tokenpos.h b/ext/HTML/Parser/tokenpos.h deleted file mode 100644 index aa971bfd4c..0000000000 --- a/ext/HTML/Parser/tokenpos.h +++ /dev/null @@ -1,49 +0,0 @@ -struct token_pos -{ - char *beg; - char *end; -}; -typedef struct token_pos token_pos_t; - -#define dTOKENS(init_lim) \ - token_pos_t token_buf[init_lim]; \ - int token_lim = init_lim; \ - token_pos_t *tokens = token_buf; \ - int num_tokens = 0 - -#define PUSH_TOKEN(p_beg, p_end) \ - STMT_START { \ - ++num_tokens; \ - if (num_tokens == token_lim) \ - tokens_grow(&tokens, &token_lim, (bool)(tokens != token_buf)); \ - tokens[num_tokens-1].beg = p_beg; \ - tokens[num_tokens-1].end = p_end; \ - } STMT_END - -#define FREE_TOKENS \ - STMT_START { \ - if (tokens != token_buf) \ - Safefree(tokens); \ - } STMT_END - -static void -tokens_grow(token_pos_t **token_ptr, int *token_lim_ptr, bool tokens_on_heap) -{ - int new_lim = *token_lim_ptr; - if (new_lim < 4) - new_lim = 4; - new_lim *= 2; - - if (tokens_on_heap) { - Renew(*token_ptr, new_lim, token_pos_t); - } - else { - token_pos_t *new_tokens; - int i; - New(57, new_tokens, new_lim, token_pos_t); - for (i = 0; i < *token_lim_ptr; i++) - new_tokens[i] = (*token_ptr)[i]; - *token_ptr = new_tokens; - } - *token_lim_ptr = new_lim; -} diff --git a/ext/HTML/Parser/typemap b/ext/HTML/Parser/typemap deleted file mode 100644 index a3238548a9..0000000000 --- a/ext/HTML/Parser/typemap +++ /dev/null @@ -1,5 +0,0 @@ -PSTATE* T_PSTATE - -INPUT -T_PSTATE - $var = get_pstate_hv(aTHX_ $arg) diff --git a/ext/HTML/Parser/util.c b/ext/HTML/Parser/util.c deleted file mode 100644 index 7e626bf714..0000000000 --- a/ext/HTML/Parser/util.c +++ /dev/null @@ -1,312 +0,0 @@ -/* $Id: util.c,v 2.30 2006/03/22 09:15:17 gisle Exp $ - * - * Copyright 1999-2006, Gisle Aas. - * - * This library is free software; you can redistribute it and/or - * modify it under the same terms as Perl itself. - */ - -#ifndef EXTERN -#define EXTERN extern -#endif - - -EXTERN SV* -sv_lower(pTHX_ SV* sv) -{ - STRLEN len; - char *s = SvPV_force(sv, len); - for (; len--; s++) - *s = toLOWER(*s); - return sv; -} - -EXTERN int -strnEQx(const char* s1, const char* s2, STRLEN n, int ignore_case) -{ - while (n--) { - if (ignore_case) { - if (toLOWER(*s1) != toLOWER(*s2)) - return 0; - } - else { - if (*s1 != *s2) - return 0; - } - s1++; - s2++; - } - return 1; -} - -static void -grow_gap(pTHX_ SV* sv, STRLEN grow, char** t, char** s, char** e) -{ - /* - SvPVX ---> AAAAAA...BBBBBB - ^ ^ ^ - t s e - */ - STRLEN t_offset = *t - SvPVX(sv); - STRLEN s_offset = *s - SvPVX(sv); - STRLEN e_offset = *e - SvPVX(sv); - - SvGROW(sv, e_offset + grow + 1); - - *t = SvPVX(sv) + t_offset; - *s = SvPVX(sv) + s_offset; - *e = SvPVX(sv) + e_offset; - - Move(*s, *s+grow, *e - *s, char); - *s += grow; - *e += grow; -} - -EXTERN SV* -decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) -{ - STRLEN len; - char *s = SvPV_force(sv, len); - char *t = s; - char *end = s + len; - char *ent_start; - - char *repl; - STRLEN repl_len; -#ifdef UNICODE_HTML_PARSER - char buf[UTF8_MAXLEN]; - int repl_utf8; - int high_surrogate = 0; -#else - char buf[1]; -#endif - -#if defined(__GNUC__) && defined(UNICODE_HTML_PARSER) - /* gcc -Wall reports this variable as possibly used uninitialized */ - repl_utf8 = 0; -#endif - - while (s < end) { - assert(t <= s); - - if ((*t++ = *s++) != '&') - continue; - - ent_start = s; - repl = 0; - - if (*s == '#') { - UV num = 0; - UV prev = 0; - int ok = 0; - s++; - if (*s == 'x' || *s == 'X') { - s++; - while (*s) { - char *tmp = strchr(PL_hexdigit, *s); - if (!tmp) - break; - num = num << 4 | ((tmp - PL_hexdigit) & 15); - if (prev && num <= prev) { - /* overflow */ - ok = 0; - break; - } - prev = num; - s++; - ok = 1; - } - } - else { - while (isDIGIT(*s)) { - num = num * 10 + (*s - '0'); - if (prev && num < prev) { - /* overflow */ - ok = 0; - break; - } - prev = num; - s++; - ok = 1; - } - } - if (ok) { -#ifdef UNICODE_HTML_PARSER - if (!SvUTF8(sv) && num <= 255) { - buf[0] = (char) num; - repl = buf; - repl_len = 1; - repl_utf8 = 0; - } - else { - char *tmp; - if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ - if (high_surrogate != 0) { - t -= 3; /* Back up past 0xFFFD */ - num = ((high_surrogate - 0xD800) << 10) + - (num - 0xDC00) + 0x10000; - high_surrogate = 0; - } else { - num = 0xFFFD; - } - } - else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ - high_surrogate = num; - num = 0xFFFD; - } - else { - high_surrogate = 0; - /* otherwise invalid? */ - if ((num >= 0xFDD0 && num <= 0xFDEF) || - ((num & 0xFFFE) == 0xFFFE) || - num > 0x10FFFF) - { - num = 0xFFFD; - } - } - - tmp = (char*)uvuni_to_utf8((U8*)buf, num); - repl = buf; - repl_len = tmp - buf; - repl_utf8 = 1; - } -#else - if (num <= 255) { - buf[0] = (char) num & 0xFF; - repl = buf; - repl_len = 1; - } -#endif - } - } - else { - char *ent_name = s; - while (isALNUM(*s)) - s++; - if (ent_name != s && entity2char) { - SV** svp; - if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || - (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) - ) - { - repl = SvPV(*svp, repl_len); -#ifdef UNICODE_HTML_PARSER - repl_utf8 = SvUTF8(*svp); -#endif - } - else if (expand_prefix) { - char *ss = s - 1; - while (ss > ent_name) { - svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); - if (svp) { - repl = SvPV(*svp, repl_len); -#ifdef UNICODE_HTML_PARSER - repl_utf8 = SvUTF8(*svp); -#endif - s = ss; - break; - } - ss--; - } - } - } -#ifdef UNICODE_HTML_PARSER - high_surrogate = 0; -#endif - } - - if (repl) { - char *repl_allocated = 0; - if (*s == ';') - s++; - t--; /* '&' already copied, undo it */ - -#ifdef UNICODE_HTML_PARSER - if (*s != '&') { - high_surrogate = 0; - } - - if (!SvUTF8(sv) && repl_utf8) { - /* need to upgrade sv before we continue */ - STRLEN before_gap_len = t - SvPVX(sv); - char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); - STRLEN after_gap_len = end - s; - char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); - - sv_setpvn(sv, before_gap, before_gap_len); - sv_catpvn(sv, after_gap, after_gap_len); - SvUTF8_on(sv); - - Safefree(before_gap); - Safefree(after_gap); - - s = t = SvPVX(sv) + before_gap_len; - end = SvPVX(sv) + before_gap_len + after_gap_len; - } - else if (SvUTF8(sv) && !repl_utf8) { - repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); - repl_allocated = repl; - } -#endif - - if (t + repl_len > s) { - /* need to grow the string */ - grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); - } - - /* copy replacement string into string */ - while (repl_len--) - *t++ = *repl++; - - if (repl_allocated) - Safefree(repl_allocated); - } - else { - while (ent_start < s) - *t++ = *ent_start++; - } - } - - *t = '\0'; - SvCUR_set(sv, t - SvPVX(sv)); - - return sv; -} - -#ifdef UNICODE_HTML_PARSER -static bool -has_hibit(char *s, char *e) -{ - while (s < e) { - U8 ch = *s++; - if (!UTF8_IS_INVARIANT(ch)) { - return 1; - } - } - return 0; -} - - -EXTERN bool -probably_utf8_chunk(pTHX_ char *s, STRLEN len) -{ - char *e = s + len; - STRLEN clen; - - /* ignore partial utf8 char at end of buffer */ - while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) - e--; - if (s < e && UTF8_IS_START((U8)*(e - 1))) - e--; - clen = len - (e - s); - if (clen && UTF8SKIP(e) == clen) { - /* all promised continuation bytes are present */ - e = s + len; - } - - if (!has_hibit(s, e)) - return 0; - - return is_utf8_string((U8*)s, e - s); -} -#endif |