summaryrefslogtreecommitdiff
path: root/XPath/XMLParser.pm
diff options
context:
space:
mode:
Diffstat (limited to 'XPath/XMLParser.pm')
-rw-r--r--XPath/XMLParser.pm385
1 files changed, 385 insertions, 0 deletions
diff --git a/XPath/XMLParser.pm b/XPath/XMLParser.pm
new file mode 100644
index 0000000..78664d3
--- /dev/null
+++ b/XPath/XMLParser.pm
@@ -0,0 +1,385 @@
+# $Id: XMLParser.pm,v 1.49 2001/03/14 17:13:57 matt Exp $
+
+package XML::XPath::XMLParser;
+
+use strict;
+
+use XML::Parser;
+#use XML::XPath;
+use XML::XPath::Node;
+use XML::XPath::Node::Element;
+use XML::XPath::Node::Text;
+use XML::XPath::Node::Comment;
+use XML::XPath::Node::PI;
+use XML::XPath::Node::Attribute;
+use XML::XPath::Node::Namespace;
+
+my @options = qw(
+ filename
+ xml
+ parser
+ ioref
+ );
+
+my ($_current, $_namespaces_on);
+my %IdNames;
+
+use vars qw/$xmlns_ns $xml_ns/;
+
+$xmlns_ns = "http://www.w3.org/2000/xmlns/";
+$xml_ns = "http://www.w3.org/XML/1998/namespace";
+
+sub new {
+ my $proto = shift;
+ my $class = ref($proto) || $proto;
+ my %args = @_;
+ my %hash = map(( "_$_" => $args{$_} ), @options);
+ bless \%hash, $class;
+}
+
+sub parse {
+ my $self = shift;
+
+ $self->{IdNames} = {};
+ $self->{InScopeNamespaceStack} = [ {
+ '_Default' => undef,
+ 'xmlns' => $xmlns_ns,
+ 'xml' => $xml_ns,
+ } ];
+
+ $self->{NodeStack} = [ ];
+
+ $self->set_xml($_[0]) if $_[0];
+
+ my $parser = $self->get_parser || XML::Parser->new(
+ ErrorContext => 2,
+ ParseParamEnt => 1,
+ );
+
+ $parser->setHandlers(
+ Init => sub { $self->parse_init(@_) },
+ Char => sub { $self->parse_char(@_) },
+ Start => sub { $self->parse_start(@_) },
+ End => sub { $self->parse_end(@_) },
+ Final => sub { $self->parse_final(@_) },
+ Proc => sub { $self->parse_pi(@_) },
+ Comment => sub { $self->parse_comment(@_) },
+ Attlist => sub { $self->parse_attlist(@_) },
+ );
+
+ my $toparse;
+ if ($toparse = $self->get_filename) {
+ return $parser->parsefile($toparse);
+ }
+ else {
+ return $parser->parse($self->get_xml || $self->get_ioref);
+ }
+}
+
+sub parsefile {
+ my $self = shift;
+ my ($filename) = @_;
+ $self->set_filename($filename);
+ $self->parse;
+}
+
+sub parse_init {
+ my $self = shift;
+ my $e = shift;
+ my $document = XML::XPath::Node::Element->new();
+ my $newns = XML::XPath::Node::Namespace->new('xml', $xml_ns);
+ $document->appendNamespace($newns);
+ $self->{current} = $self->{DOC_Node} = $document;
+}
+
+sub parse_final {
+ my $self = shift;
+ return $self->{DOC_Node};
+}
+
+sub parse_char {
+ my $self = shift;
+ my $e = shift;
+ my $text = shift;
+
+ my $parent = $self->{current};
+
+ my $last = $parent->getLastChild;
+ if ($last && $last->isTextNode) {
+ # append to previous text node
+ $last->appendText($text);
+ return;
+ }
+
+ my $node = XML::XPath::Node::Text->new($text);
+ $parent->appendChild($node, 1);
+}
+
+sub parse_start {
+ my $self = shift;
+ my $e = shift;
+ my $tag = shift;
+
+ push @{ $self->{InScopeNamespaceStack} },
+ { %{ $self->{InScopeNamespaceStack}[-1] } };
+ $self->_scan_namespaces(@_);
+
+ my ($prefix, $namespace) = $self->_namespace($tag);
+
+ my $node = XML::XPath::Node::Element->new($tag, $prefix);
+
+ my @attributes;
+ for (my $ii = 0; $ii < $#_; $ii += 2) {
+ my ($name, $value) = ($_[$ii], $_[$ii+1]);
+ if ($name =~ /^xmlns(:(.*))?$/) {
+ # namespace node
+ my $prefix = $2 || '#default';
+# warn "Creating NS node: $prefix = $value\n";
+ my $newns = XML::XPath::Node::Namespace->new($prefix, $value);
+ $node->appendNamespace($newns);
+ }
+ else {
+ my ($prefix, $namespace) = $self->_namespace($name);
+ undef $namespace unless $prefix;
+
+ my $newattr = XML::XPath::Node::Attribute->new($name, $value, $prefix);
+ $node->appendAttribute($newattr, 1);
+ if (exists($self->{IdNames}{$tag}) && ($self->{IdNames}{$tag} eq $name)) {
+ # warn "appending Id Element: $val for ", $node->getName, "\n";
+ $self->{DOC_Node}->appendIdElement($value, $node);
+ }
+ }
+ }
+
+ $self->{current}->appendChild($node, 1);
+ $self->{current} = $node;
+}
+
+sub parse_end {
+ my $self = shift;
+ my $e = shift;
+ $self->{current} = $self->{current}->getParentNode;
+}
+
+sub parse_pi {
+ my $self = shift;
+ my $e = shift;
+ my ($target, $data) = @_;
+ my $node = XML::XPath::Node::PI->new($target, $data);
+ $self->{current}->appendChild($node, 1);
+}
+
+sub parse_comment {
+ my $self = shift;
+ my $e = shift;
+ my ($data) = @_;
+ my $node = XML::XPath::Node::Comment->new($data);
+ $self->{current}->appendChild($node, 1);
+}
+
+sub parse_attlist {
+ my $self = shift;
+ my $e = shift;
+ my ($elname, $attname, $type, $default, $fixed) = @_;
+ if ($type eq 'ID') {
+ $self->{IdNames}{$elname} = $attname;
+ }
+}
+
+sub _scan_namespaces {
+ my ($self, %attributes) = @_;
+
+ while (my ($attr_name, $value) = each %attributes) {
+ if ($attr_name eq 'xmlns') {
+ $self->{InScopeNamespaceStack}[-1]{'_Default'} = $value;
+ } elsif ($attr_name =~ /^xmlns:(.*)$/) {
+ my $prefix = $1;
+ $self->{InScopeNamespaceStack}[-1]{$prefix} = $value;
+ }
+ }
+}
+
+sub _namespace {
+ my ($self, $name) = @_;
+
+ my ($prefix, $localname) = split(/:/, $name);
+ if (!defined($localname)) {
+ if ($prefix eq 'xmlns') {
+ return '', undef;
+ } else {
+ return '', $self->{InScopeNamespaceStack}[-1]{'_Default'};
+ }
+ } else {
+ return $prefix, $self->{InScopeNamespaceStack}[-1]{$prefix};
+ }
+}
+
+sub as_string {
+ my $node = shift;
+ $node->toString;
+}
+
+sub get_parser { shift->{_parser}; }
+sub get_filename { shift->{_filename}; }
+sub get_xml { shift->{_xml}; }
+sub get_ioref { shift->{_ioref}; }
+
+sub set_parser { $_[0]->{_parser} = $_[1]; }
+sub set_filename { $_[0]->{_filename} = $_[1]; }
+sub set_xml { $_[0]->{_xml} = $_[1]; }
+sub set_ioref { $_[0]->{_ioref} = $_[1]; }
+
+1;
+
+__END__
+
+=head1 NAME
+
+XML::XPath::XMLParser - The default XML parsing class that produces a node tree
+
+=head1 SYNOPSIS
+
+ my $parser = XML::XPath::XMLParser->new(
+ filename => $self->get_filename,
+ xml => $self->get_xml,
+ ioref => $self->get_ioref,
+ parser => $self->get_parser,
+ );
+ my $root_node = $parser->parse;
+
+=head1 DESCRIPTION
+
+This module generates a node tree for use as the context node for XPath processing.
+It aims to be a quick parser, nothing fancy, and yet has to store more information
+than most parsers. To achieve this I've used array refs everywhere - no hashes.
+I don't have any performance figures for the speedups achieved, so I make no
+appologies for anyone not used to using arrays instead of hashes. I think they
+make good sense here where we know the attributes of each type of node.
+
+=head1 Node Structure
+
+All nodes have the same first 2 entries in the array: node_parent
+and node_pos. The type of the node is determined using the ref() function.
+The node_parent always contains an entry for the parent of the current
+node - except for the root node which has undef in there. And node_pos is the
+position of this node in the array that it is in (think:
+$node == $node->[node_parent]->[node_children]->[$node->[node_pos]] )
+
+Nodes are structured as follows:
+
+=head2 Root Node
+
+The root node is just an element node with no parent.
+
+ [
+ undef, # node_parent - check for undef to identify root node
+ undef, # node_pos
+ undef, # node_prefix
+ [ ... ], # node_children (see below)
+ ]
+
+=head2 Element Node
+
+ [
+ $parent, # node_parent
+ <position in current array>, # node_pos
+ 'xxx', # node_prefix - namespace prefix on this element
+ [ ... ], # node_children
+ 'yyy', # node_name - element tag name
+ [ ... ], # node_attribs - attributes on this element
+ [ ... ], # node_namespaces - namespaces currently in scope
+ ]
+
+=head2 Attribute Node
+
+ [
+ $parent, # node_parent - the element node
+ <position in current array>, # node_pos
+ 'xxx', # node_prefix - namespace prefix on this element
+ 'href', # node_key - attribute name
+ 'ftp://ftp.com/', # node_value - value in the node
+ ]
+
+=head2 Namespace Nodes
+
+Each element has an associated set of namespace nodes that are currently
+in scope. Each namespace node stores a prefix and the expanded name (retrieved
+from the xmlns:prefix="..." attribute).
+
+ [
+ $parent,
+ <pos>,
+ 'a', # node_prefix - the namespace as it was written as a prefix
+ 'http://my.namespace.com', # node_expanded - the expanded name.
+ ]
+
+=head2 Text Nodes
+
+ [
+ $parent,
+ <pos>,
+ 'This is some text' # node_text - the text in the node
+ ]
+
+=head2 Comment Nodes
+
+ [
+ $parent,
+ <pos>,
+ 'This is a comment' # node_comment
+ ]
+
+=head2 Processing Instruction Nodes
+
+ [
+ $parent,
+ <pos>,
+ 'target', # node_target
+ 'data', # node_data
+ ]
+
+=head1 Usage
+
+If you feel the need to use this module outside of XML::XPath (for example
+you might use this module directly so that you can cache parsed trees), you
+can follow the following API:
+
+=head2 new
+
+The new method takes either no parameters, or any of the following parameters:
+
+ filename
+ xml
+ parser
+ ioref
+
+This uses the familiar hash syntax, so an example might be:
+
+ use XML::XPath::XMLParser;
+
+ my $parser = XML::XPath::XMLParser->new(filename => 'example.xml');
+
+The parameters represent a filename, a string containing XML, an XML::Parser
+instance and an open filehandle ref respectively. You can also set or get all
+of these properties using the get_ and set_ functions that have the same
+name as the property: e.g. get_filename, set_ioref, etc.
+
+=head2 parse
+
+The parse method generally takes no parameters, however you are free to
+pass either an open filehandle reference or an XML string if you so require.
+The return value is a tree that XML::XPath can use. The parse method will
+die if there is an error in your XML, so be sure to use perl's exception
+handling mechanism (eval{};) if you want to avoid this.
+
+=head2 parsefile
+
+The parsefile method is identical to parse() except it expects a single
+parameter that is a string naming a file to open and parse. Again it
+returns a tree and also dies if there are XML errors.
+
+=head1 NOTICES
+
+This file is distributed as part of the XML::XPath module, and is copyright
+2000 Fastnet Software Ltd. Please see the documentation for the module as a
+whole for licencing information.