summaryrefslogtreecommitdiff
path: root/XPath.pm
diff options
context:
space:
mode:
Diffstat (limited to 'XPath.pm')
-rw-r--r--XPath.pm553
1 files changed, 553 insertions, 0 deletions
diff --git a/XPath.pm b/XPath.pm
new file mode 100644
index 0000000..579d2cd
--- /dev/null
+++ b/XPath.pm
@@ -0,0 +1,553 @@
+# $Id: XPath.pm,v 1.56 2003/01/26 19:33:17 matt Exp $
+
+package XML::XPath;
+
+use strict;
+use vars qw($VERSION $AUTOLOAD $revision);
+
+$VERSION = '1.13';
+
+$XML::XPath::Namespaces = 1;
+$XML::XPath::Debug = 0;
+
+use XML::XPath::XMLParser;
+use XML::XPath::Parser;
+use IO::File;
+
+# For testing
+#use Data::Dumper;
+#$Data::Dumper::Indent = 1;
+
+# Parameters for new()
+my @options = qw(
+ filename
+ parser
+ xml
+ ioref
+ context
+ );
+
+sub new {
+ my $proto = shift;
+ my $class = ref($proto) || $proto;
+
+ my(%args);
+ # Try to figure out what the user passed
+ if ($#_ == 0) { # passed a scalar
+ my $string = $_[0];
+ if ($string =~ m{<.*?>}s) { # it's an XML string
+ $args{'xml'} = $string;
+ } elsif (ref($string)) { # read XML from file handle
+ $args{'ioref'} = $string;
+ } elsif ($string eq '-') { # read XML from stdin
+ $args{'ioref'} = IO::File->new($string);
+ } else { # read XML from a file
+ $args{'filename'} = $string;
+ }
+ } else { # passed a hash or hash reference
+ # just pass the parameters on to the XPath constructor
+ %args = ((ref($_[0]) eq "HASH") ? %{$_[0]} : @_);
+ }
+
+ if ($args{filename} && (!-e $args{filename} || !-r $args{filename})) {
+ die "Cannot open file '$args{filename}'";
+ }
+ my %hash = map(( "_$_" => $args{$_} ), @options);
+ $hash{path_parser} = XML::XPath::Parser->new();
+ return bless \%hash, $class;
+}
+
+sub find {
+ my $self = shift;
+ my $path = shift;
+ my $context = shift;
+ die "No path to find" unless $path;
+
+ if (!defined $context) {
+ $context = $self->get_context;
+ }
+ if (!defined $context) {
+ # Still no context? Need to parse...
+ my $parser = XML::XPath::XMLParser->new(
+ filename => $self->get_filename,
+ xml => $self->get_xml,
+ ioref => $self->get_ioref,
+ parser => $self->get_parser,
+ );
+ $context = $parser->parse;
+ $self->set_context($context);
+# warn "CONTEXT:\n", Data::Dumper->Dumpxs([$context], ['context']);
+ }
+
+ my $parsed_path = $self->{path_parser}->parse($path);
+# warn "\n\nPATH: ", $parsed_path->as_string, "\n\n";
+
+# warn "evaluating path\n";
+ return $parsed_path->evaluate($context);
+}
+
+# sub memsize {
+# print STDERR @_, "\t";
+# open(FH, '/proc/self/status');
+# while(<FH>) {
+# print STDERR $_ if /^VmSize/;
+# }
+# close FH;
+# }
+#
+sub findnodes {
+ my $self = shift;
+ my ($path, $context) = @_;
+
+ my $results = $self->find($path, $context);
+
+ if ($results->isa('XML::XPath::NodeSet')) {
+ return wantarray ? $results->get_nodelist : $results;
+# return $results->get_nodelist;
+ }
+
+# warn("findnodes returned a ", ref($results), " object\n") if $XML::XPath::Debug;
+ return wantarray ? () : XML::XPath::NodeSet->new();
+}
+
+sub matches {
+ my $self = shift;
+ my ($node, $path, $context) = @_;
+
+ my @nodes = $self->findnodes($path, $context);
+
+ if (grep { "$node" eq "$_" } @nodes) {
+ return 1;
+ }
+ return;
+}
+
+sub findnodes_as_string {
+ my $self = shift;
+ my ($path, $context) = @_;
+
+ my $results = $self->find($path, $context);
+
+ if ($results->isa('XML::XPath::NodeSet')) {
+ return join('', map { $_->toString } $results->get_nodelist);
+ }
+ elsif ($results->isa('XML::XPath::Node')) {
+ return $results->toString;
+ }
+ else {
+ return XML::XPath::Node::XMLescape($results->value);
+ }
+}
+
+sub findvalue {
+ my $self = shift;
+ my ($path, $context) = @_;
+
+ my $results = $self->find($path, $context);
+
+ if ($results->isa('XML::XPath::NodeSet')) {
+ return $results->to_literal;
+ }
+
+ return $results;
+}
+
+sub exists
+{
+ my $self = shift;
+ my ($path, $context) = @_;
+ $path = '/' if (!defined $path);
+ my @nodeset = $self->findnodes($path, $context);
+ return 1 if (scalar( @nodeset ));
+ return 0;
+}
+
+sub getNodeAsXML {
+ my $self = shift;
+ my $node_path = shift;
+ $node_path = '/' if (!defined $node_path);
+ if (ref($node_path)) {
+ return $node_path->as_string();
+ } else {
+ return $self->findnodes_as_string($node_path);
+ }
+}
+
+sub getNodeText {
+ my $self = shift;
+ my $node_path = shift;
+ if (ref($node_path)) {
+ return $node_path->string_value();
+ } else {
+ return $self->findvalue($node_path);
+ }
+}
+
+sub setNodeText {
+ my $self = shift;
+ my($node_path, $new_text) = @_;
+ my $nodeset = $self->findnodes($node_path);
+ return undef if (!defined $nodeset); # could not find node
+ my @nodes = $nodeset->get_nodelist;
+ if ($#nodes < 0) {
+ if ($node_path =~ m|/@([^/]+)$|) {
+ # attribute not found, so try to create it
+ my $parent_path = $`;
+ my $attr = $1;
+ $nodeset = $self->findnodes($parent_path);
+ return undef if (!defined $nodeset); # could not find node
+ foreach my $node ($nodeset->get_nodelist) {
+ my $newnode = XML::XPath::Node::Attribute->new($attr, $new_text);
+ return undef if (!defined $newnode); # could not create new node
+ $node->appendAttribute($newnode);
+ }
+ } else {
+ return undef; # could not find node
+ }
+ }
+ foreach my $node (@nodes) {
+ if ($node->getNodeType == XML::XPath::Node::ATTRIBUTE_NODE) {
+ $node->setNodeValue($new_text);
+ } else {
+ foreach my $delnode ($node->getChildNodes()) {
+ $node->removeChild($delnode);
+ }
+ my $newnode = XML::XPath::Node::Text->new($new_text);
+ return undef if (!defined $newnode); # could not create new node
+ $node->appendChild($newnode);
+ }
+ }
+ return 1;
+}
+
+sub createNode {
+ my $self = shift;
+ my($node_path) = @_;
+ my $path_steps = $self->{path_parser}->parse($node_path);
+ my @path_steps = ();
+ foreach my $step (@{$path_steps->get_lhs()}) {
+ my $string = $step->as_string();
+ push(@path_steps, $string) if (defined $string && $string ne "");
+ }
+ my $prev_node = undef;
+ my $nodeset = undef;
+ my $nodes = undef;
+ my $p = undef;
+ my $test_path = "";
+ # Start with the deepest node, working up the path (right to left),
+ # trying to find a node that exists.
+ for ($p = $#path_steps; $p >= 0; $p--) {
+ my $path = $path_steps[$p];
+ $test_path = "(/" . join("/", @path_steps[0..$p]) . ")";
+ $nodeset = $self->findnodes($test_path);
+ return undef if (!defined $nodeset); # error looking for node
+ $nodes = $nodeset->size;
+ return undef if ($nodes > 1); # too many paths - path not specific enough
+ if ($nodes == 1) { # found a node -- need to create nodes below it
+ $prev_node = $nodeset->get_node(1);
+ last;
+ }
+ }
+ if (!defined $prev_node) {
+ my @root_nodes = $self->findnodes('/')->get_nodelist();
+ $prev_node = $root_nodes[0];
+ }
+ # We found a node that exists, or we'll start at the root.
+ # Create all lower nodes working left to right along the path.
+ for ($p++ ; $p <= $#path_steps; $p++) {
+ my $path = $path_steps[$p];
+ my $newnode = undef;
+ my($axis,$name) = ($path =~ /^(.*?)::(.*)$/);
+ if ($axis =~ /^child$/i) {
+ $newnode = XML::XPath::Node::Element->new($name);
+ return undef if (!defined $newnode); # could not create new node
+ $prev_node->appendChild($newnode);
+ } elsif ($axis =~ /^attribute$/i) {
+ $newnode = XML::XPath::Node::Attribute->new($name, "");
+ return undef if (!defined $newnode); # could not create new node
+ $prev_node->appendAttribute($newnode);
+ }
+ $prev_node = $newnode;
+ }
+ return $prev_node;
+}
+
+sub get_filename {
+ my $self = shift;
+ $self->{_filename};
+}
+
+sub set_filename {
+ my $self = shift;
+ $self->{_filename} = shift;
+}
+
+sub get_parser {
+ my $self = shift;
+ $self->{_parser};
+}
+
+sub set_parser {
+ my $self = shift;
+ $self->{_parser} = shift;
+}
+
+sub get_xml {
+ my $self = shift;
+ $self->{_xml};
+}
+
+sub set_xml {
+ my $self = shift;
+ $self->{_xml} = shift;
+}
+
+sub get_ioref {
+ my $self = shift;
+ $self->{_ioref};
+}
+
+sub set_ioref {
+ my $self = shift;
+ $self->{_ioref} = shift;
+}
+
+sub get_context {
+ my $self = shift;
+ $self->{_context};
+}
+
+sub set_context {
+ my $self = shift;
+ $self->{_context} = shift;
+}
+
+sub cleanup {
+ my $self = shift;
+ if ($XML::XPath::SafeMode) {
+ my $context = $self->get_context;
+ return unless $context;
+ $context->dispose;
+ }
+}
+
+sub set_namespace {
+ my $self = shift;
+ my ($prefix, $expanded) = @_;
+ $self->{path_parser}->set_namespace($prefix, $expanded);
+}
+
+sub clear_namespaces {
+ my $self = shift;
+ $self->{path_parser}->clear_namespaces();
+}
+
+1;
+__END__
+
+=head1 NAME
+
+XML::XPath - a set of modules for parsing and evaluating XPath statements
+
+=head1 DESCRIPTION
+
+This module aims to comply exactly to the XPath specification at
+http://www.w3.org/TR/xpath and yet allow extensions to be added in the
+form of functions. Modules such as XSLT and XPointer may need to do
+this as they support functionality beyond XPath.
+
+=head1 SYNOPSIS
+
+ use XML::XPath;
+ use XML::XPath::XMLParser;
+
+ my $xp = XML::XPath->new(filename => 'test.xhtml');
+
+ my $nodeset = $xp->find('/html/body/p'); # find all paragraphs
+
+ foreach my $node ($nodeset->get_nodelist) {
+ print "FOUND\n\n",
+ XML::XPath::XMLParser::as_string($node),
+ "\n\n";
+ }
+
+=head1 DETAILS
+
+There's an awful lot to all of this, so bear with it - if you stick it
+out it should be worth it. Please get a good understanding of XPath
+by reading the spec before asking me questions. All of the classes
+and parts herein are named to be synonimous with the names in the
+specification, so consult that if you don't understand why I'm doing
+something in the code.
+
+=head1 API
+
+The API of XML::XPath itself is extremely simple to allow you to get
+going almost immediately. The deeper API's are more complex, but you
+shouldn't have to touch most of that.
+
+=head2 new()
+
+This constructor follows the often seen named parameter method call.
+Parameters you can use are: filename, parser, xml, ioref and context.
+The filename parameter specifies an XML file to parse. The xml
+parameter specifies a string to parse, and the ioref parameter
+specifies an ioref to parse. The context option allows you to
+specify a context node. The context node has to be in the format
+of a node as specified in L<XML::XPath::XMLParser>. The 4 parameters
+filename, xml, ioref and context are mutually exclusive - you should
+only specify one (if you specify anything other than context, the
+context node is the root of your document).
+The parser option allows you to pass in an already prepared
+XML::Parser object, to save you having to create more than one
+in your application (if, for example, you're doing more than just XPath).
+
+ my $xp = XML::XPath->new( context => $node );
+
+It is very much recommended that you use only 1 XPath object throughout
+the life of your application. This is because the object (and it's sub-objects)
+maintain certain bits of state information that will be useful (such
+as XPath variables) to later calls to find(). It's also a good idea because
+you'll use less memory this way.
+
+=head2 I<nodeset> = find($path, [$context])
+
+The find function takes an XPath expression (a string) and returns either an
+XML::XPath::NodeSet object containing the nodes it found (or empty if
+no nodes matched the path), or one of XML::XPath::Literal (a string),
+XML::XPath::Number, or XML::XPath::Boolean. It should always return
+something - and you can use ->isa() to find out what it returned. If you
+need to check how many nodes it found you should check $nodeset->size.
+See L<XML::XPath::NodeSet>. An optional second parameter of a context
+node allows you to use this method repeatedly, for example XSLT needs
+to do this.
+
+=head2 findnodes($path, [$context])
+
+Returns a list of nodes found by $path, optionally in context $context.
+In scalar context returns an XML::XPath::NodeSet object.
+
+=head2 findnodes_as_string($path, [$context])
+
+Returns the nodes found reproduced as XML. The result is not guaranteed
+to be valid XML though.
+
+=head2 findvalue($path, [$context])
+
+Returns either a C<XML::XPath::Literal>, a C<XML::XPath::Boolean> or a
+C<XML::XPath::Number> object. If the path returns a NodeSet,
+$nodeset->to_literal is called automatically for you (and thus a
+C<XML::XPath::Literal> is returned). Note that
+for each of the objects stringification is overloaded, so you can just
+print the value found, or manipulate it in the ways you would a normal
+perl value (e.g. using regular expressions).
+
+=head2 exists($path, [$context])
+
+Returns true if the given path exists.
+
+=head2 matches($node, $path, [$context])
+
+Returns true if the node matches the path (optionally in context $context).
+
+=head2 getNodeText($path)
+
+Returns the text string for a particular XML node. Returns a string,
+or undef if the node doesn't exist.
+
+=head2 setNodeText($path, $text)
+
+Sets the text string for a particular XML node. The node can be an
+element or an attribute. If the node to be set is an attribute, and
+the attribute node does not exist, it will be created automatically.
+
+=head2 createNode($path)
+
+Creates the node matching the path given. If part of the path given, or
+all of the path do not exist, the necessary nodes will be created
+automatically.
+
+=head2 set_namespace($prefix, $uri)
+
+Sets the namespace prefix mapping to the uri.
+
+Normally in XML::XPath the prefixes in XPath node tests take their
+context from the current node. This means that foo:bar will always
+match an element <foo:bar> regardless of the namespace that the prefix
+foo is mapped to (which might even change within the document, resulting
+in unexpected results). In order to make prefixes in XPath node tests
+actually map to a real URI, you need to enable that via a call
+to the set_namespace method of your XML::XPath object.
+
+=head2 clear_namespaces()
+
+Clears all previously set namespace mappings.
+
+=head2 $XML::XPath::Namespaces
+
+Set this to 0 if you I<don't> want namespace processing to occur. This
+will make everything a little (tiny) bit faster, but you'll suffer for it,
+probably.
+
+=head1 Node Object Model
+
+See L<XML::XPath::Node>, L<XML::XPath::Node::Element>,
+L<XML::XPath::Node::Text>, L<XML::XPath::Node::Comment>,
+L<XML::XPath::Node::Attribute>, L<XML::XPath::Node::Namespace>,
+and L<XML::XPath::Node::PI>.
+
+=head1 On Garbage Collection
+
+XPath nodes work in a special way that allows circular references, and
+yet still lets Perl's reference counting garbage collector to clean up
+the nodes after use. This should be totally transparent to the user,
+with one caveat: B<If you free your tree before letting go of a sub-tree,
+consider that playing with fire and you may get burned>. What does this
+mean to the average user? Not much. Provided you don't free (or let go
+out of scope) either the tree you passed to XML::XPath->new, or if you
+didn't pass a tree, and passed a filename or IO-ref, then provided you
+don't let the XML::XPath object go out of scope before you let results
+of find() and its friends go out of scope, then you'll be fine. Even if
+you B<do> let the tree go out of scope before results, you'll probably
+still be fine. The only case where you may get stung is when the last
+part of your path/query is either an ancestor or parent axis. In that
+case the worst that will happen is you'll end up with a circular reference
+that won't get cleared until interpreter destruction time. You can get
+around that by explicitly calling $node->DESTROY on each of your result
+nodes, if you really need to do that.
+
+Mail me direct if that's not clear. Note that it's not doom and gloom. It's
+by no means perfect, but the worst that will happen is a long running process
+could leak memory. Most long running processes will therefore be able to
+explicitly be careful not to free the tree (or XML::XPath object) before
+freeing results. AxKit, an application that uses XML::XPath, does this and
+I didn't have to make any changes to the code - it's already sensible
+programming.
+
+If you I<really> don't want all this to happen, then set the variable
+$XML::XPath::SafeMode, and call $xp->cleanup() on the XML::XPath object
+when you're finished, or $tree->dispose() if you have a tree instead.
+
+=head1 Example
+
+Please see the test files in t/ for examples on how to use XPath.
+
+=head1 Support/Author
+
+This module is copyright 2000 AxKit.com Ltd. This is free
+software, and as such comes with NO WARRANTY. No dates are used in this
+module. You may distribute this module under the terms of either the
+Gnu GPL, or the Artistic License (the same terms as Perl itself).
+
+For support, please subscribe to the Perl-XML mailing list at the URL
+http://listserv.activestate.com/mailman/listinfo/perl-xml
+
+Matt Sergeant, matt@sergeant.org
+
+=head1 SEE ALSO
+
+L<XML::XPath::Literal>, L<XML::XPath::Boolean>, L<XML::XPath::Number>,
+L<XML::XPath::XMLParser>, L<XML::XPath::NodeSet>, L<XML::XPath::PerlSAX>,
+L<XML::XPath::Builder>.
+
+=cut