diff options
Diffstat (limited to 'XPath/Parser.pm')
-rw-r--r-- | XPath/Parser.pm | 821 |
1 files changed, 821 insertions, 0 deletions
diff --git a/XPath/Parser.pm b/XPath/Parser.pm new file mode 100644 index 0000000..ec69b0d --- /dev/null +++ b/XPath/Parser.pm @@ -0,0 +1,821 @@ +# $Id: Parser.pm,v 1.33 2001/11/26 17:41:18 matt Exp $ + +package XML::XPath::Parser; + +use strict; +use vars qw/ + $NCName + $QName + $NCWild + $QNWild + $NUMBER_RE + $NODE_TYPE + $AXIS_NAME + %AXES + $LITERAL + %CACHE/; + +use XML::XPath::XMLParser; +use XML::XPath::Step; +use XML::XPath::Expr; +use XML::XPath::Function; +use XML::XPath::LocationPath; +use XML::XPath::Variable; +use XML::XPath::Literal; +use XML::XPath::Number; +use XML::XPath::NodeSet; + +# Axis name to principal node type mapping +%AXES = ( + 'ancestor' => 'element', + 'ancestor-or-self' => 'element', + 'attribute' => 'attribute', + 'namespace' => 'namespace', + 'child' => 'element', + 'descendant' => 'element', + 'descendant-or-self' => 'element', + 'following' => 'element', + 'following-sibling' => 'element', + 'parent' => 'element', + 'preceding' => 'element', + 'preceding-sibling' => 'element', + 'self' => 'element', + ); + +$NCName = '([A-Za-z_][\w\\.\\-]*)'; +$QName = "($NCName:)?$NCName"; +$NCWild = "${NCName}:\\*"; +$QNWild = "\\*"; +$NODE_TYPE = '((text|comment|processing-instruction|node)\\(\\))'; +$AXIS_NAME = '(' . join('|', keys %AXES) . ')::'; +$NUMBER_RE = '\d+(\\.\d*)?|\\.\d+'; +$LITERAL = '\\"[^\\"]*\\"|\\\'[^\\\']*\\\''; + +sub new { + my $class = shift; + my $self = bless {}, $class; + debug("New Parser being created.\n"); + $self->{context_set} = XML::XPath::NodeSet->new(); + $self->{context_pos} = undef; # 1 based position in array context + $self->{context_size} = 0; # total size of context + $self->clear_namespaces(); + $self->{vars} = {}; + $self->{direction} = 'forward'; + $self->{cache} = {}; + return $self; +} + +sub get_var { + my $self = shift; + my $var = shift; + $self->{vars}->{$var}; +} + +sub set_var { + my $self = shift; + my $var = shift; + my $val = shift; + $self->{vars}->{$var} = $val; +} + +sub set_namespace { + my $self = shift; + my ($prefix, $expanded) = @_; + $self->{namespaces}{$prefix} = $expanded; +} + +sub clear_namespaces { + my $self = shift; + $self->{namespaces} = {}; +} + +sub get_namespace { + my $self = shift; + my ($prefix, $node) = @_; + if (my $ns = $self->{namespaces}{$prefix}) { + return $ns; + } + if (my $nsnode = $node->getNamespace($prefix)) { + return $nsnode->getValue(); + } +} + +sub get_context_set { $_[0]->{context_set}; } +sub set_context_set { $_[0]->{context_set} = $_[1]; } +sub get_context_pos { $_[0]->{context_pos}; } +sub set_context_pos { $_[0]->{context_pos} = $_[1]; } +sub get_context_size { $_[0]->{context_set}->size; } +sub get_context_node { $_[0]->{context_set}->get_node($_[0]->{context_pos}); } + +sub my_sub { + return (caller(1))[3]; +} + +sub parse { + my $self = shift; + my $path = shift; + if ($CACHE{$path}) { + return $CACHE{$path}; + } + my $tokens = $self->tokenize($path); + + $self->{_tokpos} = 0; + my $tree = $self->analyze($tokens); + + if ($self->{_tokpos} < scalar(@$tokens)) { + # didn't manage to parse entire expression - throw an exception + die "Parse of expression $path failed - junk after end of expression: $tokens->[$self->{_tokpos}]"; + } + + $CACHE{$path} = $tree; + + debug("PARSED Expr to:\n", $tree->as_string, "\n") if $XML::XPath::Debug; + + return $tree; +} + +sub tokenize { + my $self = shift; + my $path = shift; + study $path; + + my @tokens; + + debug("Parsing: $path\n"); + + # Bug: We don't allow "'@' NodeType" which is in the grammar, but I think is just plain stupid. + + while($path =~ m/\G + \s* # ignore all whitespace + ( # tokens + $LITERAL| + $NUMBER_RE| # Match digits + \.\.| # match parent + \.| # match current + ($AXIS_NAME)?$NODE_TYPE| # match tests + processing-instruction| + \@($NCWild|$QName|$QNWild)| # match attrib + \$$QName| # match variable reference + ($AXIS_NAME)?($NCWild|$QName|$QNWild)| # match NCName,NodeType,Axis::Test + \!=|<=|\-|>=|\/\/|and|or|mod|div| # multi-char seps + [,\+=\|<>\/\(\[\]\)]| # single char seps + (?<!(\@|\(|\[))\*| # multiply operator rules (see xpath spec) + (?<!::)\*| + $ # match end of query + ) + \s* # ignore all whitespace + /gcxso) { + + my ($token) = ($1); + + if (length($token)) { + debug("TOKEN: $token\n"); + push @tokens, $token; + } + + } + + if (pos($path) < length($path)) { + my $marker = ("." x (pos($path)-1)); + $path = substr($path, 0, pos($path) + 8) . "..."; + $path =~ s/\n/ /g; + $path =~ s/\t/ /g; + die "Query:\n", + "$path\n", + $marker, "^^^\n", + "Invalid query somewhere around here (I think)\n"; + } + + return \@tokens; +} + +sub analyze { + my $self = shift; + my $tokens = shift; + # lexical analysis + + return Expr($self, $tokens); +} + +sub match { + my ($self, $tokens, $match, $fatal) = @_; + + $self->{_curr_match} = ''; + return 0 unless $self->{_tokpos} < @$tokens; + + local $^W; + +# debug ("match: $match\n"); + + if ($tokens->[$self->{_tokpos}] =~ /^$match$/) { + $self->{_curr_match} = $tokens->[$self->{_tokpos}]; + $self->{_tokpos}++; + return 1; + } + else { + if ($fatal) { + die "Invalid token: ", $tokens->[$self->{_tokpos}], "\n"; + } + else { + return 0; + } + } +} + +sub Expr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + return OrExpr($self, $tokens); +} + +sub OrExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = AndExpr($self, $tokens); + while (match($self, $tokens, 'or')) { + my $or_expr = XML::XPath::Expr->new($self); + $or_expr->set_lhs($expr); + $or_expr->set_op('or'); + + my $rhs = AndExpr($self, $tokens); + + $or_expr->set_rhs($rhs); + $expr = $or_expr; + } + + return $expr; +} + +sub AndExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = EqualityExpr($self, $tokens); + while (match($self, $tokens, 'and')) { + my $and_expr = XML::XPath::Expr->new($self); + $and_expr->set_lhs($expr); + $and_expr->set_op('and'); + + my $rhs = EqualityExpr($self, $tokens); + + $and_expr->set_rhs($rhs); + $expr = $and_expr; + } + + return $expr; +} + +sub EqualityExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = RelationalExpr($self, $tokens); + while (match($self, $tokens, '!?=')) { + my $eq_expr = XML::XPath::Expr->new($self); + $eq_expr->set_lhs($expr); + $eq_expr->set_op($self->{_curr_match}); + + my $rhs = RelationalExpr($self, $tokens); + + $eq_expr->set_rhs($rhs); + $expr = $eq_expr; + } + + return $expr; +} + +sub RelationalExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = AdditiveExpr($self, $tokens); + while (match($self, $tokens, '(<|>|<=|>=)')) { + my $rel_expr = XML::XPath::Expr->new($self); + $rel_expr->set_lhs($expr); + $rel_expr->set_op($self->{_curr_match}); + + my $rhs = AdditiveExpr($self, $tokens); + + $rel_expr->set_rhs($rhs); + $expr = $rel_expr; + } + + return $expr; +} + +sub AdditiveExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = MultiplicativeExpr($self, $tokens); + while (match($self, $tokens, '[\\+\\-]')) { + my $add_expr = XML::XPath::Expr->new($self); + $add_expr->set_lhs($expr); + $add_expr->set_op($self->{_curr_match}); + + my $rhs = MultiplicativeExpr($self, $tokens); + + $add_expr->set_rhs($rhs); + $expr = $add_expr; + } + + return $expr; +} + +sub MultiplicativeExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = UnaryExpr($self, $tokens); + while (match($self, $tokens, '(\\*|div|mod)')) { + my $mult_expr = XML::XPath::Expr->new($self); + $mult_expr->set_lhs($expr); + $mult_expr->set_op($self->{_curr_match}); + + my $rhs = UnaryExpr($self, $tokens); + + $mult_expr->set_rhs($rhs); + $expr = $mult_expr; + } + + return $expr; +} + +sub UnaryExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + if (match($self, $tokens, '-')) { + my $expr = XML::XPath::Expr->new($self); + $expr->set_lhs(XML::XPath::Number->new(0)); + $expr->set_op('-'); + $expr->set_rhs(UnaryExpr($self, $tokens)); + return $expr; + } + else { + return UnionExpr($self, $tokens); + } +} + +sub UnionExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = PathExpr($self, $tokens); + while (match($self, $tokens, '\\|')) { + my $un_expr = XML::XPath::Expr->new($self); + $un_expr->set_lhs($expr); + $un_expr->set_op('|'); + + my $rhs = PathExpr($self, $tokens); + + $un_expr->set_rhs($rhs); + $expr = $un_expr; + } + + return $expr; +} + +sub PathExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + # PathExpr is LocationPath | FilterExpr | FilterExpr '//?' RelativeLocationPath + + # Since we are being predictive we need to find out which function to call next, then. + + # LocationPath either starts with "/", "//", ".", ".." or a proper Step. + + my $expr = XML::XPath::Expr->new($self); + + my $test = $tokens->[$self->{_tokpos}]; + + # Test for AbsoluteLocationPath and AbbreviatedRelativeLocationPath + if ($test =~ /^(\/\/?|\.\.?)$/) { + # LocationPath + $expr->set_lhs(LocationPath($self, $tokens)); + } + # Test for AxisName::... + elsif (is_step($self, $tokens)) { + $expr->set_lhs(LocationPath($self, $tokens)); + } + else { + # Not a LocationPath + # Use FilterExpr instead: + + $expr = FilterExpr($self, $tokens); + if (match($self, $tokens, '//?')) { + my $loc_path = XML::XPath::LocationPath->new(); + push @$loc_path, $expr; + if ($self->{_curr_match} eq '//') { + push @$loc_path, XML::XPath::Step->new($self, 'descendant-or-self', + XML::XPath::Step::test_nt_node); + } + push @$loc_path, RelativeLocationPath($self, $tokens); + my $new_expr = XML::XPath::Expr->new($self); + $new_expr->set_lhs($loc_path); + return $new_expr; + } + } + + return $expr; +} + +sub FilterExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = PrimaryExpr($self, $tokens); + while (match($self, $tokens, '\\[')) { + # really PredicateExpr... + $expr->push_predicate(Expr($self, $tokens)); + match($self, $tokens, '\\]', 1); + } + + return $expr; +} + +sub PrimaryExpr { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $expr = XML::XPath::Expr->new($self); + + if (match($self, $tokens, $LITERAL)) { + # new Literal with $self->{_curr_match}... + $self->{_curr_match} =~ m/^(["'])(.*)\1$/; + $expr->set_lhs(XML::XPath::Literal->new($2)); + } + elsif (match($self, $tokens, $NUMBER_RE)) { + # new Number with $self->{_curr_match}... + $expr->set_lhs(XML::XPath::Number->new($self->{_curr_match})); + } + elsif (match($self, $tokens, '\\(')) { + $expr->set_lhs(Expr($self, $tokens)); + match($self, $tokens, '\\)', 1); + } + elsif (match($self, $tokens, "\\\$$QName")) { + # new Variable with $self->{_curr_match}... + $self->{_curr_match} =~ /^\$(.*)$/; + $expr->set_lhs(XML::XPath::Variable->new($self, $1)); + } + elsif (match($self, $tokens, $QName)) { + # check match not Node_Type - done in lexer... + # new Function + my $func_name = $self->{_curr_match}; + match($self, $tokens, '\\(', 1); + $expr->set_lhs( + XML::XPath::Function->new( + $self, + $func_name, + Arguments($self, $tokens) + ) + ); + match($self, $tokens, '\\)', 1); + } + else { + die "Not a PrimaryExpr at ", $tokens->[$self->{_tokpos}], "\n"; + } + + return $expr; +} + +sub Arguments { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my @args; + + if($tokens->[$self->{_tokpos}] eq ')') { + return \@args; + } + + push @args, Expr($self, $tokens); + while (match($self, $tokens, ',')) { + push @args, Expr($self, $tokens); + } + + return \@args; +} + +sub LocationPath { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $loc_path = XML::XPath::LocationPath->new(); + + if (match($self, $tokens, '/')) { + # root + debug("SUB: Matched root\n"); + push @$loc_path, XML::XPath::Root->new(); + if (is_step($self, $tokens)) { + debug("Next is step\n"); + push @$loc_path, RelativeLocationPath($self, $tokens); + } + } + elsif (match($self, $tokens, '//')) { + # root + push @$loc_path, XML::XPath::Root->new(); + my $optimised = optimise_descendant_or_self($self, $tokens); + if (!$optimised) { + push @$loc_path, XML::XPath::Step->new($self, 'descendant-or-self', + XML::XPath::Step::test_nt_node); + push @$loc_path, RelativeLocationPath($self, $tokens); + } + else { + push @$loc_path, $optimised, RelativeLocationPath($self, $tokens); + } + } + else { + push @$loc_path, RelativeLocationPath($self, $tokens); + } + + return $loc_path; +} + +sub optimise_descendant_or_self { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my $tokpos = $self->{_tokpos}; + + # // must be followed by a Step. + if ($tokens->[$tokpos+1] && $tokens->[$tokpos+1] eq '[') { + # next token is a predicate + return; + } + elsif ($tokens->[$tokpos] =~ /^\.\.?$/) { + # abbreviatedStep - can't optimise. + return; + } + else { + debug("Trying to optimise //\n"); + my $step = Step($self, $tokens); + if ($step->{axis} ne 'child') { + # can't optimise axes other than child for now... + $self->{_tokpos} = $tokpos; + return; + } + $step->{axis} = 'descendant'; + $step->{axis_method} = 'axis_descendant'; + $self->{_tokpos}--; + $tokens->[$self->{_tokpos}] = '.'; + return $step; + } +} + +sub RelativeLocationPath { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + my @steps; + + push @steps, Step($self, $tokens); + while (match($self, $tokens, '//?')) { + if ($self->{_curr_match} eq '//') { + my $optimised = optimise_descendant_or_self($self, $tokens); + if (!$optimised) { + push @steps, XML::XPath::Step->new($self, 'descendant-or-self', + XML::XPath::Step::test_nt_node); + } + else { + push @steps, $optimised; + } + } + push @steps, Step($self, $tokens); + if (@steps > 1 && + $steps[-1]->{axis} eq 'self' && + $steps[-1]->{test} == XML::XPath::Step::test_nt_node) { + pop @steps; + } + } + + return @steps; +} + +sub Step { + my ($self, $tokens) = @_; + + debug("in SUB\n"); + + if (match($self, $tokens, '\\.')) { + # self::node() + return XML::XPath::Step->new($self, 'self', XML::XPath::Step::test_nt_node); + } + elsif (match($self, $tokens, '\\.\\.')) { + # parent::node() + return XML::XPath::Step->new($self, 'parent', XML::XPath::Step::test_nt_node); + } + else { + # AxisSpecifier NodeTest Predicate(s?) + my $token = $tokens->[$self->{_tokpos}]; + + debug("SUB: Checking $token\n"); + + my $step; + if ($token eq 'processing-instruction') { + $self->{_tokpos}++; + match($self, $tokens, '\\(', 1); + match($self, $tokens, $LITERAL); + $self->{_curr_match} =~ /^["'](.*)["']$/; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_nt_pi, + XML::XPath::Literal->new($1)); + match($self, $tokens, '\\)', 1); + } + elsif ($token =~ /^\@($NCWild|$QName|$QNWild)$/o) { + $self->{_tokpos}++; + if ($token eq '@*') { + $step = XML::XPath::Step->new($self, + 'attribute', + XML::XPath::Step::test_attr_any, + '*'); + } + elsif ($token =~ /^\@($NCName):\*$/o) { + $step = XML::XPath::Step->new($self, + 'attribute', + XML::XPath::Step::test_attr_ncwild, + $1); + } + elsif ($token =~ /^\@($QName)$/o) { + $step = XML::XPath::Step->new($self, + 'attribute', + XML::XPath::Step::test_attr_qname, + $1); + } + } + elsif ($token =~ /^($NCName):\*$/o) { # ns:* + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_ncwild, + $1); + } + elsif ($token =~ /^$QNWild$/o) { # * + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_any, + $token); + } + elsif ($token =~ /^$QName$/o) { # name:name + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_qname, + $token); + } + elsif ($token eq 'comment()') { + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_nt_comment); + } + elsif ($token eq 'text()') { + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_nt_text); + } + elsif ($token eq 'node()') { + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_nt_node); + } + elsif ($token eq 'processing-instruction()') { + $self->{_tokpos}++; + $step = XML::XPath::Step->new($self, 'child', + XML::XPath::Step::test_nt_pi); + } + elsif ($token =~ /^$AXIS_NAME($NCWild|$QName|$QNWild|$NODE_TYPE)$/o) { + my $axis = $1; + $self->{_tokpos}++; + $token = $2; + if ($token eq 'processing-instruction') { + match($self, $tokens, '\\(', 1); + match($self, $tokens, $LITERAL); + $self->{_curr_match} =~ /^["'](.*)["']$/; + $step = XML::XPath::Step->new($self, $axis, + XML::XPath::Step::test_nt_pi, + XML::XPath::Literal->new($1)); + match($self, $tokens, '\\)', 1); + } + elsif ($token =~ /^($NCName):\*$/o) { # ns:* + $step = XML::XPath::Step->new($self, $axis, + (($axis eq 'attribute') ? + XML::XPath::Step::test_attr_ncwild + : + XML::XPath::Step::test_ncwild), + $1); + } + elsif ($token =~ /^$QNWild$/o) { # * + $step = XML::XPath::Step->new($self, $axis, + (($axis eq 'attribute') ? + XML::XPath::Step::test_attr_any + : + XML::XPath::Step::test_any), + $token); + } + elsif ($token =~ /^$QName$/o) { # name:name + $step = XML::XPath::Step->new($self, $axis, + (($axis eq 'attribute') ? + XML::XPath::Step::test_attr_qname + : + XML::XPath::Step::test_qname), + $token); + } + elsif ($token eq 'comment()') { + $step = XML::XPath::Step->new($self, $axis, + XML::XPath::Step::test_nt_comment); + } + elsif ($token eq 'text()') { + $step = XML::XPath::Step->new($self, $axis, + XML::XPath::Step::test_nt_text); + } + elsif ($token eq 'node()') { + $step = XML::XPath::Step->new($self, $axis, + XML::XPath::Step::test_nt_node); + } + elsif ($token eq 'processing-instruction()') { + $step = XML::XPath::Step->new($self, $axis, + XML::XPath::Step::test_nt_pi); + } + else { + die "Shouldn't get here"; + } + } + else { + die "token $token doesn't match format of a 'Step'\n"; + } + + while (match($self, $tokens, '\\[')) { + push @{$step->{predicates}}, Expr($self, $tokens); + match($self, $tokens, '\\]', 1); + } + + return $step; + } +} + +sub is_step { + my ($self, $tokens) = @_; + + my $token = $tokens->[$self->{_tokpos}]; + + return unless defined $token; + + debug("SUB: Checking if '$token' is a step\n"); + + local $^W; + + if ($token eq 'processing-instruction') { + return 1; + } + elsif ($token =~ /^\@($NCWild|$QName|$QNWild)$/o) { + return 1; + } + elsif ($token =~ /^($NCWild|$QName|$QNWild)$/o && $tokens->[$self->{_tokpos}+1] ne '(') { + return 1; + } + elsif ($token =~ /^$NODE_TYPE$/o) { + return 1; + } + elsif ($token =~ /^$AXIS_NAME($NCWild|$QName|$QNWild|$NODE_TYPE)$/o) { + return 1; + } + + debug("SUB: '$token' not a step\n"); + + return; +} + +sub debug { + return unless $XML::XPath::Debug; + + my ($pkg, $file, $line, $sub) = caller(1); + + $sub =~ s/^$pkg\:://; + + while (@_) { + my $x = shift; + $x =~ s/\bPKG\b/$pkg/g; + $x =~ s/\bLINE\b/$line/g; + $x =~ s/\bSUB\b/$sub/g; + print STDERR $x; + } +} + +1; |