This code is an implementation of an XPath parser using the ply
module, which defines the rules for parsing XPath expressions and creates an abstract syntax tree (AST) representation of the parsed expressions. The parser is composed of multiple grammar rules for basic expressions, path expressions, and paths, which are used to construct an AST representation of the parsed expressions.
"""XPath parsing rules.
To understand how this module works, it is valuable to have a strong
understanding of the `ply <http://www.dabeaz.com/ply/>` module.
"""
from __future__ import unicode_literals
from eulxml.xpath import ast
from eulxml.xpath.lexrules import tokens
precedence = (
('left', 'OR_OP'),
('left', 'AND_OP'),
('left', 'EQUAL_OP'),
('left', 'REL_OP'),
('left', 'PLUS_OP', 'MINUS_OP'),
('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
('right', 'UMINUS_OP'),
('left', 'UNION_OP'),
)
#
# basic expressions
#
def p_expr_boolean(p):
"""
Expr : Expr OR_OP Expr
| Expr AND_OP Expr
| Expr EQUAL_OP Expr
| Expr REL_OP Expr
| Expr PLUS_OP Expr
| Expr MINUS_OP Expr
| Expr MULT_OP Expr
| Expr DIV_OP Expr
| Expr MOD_OP Expr
| Expr UNION_OP Expr
"""
p[0] = ast.BinaryExpression(p[1], p[2], p[3])
def p_expr_unary(p):
"""
Expr : MINUS_OP Expr %prec UMINUS_OP
"""
p[0] = ast.UnaryExpression(p[1], p[2])
#
# path expressions
#
def p_path_expr_binary(p):
"""
Expr : FilterExpr PATH_SEP RelativeLocationPath
| FilterExpr ABBREV_PATH_SEP RelativeLocationPath
"""
p[0] = ast.BinaryExpression(p[1], p[2], p[3])
def p_path_expr_unary(p):
"""
Expr : RelativeLocationPath
| AbsoluteLocationPath
| AbbreviatedAbsoluteLocationPath
| FilterExpr
"""
p[0] = p[1]
#
# paths
#
def p_absolute_location_path_rootonly(p):
"""
AbsoluteLocationPath : PATH_SEP
"""
p[0] = ast.AbsolutePath(p[1])
def p_absolute_location_path_subpath(p):
"""
AbsoluteLocationPath : PATH_SEP RelativeLocationPath
"""
p[0] = ast.AbsolutePath(p[1], p[2])
def p_abbreviated_absolute_location_path(p):
"""
AbbreviatedAbsoluteLocationPath : ABBREV_PATH_SEP RelativeLocationPath
"""
p[0] = ast.AbsolutePath(p[1], p[2])
def p_relative_location_path_simple(p):
"""
RelativeLocationPath : Step
"""
p[0] = p[1]
def p_relative_location_path_binary(p):
"""
RelativeLocationPath : RelativeLocationPath PATH_SEP Step
| RelativeLocationPath ABBREV_PATH_SEP Step
"""
p[0] = ast.BinaryExpression(p[1], p[2], p[3])
#
# path steps
#
def p_step_nodetest(p):
"""
Step : NodeTest
"""
p[0] = ast.Step(None, p[1], [])
def p_step_nodetest_predicates(p):
"""
Step : NodeTest PredicateList
"""
p[0] = ast.Step(None, p[1], p[2])
def p_step_axis_nodetest(p):
"""
Step : AxisSpecifier NodeTest
"""
p[0] = ast.Step(p[1], p[2], [])
def p_step_axis_nodetest_predicates(p):
"""
Step : AxisSpecifier NodeTest PredicateList
"""
p[0] = ast.Step(p[1], p[2], p[3])
def p_step_abbrev(p):
"""
Step : ABBREV_STEP_SELF
| ABBREV_STEP_PARENT
"""
p[0] = ast.AbbreviatedStep(p[1])
#
# axis specifier
#
def p_axis_specifier_full(p):
"""
AxisSpecifier : AXISNAME AXIS_SEP
"""
p[0] = p[1]
def p_axis_specifier_abbrev(p):
"""
AxisSpecifier : ABBREV_AXIS_AT
"""
p[0] = '@'
#
# node test
#
def p_node_test_name_test(p):
"""
NodeTest : NameTest
"""
p[0] = p[1]
def p_node_test_type_simple(p):
"""
NodeTest : NODETYPE OPEN_PAREN CLOSE_PAREN
"""
# NOTE: Strictly speaking p[1] must come from a list of recognized
# NodeTypes. Since we don't actually do anything with them, we don't
# need to recognize them.
p[0] = ast.NodeType(p[1])
def p_node_test_type_literal(p):
"""
NodeTest : NODETYPE OPEN_PAREN LITERAL CLOSE_PAREN
"""
# NOTE: Technically this only allows 'processing-instruction' for p[1].
# We'll go light on that restriction since we don't actually need it for
# processing.
p[0] = ast.NodeType(p[1], p[3])
#
# name test
#
def p_name_test_star(p):
"""
NameTest : STAR_OP
"""
p[0] = ast.NameTest(None, p[1])
def p_name_test_prefix_star(p):
"""
NameTest : NCNAME COLON STAR_OP
"""
p[0] = ast.NameTest(p[1], p[3])
def p_name_test_qname(p):
"""
NameTest : QName
"""
qname = p[1]
p[0] = ast.NameTest(qname[0], qname[1])
#
# qname
#
def p_qname_prefixed(p):
"""
QName : NCNAME COLON NCNAME
"""
p[0] = (p[1], p[3])
def p_qname_unprefixed(p):
"""
QName : NCNAME
"""
p[0] = (None, p[1])
def p_funcqname_prefixed(p):
"""
FuncQName : NCNAME COLON FUNCNAME
"""
p[0] = (p[1], p[3])
def p_funcqname_unprefixed(p):
"""
FuncQName : FUNCNAME
"""
p[0] = (None, p[1])
#
# filter expressions
#
def p_filter_expr_simple(p):
"""
FilterExpr : VariableReference
| LITERAL
| Number
| FunctionCall
"""
# FIXME: | FunctionCall moved so as not to conflict with NodeTest :
# FunctionCall
p[0] = p[1]
def p_filter_expr_grouped(p):
"""
FilterExpr : OPEN_PAREN Expr CLOSE_PAREN
"""
p[0] = p[2]
def p_filter_expr_predicate(p):
"""
FilterExpr : FilterExpr Predicate
"""
if not hasattr(p[1], 'append_predicate'):
p[1] = ast.PredicatedExpression(p[1])
p[1].append_predicate(p[2])
p[0] = p[1]
#
# predicates
#
def p_predicate_list_single(p):
"""
PredicateList : Predicate
"""
p[0] = [p[1]]
def p_predicate_list_recursive(p):
"""
PredicateList : PredicateList Predicate
"""
p[0] = p[1]
p[0].append(p[2])
def p_predicate(p):
"""
Predicate : OPEN_BRACKET Expr CLOSE_BRACKET
"""
p[0] = p[2]
#
# variable
#
def p_variable_reference(p):
"""
VariableReference : DOLLAR QName
"""
p[0] = ast.VariableReference(p[2])
#
# number
#
def p_number(p):
"""
Number : FLOAT
| INTEGER
"""
p[0] = p[1]
#
# funcall
#
def p_function_call(p):
"""
FunctionCall : FuncQName FormalArguments
"""
# FIXME: This production also matches NodeType() or
# processing-instruction("foo"), which are technically NodeTest
qname = p[1]
p[0] = ast.FunctionCall(qname[0], qname[1], p[2])
def p_formal_arguments_empty(p):
"""
FormalArguments : OPEN_PAREN CLOSE_PAREN
"""
p[0] = []
def p_formal_arguments_list(p):
"""
FormalArguments : OPEN_PAREN ArgumentList CLOSE_PAREN
"""
p[0] = p[2]
def p_argument_list_single(p):
"""
ArgumentList : Expr
"""
p[0] = [p[1]]
def p_argument_list_recursive(p):
"""
ArgumentList : ArgumentList COMMA Expr
"""
p[0] = p[1]
p[0].append(p[3])
#
# error handling
#
def p_error(p):
# In some cases, p could actually be None.
# However, stack trace should have enough information to identify the problem.
raise RuntimeError("Syntax error at '%s'" % repr(p))
# XPath Parsing Rules
======================
This module provides an implementation of XPath parsing rules using the ply module.
For a detailed understanding of the module, refer to the ply documentation.
```python
from __future__ import unicode_literals
from eulxml.xpath import ast
from eulxml.xpath.lexrules import tokens
# Precedence rules for operators
precedence = (
('left', 'OR_OP'),
('left', 'AND_OP'),
('left', 'EQUAL_OP'),
('left', 'REL_OP'),
('left', 'PLUS_OP', 'MINUS_OP'),
('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
('right', 'UMINUS_OP'),
('left', 'UNION_OP'),
)
# Basic Expressions
----------------
class ExprNode:
def __init__(self, left_node, operator, right_node):
self.left_node = left_node
self.operator = operator
self.right_node = right_node
def accept(self, visitor):
raise NotImplementedError
class BinaryExpression(ExprNode):
def __init__(self, left_node, operator, right_node):
super().__init__(left_node, operator, right_node)
def accept(self, visitor):
return visitor.visit_binary_expression(self)
class UnaryExpression(ExprNode):
def __init__(self, operator, right_node):
super().__init__(None, operator, right_node)
def accept(self, visitor):
return visitor.visit_unary_expression(self)
def p_expr_boolean(p):
"""
Expr : Expr OR_OP Expr
| Expr AND_OP Expr
| Expr EQUAL_OP Expr
| Expr REL_OP Expr
| Expr PLUS_OP Expr
| Expr MINUS_OP Expr
| Expr MULT_OP Expr
| Expr DIV_OP Expr
| Expr MOD_OP Expr
| Expr UNION_OP Expr
"""
left_node = p[1]
operator = p[2]
right_node = p[3]
result = BinaryExpression(left_node, operator, right_node)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_expr_unary(p):
"""
Expr : MINUS_OP Expr %prec UMINUS_OP
"""
operator = p[1]
right_node = p[2]
result = UnaryExpression(operator, right_node)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Path Expressions
----------------
def p_path_expr_binary(p):
"""
Expr : FilterExpr PATH_SEP RelativeLocationPath
| FilterExpr ABBREV_PATH_SEP RelativeLocationPath
"""
left_node = p[1]
separator = p[2]
right_node = p[3]
result = ast.BinaryExpression(left_node, separator, right_node)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_path_expr_unary(p):
"""
Expr : RelativeLocationPath
| AbsoluteLocationPath
| AbbreviatedAbsoluteLocationPath
| FilterExpr
"""
return [p[1]]
# Paths
--------
class AbsolutePathNode:
def __init__(self, separator, path):
self.separator = separator
self.path = path
def accept(self, visitor):
raise NotImplementedError
class AbsolutePath(AbsolutePathNode):
def __init__(self, separator, path):
super().__init__(separator, path)
def accept(self, visitor):
return visitor.visit_absolute_path(self)
def p_absolute_location_path_rootonly(p):
"""
AbsoluteLocationPath : PATH_SEP
"""
separator = p[1]
path = ast.AbsolutePath(separator)
path.accept(p.lexer.lexinfo.visitor)
return [path]
def p_absolute_location_path_subpath(p):
"""
AbsoluteLocationPath : PATH_SEP RelativeLocationPath
"""
separator = p[1]
path = p[2]
result = AbsolutePath(separator, path)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_abbreviated_absolute_location_path(p):
"""
AbbreviatedAbsoluteLocationPath : ABBREV_PATH_SEP RelativeLocationPath
"""
separator = p[1]
path = p[2]
result = AbsolutePath(separator, path)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Relative Paths
----------------
def p_relative_location_path_simple(p):
"""
RelativeLocationPath : Step
"""
return [p[1]]
def p_relative_location_path_binary(p):
"""
RelativeLocationPath : RelativeLocationPath PATH_SEP Step
| RelativeLocationPath ABBREV_PATH_SEP Step
"""
left_node = p[1]
separator = p[2]
right_node = p[3]
result = ast.BinaryExpression(left_node, separator, right_node)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Steps
--------
class StepNode:
def __init__(self, axis, node_test, predicates):
self.axis = axis
self.node_test = node_test
self.predicates = predicates
def accept(self, visitor):
raise NotImplementedError
class Step(StepNode):
def __init__(self, axis, node_test, predicates):
super().__init__(axis, node_test, predicates)
def accept(self, visitor):
return visitor.visit_step(self)
class AbbreviatedStep(StepNode):
def __init__(self, abbreviation):
super().__init__(None, None, None)
self.abbreviation = abbreviation
def accept(self, visitor):
return visitor.visit_abbreviated_step(self)
def p_step_nodetest(p):
"""
Step : NodeTest
"""
node_test = p[1]
result = Step(None, node_test, [])
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_step_nodetest_predicates(p):
"""
Step : NodeTest PredicateList
"""
node_test = p[1]
predicates = p[2]
result = Step(None, node_test, predicates)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_step_axis_nodetest(p):
"""
Step : AxisSpecifier NodeTest
"""
axis = p[1]
node_test = p[2]
result = Step(axis, node_test, [])
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_step_axis_nodetest_predicates(p):
"""
Step : AxisSpecifier NodeTest PredicateList
"""
axis = p[1]
node_test = p[2]
predicates = p[3]
result = Step(axis, node_test, predicates)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_step_abbrev(p):
"""
Step : ABBREV_STEP_SELF
| ABBREV_STEP_PARENT
"""
abbreviation = p[1]
result = AbbreviatedStep(abbreviation)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Axis Specifier
----------------
def p_axis_specifier_full(p):
"""
AxisSpecifier : AXISNAME AXIS_SEP
"""
axis_name = p[1]
axis_separator = p[2]
return [axis_name]
def p_axis_specifier_abbrev(p):
"""
AxisSpecifier : ABBREV_AXIS_AT
"""
return ['@']
# Node Test
------------
class NodeTestNode:
def __init__(self, name_test):
self.name_test = name_test
def accept(self, visitor):
raise NotImplementedError
class NodeTest(NodeTestNode):
def __init__(self, name_test):
super().__init__(name_test)
def accept(self, visitor):
return visitor.visit_node_test(self)
class NodeType(NodeTestNode):
def __init__(self, node_type):
super().__init__(None)
self.node_type = node_type
def accept(self, visitor):
return visitor.visit_node_type(self)
def p_node_test_name_test(p):
"""
NodeTest : NameTest
"""
name_test = p[1]
result = NodeTest(name_test)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_node_test_type_simple(p):
"""
NodeTest : NODETYPE OPEN_PAREN CLOSE_PAREN
"""
node_type = p[1]
result = NodeType(node_type)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_node_test_type_literal(p):
"""
NodeTest : NODETYPE OPEN_PAREN LITERAL CLOSE_PAREN
"""
node_type = p[1]
literal = p[3]
result = NodeType(node_type, literal)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Name Test
------------
class NameTestNode:
def __init__(self, prefix, name):
self.prefix = prefix
self.name = name
def accept(self, visitor):
raise NotImplementedError
class NameTest(NameTestNode):
def __init__(self, prefix, name):
super().__init__(prefix, name)
def accept(self, visitor):
return visitor.visit_name_test(self)
class Star(NameTestNode):
def __init__(self):
super().__init__(None, None)
def accept(self, visitor):
return visitor.visit_star(self)
def p_name_test_star(p):
"""
NameTest : STAR_OP
"""
result = Star()
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_name_test_prefix_star(p):
"""
NameTest : NCNAME COLON STAR_OP
"""
prefix = p[1]
name = p[3]
result = NameTest(prefix, name)
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_name_test_qname(p):
"""
NameTest : QName
"""
qname = p[1]
result = NameTest(qname[0], qname[1])
result.accept(p.lexer.lexinfo.visitor)
return [result]
# QName
---------
def p_qname_prefixed(p):
"""
QName : NCNAME COLON NCNAME
"""
prefix = p[1]
name = p[3]
return [(prefix, name)]
def p_qname_unprefixed(p):
"""
QName : NCNAME
"""
return [(None, p[1])]
def p_funcqname_prefixed(p):
"""
FuncQName : NCNAME COLON FUNCNAME
"""
prefix = p[1]
name = p[3]
return [(prefix, name)]
def p_funcqname_unprefixed(p):
"""
FuncQName : FUNCNAME
"""
return [(None, p[1])]
# Filter Expressions
--------------------
class FilterExpressionNode:
def __init__(self, expression):
self.expression = expression
def accept(self, visitor):
raise NotImplementedError
class FilterExpression(FilterExpressionNode):
def __init__(self, expression):
super().__init__(expression)
def accept(self, visitor):
return visitor.visit_filter_expression(self)
class PredicatedExpression(FilterExpressionNode):
def __init__(self, expression, predicate):
super().__init__(expression)
self.predicate = predicate
def accept(self, visitor):
return visitor.visit_predicated_expression(self)
def p_filter_expr_simple(p):
"""
FilterExpr : VariableReference
| LITERAL
| Number
| FunctionCall
"""
return [p[1]]
def p_filter_expr_grouped(p):
"""
FilterExpr : OPEN_PAREN Expr CLOSE_PAREN
"""
result = FilterExpression(p[2])
result.accept(p.lexer.lexinfo.visitor)
return [result]
def p_filter_expr_predicate(p):
"""
FilterExpr : FilterExpr Predicate
"""
expression = p[1]
predicate = p[2]
result = PredicatedExpression(expression, predicate)
result.accept(p.lexer.lexinfo.visitor)
return [result]
# Predicates
-------------
def p_predicate_list_single(p):
"""
PredicateList : Predicate
"""
return [p[1]]
def p_predicate_list_recursive(p):
"""
PredicateList : PredicateList Predicate
"""
list = p[1]
predicate = p[2]
list.append(predicate)
return list
def p_predicate(p):
"""
Predicate : OPEN_BRACKET Expr CLOSE_BRACKET
"""
return [p[2]]
# Variable
------------
def p_variable_reference(p):
"""
VariableReference : DOLLAR QName
"""
qname = p[2]
return [ast.VariableReference(qname[0], qname[1])]
# Number
---------
def p_number(p):
"""
Number : FLOAT
| INTEGER
"""
return [p[1]]
# Function Call
----------------
def p_function_call(p):
"""
FunctionCall : FuncQName FormalArguments
"""
qname = p[1]
arguments = p[2]
return [ast.FunctionCall(qname[0], qname[1], arguments)]
def p_formal_arguments_empty(p):
"""
FormalArguments : OPEN_PAREN CLOSE_PAREN
"""
return []
def p_formal_arguments_list(p):
"""
FormalArguments : OPEN_PAREN ArgumentList CLOSE_PAREN
"""
return [p[2]]
def p_argument_list_single(p):
"""
ArgumentList : Expr
"""
return [p[1]]
def p_argument_list_recursive(p):
"""
ArgumentList : ArgumentList COMMA Expr
"""
list = p[1]
list.append(p[3])
return list
# Error Handling
-----------------
def p_error(p):
raise RuntimeError("Syntax error at '%s'" % repr(p))
class Visitor:
def visit_binary_expression(self, node):
raise NotImplementedError
def visit_unary_expression(self, node):
raise NotImplementedError
def visit_absolute_path(self, node):
raise NotImplementedError
def visit_step(self, node):
raise NotImplementedError
def visit_abbreviated_step(self, node):
raise NotImplementedError
def visit_node_test(self, node):
raise NotImplementedError
def visit_node_type(self, node):
raise NotImplementedError
def visit_star(self, node):
raise NotImplementedError
def visit_name_test(self, node):
raise NotImplementedError
def visit_predicated_expression(self, node):
raise NotImplementedError
def visit_filter_expression(self, node):
raise NotImplementedError
# lexer and parser
from eulxml.xpath import lexer
from eulxml.xpath import parser
lexer = lexer.Lexer(tokens)
parser = parser.Parser(lexer, precedence)
Overview
This code is an implementation of an XPath parser using the ply
module. It defines the rules for parsing XPath expressions and creates an abstract syntax tree (AST) representation of the parsed expressions.
Module Imports
The code imports the following modules:
__future__.unicode_literals
: enables Unicode literals in Python 2.eulxml.xpath.ast
: provides the ast
module for creating the abstract syntax tree representation of the parsed expressions.eulxml.xpath.lexrules
: provides the tokens
module, which defines the tokens used in the parser.Precedence Rules
The code defines the precedence rules for the parser using a tuple of tuples. The rules specify the operator precedence and associativity for each operator.
Grammar Rules
The code defines several grammar rules for the parser, which are used to parse XPath expressions. The rules are defined using the p_
prefix and are grouped into three categories:
Expr
, OR_OP
, AND_OP
, etc. The p_expr_boolean
function is used to parse boolean expressions, while the p_expr_unary
function is used to parse unary expressions.FilterExpr
, RelativeLocationPath
, etc. The p_path_expr_binary
function is used to parse binary path expressions, while the p_path_expr_unary
function is used to parse unary path expressions.AbsoluteLocationPath
, RelativeLocationPath
, etc. The p_absolute_location_path_rootonly
function is used to parse absolute location paths, while the p_absolute_location_path_subpath
function is used to parse subpaths.AST Construction
The code uses the ast
module to construct an abstract syntax tree (AST) representation of the parsed expressions. The AST nodes are created using the following functions:
ast.BinaryExpression
: creates a binary expression node.ast.UnaryExpression
: creates a unary expression node.ast.AbsolutePath
: creates an absolute path node.p[0]
: assigns the parsed expression to the AST node.Token Definitions
The code defines several tokens, such as PATH_SEP
, OR_OP
, AND_OP
, etc., which are used in the parser rules. These tokens are imported from the eulxml.xpath.lexrules
module.