xpath | Cell 2 | Cell 4 | Search

This code is an implementation of an XPath parser using the ply module, which defines the rules for parsing XPath expressions and creates an abstract syntax tree (AST) representation of the parsed expressions. The parser is composed of multiple grammar rules for basic expressions, path expressions, and paths, which are used to construct an AST representation of the parsed expressions.

Cell 3

"""XPath parsing rules.

To understand how this module works, it is valuable to have a strong
understanding of the `ply <http://www.dabeaz.com/ply/>` module.
"""

from __future__ import unicode_literals
from eulxml.xpath import ast
from eulxml.xpath.lexrules import tokens

precedence = (
    ('left', 'OR_OP'),
    ('left', 'AND_OP'),
    ('left', 'EQUAL_OP'),
    ('left', 'REL_OP'),
    ('left', 'PLUS_OP', 'MINUS_OP'),
    ('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
    ('right', 'UMINUS_OP'),
    ('left', 'UNION_OP'),
)

#
# basic expressions
#

def p_expr_boolean(p):
    """
    Expr : Expr OR_OP Expr
         | Expr AND_OP Expr
         | Expr EQUAL_OP Expr
         | Expr REL_OP Expr
         | Expr PLUS_OP Expr
         | Expr MINUS_OP Expr
         | Expr MULT_OP Expr
         | Expr DIV_OP Expr
         | Expr MOD_OP Expr
         | Expr UNION_OP Expr
    """
    p[0] = ast.BinaryExpression(p[1], p[2], p[3])

def p_expr_unary(p):
    """
    Expr : MINUS_OP Expr %prec UMINUS_OP
    """
    p[0] = ast.UnaryExpression(p[1], p[2])

#
# path expressions
#

def p_path_expr_binary(p):
    """
    Expr : FilterExpr PATH_SEP RelativeLocationPath
         | FilterExpr ABBREV_PATH_SEP RelativeLocationPath
    """
    p[0] = ast.BinaryExpression(p[1], p[2], p[3])

def p_path_expr_unary(p):
    """
    Expr : RelativeLocationPath
         | AbsoluteLocationPath
         | AbbreviatedAbsoluteLocationPath
         | FilterExpr
    """
    p[0] = p[1]

#
# paths
#

def p_absolute_location_path_rootonly(p):
    """
    AbsoluteLocationPath : PATH_SEP
    """
    p[0] = ast.AbsolutePath(p[1])

def p_absolute_location_path_subpath(p):
    """
    AbsoluteLocationPath : PATH_SEP RelativeLocationPath
    """
    p[0] = ast.AbsolutePath(p[1], p[2])

def p_abbreviated_absolute_location_path(p):
    """
    AbbreviatedAbsoluteLocationPath : ABBREV_PATH_SEP RelativeLocationPath
    """
    p[0] = ast.AbsolutePath(p[1], p[2])

def p_relative_location_path_simple(p):
    """
    RelativeLocationPath : Step
    """
    p[0] = p[1]

def p_relative_location_path_binary(p):
    """
    RelativeLocationPath : RelativeLocationPath PATH_SEP Step
                         | RelativeLocationPath ABBREV_PATH_SEP Step
    """
    p[0] = ast.BinaryExpression(p[1], p[2], p[3])

#
# path steps
#

def p_step_nodetest(p):
    """
    Step : NodeTest
    """
    p[0] = ast.Step(None, p[1], [])

def p_step_nodetest_predicates(p):
    """
    Step : NodeTest PredicateList
    """
    p[0] = ast.Step(None, p[1], p[2])

def p_step_axis_nodetest(p):
    """
    Step : AxisSpecifier NodeTest
    """
    p[0] = ast.Step(p[1], p[2], [])

def p_step_axis_nodetest_predicates(p):
    """
    Step : AxisSpecifier NodeTest PredicateList
    """
    p[0] = ast.Step(p[1], p[2], p[3])

def p_step_abbrev(p):
    """
    Step : ABBREV_STEP_SELF
         | ABBREV_STEP_PARENT
    """
    p[0] = ast.AbbreviatedStep(p[1])

#
# axis specifier
#

def p_axis_specifier_full(p):
    """
    AxisSpecifier : AXISNAME AXIS_SEP
    """
    p[0] = p[1]

def p_axis_specifier_abbrev(p):
    """
    AxisSpecifier : ABBREV_AXIS_AT
    """
    p[0] = '@'

#
# node test
#

def p_node_test_name_test(p):
    """
    NodeTest : NameTest
    """
    p[0] = p[1]

def p_node_test_type_simple(p):
    """
    NodeTest : NODETYPE OPEN_PAREN CLOSE_PAREN
    """
    # NOTE: Strictly speaking p[1] must come from a list of recognized
    # NodeTypes. Since we don't actually do anything with them, we don't
    # need to recognize them.
    p[0] = ast.NodeType(p[1])

def p_node_test_type_literal(p):
    """
    NodeTest : NODETYPE OPEN_PAREN LITERAL CLOSE_PAREN
    """
    # NOTE: Technically this only allows 'processing-instruction' for p[1].
    # We'll go light on that restriction since we don't actually need it for
    # processing.
    p[0] = ast.NodeType(p[1], p[3])

#
# name test
#

def p_name_test_star(p):
    """
    NameTest : STAR_OP
    """
    p[0] = ast.NameTest(None, p[1])

def p_name_test_prefix_star(p):
    """
    NameTest : NCNAME COLON STAR_OP
    """
    p[0] = ast.NameTest(p[1], p[3])

def p_name_test_qname(p):
    """
    NameTest : QName
    """
    qname = p[1]
    p[0] = ast.NameTest(qname[0], qname[1])


#
# qname
#

def p_qname_prefixed(p):
    """
    QName : NCNAME COLON NCNAME
    """
    p[0] = (p[1], p[3])

def p_qname_unprefixed(p):
    """
    QName : NCNAME
    """
    p[0] = (None, p[1])

def p_funcqname_prefixed(p):
    """
    FuncQName : NCNAME COLON FUNCNAME
    """
    p[0] = (p[1], p[3])

def p_funcqname_unprefixed(p):
    """
    FuncQName : FUNCNAME
    """
    p[0] = (None, p[1])

#
# filter expressions
#

def p_filter_expr_simple(p):
    """
    FilterExpr : VariableReference
               | LITERAL
               | Number
               | FunctionCall
    """
    # FIXME: | FunctionCall moved so as not to conflict with NodeTest :
    # FunctionCall
    p[0] = p[1]

def p_filter_expr_grouped(p):
    """
    FilterExpr : OPEN_PAREN Expr CLOSE_PAREN
    """
    p[0] = p[2]

def p_filter_expr_predicate(p):
    """
    FilterExpr : FilterExpr Predicate
    """
    if not hasattr(p[1], 'append_predicate'):
        p[1] = ast.PredicatedExpression(p[1])
    p[1].append_predicate(p[2])
    p[0] = p[1]

#
# predicates
#

def p_predicate_list_single(p):
    """
    PredicateList : Predicate
    """
    p[0] = [p[1]]

def p_predicate_list_recursive(p):
    """
    PredicateList : PredicateList Predicate
    """
    p[0] = p[1]
    p[0].append(p[2])

def p_predicate(p):
    """
    Predicate : OPEN_BRACKET Expr CLOSE_BRACKET
    """
    p[0] = p[2]

#
# variable
#

def p_variable_reference(p):
    """
    VariableReference : DOLLAR QName
    """
    p[0] = ast.VariableReference(p[2])

#
# number
#

def p_number(p):
    """
    Number : FLOAT
           | INTEGER
    """
    p[0] = p[1]

#
# funcall
#

def p_function_call(p):
    """
    FunctionCall : FuncQName FormalArguments
    """
    # FIXME: This production also matches NodeType() or
    # processing-instruction("foo"), which are technically NodeTest
    qname = p[1]
    p[0] = ast.FunctionCall(qname[0], qname[1], p[2])

def p_formal_arguments_empty(p):
    """
    FormalArguments : OPEN_PAREN CLOSE_PAREN
    """
    p[0] = []

def p_formal_arguments_list(p):
    """
    FormalArguments : OPEN_PAREN ArgumentList CLOSE_PAREN
    """
    p[0] = p[2]

def p_argument_list_single(p):
    """
    ArgumentList : Expr
    """
    p[0] = [p[1]]

def p_argument_list_recursive(p):
    """
    ArgumentList : ArgumentList COMMA Expr
    """
    p[0] = p[1]
    p[0].append(p[3])

#
# error handling
#

def p_error(p):
    # In some cases, p could actually be None.
    # However, stack trace should have enough information to identify the problem.
    raise RuntimeError("Syntax error at '%s'" % repr(p))

What the code could have been:

# XPath Parsing Rules
======================

This module provides an implementation of XPath parsing rules using the ply module.
For a detailed understanding of the module, refer to the ply documentation.

```python
from __future__ import unicode_literals
from eulxml.xpath import ast
from eulxml.xpath.lexrules import tokens

# Precedence rules for operators
precedence = (
    ('left', 'OR_OP'),
    ('left', 'AND_OP'),
    ('left', 'EQUAL_OP'),
    ('left', 'REL_OP'),
    ('left', 'PLUS_OP', 'MINUS_OP'),
    ('left', 'MULT_OP', 'DIV_OP', 'MOD_OP'),
    ('right', 'UMINUS_OP'),
    ('left', 'UNION_OP'),
)


# Basic Expressions
----------------

class ExprNode:
    def __init__(self, left_node, operator, right_node):
        self.left_node = left_node
        self.operator = operator
        self.right_node = right_node

    def accept(self, visitor):
        raise NotImplementedError


class BinaryExpression(ExprNode):
    def __init__(self, left_node, operator, right_node):
        super().__init__(left_node, operator, right_node)

    def accept(self, visitor):
        return visitor.visit_binary_expression(self)


class UnaryExpression(ExprNode):
    def __init__(self, operator, right_node):
        super().__init__(None, operator, right_node)

    def accept(self, visitor):
        return visitor.visit_unary_expression(self)


def p_expr_boolean(p):
    """
    Expr : Expr OR_OP Expr
         | Expr AND_OP Expr
         | Expr EQUAL_OP Expr
         | Expr REL_OP Expr
         | Expr PLUS_OP Expr
         | Expr MINUS_OP Expr
         | Expr MULT_OP Expr
         | Expr DIV_OP Expr
         | Expr MOD_OP Expr
         | Expr UNION_OP Expr
    """
    left_node = p[1]
    operator = p[2]
    right_node = p[3]
    result = BinaryExpression(left_node, operator, right_node)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_expr_unary(p):
    """
    Expr : MINUS_OP Expr %prec UMINUS_OP
    """
    operator = p[1]
    right_node = p[2]
    result = UnaryExpression(operator, right_node)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Path Expressions
----------------

def p_path_expr_binary(p):
    """
    Expr : FilterExpr PATH_SEP RelativeLocationPath
         | FilterExpr ABBREV_PATH_SEP RelativeLocationPath
    """
    left_node = p[1]
    separator = p[2]
    right_node = p[3]
    result = ast.BinaryExpression(left_node, separator, right_node)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_path_expr_unary(p):
    """
    Expr : RelativeLocationPath
         | AbsoluteLocationPath
         | AbbreviatedAbsoluteLocationPath
         | FilterExpr
    """
    return [p[1]]


# Paths
--------

class AbsolutePathNode:
    def __init__(self, separator, path):
        self.separator = separator
        self.path = path

    def accept(self, visitor):
        raise NotImplementedError


class AbsolutePath(AbsolutePathNode):
    def __init__(self, separator, path):
        super().__init__(separator, path)

    def accept(self, visitor):
        return visitor.visit_absolute_path(self)


def p_absolute_location_path_rootonly(p):
    """
    AbsoluteLocationPath : PATH_SEP
    """
    separator = p[1]
    path = ast.AbsolutePath(separator)
    path.accept(p.lexer.lexinfo.visitor)
    return [path]


def p_absolute_location_path_subpath(p):
    """
    AbsoluteLocationPath : PATH_SEP RelativeLocationPath
    """
    separator = p[1]
    path = p[2]
    result = AbsolutePath(separator, path)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_abbreviated_absolute_location_path(p):
    """
    AbbreviatedAbsoluteLocationPath : ABBREV_PATH_SEP RelativeLocationPath
    """
    separator = p[1]
    path = p[2]
    result = AbsolutePath(separator, path)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Relative Paths
----------------

def p_relative_location_path_simple(p):
    """
    RelativeLocationPath : Step
    """
    return [p[1]]


def p_relative_location_path_binary(p):
    """
    RelativeLocationPath : RelativeLocationPath PATH_SEP Step
                         | RelativeLocationPath ABBREV_PATH_SEP Step
    """
    left_node = p[1]
    separator = p[2]
    right_node = p[3]
    result = ast.BinaryExpression(left_node, separator, right_node)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Steps
--------

class StepNode:
    def __init__(self, axis, node_test, predicates):
        self.axis = axis
        self.node_test = node_test
        self.predicates = predicates

    def accept(self, visitor):
        raise NotImplementedError


class Step(StepNode):
    def __init__(self, axis, node_test, predicates):
        super().__init__(axis, node_test, predicates)

    def accept(self, visitor):
        return visitor.visit_step(self)


class AbbreviatedStep(StepNode):
    def __init__(self, abbreviation):
        super().__init__(None, None, None)
        self.abbreviation = abbreviation

    def accept(self, visitor):
        return visitor.visit_abbreviated_step(self)


def p_step_nodetest(p):
    """
    Step : NodeTest
    """
    node_test = p[1]
    result = Step(None, node_test, [])
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_step_nodetest_predicates(p):
    """
    Step : NodeTest PredicateList
    """
    node_test = p[1]
    predicates = p[2]
    result = Step(None, node_test, predicates)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_step_axis_nodetest(p):
    """
    Step : AxisSpecifier NodeTest
    """
    axis = p[1]
    node_test = p[2]
    result = Step(axis, node_test, [])
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_step_axis_nodetest_predicates(p):
    """
    Step : AxisSpecifier NodeTest PredicateList
    """
    axis = p[1]
    node_test = p[2]
    predicates = p[3]
    result = Step(axis, node_test, predicates)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_step_abbrev(p):
    """
    Step : ABBREV_STEP_SELF
         | ABBREV_STEP_PARENT
    """
    abbreviation = p[1]
    result = AbbreviatedStep(abbreviation)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Axis Specifier
----------------

def p_axis_specifier_full(p):
    """
    AxisSpecifier : AXISNAME AXIS_SEP
    """
    axis_name = p[1]
    axis_separator = p[2]
    return [axis_name]


def p_axis_specifier_abbrev(p):
    """
    AxisSpecifier : ABBREV_AXIS_AT
    """
    return ['@']


# Node Test
------------

class NodeTestNode:
    def __init__(self, name_test):
        self.name_test = name_test

    def accept(self, visitor):
        raise NotImplementedError


class NodeTest(NodeTestNode):
    def __init__(self, name_test):
        super().__init__(name_test)

    def accept(self, visitor):
        return visitor.visit_node_test(self)


class NodeType(NodeTestNode):
    def __init__(self, node_type):
        super().__init__(None)
        self.node_type = node_type

    def accept(self, visitor):
        return visitor.visit_node_type(self)


def p_node_test_name_test(p):
    """
    NodeTest : NameTest
    """
    name_test = p[1]
    result = NodeTest(name_test)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_node_test_type_simple(p):
    """
    NodeTest : NODETYPE OPEN_PAREN CLOSE_PAREN
    """
    node_type = p[1]
    result = NodeType(node_type)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_node_test_type_literal(p):
    """
    NodeTest : NODETYPE OPEN_PAREN LITERAL CLOSE_PAREN
    """
    node_type = p[1]
    literal = p[3]
    result = NodeType(node_type, literal)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Name Test
------------

class NameTestNode:
    def __init__(self, prefix, name):
        self.prefix = prefix
        self.name = name

    def accept(self, visitor):
        raise NotImplementedError


class NameTest(NameTestNode):
    def __init__(self, prefix, name):
        super().__init__(prefix, name)

    def accept(self, visitor):
        return visitor.visit_name_test(self)


class Star(NameTestNode):
    def __init__(self):
        super().__init__(None, None)

    def accept(self, visitor):
        return visitor.visit_star(self)


def p_name_test_star(p):
    """
    NameTest : STAR_OP
    """
    result = Star()
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_name_test_prefix_star(p):
    """
    NameTest : NCNAME COLON STAR_OP
    """
    prefix = p[1]
    name = p[3]
    result = NameTest(prefix, name)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_name_test_qname(p):
    """
    NameTest : QName
    """
    qname = p[1]
    result = NameTest(qname[0], qname[1])
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# QName
---------

def p_qname_prefixed(p):
    """
    QName : NCNAME COLON NCNAME
    """
    prefix = p[1]
    name = p[3]
    return [(prefix, name)]


def p_qname_unprefixed(p):
    """
    QName : NCNAME
    """
    return [(None, p[1])]


def p_funcqname_prefixed(p):
    """
    FuncQName : NCNAME COLON FUNCNAME
    """
    prefix = p[1]
    name = p[3]
    return [(prefix, name)]


def p_funcqname_unprefixed(p):
    """
    FuncQName : FUNCNAME
    """
    return [(None, p[1])]


# Filter Expressions
--------------------

class FilterExpressionNode:
    def __init__(self, expression):
        self.expression = expression

    def accept(self, visitor):
        raise NotImplementedError


class FilterExpression(FilterExpressionNode):
    def __init__(self, expression):
        super().__init__(expression)

    def accept(self, visitor):
        return visitor.visit_filter_expression(self)


class PredicatedExpression(FilterExpressionNode):
    def __init__(self, expression, predicate):
        super().__init__(expression)
        self.predicate = predicate

    def accept(self, visitor):
        return visitor.visit_predicated_expression(self)


def p_filter_expr_simple(p):
    """
    FilterExpr : VariableReference
               | LITERAL
               | Number
               | FunctionCall
    """
    return [p[1]]


def p_filter_expr_grouped(p):
    """
    FilterExpr : OPEN_PAREN Expr CLOSE_PAREN
    """
    result = FilterExpression(p[2])
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


def p_filter_expr_predicate(p):
    """
    FilterExpr : FilterExpr Predicate
    """
    expression = p[1]
    predicate = p[2]
    result = PredicatedExpression(expression, predicate)
    result.accept(p.lexer.lexinfo.visitor)
    return [result]


# Predicates
-------------

def p_predicate_list_single(p):
    """
    PredicateList : Predicate
    """
    return [p[1]]


def p_predicate_list_recursive(p):
    """
    PredicateList : PredicateList Predicate
    """
    list = p[1]
    predicate = p[2]
    list.append(predicate)
    return list


def p_predicate(p):
    """
    Predicate : OPEN_BRACKET Expr CLOSE_BRACKET
    """
    return [p[2]]


# Variable
------------

def p_variable_reference(p):
    """
    VariableReference : DOLLAR QName
    """
    qname = p[2]
    return [ast.VariableReference(qname[0], qname[1])]


# Number
---------

def p_number(p):
    """
    Number : FLOAT
           | INTEGER
    """
    return [p[1]]


# Function Call
----------------

def p_function_call(p):
    """
    FunctionCall : FuncQName FormalArguments
    """
    qname = p[1]
    arguments = p[2]
    return [ast.FunctionCall(qname[0], qname[1], arguments)]


def p_formal_arguments_empty(p):
    """
    FormalArguments : OPEN_PAREN CLOSE_PAREN
    """
    return []


def p_formal_arguments_list(p):
    """
    FormalArguments : OPEN_PAREN ArgumentList CLOSE_PAREN
    """
    return [p[2]]


def p_argument_list_single(p):
    """
    ArgumentList : Expr
    """
    return [p[1]]


def p_argument_list_recursive(p):
    """
    ArgumentList : ArgumentList COMMA Expr
    """
    list = p[1]
    list.append(p[3])
    return list


# Error Handling
-----------------

def p_error(p):
    raise RuntimeError("Syntax error at '%s'" % repr(p))


class Visitor:
    def visit_binary_expression(self, node):
        raise NotImplementedError

    def visit_unary_expression(self, node):
        raise NotImplementedError

    def visit_absolute_path(self, node):
        raise NotImplementedError

    def visit_step(self, node):
        raise NotImplementedError

    def visit_abbreviated_step(self, node):
        raise NotImplementedError

    def visit_node_test(self, node):
        raise NotImplementedError

    def visit_node_type(self, node):
        raise NotImplementedError

    def visit_star(self, node):
        raise NotImplementedError

    def visit_name_test(self, node):
        raise NotImplementedError

    def visit_predicated_expression(self, node):
        raise NotImplementedError

    def visit_filter_expression(self, node):
        raise NotImplementedError


# lexer and parser
from eulxml.xpath import lexer
from eulxml.xpath import parser

lexer = lexer.Lexer(tokens)
parser = parser.Parser(lexer, precedence)

Overview

This code is an implementation of an XPath parser using the ply module. It defines the rules for parsing XPath expressions and creates an abstract syntax tree (AST) representation of the parsed expressions.

Module Imports

The code imports the following modules:

Precedence Rules

The code defines the precedence rules for the parser using a tuple of tuples. The rules specify the operator precedence and associativity for each operator.

Grammar Rules

The code defines several grammar rules for the parser, which are used to parse XPath expressions. The rules are defined using the p_ prefix and are grouped into three categories:

AST Construction

The code uses the ast module to construct an abstract syntax tree (AST) representation of the parsed expressions. The AST nodes are created using the following functions:

Token Definitions

The code defines several tokens, such as PATH_SEP, OR_OP, AND_OP, etc., which are used in the parser rules. These tokens are imported from the eulxml.xpath.lexrules module.