This module is designed to build a lexer and parser for XPath expressions, importing and utilizing various libraries, including ply
and eulxml.xpath
. It defines custom lexing rules, utility functions, and exports the lexer, parser, and parsing and serialization functions for use in other modules.
"""Core XPath parsing glue.
This module builds a lexer and parser for XPath expressions for import into
eulxml.xpath. To understand how this module builds the lexer and parser, it
is helpful to understand how the `ply <http://www.dabeaz.com/ply/>`_ module
works.
Note that most client applications will import htese objects from
eulxml.xpath, not directly from here."""
from __future__ import unicode_literals
import os
import re
from ply import lex, yacc
import tempfile
from eulxml.xpath import lexrules
from eulxml.xpath import parserules
from eulxml.xpath.ast import serialize
__all__ = ['lexer', 'parser', 'parse', 'serialize']
# build the lexer. This will generate a lextab.py in the eulxml.xpath
# directory. Unfortunately, xpath requires some wonky lexing.
# Per http://www.w3.org/TR/xpath/#exprlex :
# 1 If there is a preceding token and the preceding token is not one of @,
# ::, (, [, , or an Operator, then a * must be recognized as a
# MultiplyOperator and an NCName must be recognized as an OperatorName.
# 2 If the character following an NCName (possibly after intervening
# ExprWhitespace) is (, then the token must be recognized as a NodeType
# or a FunctionName.
# 3 If the two characters following an NCName (possibly after intervening
# ExprWhitespace) are ::, then the token must be recognized as an
# AxisName.
# 4 Otherwise, the token must not be recognized as a MultiplyOperator, an
# OperatorName, a NodeType, a FunctionName, or an AxisName.
#
# To implement this, we create a wrapper class that extends token() for the
# described lookahead/lookback lexing, and we dynamically set the lexer's
# __class__ to this wrapper. That's pretty weird and ugly, but Python allows
# it. If you can find a prettier solution to the problem then I welcome a
# fix.
OPERATOR_FORCERS = set([
# @, ::, (, [
'ABBREV_AXIS_AT', 'AXIS_SEP', 'OPEN_PAREN', 'OPEN_BRACKET',
# Operators: OperatorName
'AND_OP', 'OR_OP', 'MOD_OP', 'DIV_OP', 'MULT_OP',
# Operators: MultiplyOperator
'PATH_SEP',
# Operators: /, //, |, +, -
'ABBREV_PATH_SEP', 'UNION_OP', 'PLUS_OP', 'MINUS_OP',
# Operators: =. !=, <, <=, >, >=
'EQUAL_OP', 'REL_OP',
# Also need to add : . Official XPath lexing rules are in terms of
# QNames, but we produce QNames in the parse layer. We need to include :
# here to force foo:div to be a single step, otherwise that last div
# would be interpreted as an operator (where standard xpath would just
# call it part of the qname)
'COLON',
])
NODE_TYPES = set(['comment', 'text', 'processing-instruction', 'node'])
class LexerWrapper(lex.Lexer):
def token(self):
tok = lex.Lexer.token(self)
if tok is not None:
if tok.type == 'STAR_OP':
if self.last is not None and self.last.type not in OPERATOR_FORCERS:
# first half of point 1
tok.type = 'MULT_OP'
if tok.type == 'NCNAME':
if self.last is not None and self.last.type not in OPERATOR_FORCERS:
# second half of point 1
operator = lexrules.operator_names.get(tok.value, None)
if operator is not None:
tok.type = operator
else:
next = self.peek()
if next is not None:
if next.type == 'OPEN_PAREN':
# point 2
if tok.value in NODE_TYPES:
tok.type = 'NODETYPE'
else:
tok.type = 'FUNCNAME'
elif next.type == 'AXIS_SEP':
# point 3
tok.type = 'AXISNAME'
self.last = tok
return tok
def peek(self):
clone = self.clone()
return clone.token()
# try to build the lexer with cached lex table generation. this will fail if
# the user doesn't have write perms on the source directory. in that case,
# try again without lex table generation.
lexdir = os.path.dirname(lexrules.__file__)
lexer = None
try:
lexer = lex.lex(module=lexrules, optimize=1, outputdir=lexdir,
reflags=re.UNICODE)
except IOError as e:
import errno
if e.errno != errno.EACCES:
raise
if lexer is None:
lexer = lex.lex(module=lexrules, reflags=re.UNICODE)
# then dynamically rewrite the lexer class to use the wonky override logic
# above
lexer.__class__ = LexerWrapper
lexer.last = None
# build the parser. This will generate a parsetab.py in the eulxml.xpath
# directory. Unlike lex, though, this just logs a complaint when it fails
# (contrast lex's explosion). Other than that, it's much less exciting
# than the lexer wackiness.
parsedir = os.path.dirname(parserules.__file__)
# By default, store generated parse files with the code
# If we don't have write permission, put them in the configured tempdir
if (not os.access(parsedir, os.W_OK)):
parsedir = tempfile.gettempdir()
parser = yacc.yacc(module=parserules, outputdir=parsedir, debug=0)
def parse(xpath):
'''Parse an xpath.'''
# Expose the parse method of the constructed parser,
# but explicitly specify the lexer created here,
# since otherwise parse will use the most-recently created lexer.
return parser.parse(xpath, lexer=lexer)
def ptokens(s):
'''Lex a string as XPath tokens, and print each token as it is lexed.
This is used primarily for debugging. You probably don't want this
function.'''
lexer.input(s)
for tok in lexer:
print(tok)
from __future__ import unicode_literals
import os
import re
from ply import lex, yacc
import tempfile
from eulxml.xpath import lexrules
from eulxml.xpath import parserules
from eulxml.xpath.ast import serialize
__all__ = ['lexer', 'parser', 'parse','serialize']
Module Breakdown
This code defines a module for building a lexer and parser for XPath expressions. Here's a concise overview:
The code imports various modules, including:
ply
for building the lexer and parsereulxml.xpath
for XPath parsing rules and objectstempfile
for temporary file handlingThe code defines several objects and functions, including:
OPERATOR_FORCERS
: a set of operator names that force specific lexing behaviorlexer
and parser
: objects generated by ply
for parsing XPath expressionsThe code implements a custom lexer to handle specific XPath lexing rules, such as:
*
as a MultiplyOperator
after certain tokens::
as an AxisName
after an NCName
and certain whitespace@
, (
, and [
The code also defines utility functions, including serialize
, which is used to serialize XPath abstract syntax trees (ASTs).
The module exports several objects and functions, including lexer
, parser
, parse
, and serialize
, which can be imported by other modules.