3
# Copyright (c) 2003-2016 Paul T. McGuire
5
# Permission is hereby granted, free of charge, to any person obtaining
6
# a copy of this software and associated documentation files (the
7
# "Software"), to deal in the Software without restriction, including
8
# without limitation the rights to use, copy, modify, merge, publish,
9
# distribute, sublicense, and/or sell copies of the Software, and to
10
# permit persons to whom the Software is furnished to do so, subject to
11
# the following conditions:
13
# The above copyright notice and this permission notice shall be
14
# included in all copies or substantial portions of the Software.
16
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27
pyparsing module - Classes and methods to define and execute parsing grammars
29
The pyparsing module is an alternative approach to creating and executing simple grammars,
30
vs. the traditional lex/yacc approach, or the use of regular expressions. With pyparsing, you
31
don't need to learn a new syntax for defining grammars or matching expressions - the parsing module
32
provides a library of classes that you use to construct the grammar directly in Python.
34
Here is a program to parse "Hello, World!" (or any greeting of the form
35
C{"<salutation>, <addressee>!"}), built up using L{Word}, L{Literal}, and L{And} elements
36
(L{'+'<ParserElement.__add__>} operator gives L{And} expressions, strings are auto-converted to
37
L{Literal} expressions)::
39
from pyparsing import Word, alphas
41
# define grammar of a greeting
42
greet = Word(alphas) + "," + Word(alphas) + "!"
44
hello = "Hello, World!"
45
print (hello, "->", greet.parseString(hello))
47
The program outputs the following::
49
Hello, World! -> ['Hello', ',', 'World', '!']
51
The Python representation of the grammar is quite readable, owing to the self-explanatory
52
class names, and the use of '+', '|' and '^' operators.
54
The L{ParseResults} object returned from L{ParserElement.parseString<ParserElement.parseString>} can be accessed as a nested list, a dictionary, or an
55
object with named attributes.
57
The pyparsing module handles some of the problems that are typically vexing when writing text parsers:
58
- extra or missing whitespace (the above program will also handle "Hello,World!", "Hello , World !", etc.)
63
__version__ = "2.1.10"
64
__versionTime__ = "07 Oct 2016 01:31 UTC"
65
__author__ = "Paul McGuire <ptmcg@users.sourceforge.net>"
68
from weakref import ref as wkref
78
from datetime import datetime
81
from _thread import RLock
83
from threading import RLock
86
from collections import OrderedDict as _OrderedDict
89
from ordereddict import OrderedDict as _OrderedDict
93
#~ sys.stderr.write( "testing pyparsing module, version %s, %s\n" % (__version__,__versionTime__ ) )
96
'And', 'CaselessKeyword', 'CaselessLiteral', 'CharsNotIn', 'Combine', 'Dict', 'Each', 'Empty',
97
'FollowedBy', 'Forward', 'GoToColumn', 'Group', 'Keyword', 'LineEnd', 'LineStart', 'Literal',
98
'MatchFirst', 'NoMatch', 'NotAny', 'OneOrMore', 'OnlyOnce', 'Optional', 'Or',
99
'ParseBaseException', 'ParseElementEnhance', 'ParseException', 'ParseExpression', 'ParseFatalException',
100
'ParseResults', 'ParseSyntaxException', 'ParserElement', 'QuotedString', 'RecursiveGrammarException',
101
'Regex', 'SkipTo', 'StringEnd', 'StringStart', 'Suppress', 'Token', 'TokenConverter',
102
'White', 'Word', 'WordEnd', 'WordStart', 'ZeroOrMore',
103
'alphanums', 'alphas', 'alphas8bit', 'anyCloseTag', 'anyOpenTag', 'cStyleComment', 'col',
104
'commaSeparatedList', 'commonHTMLEntity', 'countedArray', 'cppStyleComment', 'dblQuotedString',
105
'dblSlashComment', 'delimitedList', 'dictOf', 'downcaseTokens', 'empty', 'hexnums',
106
'htmlComment', 'javaStyleComment', 'line', 'lineEnd', 'lineStart', 'lineno',
107
'makeHTMLTags', 'makeXMLTags', 'matchOnlyAtCol', 'matchPreviousExpr', 'matchPreviousLiteral',
108
'nestedExpr', 'nullDebugAction', 'nums', 'oneOf', 'opAssoc', 'operatorPrecedence', 'printables',
109
'punc8bit', 'pythonStyleComment', 'quotedString', 'removeQuotes', 'replaceHTMLEntity',
110
'replaceWith', 'restOfLine', 'sglQuotedString', 'srange', 'stringEnd',
111
'stringStart', 'traceParseAction', 'unicodeString', 'upcaseTokens', 'withAttribute',
112
'indentedBlock', 'originalTextFor', 'ungroup', 'infixNotation','locatedExpr', 'withClass',
113
'CloseMatch', 'tokenMap', 'pyparsing_common',
116
system_version = tuple(sys.version_info)[:3]
117
PY_3 = system_version[0] == 3
119
_MAX_INT = sys.maxsize
124
# build list of single arg builtins, that can be used as parse actions
125
singleArgBuiltins = [sum, len, sorted, reversed, list, tuple, set, any, all, min, max]
128
_MAX_INT = sys.maxint
132
"""Drop-in replacement for str(obj) that tries to be Unicode friendly. It first tries
133
str(obj). If that fails with a UnicodeEncodeError, then it tries unicode(obj). It
134
then < returns the unicode object | encodes it with the default encoding | ... >.
136
if isinstance(obj,unicode):
140
# If this works, then _ustr(obj) has the same behaviour as str(obj), so
141
# it won't break any existing code.
144
except UnicodeEncodeError:
146
ret = unicode(obj).encode(sys.getdefaultencoding(), 'xmlcharrefreplace')
147
xmlcharref = Regex('&#\d+;')
148
xmlcharref.setParseAction(lambda t: '\\u' + hex(int(t[0][2:-1]))[2:])
149
return xmlcharref.transformString(ret)
151
# build list of single arg builtins, tolerant of Python version, that can be used as parse actions
152
singleArgBuiltins = []
154
for fname in "sum len sorted reversed list tuple set any all min max".split():
156
singleArgBuiltins.append(getattr(__builtin__,fname))
157
except AttributeError:
160
_generatorType = type((y for y in range(1)))
162
def _xml_escape(data):
163
"""Escape &, <, >, ", ', etc. in a string of data."""
165
# ampersand must be replaced first
166
from_symbols = '&><"\''
167
to_symbols = ('&'+s+';' for s in "amp gt lt quot apos".split())
168
for from_,to_ in zip(from_symbols, to_symbols):
169
data = data.replace(from_, to_)
172
class _Constants(object):
175
alphas = string.ascii_uppercase + string.ascii_lowercase
177
hexnums = nums + "ABCDEFabcdef"
178
alphanums = alphas + nums
180
printables = "".join(c for c in string.printable if c not in string.whitespace)
182
class ParseBaseException(Exception):
183
"""base exception class for all parsing runtime exceptions"""
184
# Performance tuning: we construct a *lot* of these, so keep this
185
# constructor as small and fast as possible
186
def __init__( self, pstr, loc=0, msg=None, elem=None ):
194
self.parserElement = elem
195
self.args = (pstr, loc, msg)
198
def _from_exception(cls, pe):
200
internal factory method to simplify creating one type of ParseException
201
from another - avoids having __init__ signature conflicts among subclasses
203
return cls(pe.pstr, pe.loc, pe.msg, pe.parserElement)
205
def __getattr__( self, aname ):
206
"""supported attributes by name are:
207
- lineno - returns the line number of the exception text
208
- col - returns the column number of the exception text
209
- line - returns the line containing the exception text
211
if( aname == "lineno" ):
212
return lineno( self.loc, self.pstr )
213
elif( aname in ("col", "column") ):
214
return col( self.loc, self.pstr )
215
elif( aname == "line" ):
216
return line( self.loc, self.pstr )
218
raise AttributeError(aname)
221
return "%s (at char %d), (line:%d, col:%d)" % \
222
( self.msg, self.loc, self.lineno, self.column )
223
def __repr__( self ):
225
def markInputline( self, markerString = ">!<" ):
226
"""Extracts the exception line from the input string, and marks
227
the location of the exception with a special symbol.
230
line_column = self.column - 1
232
line_str = "".join((line_str[:line_column],
233
markerString, line_str[line_column:]))
234
return line_str.strip()
236
return "lineno col line".split() + dir(type(self))
238
class ParseException(ParseBaseException):
240
Exception thrown when parse expressions don't match class;
241
supported attributes by name are:
242
- lineno - returns the line number of the exception text
243
- col - returns the column number of the exception text
244
- line - returns the line containing the exception text
248
Word(nums).setName("integer").parseString("ABC")
249
except ParseException as pe:
251
print("column: {}".format(pe.col))
254
Expected integer (at char 0), (line:1, col:1)
259
class ParseFatalException(ParseBaseException):
260
"""user-throwable exception thrown when inconsistent parse content
261
is found; stops all parsing immediately"""
264
class ParseSyntaxException(ParseFatalException):
265
"""just like L{ParseFatalException}, but thrown internally when an
266
L{ErrorStop<And._ErrorStop>} ('-' operator) indicates that parsing is to stop
267
immediately because an unbacktrackable syntax error has been found"""
270
#~ class ReparseException(ParseBaseException):
271
#~ """Experimental class - parse actions can raise this exception to cause
272
#~ pyparsing to reparse the input string:
273
#~ - with a modified input string, and/or
274
#~ - with a modified start location
275
#~ Set the values of the ReparseException in the constructor, and raise the
276
#~ exception in a parse action to cause pyparsing to use the new string/location.
277
#~ Setting the values as None causes no change to be made.
279
#~ def __init_( self, newstring, restartLoc ):
280
#~ self.newParseText = newstring
281
#~ self.reparseLoc = restartLoc
283
class RecursiveGrammarException(Exception):
284
"""exception thrown by L{ParserElement.validate} if the grammar could be improperly recursive"""
285
def __init__( self, parseElementList ):
286
self.parseElementTrace = parseElementList
289
return "RecursiveGrammarException: %s" % self.parseElementTrace
291
class _ParseResultsWithOffset(object):
292
def __init__(self,p1,p2):
294
def __getitem__(self,i):
297
return repr(self.tup[0])
298
def setOffset(self,i):
299
self.tup = (self.tup[0],i)
301
class ParseResults(object):
303
Structured parse results, to provide multiple means of access to the parsed data:
304
- as a list (C{len(results)})
305
- by list index (C{results[0], results[1]}, etc.)
306
- by attribute (C{results.<resultsName>} - see L{ParserElement.setResultsName})
310
date_str = (integer.setResultsName("year") + '/'
311
+ integer.setResultsName("month") + '/'
312
+ integer.setResultsName("day"))
314
# date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
316
# parseString returns a ParseResults object
317
result = date_str.parseString("1999/12/31")
319
def test(s, fn=repr):
320
print("%s -> %s" % (s, fn(eval(s))))
323
test("result['month']")
325
test("'month' in result")
326
test("'minutes' in result")
327
test("result.dump()", str)
329
list(result) -> ['1999', '/', '12', '/', '31']
331
result['month'] -> '12'
333
'month' in result -> True
334
'minutes' in result -> False
335
result.dump() -> ['1999', '/', '12', '/', '31']
340
def __new__(cls, toklist=None, name=None, asList=True, modal=True ):
341
if isinstance(toklist, cls):
343
retobj = object.__new__(cls)
344
retobj.__doinit = True
347
# Performance tuning: we construct a *lot* of these, so keep this
348
# constructor as small and fast as possible
349
def __init__( self, toklist=None, name=None, asList=True, modal=True, isinstance=isinstance ):
351
self.__doinit = False
354
self.__accumNames = {}
355
self.__asList = asList
359
if isinstance(toklist, list):
360
self.__toklist = toklist[:]
361
elif isinstance(toklist, _generatorType):
362
self.__toklist = list(toklist)
364
self.__toklist = [toklist]
365
self.__tokdict = dict()
367
if name is not None and name:
369
self.__accumNames[name] = 0
370
if isinstance(name,int):
371
name = _ustr(name) # will always return a str, but use _ustr for consistency
373
if not (isinstance(toklist, (type(None), basestring, list)) and toklist in (None,'',[])):
374
if isinstance(toklist,basestring):
375
toklist = [ toklist ]
377
if isinstance(toklist,ParseResults):
378
self[name] = _ParseResultsWithOffset(toklist.copy(),0)
380
self[name] = _ParseResultsWithOffset(ParseResults(toklist[0]),0)
381
self[name].__name = name
384
self[name] = toklist[0]
385
except (KeyError,TypeError,IndexError):
388
def __getitem__( self, i ):
389
if isinstance( i, (int,slice) ):
390
return self.__toklist[i]
392
if i not in self.__accumNames:
393
return self.__tokdict[i][-1][0]
395
return ParseResults([ v[0] for v in self.__tokdict[i] ])
397
def __setitem__( self, k, v, isinstance=isinstance ):
398
if isinstance(v,_ParseResultsWithOffset):
399
self.__tokdict[k] = self.__tokdict.get(k,list()) + [v]
401
elif isinstance(k,(int,slice)):
402
self.__toklist[k] = v
405
self.__tokdict[k] = self.__tokdict.get(k,list()) + [_ParseResultsWithOffset(v,0)]
407
if isinstance(sub,ParseResults):
408
sub.__parent = wkref(self)
410
def __delitem__( self, i ):
411
if isinstance(i,(int,slice)):
412
mylen = len( self.__toklist )
413
del self.__toklist[i]
415
# convert int to slice
416
if isinstance(i, int):
420
# get removed indices
421
removed = list(range(*i.indices(mylen)))
423
# fixup indices in token dictionary
424
for name,occurrences in self.__tokdict.items():
426
for k, (value, position) in enumerate(occurrences):
427
occurrences[k] = _ParseResultsWithOffset(value, position - (position > j))
429
del self.__tokdict[i]
431
def __contains__( self, k ):
432
return k in self.__tokdict
434
def __len__( self ): return len( self.__toklist )
435
def __bool__(self): return ( not not self.__toklist )
436
__nonzero__ = __bool__
437
def __iter__( self ): return iter( self.__toklist )
438
def __reversed__( self ): return iter( self.__toklist[::-1] )
439
def _iterkeys( self ):
440
if hasattr(self.__tokdict, "iterkeys"):
441
return self.__tokdict.iterkeys()
443
return iter(self.__tokdict)
445
def _itervalues( self ):
446
return (self[k] for k in self._iterkeys())
448
def _iteritems( self ):
449
return ((k, self[k]) for k in self._iterkeys())
453
"""Returns an iterator of all named result keys (Python 3.x only)."""
456
"""Returns an iterator of all named result values (Python 3.x only)."""
459
"""Returns an iterator of all named result key-value tuples (Python 3.x only)."""
463
"""Returns an iterator of all named result keys (Python 2.x only)."""
465
itervalues = _itervalues
466
"""Returns an iterator of all named result values (Python 2.x only)."""
468
iteritems = _iteritems
469
"""Returns an iterator of all named result key-value tuples (Python 2.x only)."""
472
"""Returns all named result keys (as a list in Python 2.x, as an iterator in Python 3.x)."""
473
return list(self.iterkeys())
476
"""Returns all named result values (as a list in Python 2.x, as an iterator in Python 3.x)."""
477
return list(self.itervalues())
480
"""Returns all named result key-values (as a list of tuples in Python 2.x, as an iterator in Python 3.x)."""
481
return list(self.iteritems())
484
"""Since keys() returns an iterator, this method is helpful in bypassing
485
code that looks for the existence of any defined results names."""
486
return bool(self.__tokdict)
488
def pop( self, *args, **kwargs):
490
Removes and returns item at specified index (default=C{last}).
491
Supports both C{list} and C{dict} semantics for C{pop()}. If passed no
492
argument or an integer argument, it will use C{list} semantics
493
and pop tokens from the list of parsed tokens. If passed a
494
non-integer argument (most likely a string), it will use C{dict}
495
semantics and pop the corresponding value from any defined
496
results names. A second default return value argument is
497
supported, just as in C{dict.pop()}.
500
def remove_first(tokens):
502
print(OneOrMore(Word(nums)).parseString("0 123 321")) # -> ['0', '123', '321']
503
print(OneOrMore(Word(nums)).addParseAction(remove_first).parseString("0 123 321")) # -> ['123', '321']
506
patt = label("LABEL") + OneOrMore(Word(nums))
507
print(patt.parseString("AAB 123 321").dump())
509
# Use pop() in a parse action to remove named result (note that corresponding value is not
510
# removed from list form of results)
511
def remove_LABEL(tokens):
514
patt.addParseAction(remove_LABEL)
515
print(patt.parseString("AAB 123 321").dump())
517
['AAB', '123', '321']
520
['AAB', '123', '321']
524
for k,v in kwargs.items():
528
raise TypeError("pop() got an unexpected keyword argument '%s'" % k)
529
if (isinstance(args[0], int) or
537
defaultvalue = args[1]
540
def get(self, key, defaultValue=None):
542
Returns named result matching the given key, or if there is no
543
such name, then returns the given C{defaultValue} or C{None} if no
544
C{defaultValue} is specified.
546
Similar to C{dict.get()}.
550
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
552
result = date_str.parseString("1999/12/31")
553
print(result.get("year")) # -> '1999'
554
print(result.get("hour", "not specified")) # -> 'not specified'
555
print(result.get("hour")) # -> None
562
def insert( self, index, insStr ):
564
Inserts new element at location index in the list of parsed tokens.
566
Similar to C{list.insert()}.
569
print(OneOrMore(Word(nums)).parseString("0 123 321")) # -> ['0', '123', '321']
571
# use a parse action to insert the parse location in the front of the parsed results
572
def insert_locn(locn, tokens):
573
tokens.insert(0, locn)
574
print(OneOrMore(Word(nums)).addParseAction(insert_locn).parseString("0 123 321")) # -> [0, '0', '123', '321']
576
self.__toklist.insert(index, insStr)
577
# fixup indices in token dictionary
578
for name,occurrences in self.__tokdict.items():
579
for k, (value, position) in enumerate(occurrences):
580
occurrences[k] = _ParseResultsWithOffset(value, position + (position > index))
582
def append( self, item ):
584
Add single element to end of ParseResults list of elements.
587
print(OneOrMore(Word(nums)).parseString("0 123 321")) # -> ['0', '123', '321']
589
# use a parse action to compute the sum of the parsed integers, and add it to the end
590
def append_sum(tokens):
591
tokens.append(sum(map(int, tokens)))
592
print(OneOrMore(Word(nums)).addParseAction(append_sum).parseString("0 123 321")) # -> ['0', '123', '321', 444]
594
self.__toklist.append(item)
596
def extend( self, itemseq ):
598
Add sequence of elements to end of ParseResults list of elements.
601
patt = OneOrMore(Word(alphas))
603
# use a parse action to append the reverse of the matched strings, to make a palindrome
604
def make_palindrome(tokens):
605
tokens.extend(reversed([t[::-1] for t in tokens]))
606
return ''.join(tokens)
607
print(patt.addParseAction(make_palindrome).parseString("lskdj sdlkjf lksd")) # -> 'lskdjsdlkjflksddsklfjkldsjdksl'
609
if isinstance(itemseq, ParseResults):
612
self.__toklist.extend(itemseq)
616
Clear all elements and results names.
618
del self.__toklist[:]
619
self.__tokdict.clear()
621
def __getattr__( self, name ):
627
if name in self.__tokdict:
628
if name not in self.__accumNames:
629
return self.__tokdict[name][-1][0]
631
return ParseResults([ v[0] for v in self.__tokdict[name] ])
635
def __add__( self, other ):
640
def __iadd__( self, other ):
642
offset = len(self.__toklist)
643
addoffset = lambda a: offset if a<0 else a+offset
644
otheritems = other.__tokdict.items()
645
otherdictitems = [(k, _ParseResultsWithOffset(v[0],addoffset(v[1])) )
646
for (k,vlist) in otheritems for v in vlist]
647
for k,v in otherdictitems:
649
if isinstance(v[0],ParseResults):
650
v[0].__parent = wkref(self)
652
self.__toklist += other.__toklist
653
self.__accumNames.update( other.__accumNames )
656
def __radd__(self, other):
657
if isinstance(other,int) and other == 0:
658
# useful for merging many ParseResults using sum() builtin
661
# this may raise a TypeError - so be it
664
def __repr__( self ):
665
return "(%s, %s)" % ( repr( self.__toklist ), repr( self.__tokdict ) )
668
return '[' + ', '.join(_ustr(i) if isinstance(i, ParseResults) else repr(i) for i in self.__toklist) + ']'
670
def _asStringList( self, sep='' ):
672
for item in self.__toklist:
675
if isinstance( item, ParseResults ):
676
out += item._asStringList()
678
out.append( _ustr(item) )
683
Returns the parse results as a nested list of matching tokens, all converted to strings.
686
patt = OneOrMore(Word(alphas))
687
result = patt.parseString("sldkj lsdkj sldkj")
688
# even though the result prints in string-like form, it is actually a pyparsing ParseResults
689
print(type(result), result) # -> <class 'pyparsing.ParseResults'> ['sldkj', 'lsdkj', 'sldkj']
691
# Use asList() to create an actual list
692
result_list = result.asList()
693
print(type(result_list), result_list) # -> <class 'list'> ['sldkj', 'lsdkj', 'sldkj']
695
return [res.asList() if isinstance(res,ParseResults) else res for res in self.__toklist]
699
Returns the named parse results as a nested dictionary.
703
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
705
result = date_str.parseString('12/31/1999')
706
print(type(result), repr(result)) # -> <class 'pyparsing.ParseResults'> (['12', '/', '31', '/', '1999'], {'day': [('1999', 4)], 'year': [('12', 0)], 'month': [('31', 2)]})
708
result_dict = result.asDict()
709
print(type(result_dict), repr(result_dict)) # -> <class 'dict'> {'day': '1999', 'year': '12', 'month': '31'}
711
# even though a ParseResults supports dict-like access, sometime you just need to have a dict
713
print(json.dumps(result)) # -> Exception: TypeError: ... is not JSON serializable
714
print(json.dumps(result.asDict())) # -> {"month": "31", "day": "1999", "year": "12"}
719
item_fn = self.iteritems
722
if isinstance(obj, ParseResults):
726
return [toItem(v) for v in obj]
730
return dict((k,toItem(v)) for k,v in item_fn())
734
Returns a new copy of a C{ParseResults} object.
736
ret = ParseResults( self.__toklist )
737
ret.__tokdict = self.__tokdict.copy()
738
ret.__parent = self.__parent
739
ret.__accumNames.update( self.__accumNames )
740
ret.__name = self.__name
743
def asXML( self, doctag=None, namedItemsOnly=False, indent="", formatted=True ):
745
(Deprecated) Returns the parse results as XML. Tags are created for tokens and lists that have defined results names.
749
namedItems = dict((v[1],k) for (k,vlist) in self.__tokdict.items()
751
nextLevelIndent = indent + " "
753
# collapse out indents if formatting is not desired
760
if doctag is not None:
764
selfTag = self.__name
772
out += [ nl, indent, "<", selfTag, ">" ]
774
for i,res in enumerate(self.__toklist):
775
if isinstance(res,ParseResults):
777
out += [ res.asXML(namedItems[i],
778
namedItemsOnly and doctag is None,
782
out += [ res.asXML(None,
783
namedItemsOnly and doctag is None,
787
# individual token, see if there is a name for it
790
resTag = namedItems[i]
796
xmlBodyText = _xml_escape(_ustr(res))
797
out += [ nl, nextLevelIndent, "<", resTag, ">",
801
out += [ nl, indent, "</", selfTag, ">" ]
804
def __lookup(self,sub):
805
for k,vlist in self.__tokdict.items():
813
Returns the results name for this token expression. Useful when several
814
different expressions might match at a particular location.
818
ssn_expr = Regex(r"\d\d\d-\d\d-\d\d\d\d")
819
house_number_expr = Suppress('#') + Word(nums, alphanums)
820
user_data = (Group(house_number_expr)("house_number")
821
| Group(ssn_expr)("ssn")
822
| Group(integer)("age"))
823
user_info = OneOrMore(user_data)
825
result = user_info.parseString("22 111-22-3333 #221B")
827
print(item.getName(), ':', item[0])
836
par = self.__parent()
838
return par.__lookup(self)
841
elif (len(self) == 1 and
842
len(self.__tokdict) == 1 and
843
next(iter(self.__tokdict.values()))[0][1] in (0,-1)):
844
return next(iter(self.__tokdict.keys()))
848
def dump(self, indent='', depth=0, full=True):
850
Diagnostic method for listing out the contents of a C{ParseResults}.
851
Accepts an optional C{indent} argument so that this string can be embedded
852
in a nested display of other data.
856
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
858
result = date_str.parseString('12/31/1999')
861
['12', '/', '31', '/', '1999']
868
out.append( indent+_ustr(self.asList()) )
871
items = sorted((str(k), v) for k,v in self.items())
875
out.append( "%s%s- %s: " % (indent,(' '*depth), k) )
876
if isinstance(v,ParseResults):
878
out.append( v.dump(indent,depth+1) )
883
elif any(isinstance(vv,ParseResults) for vv in self):
885
for i,vv in enumerate(v):
886
if isinstance(vv,ParseResults):
887
out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth)),i,indent,(' '*(depth+1)),vv.dump(indent,depth+1) ))
889
out.append("\n%s%s[%d]:\n%s%s%s" % (indent,(' '*(depth)),i,indent,(' '*(depth+1)),_ustr(vv)))
893
def pprint(self, *args, **kwargs):
895
Pretty-printer for parsed results as a list, using the C{pprint} module.
896
Accepts additional positional or keyword args as defined for the
897
C{pprint.pprint} method. (U{http://docs.python.org/3/library/pprint.html#pprint.pprint})
900
ident = Word(alphas, alphanums)
903
term = ident | num | Group('(' + func + ')')
904
func <<= ident + Group(Optional(delimitedList(term)))
905
result = func.parseString("fna a,b,(fnb c,d,200),100")
906
result.pprint(width=40)
911
['(', 'fnb', ['c', 'd', '200'], ')'],
914
pprint.pprint(self.asList(), *args, **kwargs)
916
# add support for pickle protocol
917
def __getstate__(self):
918
return ( self.__toklist,
919
( self.__tokdict.copy(),
920
self.__parent is not None and self.__parent() or None,
924
def __setstate__(self,state):
925
self.__toklist = state[0]
929
self.__name) = state[1]
930
self.__accumNames = {}
931
self.__accumNames.update(inAccumNames)
933
self.__parent = wkref(par)
937
def __getnewargs__(self):
938
return self.__toklist, self.__name, self.__asList, self.__modal
941
return (dir(type(self)) + list(self.keys()))
943
collections.MutableMapping.register(ParseResults)
946
"""Returns current column within a string, counting newlines as line separators.
947
The first column is number 1.
949
Note: the default parsing behavior is to expand tabs in the input string
950
before starting the parsing process. See L{I{ParserElement.parseString}<ParserElement.parseString>} for more information
951
on parsing strings containing C{<TAB>}s, and suggested methods to maintain a
952
consistent view of the parsed string, the parse location, and line and column
953
positions within the parsed string.
956
return 1 if 0<loc<len(s) and s[loc-1] == '\n' else loc - s.rfind("\n", 0, loc)
958
def lineno(loc,strg):
959
"""Returns current line number within a string, counting newlines as line separators.
960
The first line is number 1.
962
Note: the default parsing behavior is to expand tabs in the input string
963
before starting the parsing process. See L{I{ParserElement.parseString}<ParserElement.parseString>} for more information
964
on parsing strings containing C{<TAB>}s, and suggested methods to maintain a
965
consistent view of the parsed string, the parse location, and line and column
966
positions within the parsed string.
968
return strg.count("\n",0,loc) + 1
970
def line( loc, strg ):
971
"""Returns the line of text containing loc within a string, counting newlines as line separators.
973
lastCR = strg.rfind("\n", 0, loc)
974
nextCR = strg.find("\n", loc)
976
return strg[lastCR+1:nextCR]
978
return strg[lastCR+1:]
980
def _defaultStartDebugAction( instring, loc, expr ):
981
print (("Match " + _ustr(expr) + " at loc " + _ustr(loc) + "(%d,%d)" % ( lineno(loc,instring), col(loc,instring) )))
983
def _defaultSuccessDebugAction( instring, startloc, endloc, expr, toks ):
984
print ("Matched " + _ustr(expr) + " -> " + str(toks.asList()))
986
def _defaultExceptionDebugAction( instring, loc, expr, exc ):
987
print ("Exception raised:" + _ustr(exc))
989
def nullDebugAction(*args):
990
"""'Do-nothing' debug action, to suppress debugging output during parsing."""
993
# Only works on Python 3.x - nonlocal is toxic to Python 2 installs
994
#~ 'decorator to trim function calls to match the arity of the target'
995
#~ def _trim_arity(func, maxargs=3):
996
#~ if func in singleArgBuiltins:
997
#~ return lambda s,l,t: func(t)
999
#~ foundArity = False
1000
#~ def wrapper(*args):
1001
#~ nonlocal limit,foundArity
1004
#~ ret = func(*args[limit:])
1005
#~ foundArity = True
1007
#~ except TypeError:
1008
#~ if limit == maxargs or foundArity:
1014
# this version is Python 2.x-3.x cross-compatible
1015
'decorator to trim function calls to match the arity of the target'
1016
def _trim_arity(func, maxargs=2):
1017
if func in singleArgBuiltins:
1018
return lambda s,l,t: func(t)
1020
foundArity = [False]
1022
# traceback return data structure changed in Py3.5 - normalize back to plain tuples
1023
if system_version[:2] >= (3,5):
1024
def extract_stack(limit=0):
1025
# special handling for Python 3.5.0 - extra deep call stack by 1
1026
offset = -3 if system_version == (3,5,0) else -2
1027
frame_summary = traceback.extract_stack(limit=-offset+limit-1)[offset]
1028
return [(frame_summary.filename, frame_summary.lineno)]
1029
def extract_tb(tb, limit=0):
1030
frames = traceback.extract_tb(tb, limit=limit)
1031
frame_summary = frames[-1]
1032
return [(frame_summary.filename, frame_summary.lineno)]
1034
extract_stack = traceback.extract_stack
1035
extract_tb = traceback.extract_tb
1037
# synthesize what would be returned by traceback.extract_stack at the call to
1038
# user's parse action 'func', so that we don't incur call penalty at parse time
1041
# IF ANY CODE CHANGES, EVEN JUST COMMENTS OR BLANK LINES, BETWEEN THE NEXT LINE AND
1042
# THE CALL TO FUNC INSIDE WRAPPER, LINE_DIFF MUST BE MODIFIED!!!!
1043
this_line = extract_stack(limit=2)[-1]
1044
pa_call_line_synth = (this_line[0], this_line[1]+LINE_DIFF)
1049
ret = func(*args[limit[0]:])
1050
foundArity[0] = True
1053
# re-raise TypeErrors if they did not come from our arity testing
1058
tb = sys.exc_info()[-1]
1059
if not extract_tb(tb, limit=2)[-1][:2] == pa_call_line_synth:
1064
if limit[0] <= maxargs:
1069
# copy func name to wrapper for sensible debug output
1070
func_name = "<parse action>"
1072
func_name = getattr(func, '__name__',
1073
getattr(func, '__class__').__name__)
1075
func_name = str(func)
1076
wrapper.__name__ = func_name
1080
class ParserElement(object):
1081
"""Abstract base level parser element class."""
1082
DEFAULT_WHITE_CHARS = " \n\t\r"
1083
verbose_stacktrace = False
1086
def setDefaultWhitespaceChars( chars ):
1088
Overrides the default whitespace chars
1091
# default whitespace chars are space, <TAB> and newline
1092
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def', 'ghi', 'jkl']
1094
# change to just treat newline as significant
1095
ParserElement.setDefaultWhitespaceChars(" \t")
1096
OneOrMore(Word(alphas)).parseString("abc def\nghi jkl") # -> ['abc', 'def']
1098
ParserElement.DEFAULT_WHITE_CHARS = chars
1101
def inlineLiteralsUsing(cls):
1103
Set class to be used for inclusion of string literals into a parser.
1106
# default literal class used is Literal
1107
integer = Word(nums)
1108
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
1110
date_str.parseString("1999/12/31") # -> ['1999', '/', '12', '/', '31']
1113
# change to Suppress
1114
ParserElement.inlineLiteralsUsing(Suppress)
1115
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
1117
date_str.parseString("1999/12/31") # -> ['1999', '12', '31']
1119
ParserElement._literalStringClass = cls
1121
def __init__( self, savelist=False ):
1122
self.parseAction = list()
1123
self.failAction = None
1124
#~ self.name = "<unknown>" # don't define self.name, let subclasses try/except upcall
1126
self.resultsName = None
1127
self.saveAsList = savelist
1128
self.skipWhitespace = True
1129
self.whiteChars = ParserElement.DEFAULT_WHITE_CHARS
1130
self.copyDefaultWhiteChars = True
1131
self.mayReturnEmpty = False # used when checking for left-recursion
1132
self.keepTabs = False
1133
self.ignoreExprs = list()
1135
self.streamlined = False
1136
self.mayIndexError = True # used to optimize exception handling for subclasses that don't advance parse index
1138
self.modalResults = True # used to mark results names as modal (report only last) or cumulative (list all)
1139
self.debugActions = ( None, None, None ) #custom debug actions
1141
self.callPreparse = True # used to avoid redundant calls to preParse
1142
self.callDuringTry = False
1146
Make a copy of this C{ParserElement}. Useful for defining different parse actions
1147
for the same parsing pattern, using copies of the original parse element.
1150
integer = Word(nums).setParseAction(lambda toks: int(toks[0]))
1151
integerK = integer.copy().addParseAction(lambda toks: toks[0]*1024) + Suppress("K")
1152
integerM = integer.copy().addParseAction(lambda toks: toks[0]*1024*1024) + Suppress("M")
1154
print(OneOrMore(integerK | integerM | integer).parseString("5K 100 640K 256M"))
1156
[5120, 100, 655360, 268435456]
1157
Equivalent form of C{expr.copy()} is just C{expr()}::
1158
integerM = integer().addParseAction(lambda toks: toks[0]*1024*1024) + Suppress("M")
1160
cpy = copy.copy( self )
1161
cpy.parseAction = self.parseAction[:]
1162
cpy.ignoreExprs = self.ignoreExprs[:]
1163
if self.copyDefaultWhiteChars:
1164
cpy.whiteChars = ParserElement.DEFAULT_WHITE_CHARS
1167
def setName( self, name ):
1169
Define name for this expression, makes debugging and exception messages clearer.
1172
Word(nums).parseString("ABC") # -> Exception: Expected W:(0123...) (at char 0), (line:1, col:1)
1173
Word(nums).setName("integer").parseString("ABC") # -> Exception: Expected integer (at char 0), (line:1, col:1)
1176
self.errmsg = "Expected " + self.name
1177
if hasattr(self,"exception"):
1178
self.exception.msg = self.errmsg
1181
def setResultsName( self, name, listAllMatches=False ):
1183
Define name for referencing matching tokens as a nested attribute
1184
of the returned parse results.
1185
NOTE: this returns a *copy* of the original C{ParserElement} object;
1186
this is so that the client can define a basic element, such as an
1187
integer, and reference it in multiple places with different names.
1189
You can also set results names using the abbreviated syntax,
1190
C{expr("name")} in place of C{expr.setResultsName("name")} -
1191
see L{I{__call__}<__call__>}.
1194
date_str = (integer.setResultsName("year") + '/'
1195
+ integer.setResultsName("month") + '/'
1196
+ integer.setResultsName("day"))
1199
date_str = integer("year") + '/' + integer("month") + '/' + integer("day")
1201
newself = self.copy()
1202
if name.endswith("*"):
1205
newself.resultsName = name
1206
newself.modalResults = not listAllMatches
1209
def setBreak(self,breakFlag = True):
1210
"""Method to invoke the Python pdb debugger when this element is
1211
about to be parsed. Set C{breakFlag} to True to enable, False to
1215
_parseMethod = self._parse
1216
def breaker(instring, loc, doActions=True, callPreParse=True):
1219
return _parseMethod( instring, loc, doActions, callPreParse )
1220
breaker._originalParseMethod = _parseMethod
1221
self._parse = breaker
1223
if hasattr(self._parse,"_originalParseMethod"):
1224
self._parse = self._parse._originalParseMethod
1227
def setParseAction( self, *fns, **kwargs ):
1229
Define action to perform when successfully matching parse element definition.
1230
Parse action fn is a callable method with 0-3 arguments, called as C{fn(s,loc,toks)},
1231
C{fn(loc,toks)}, C{fn(toks)}, or just C{fn()}, where:
1232
- s = the original string being parsed (see note below)
1233
- loc = the location of the matching substring
1234
- toks = a list of the matched tokens, packaged as a C{L{ParseResults}} object
1235
If the functions in fns modify the tokens, they can return them as the return
1236
value from fn, and the modified list of tokens will replace the original.
1237
Otherwise, fn does not need to return any value.
1239
Optional keyword arguments:
1240
- callDuringTry = (default=C{False}) indicate if parse action should be run during lookaheads and alternate testing
1242
Note: the default parsing behavior is to expand tabs in the input string
1243
before starting the parsing process. See L{I{parseString}<parseString>} for more information
1244
on parsing strings containing C{<TAB>}s, and suggested methods to maintain a
1245
consistent view of the parsed string, the parse location, and line and column
1246
positions within the parsed string.
1249
integer = Word(nums)
1250
date_str = integer + '/' + integer + '/' + integer
1252
date_str.parseString("1999/12/31") # -> ['1999', '/', '12', '/', '31']
1254
# use parse action to convert to ints at parse time
1255
integer = Word(nums).setParseAction(lambda toks: int(toks[0]))
1256
date_str = integer + '/' + integer + '/' + integer
1258
# note that integer fields are now ints, not strings
1259
date_str.parseString("1999/12/31") # -> [1999, '/', 12, '/', 31]
1261
self.parseAction = list(map(_trim_arity, list(fns)))
1262
self.callDuringTry = kwargs.get("callDuringTry", False)
1265
def addParseAction( self, *fns, **kwargs ):
1267
Add parse action to expression's list of parse actions. See L{I{setParseAction}<setParseAction>}.
1269
See examples in L{I{copy}<copy>}.
1271
self.parseAction += list(map(_trim_arity, list(fns)))
1272
self.callDuringTry = self.callDuringTry or kwargs.get("callDuringTry", False)
1275
def addCondition(self, *fns, **kwargs):
1276
"""Add a boolean predicate function to expression's list of parse actions. See
1277
L{I{setParseAction}<setParseAction>} for function call signatures. Unlike C{setParseAction},
1278
functions passed to C{addCondition} need to return boolean success/fail of the condition.
1280
Optional keyword arguments:
1281
- message = define a custom message to be used in the raised exception
1282
- fatal = if True, will raise ParseFatalException to stop parsing immediately; otherwise will raise ParseException
1285
integer = Word(nums).setParseAction(lambda toks: int(toks[0]))
1286
year_int = integer.copy()
1287
year_int.addCondition(lambda toks: toks[0] >= 2000, message="Only support years 2000 and later")
1288
date_str = year_int + '/' + integer + '/' + integer
1290
result = date_str.parseString("1999/12/31") # -> Exception: Only support years 2000 and later (at char 0), (line:1, col:1)
1292
msg = kwargs.get("message", "failed user-defined condition")
1293
exc_type = ParseFatalException if kwargs.get("fatal", False) else ParseException
1296
if not bool(_trim_arity(fn)(s,l,t)):
1297
raise exc_type(s,l,msg)
1298
self.parseAction.append(pa)
1299
self.callDuringTry = self.callDuringTry or kwargs.get("callDuringTry", False)
1302
def setFailAction( self, fn ):
1303
"""Define action to perform if parsing fails at this expression.
1304
Fail acton fn is a callable function that takes the arguments
1305
C{fn(s,loc,expr,err)} where:
1306
- s = string being parsed
1307
- loc = location where expression match was attempted and failed
1308
- expr = the parse expression that failed
1309
- err = the exception thrown
1310
The function returns no value. It may throw C{L{ParseFatalException}}
1311
if it is desired to stop parsing immediately."""
1312
self.failAction = fn
1315
def _skipIgnorables( self, instring, loc ):
1319
for e in self.ignoreExprs:
1322
loc,dummy = e._parse( instring, loc )
1324
except ParseException:
1328
def preParse( self, instring, loc ):
1329
if self.ignoreExprs:
1330
loc = self._skipIgnorables( instring, loc )
1332
if self.skipWhitespace:
1333
wt = self.whiteChars
1334
instrlen = len(instring)
1335
while loc < instrlen and instring[loc] in wt:
1340
def parseImpl( self, instring, loc, doActions=True ):
1343
def postParse( self, instring, loc, tokenlist ):
1347
def _parseNoCache( self, instring, loc, doActions=True, callPreParse=True ):
1348
debugging = ( self.debug ) #and doActions )
1350
if debugging or self.failAction:
1351
#~ print ("Match",self,"at loc",loc,"(%d,%d)" % ( lineno(loc,instring), col(loc,instring) ))
1352
if (self.debugActions[0] ):
1353
self.debugActions[0]( instring, loc, self )
1354
if callPreParse and self.callPreparse:
1355
preloc = self.preParse( instring, loc )
1358
tokensStart = preloc
1361
loc,tokens = self.parseImpl( instring, preloc, doActions )
1363
raise ParseException( instring, len(instring), self.errmsg, self )
1364
except ParseBaseException as err:
1365
#~ print ("Exception raised:", err)
1366
if self.debugActions[2]:
1367
self.debugActions[2]( instring, tokensStart, self, err )
1369
self.failAction( instring, tokensStart, self, err )
1372
if callPreParse and self.callPreparse:
1373
preloc = self.preParse( instring, loc )
1376
tokensStart = preloc
1377
if self.mayIndexError or loc >= len(instring):
1379
loc,tokens = self.parseImpl( instring, preloc, doActions )
1381
raise ParseException( instring, len(instring), self.errmsg, self )
1383
loc,tokens = self.parseImpl( instring, preloc, doActions )
1385
tokens = self.postParse( instring, loc, tokens )
1387
retTokens = ParseResults( tokens, self.resultsName, asList=self.saveAsList, modal=self.modalResults )
1388
if self.parseAction and (doActions or self.callDuringTry):
1391
for fn in self.parseAction:
1392
tokens = fn( instring, tokensStart, retTokens )
1393
if tokens is not None:
1394
retTokens = ParseResults( tokens,
1396
asList=self.saveAsList and isinstance(tokens,(ParseResults,list)),
1397
modal=self.modalResults )
1398
except ParseBaseException as err:
1399
#~ print "Exception raised in user parse action:", err
1400
if (self.debugActions[2] ):
1401
self.debugActions[2]( instring, tokensStart, self, err )
1404
for fn in self.parseAction:
1405
tokens = fn( instring, tokensStart, retTokens )
1406
if tokens is not None:
1407
retTokens = ParseResults( tokens,
1409
asList=self.saveAsList and isinstance(tokens,(ParseResults,list)),
1410
modal=self.modalResults )
1413
#~ print ("Matched",self,"->",retTokens.asList())
1414
if (self.debugActions[1] ):
1415
self.debugActions[1]( instring, tokensStart, loc, self, retTokens )
1417
return loc, retTokens
1419
def tryParse( self, instring, loc ):
1421
return self._parse( instring, loc, doActions=False )[0]
1422
except ParseFatalException:
1423
raise ParseException( instring, loc, self.errmsg, self)
1425
def canParseNext(self, instring, loc):
1427
self.tryParse(instring, loc)
1428
except (ParseException, IndexError):
1433
class _UnboundedCache(object):
1436
self.not_in_cache = not_in_cache = object()
1439
return cache.get(key, not_in_cache)
1441
def set(self, key, value):
1447
self.get = types.MethodType(get, self)
1448
self.set = types.MethodType(set, self)
1449
self.clear = types.MethodType(clear, self)
1451
if _OrderedDict is not None:
1452
class _FifoCache(object):
1453
def __init__(self, size):
1454
self.not_in_cache = not_in_cache = object()
1456
cache = _OrderedDict()
1459
return cache.get(key, not_in_cache)
1461
def set(self, key, value):
1463
if len(cache) > size:
1464
cache.popitem(False)
1469
self.get = types.MethodType(get, self)
1470
self.set = types.MethodType(set, self)
1471
self.clear = types.MethodType(clear, self)
1474
class _FifoCache(object):
1475
def __init__(self, size):
1476
self.not_in_cache = not_in_cache = object()
1479
key_fifo = collections.deque([], size)
1482
return cache.get(key, not_in_cache)
1484
def set(self, key, value):
1486
if len(cache) > size:
1487
cache.pop(key_fifo.popleft(), None)
1488
key_fifo.append(key)
1494
self.get = types.MethodType(get, self)
1495
self.set = types.MethodType(set, self)
1496
self.clear = types.MethodType(clear, self)
1498
# argument cache for optimizing repeated calls when backtracking through recursive expressions
1499
packrat_cache = {} # this is set later by enabledPackrat(); this is here so that resetCache() doesn't fail
1500
packrat_cache_lock = RLock()
1501
packrat_cache_stats = [0, 0]
1503
# this method gets repeatedly called during backtracking with the same arguments -
1504
# we can cache these arguments and save ourselves the trouble of re-parsing the contained expression
1505
def _parseCache( self, instring, loc, doActions=True, callPreParse=True ):
1507
lookup = (self, instring, loc, callPreParse, doActions)
1508
with ParserElement.packrat_cache_lock:
1509
cache = ParserElement.packrat_cache
1510
value = cache.get(lookup)
1511
if value is cache.not_in_cache:
1512
ParserElement.packrat_cache_stats[MISS] += 1
1514
value = self._parseNoCache(instring, loc, doActions, callPreParse)
1515
except ParseBaseException as pe:
1516
# cache a copy of the exception, without the traceback
1517
cache.set(lookup, pe.__class__(*pe.args))
1520
cache.set(lookup, (value[0], value[1].copy()))
1523
ParserElement.packrat_cache_stats[HIT] += 1
1524
if isinstance(value, Exception):
1526
return (value[0], value[1].copy())
1528
_parse = _parseNoCache
1532
ParserElement.packrat_cache.clear()
1533
ParserElement.packrat_cache_stats[:] = [0] * len(ParserElement.packrat_cache_stats)
1535
_packratEnabled = False
1537
def enablePackrat(cache_size_limit=128):
1538
"""Enables "packrat" parsing, which adds memoizing to the parsing logic.
1539
Repeated parse attempts at the same string location (which happens
1540
often in many complex grammars) can immediately return a cached value,
1541
instead of re-executing parsing/validating code. Memoizing is done of
1542
both valid results and parsing exceptions.
1545
- cache_size_limit - (default=C{128}) - if an integer value is provided
1546
will limit the size of the packrat cache; if None is passed, then
1547
the cache size will be unbounded; if 0 is passed, the cache will
1548
be effectively disabled.
1550
This speedup may break existing programs that use parse actions that
1551
have side-effects. For this reason, packrat parsing is disabled when
1552
you first import pyparsing. To activate the packrat feature, your
1553
program must call the class method C{ParserElement.enablePackrat()}. If
1554
your program uses C{psyco} to "compile as you go", you must call
1555
C{enablePackrat} before calling C{psyco.full()}. If you do not do this,
1556
Python will crash. For best results, call C{enablePackrat()} immediately
1557
after importing pyparsing.
1561
pyparsing.ParserElement.enablePackrat()
1563
if not ParserElement._packratEnabled:
1564
ParserElement._packratEnabled = True
1565
if cache_size_limit is None:
1566
ParserElement.packrat_cache = ParserElement._UnboundedCache()
1568
ParserElement.packrat_cache = ParserElement._FifoCache(cache_size_limit)
1569
ParserElement._parse = ParserElement._parseCache
1571
def parseString( self, instring, parseAll=False ):
1573
Execute the parse expression with the given string.
1574
This is the main interface to the client code, once the complete
1575
expression has been built.
1577
If you want the grammar to require that the entire input string be
1578
successfully parsed, then set C{parseAll} to True (equivalent to ending
1579
the grammar with C{L{StringEnd()}}).
1581
Note: C{parseString} implicitly calls C{expandtabs()} on the input string,
1582
in order to report proper column numbers in parse actions.
1583
If the input string contains tabs and
1584
the grammar uses parse actions that use the C{loc} argument to index into the
1585
string being parsed, you can ensure you have a consistent view of the input
1587
- calling C{parseWithTabs} on your grammar before calling C{parseString}
1588
(see L{I{parseWithTabs}<parseWithTabs>})
1589
- define your parse action using the full C{(s,loc,toks)} signature, and
1590
reference the input string using the parse action's C{s} argument
1591
- explictly expand the tabs in your input string before calling
1595
Word('a').parseString('aaaaabaaa') # -> ['aaaaa']
1596
Word('a').parseString('aaaaabaaa', parseAll=True) # -> Exception: Expected end of text
1598
ParserElement.resetCache()
1599
if not self.streamlined:
1601
#~ self.saveAsList = True
1602
for e in self.ignoreExprs:
1604
if not self.keepTabs:
1605
instring = instring.expandtabs()
1607
loc, tokens = self._parse( instring, 0 )
1609
loc = self.preParse( instring, loc )
1610
se = Empty() + StringEnd()
1611
se._parse( instring, loc )
1612
except ParseBaseException as exc:
1613
if ParserElement.verbose_stacktrace:
1616
# catch and re-raise exception from here, clears out pyparsing internal stack trace
1621
def scanString( self, instring, maxMatches=_MAX_INT, overlap=False ):
1623
Scan the input string for expression matches. Each match will return the
1624
matching tokens, start location, and end location. May be called with optional
1625
C{maxMatches} argument, to clip scanning after 'n' matches are found. If
1626
C{overlap} is specified, then overlapping matches will be reported.
1628
Note that the start and end locations are reported relative to the string
1629
being parsed. See L{I{parseString}<parseString>} for more information on parsing
1630
strings with embedded tabs.
1633
source = "sldjf123lsdjjkf345sldkjf879lkjsfd987"
1635
for tokens,start,end in Word(alphas).scanString(source):
1636
print(' '*start + '^'*(end-start))
1637
print(' '*start + tokens[0])
1641
sldjf123lsdjjkf345sldkjf879lkjsfd987
1651
if not self.streamlined:
1653
for e in self.ignoreExprs:
1656
if not self.keepTabs:
1657
instring = _ustr(instring).expandtabs()
1658
instrlen = len(instring)
1660
preparseFn = self.preParse
1661
parseFn = self._parse
1662
ParserElement.resetCache()
1665
while loc <= instrlen and matches < maxMatches:
1667
preloc = preparseFn( instring, loc )
1668
nextLoc,tokens = parseFn( instring, preloc, callPreParse=False )
1669
except ParseException:
1674
yield tokens, preloc, nextLoc
1676
nextloc = preparseFn( instring, loc )
1685
except ParseBaseException as exc:
1686
if ParserElement.verbose_stacktrace:
1689
# catch and re-raise exception from here, clears out pyparsing internal stack trace
1692
def transformString( self, instring ):
1694
Extension to C{L{scanString}}, to modify matching text with modified tokens that may
1695
be returned from a parse action. To use C{transformString}, define a grammar and
1696
attach a parse action to it that modifies the returned token list.
1697
Invoking C{transformString()} on a target string will then scan for matches,
1698
and replace the matched text patterns according to the logic in the parse
1699
action. C{transformString()} returns the resulting transformed string.
1703
wd.setParseAction(lambda toks: toks[0].title())
1705
print(wd.transformString("now is the winter of our discontent made glorious summer by this sun of york."))
1707
Now Is The Winter Of Our Discontent Made Glorious Summer By This Sun Of York.
1711
# force preservation of <TAB>s, to minimize unwanted transformation of string, and to
1712
# keep string locs straight between transformString and scanString
1713
self.keepTabs = True
1715
for t,s,e in self.scanString( instring ):
1716
out.append( instring[lastE:s] )
1718
if isinstance(t,ParseResults):
1720
elif isinstance(t,list):
1725
out.append(instring[lastE:])
1726
out = [o for o in out if o]
1727
return "".join(map(_ustr,_flatten(out)))
1728
except ParseBaseException as exc:
1729
if ParserElement.verbose_stacktrace:
1732
# catch and re-raise exception from here, clears out pyparsing internal stack trace
1735
def searchString( self, instring, maxMatches=_MAX_INT ):
1737
Another extension to C{L{scanString}}, simplifying the access to the tokens found
1738
to match the given parse expression. May be called with optional
1739
C{maxMatches} argument, to clip searching after 'n' matches are found.
1742
# a capitalized word starts with an uppercase letter, followed by zero or more lowercase letters
1743
cap_word = Word(alphas.upper(), alphas.lower())
1745
print(cap_word.searchString("More than Iron, more than Lead, more than Gold I need Electricity"))
1747
['More', 'Iron', 'Lead', 'Gold', 'I']
1750
return ParseResults([ t for t,s,e in self.scanString( instring, maxMatches ) ])
1751
except ParseBaseException as exc:
1752
if ParserElement.verbose_stacktrace:
1755
# catch and re-raise exception from here, clears out pyparsing internal stack trace
1758
def split(self, instring, maxsplit=_MAX_INT, includeSeparators=False):
1760
Generator method to split a string using the given expression as a separator.
1761
May be called with optional C{maxsplit} argument, to limit the number of splits;
1762
and the optional C{includeSeparators} argument (default=C{False}), if the separating
1763
matching text should be included in the split results.
1766
punc = oneOf(list(".,;:/-!?"))
1767
print(list(punc.split("This, this?, this sentence, is badly punctuated!")))
1769
['This', ' this', '', ' this sentence', ' is badly punctuated', '']
1773
for t,s,e in self.scanString(instring, maxMatches=maxsplit):
1774
yield instring[last:s]
1775
if includeSeparators:
1778
yield instring[last:]
1780
def __add__(self, other ):
1782
Implementation of + operator - returns C{L{And}}. Adding strings to a ParserElement
1783
converts them to L{Literal}s by default.
1786
greet = Word(alphas) + "," + Word(alphas) + "!"
1787
hello = "Hello, World!"
1788
print (hello, "->", greet.parseString(hello))
1790
Hello, World! -> ['Hello', ',', 'World', '!']
1792
if isinstance( other, basestring ):
1793
other = ParserElement._literalStringClass( other )
1794
if not isinstance( other, ParserElement ):
1795
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1796
SyntaxWarning, stacklevel=2)
1798
return And( [ self, other ] )
1800
def __radd__(self, other ):
1802
Implementation of + operator when left operand is not a C{L{ParserElement}}
1804
if isinstance( other, basestring ):
1805
other = ParserElement._literalStringClass( other )
1806
if not isinstance( other, ParserElement ):
1807
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1808
SyntaxWarning, stacklevel=2)
1812
def __sub__(self, other):
1814
Implementation of - operator, returns C{L{And}} with error stop
1816
if isinstance( other, basestring ):
1817
other = ParserElement._literalStringClass( other )
1818
if not isinstance( other, ParserElement ):
1819
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1820
SyntaxWarning, stacklevel=2)
1822
return And( [ self, And._ErrorStop(), other ] )
1824
def __rsub__(self, other ):
1826
Implementation of - operator when left operand is not a C{L{ParserElement}}
1828
if isinstance( other, basestring ):
1829
other = ParserElement._literalStringClass( other )
1830
if not isinstance( other, ParserElement ):
1831
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1832
SyntaxWarning, stacklevel=2)
1836
def __mul__(self,other):
1838
Implementation of * operator, allows use of C{expr * 3} in place of
1839
C{expr + expr + expr}. Expressions may also me multiplied by a 2-integer
1840
tuple, similar to C{{min,max}} multipliers in regular expressions. Tuples
1841
may also include C{None} as in:
1842
- C{expr*(n,None)} or C{expr*(n,)} is equivalent
1843
to C{expr*n + L{ZeroOrMore}(expr)}
1844
(read as "at least n instances of C{expr}")
1845
- C{expr*(None,n)} is equivalent to C{expr*(0,n)}
1846
(read as "0 to n instances of C{expr}")
1847
- C{expr*(None,None)} is equivalent to C{L{ZeroOrMore}(expr)}
1848
- C{expr*(1,None)} is equivalent to C{L{OneOrMore}(expr)}
1850
Note that C{expr*(None,n)} does not raise an exception if
1851
more than n exprs exist in the input stream; that is,
1852
C{expr*(None,n)} does not enforce a maximum number of expr
1853
occurrences. If this behavior is desired, then write
1854
C{expr*(None,n) + ~expr}
1856
if isinstance(other,int):
1857
minElements, optElements = other,0
1858
elif isinstance(other,tuple):
1859
other = (other + (None, None))[:2]
1860
if other[0] is None:
1861
other = (0, other[1])
1862
if isinstance(other[0],int) and other[1] is None:
1864
return ZeroOrMore(self)
1866
return OneOrMore(self)
1868
return self*other[0] + ZeroOrMore(self)
1869
elif isinstance(other[0],int) and isinstance(other[1],int):
1870
minElements, optElements = other
1871
optElements -= minElements
1873
raise TypeError("cannot multiply 'ParserElement' and ('%s','%s') objects", type(other[0]),type(other[1]))
1875
raise TypeError("cannot multiply 'ParserElement' and '%s' objects", type(other))
1878
raise ValueError("cannot multiply ParserElement by negative value")
1880
raise ValueError("second tuple value must be greater or equal to first tuple value")
1881
if minElements == optElements == 0:
1882
raise ValueError("cannot multiply ParserElement by 0 or (0,0)")
1885
def makeOptionalList(n):
1887
return Optional(self + makeOptionalList(n-1))
1889
return Optional(self)
1891
if minElements == 1:
1892
ret = self + makeOptionalList(optElements)
1894
ret = And([self]*minElements) + makeOptionalList(optElements)
1896
ret = makeOptionalList(optElements)
1898
if minElements == 1:
1901
ret = And([self]*minElements)
1904
def __rmul__(self, other):
1905
return self.__mul__(other)
1907
def __or__(self, other ):
1909
Implementation of | operator - returns C{L{MatchFirst}}
1911
if isinstance( other, basestring ):
1912
other = ParserElement._literalStringClass( other )
1913
if not isinstance( other, ParserElement ):
1914
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1915
SyntaxWarning, stacklevel=2)
1917
return MatchFirst( [ self, other ] )
1919
def __ror__(self, other ):
1921
Implementation of | operator when left operand is not a C{L{ParserElement}}
1923
if isinstance( other, basestring ):
1924
other = ParserElement._literalStringClass( other )
1925
if not isinstance( other, ParserElement ):
1926
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1927
SyntaxWarning, stacklevel=2)
1931
def __xor__(self, other ):
1933
Implementation of ^ operator - returns C{L{Or}}
1935
if isinstance( other, basestring ):
1936
other = ParserElement._literalStringClass( other )
1937
if not isinstance( other, ParserElement ):
1938
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1939
SyntaxWarning, stacklevel=2)
1941
return Or( [ self, other ] )
1943
def __rxor__(self, other ):
1945
Implementation of ^ operator when left operand is not a C{L{ParserElement}}
1947
if isinstance( other, basestring ):
1948
other = ParserElement._literalStringClass( other )
1949
if not isinstance( other, ParserElement ):
1950
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1951
SyntaxWarning, stacklevel=2)
1955
def __and__(self, other ):
1957
Implementation of & operator - returns C{L{Each}}
1959
if isinstance( other, basestring ):
1960
other = ParserElement._literalStringClass( other )
1961
if not isinstance( other, ParserElement ):
1962
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1963
SyntaxWarning, stacklevel=2)
1965
return Each( [ self, other ] )
1967
def __rand__(self, other ):
1969
Implementation of & operator when left operand is not a C{L{ParserElement}}
1971
if isinstance( other, basestring ):
1972
other = ParserElement._literalStringClass( other )
1973
if not isinstance( other, ParserElement ):
1974
warnings.warn("Cannot combine element of type %s with ParserElement" % type(other),
1975
SyntaxWarning, stacklevel=2)
1979
def __invert__( self ):
1981
Implementation of ~ operator - returns C{L{NotAny}}
1983
return NotAny( self )
1985
def __call__(self, name=None):
1987
Shortcut for C{L{setResultsName}}, with C{listAllMatches=False}.
1989
If C{name} is given with a trailing C{'*'} character, then C{listAllMatches} will be
1992
If C{name} is omitted, same as calling C{L{copy}}.
1995
# these are equivalent
1996
userdata = Word(alphas).setResultsName("name") + Word(nums+"-").setResultsName("socsecno")
1997
userdata = Word(alphas)("name") + Word(nums+"-")("socsecno")
1999
if name is not None:
2000
return self.setResultsName(name)
2004
def suppress( self ):
2006
Suppresses the output of this C{ParserElement}; useful to keep punctuation from
2007
cluttering up returned output.
2009
return Suppress( self )
2011
def leaveWhitespace( self ):
2013
Disables the skipping of whitespace before matching the characters in the
2014
C{ParserElement}'s defined pattern. This is normally only used internally by
2015
the pyparsing module, but may be needed in some whitespace-sensitive grammars.
2017
self.skipWhitespace = False
2020
def setWhitespaceChars( self, chars ):
2022
Overrides the default whitespace chars
2024
self.skipWhitespace = True
2025
self.whiteChars = chars
2026
self.copyDefaultWhiteChars = False
2029
def parseWithTabs( self ):
2031
Overrides default behavior to expand C{<TAB>}s to spaces before parsing the input string.
2032
Must be called before C{parseString} when the input grammar contains elements that
2033
match C{<TAB>} characters.
2035
self.keepTabs = True
2038
def ignore( self, other ):
2040
Define expression to be ignored (e.g., comments) while doing pattern
2041
matching; may be called repeatedly, to define multiple comment or other
2045
patt = OneOrMore(Word(alphas))
2046
patt.parseString('ablaj /* comment */ lskjd') # -> ['ablaj']
2048
patt.ignore(cStyleComment)
2049
patt.parseString('ablaj /* comment */ lskjd') # -> ['ablaj', 'lskjd']
2051
if isinstance(other, basestring):
2052
other = Suppress(other)
2054
if isinstance( other, Suppress ):
2055
if other not in self.ignoreExprs:
2056
self.ignoreExprs.append(other)
2058
self.ignoreExprs.append( Suppress( other.copy() ) )
2061
def setDebugActions( self, startAction, successAction, exceptionAction ):
2063
Enable display of debugging messages while doing pattern matching.
2065
self.debugActions = (startAction or _defaultStartDebugAction,
2066
successAction or _defaultSuccessDebugAction,
2067
exceptionAction or _defaultExceptionDebugAction)
2071
def setDebug( self, flag=True ):
2073
Enable display of debugging messages while doing pattern matching.
2074
Set C{flag} to True to enable, False to disable.
2077
wd = Word(alphas).setName("alphaword")
2078
integer = Word(nums).setName("numword")
2081
# turn on debugging for wd
2084
OneOrMore(term).parseString("abc 123 xyz 890")
2087
Match alphaword at loc 0(1,1)
2088
Matched alphaword -> ['abc']
2089
Match alphaword at loc 3(1,4)
2090
Exception raised:Expected alphaword (at char 4), (line:1, col:5)
2091
Match alphaword at loc 7(1,8)
2092
Matched alphaword -> ['xyz']
2093
Match alphaword at loc 11(1,12)
2094
Exception raised:Expected alphaword (at char 12), (line:1, col:13)
2095
Match alphaword at loc 15(1,16)
2096
Exception raised:Expected alphaword (at char 15), (line:1, col:16)
2098
The output shown is that produced by the default debug actions - custom debug actions can be
2099
specified using L{setDebugActions}. Prior to attempting
2100
to match the C{wd} expression, the debugging message C{"Match <exprname> at loc <n>(<line>,<col>)"}
2101
is shown. Then if the parse succeeds, a C{"Matched"} message is shown, or an C{"Exception raised"}
2102
message is shown. Also note the use of L{setName} to assign a human-readable name to the expression,
2103
which makes debugging and exception messages easier to understand - for instance, the default
2104
name created for the C{Word} expression without calling C{setName} is C{"W:(ABCD...)"}.
2107
self.setDebugActions( _defaultStartDebugAction, _defaultSuccessDebugAction, _defaultExceptionDebugAction )
2112
def __str__( self ):
2115
def __repr__( self ):
2118
def streamline( self ):
2119
self.streamlined = True
2123
def checkRecursion( self, parseElementList ):
2126
def validate( self, validateTrace=[] ):
2128
Check defined expressions for valid structure, check for infinite recursive definitions.
2130
self.checkRecursion( [] )
2132
def parseFile( self, file_or_filename, parseAll=False ):
2134
Execute the parse expression on the given file or filename.
2135
If a filename is specified (instead of a file object),
2136
the entire file is opened, read, and closed before parsing.
2139
file_contents = file_or_filename.read()
2140
except AttributeError:
2141
with open(file_or_filename, "r") as f:
2142
file_contents = f.read()
2144
return self.parseString(file_contents, parseAll)
2145
except ParseBaseException as exc:
2146
if ParserElement.verbose_stacktrace:
2149
# catch and re-raise exception from here, clears out pyparsing internal stack trace
2152
def __eq__(self,other):
2153
if isinstance(other, ParserElement):
2154
return self is other or vars(self) == vars(other)
2155
elif isinstance(other, basestring):
2156
return self.matches(other)
2158
return super(ParserElement,self)==other
2160
def __ne__(self,other):
2161
return not (self == other)
2164
return hash(id(self))
2166
def __req__(self,other):
2167
return self == other
2169
def __rne__(self,other):
2170
return not (self == other)
2172
def matches(self, testString, parseAll=True):
2174
Method for quick testing of a parser against a test string. Good for simple
2175
inline microtests of sub expressions while building up larger parser.
2178
- testString - to test against this expression for a match
2179
- parseAll - (default=C{True}) - flag to pass to C{L{parseString}} when running tests
2183
assert expr.matches("100")
2186
self.parseString(_ustr(testString), parseAll=parseAll)
2188
except ParseBaseException:
2191
def runTests(self, tests, parseAll=True, comment='#', fullDump=True, printResults=True, failureTests=False):
2193
Execute the parse expression on a series of test strings, showing each
2194
test, the parsed results or where the parse failed. Quick and easy way to
2195
run a parse expression against a list of sample strings.
2198
- tests - a list of separate test strings, or a multiline string of test strings
2199
- parseAll - (default=C{True}) - flag to pass to C{L{parseString}} when running tests
2200
- comment - (default=C{'#'}) - expression for indicating embedded comments in the test
2201
string; pass None to disable comment filtering
2202
- fullDump - (default=C{True}) - dump results as list followed by results names in nested outline;
2203
if False, only dump nested list
2204
- printResults - (default=C{True}) prints test output to stdout
2205
- failureTests - (default=C{False}) indicates if these tests are expected to fail parsing
2207
Returns: a (success, results) tuple, where success indicates that all tests succeeded
2208
(or failed if C{failureTests} is True), and the results contain a list of lines of each
2212
number_expr = pyparsing_common.number.copy()
2214
result = number_expr.runTests('''
2219
# float with scientific notation
2221
# integer with scientific notation
2224
print("Success" if result[0] else "Failed!")
2226
result = number_expr.runTests('''
2229
# missing leading digit before '.'
2233
''', failureTests=True)
2234
print("Success" if result[0] else "Failed!")
2244
# float with scientific notation
2248
# integer with scientific notation
2257
FAIL: Expected end of text (at char 3), (line:1, col:4)
2259
# missing leading digit before '.'
2262
FAIL: Expected {real number with scientific notation | real number | signed integer} (at char 0), (line:1, col:1)
2267
FAIL: Expected end of text (at char 4), (line:1, col:5)
2271
Each test string must be on a single line. If you want to test a string that spans multiple
2272
lines, create a test like this::
2274
expr.runTest(r"this is a test\\n of strings that spans \\n 3 lines")
2276
(Note that this is a raw string literal, you must include the leading 'r'.)
2278
if isinstance(tests, basestring):
2279
tests = list(map(str.strip, tests.rstrip().splitlines()))
2280
if isinstance(comment, basestring):
2281
comment = Literal(comment)
2286
if comment is not None and comment.matches(t, False) or comments and not t:
2291
out = ['\n'.join(comments), t]
2294
t = t.replace(r'\n','\n')
2295
result = self.parseString(t, parseAll=parseAll)
2296
out.append(result.dump(full=fullDump))
2297
success = success and not failureTests
2298
except ParseBaseException as pe:
2299
fatal = "(FATAL)" if isinstance(pe, ParseFatalException) else ""
2301
out.append(line(pe.loc, t))
2302
out.append(' '*(col(pe.loc,t)-1) + '^' + fatal)
2304
out.append(' '*pe.loc + '^' + fatal)
2305
out.append("FAIL: " + str(pe))
2306
success = success and failureTests
2308
except Exception as exc:
2309
out.append("FAIL-EXCEPTION: " + str(exc))
2310
success = success and failureTests
2316
print('\n'.join(out))
2318
allResults.append((t, result))
2320
return success, allResults
2323
class Token(ParserElement):
2325
Abstract C{ParserElement} subclass, for defining atomic matching patterns.
2327
def __init__( self ):
2328
super(Token,self).__init__( savelist=False )
2333
An empty token, will always match.
2335
def __init__( self ):
2336
super(Empty,self).__init__()
2338
self.mayReturnEmpty = True
2339
self.mayIndexError = False
2342
class NoMatch(Token):
2344
A token that will never match.
2346
def __init__( self ):
2347
super(NoMatch,self).__init__()
2348
self.name = "NoMatch"
2349
self.mayReturnEmpty = True
2350
self.mayIndexError = False
2351
self.errmsg = "Unmatchable token"
2353
def parseImpl( self, instring, loc, doActions=True ):
2354
raise ParseException(instring, loc, self.errmsg, self)
2357
class Literal(Token):
2359
Token to exactly match a specified string.
2362
Literal('blah').parseString('blah') # -> ['blah']
2363
Literal('blah').parseString('blahfooblah') # -> ['blah']
2364
Literal('blah').parseString('bla') # -> Exception: Expected "blah"
2366
For case-insensitive matching, use L{CaselessLiteral}.
2368
For keyword matching (force word break before and after the matched string),
2369
use L{Keyword} or L{CaselessKeyword}.
2371
def __init__( self, matchString ):
2372
super(Literal,self).__init__()
2373
self.match = matchString
2374
self.matchLen = len(matchString)
2376
self.firstMatchChar = matchString[0]
2378
warnings.warn("null string passed to Literal; use Empty() instead",
2379
SyntaxWarning, stacklevel=2)
2380
self.__class__ = Empty
2381
self.name = '"%s"' % _ustr(self.match)
2382
self.errmsg = "Expected " + self.name
2383
self.mayReturnEmpty = False
2384
self.mayIndexError = False
2386
# Performance tuning: this routine gets called a *lot*
2387
# if this is a single character match string and the first character matches,
2388
# short-circuit as quickly as possible, and avoid calling startswith
2390
def parseImpl( self, instring, loc, doActions=True ):
2391
if (instring[loc] == self.firstMatchChar and
2392
(self.matchLen==1 or instring.startswith(self.match,loc)) ):
2393
return loc+self.matchLen, self.match
2394
raise ParseException(instring, loc, self.errmsg, self)
2396
ParserElement._literalStringClass = Literal
2398
class Keyword(Token):
2400
Token to exactly match a specified string as a keyword, that is, it must be
2401
immediately followed by a non-keyword character. Compare with C{L{Literal}}:
2402
- C{Literal("if")} will match the leading C{'if'} in C{'ifAndOnlyIf'}.
2403
- C{Keyword("if")} will not; it will only match the leading C{'if'} in C{'if x=1'}, or C{'if(y==2)'}
2404
Accepts two optional constructor arguments in addition to the keyword string:
2405
- C{identChars} is a string of characters that would be valid identifier characters,
2406
defaulting to all alphanumerics + "_" and "$"
2407
- C{caseless} allows case-insensitive matching, default is C{False}.
2410
Keyword("start").parseString("start") # -> ['start']
2411
Keyword("start").parseString("starting") # -> Exception
2413
For case-insensitive matching, use L{CaselessKeyword}.
2415
DEFAULT_KEYWORD_CHARS = alphanums+"_$"
2417
def __init__( self, matchString, identChars=None, caseless=False ):
2418
super(Keyword,self).__init__()
2419
if identChars is None:
2420
identChars = Keyword.DEFAULT_KEYWORD_CHARS
2421
self.match = matchString
2422
self.matchLen = len(matchString)
2424
self.firstMatchChar = matchString[0]
2426
warnings.warn("null string passed to Keyword; use Empty() instead",
2427
SyntaxWarning, stacklevel=2)
2428
self.name = '"%s"' % self.match
2429
self.errmsg = "Expected " + self.name
2430
self.mayReturnEmpty = False
2431
self.mayIndexError = False
2432
self.caseless = caseless
2434
self.caselessmatch = matchString.upper()
2435
identChars = identChars.upper()
2436
self.identChars = set(identChars)
2438
def parseImpl( self, instring, loc, doActions=True ):
2440
if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and
2441
(loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) and
2442
(loc == 0 or instring[loc-1].upper() not in self.identChars) ):
2443
return loc+self.matchLen, self.match
2445
if (instring[loc] == self.firstMatchChar and
2446
(self.matchLen==1 or instring.startswith(self.match,loc)) and
2447
(loc >= len(instring)-self.matchLen or instring[loc+self.matchLen] not in self.identChars) and
2448
(loc == 0 or instring[loc-1] not in self.identChars) ):
2449
return loc+self.matchLen, self.match
2450
raise ParseException(instring, loc, self.errmsg, self)
2453
c = super(Keyword,self).copy()
2454
c.identChars = Keyword.DEFAULT_KEYWORD_CHARS
2458
def setDefaultKeywordChars( chars ):
2459
"""Overrides the default Keyword chars
2461
Keyword.DEFAULT_KEYWORD_CHARS = chars
2463
class CaselessLiteral(Literal):
2465
Token to match a specified string, ignoring case of letters.
2466
Note: the matched results will always be in the case of the given
2467
match string, NOT the case of the input text.
2470
OneOrMore(CaselessLiteral("CMD")).parseString("cmd CMD Cmd10") # -> ['CMD', 'CMD', 'CMD']
2472
(Contrast with example for L{CaselessKeyword}.)
2474
def __init__( self, matchString ):
2475
super(CaselessLiteral,self).__init__( matchString.upper() )
2476
# Preserve the defining literal.
2477
self.returnString = matchString
2478
self.name = "'%s'" % self.returnString
2479
self.errmsg = "Expected " + self.name
2481
def parseImpl( self, instring, loc, doActions=True ):
2482
if instring[ loc:loc+self.matchLen ].upper() == self.match:
2483
return loc+self.matchLen, self.returnString
2484
raise ParseException(instring, loc, self.errmsg, self)
2486
class CaselessKeyword(Keyword):
2488
Caseless version of L{Keyword}.
2491
OneOrMore(CaselessKeyword("CMD")).parseString("cmd CMD Cmd10") # -> ['CMD', 'CMD']
2493
(Contrast with example for L{CaselessLiteral}.)
2495
def __init__( self, matchString, identChars=None ):
2496
super(CaselessKeyword,self).__init__( matchString, identChars, caseless=True )
2498
def parseImpl( self, instring, loc, doActions=True ):
2499
if ( (instring[ loc:loc+self.matchLen ].upper() == self.caselessmatch) and
2500
(loc >= len(instring)-self.matchLen or instring[loc+self.matchLen].upper() not in self.identChars) ):
2501
return loc+self.matchLen, self.match
2502
raise ParseException(instring, loc, self.errmsg, self)
2504
class CloseMatch(Token):
2506
A variation on L{Literal} which matches "close" matches, that is,
2507
strings with at most 'n' mismatching characters. C{CloseMatch} takes parameters:
2508
- C{match_string} - string to be matched
2509
- C{maxMismatches} - (C{default=1}) maximum number of mismatches allowed to count as a match
2511
The results from a successful parse will contain the matched text from the input string and the following named results:
2512
- C{mismatches} - a list of the positions within the match_string where mismatches were found
2513
- C{original} - the original match_string used to compare against the input string
2515
If C{mismatches} is an empty list, then the match was an exact match.
2518
patt = CloseMatch("ATCATCGAATGGA")
2519
patt.parseString("ATCATCGAAXGGA") # -> (['ATCATCGAAXGGA'], {'mismatches': [[9]], 'original': ['ATCATCGAATGGA']})
2520
patt.parseString("ATCAXCGAAXGGA") # -> Exception: Expected 'ATCATCGAATGGA' (with up to 1 mismatches) (at char 0), (line:1, col:1)
2523
patt.parseString("ATCATCGAATGGA") # -> (['ATCATCGAATGGA'], {'mismatches': [[]], 'original': ['ATCATCGAATGGA']})
2525
# close match allowing up to 2 mismatches
2526
patt = CloseMatch("ATCATCGAATGGA", maxMismatches=2)
2527
patt.parseString("ATCAXCGAAXGGA") # -> (['ATCAXCGAAXGGA'], {'mismatches': [[4, 9]], 'original': ['ATCATCGAATGGA']})
2529
def __init__(self, match_string, maxMismatches=1):
2530
super(CloseMatch,self).__init__()
2531
self.name = match_string
2532
self.match_string = match_string
2533
self.maxMismatches = maxMismatches
2534
self.errmsg = "Expected %r (with up to %d mismatches)" % (self.match_string, self.maxMismatches)
2535
self.mayIndexError = False
2536
self.mayReturnEmpty = False
2538
def parseImpl( self, instring, loc, doActions=True ):
2540
instrlen = len(instring)
2541
maxloc = start + len(self.match_string)
2543
if maxloc <= instrlen:
2544
match_string = self.match_string
2547
maxMismatches = self.maxMismatches
2549
for match_stringloc,s_m in enumerate(zip(instring[loc:maxloc], self.match_string)):
2552
mismatches.append(match_stringloc)
2553
if len(mismatches) > maxMismatches:
2556
loc = match_stringloc + 1
2557
results = ParseResults([instring[start:loc]])
2558
results['original'] = self.match_string
2559
results['mismatches'] = mismatches
2562
raise ParseException(instring, loc, self.errmsg, self)
2567
Token for matching words composed of allowed character sets.
2568
Defined with string containing all allowed initial characters,
2569
an optional string containing allowed body characters (if omitted,
2570
defaults to the initial character set), and an optional minimum,
2571
maximum, and/or exact length. The default value for C{min} is 1 (a
2572
minimum value < 1 is not valid); the default values for C{max} and C{exact}
2573
are 0, meaning no maximum or exact length restriction. An optional
2574
C{excludeChars} parameter can list characters that might be found in
2575
the input C{bodyChars} string; useful to define a word of all printables
2576
except for one or two characters, for instance.
2578
L{srange} is useful for defining custom character set strings for defining
2579
C{Word} expressions, using range notation from regular expression character sets.
2581
A common mistake is to use C{Word} to match a specific literal string, as in
2582
C{Word("Address")}. Remember that C{Word} uses the string argument to define
2583
I{sets} of matchable characters. This expression would match "Add", "AAA",
2584
"dAred", or any other word made up of the characters 'A', 'd', 'r', 'e', and 's'.
2585
To match an exact literal string, use L{Literal} or L{Keyword}.
2587
pyparsing includes helper strings for building Words:
2592
- L{alphas8bit} (alphabetic characters in ASCII range 128-255 - accented, tilded, umlauted, etc.)
2593
- L{punc8bit} (non-alphabetic characters in ASCII range 128-255 - currency, symbols, superscripts, diacriticals, etc.)
2594
- L{printables} (any non-whitespace character)
2597
# a word composed of digits
2598
integer = Word(nums) # equivalent to Word("0123456789") or Word(srange("0-9"))
2600
# a word with a leading capital, and zero or more lowercase
2601
capital_word = Word(alphas.upper(), alphas.lower())
2603
# hostnames are alphanumeric, with leading alpha, and '-'
2604
hostname = Word(alphas, alphanums+'-')
2606
# roman numeral (not a strict parser, accepts invalid mix of characters)
2607
roman = Word("IVXLCDM")
2609
# any string of non-whitespace characters, except for ','
2610
csv_value = Word(printables, excludeChars=",")
2612
def __init__( self, initChars, bodyChars=None, min=1, max=0, exact=0, asKeyword=False, excludeChars=None ):
2613
super(Word,self).__init__()
2615
initChars = ''.join(c for c in initChars if c not in excludeChars)
2617
bodyChars = ''.join(c for c in bodyChars if c not in excludeChars)
2618
self.initCharsOrig = initChars
2619
self.initChars = set(initChars)
2621
self.bodyCharsOrig = bodyChars
2622
self.bodyChars = set(bodyChars)
2624
self.bodyCharsOrig = initChars
2625
self.bodyChars = set(initChars)
2627
self.maxSpecified = max > 0
2630
raise ValueError("cannot specify a minimum length < 1; use Optional(Word()) if zero-length word is permitted")
2637
self.maxLen = _MAX_INT
2643
self.name = _ustr(self)
2644
self.errmsg = "Expected " + self.name
2645
self.mayIndexError = False
2646
self.asKeyword = asKeyword
2648
if ' ' not in self.initCharsOrig+self.bodyCharsOrig and (min==1 and max==0 and exact==0):
2649
if self.bodyCharsOrig == self.initCharsOrig:
2650
self.reString = "[%s]+" % _escapeRegexRangeChars(self.initCharsOrig)
2651
elif len(self.initCharsOrig) == 1:
2652
self.reString = "%s[%s]*" % \
2653
(re.escape(self.initCharsOrig),
2654
_escapeRegexRangeChars(self.bodyCharsOrig),)
2656
self.reString = "[%s][%s]*" % \
2657
(_escapeRegexRangeChars(self.initCharsOrig),
2658
_escapeRegexRangeChars(self.bodyCharsOrig),)
2660
self.reString = r"\b"+self.reString+r"\b"
2662
self.re = re.compile( self.reString )
2666
def parseImpl( self, instring, loc, doActions=True ):
2668
result = self.re.match(instring,loc)
2670
raise ParseException(instring, loc, self.errmsg, self)
2673
return loc, result.group()
2675
if not(instring[ loc ] in self.initChars):
2676
raise ParseException(instring, loc, self.errmsg, self)
2680
instrlen = len(instring)
2681
bodychars = self.bodyChars
2682
maxloc = start + self.maxLen
2683
maxloc = min( maxloc, instrlen )
2684
while loc < maxloc and instring[loc] in bodychars:
2687
throwException = False
2688
if loc - start < self.minLen:
2689
throwException = True
2690
if self.maxSpecified and loc < instrlen and instring[loc] in bodychars:
2691
throwException = True
2693
if (start>0 and instring[start-1] in bodychars) or (loc<instrlen and instring[loc] in bodychars):
2694
throwException = True
2697
raise ParseException(instring, loc, self.errmsg, self)
2699
return loc, instring[start:loc]
2701
def __str__( self ):
2703
return super(Word,self).__str__()
2708
if self.strRepr is None:
2716
if ( self.initCharsOrig != self.bodyCharsOrig ):
2717
self.strRepr = "W:(%s,%s)" % ( charsAsStr(self.initCharsOrig), charsAsStr(self.bodyCharsOrig) )
2719
self.strRepr = "W:(%s)" % charsAsStr(self.initCharsOrig)
2726
Token for matching strings that match a given regular expression.
2727
Defined with string specifying the regular expression in a form recognized by the inbuilt Python re module.
2728
If the given regex contains named groups (defined using C{(?P<name>...)}), these will be preserved as
2729
named parse results.
2732
realnum = Regex(r"[+-]?\d+\.\d*")
2733
date = Regex(r'(?P<year>\d{4})-(?P<month>\d\d?)-(?P<day>\d\d?)')
2734
# ref: http://stackoverflow.com/questions/267399/how-do-you-match-only-valid-roman-numerals-with-a-regular-expression
2735
roman = Regex(r"M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})")
2737
compiledREtype = type(re.compile("[A-Z]"))
2738
def __init__( self, pattern, flags=0):
2739
"""The parameters C{pattern} and C{flags} are passed to the C{re.compile()} function as-is. See the Python C{re} module for an explanation of the acceptable patterns and flags."""
2740
super(Regex,self).__init__()
2742
if isinstance(pattern, basestring):
2744
warnings.warn("null string passed to Regex; use Empty() instead",
2745
SyntaxWarning, stacklevel=2)
2747
self.pattern = pattern
2751
self.re = re.compile(self.pattern, self.flags)
2752
self.reString = self.pattern
2753
except sre_constants.error:
2754
warnings.warn("invalid pattern (%s) passed to Regex" % pattern,
2755
SyntaxWarning, stacklevel=2)
2758
elif isinstance(pattern, Regex.compiledREtype):
2761
self.reString = str(pattern)
2765
raise ValueError("Regex may only be constructed with a string or a compiled RE object")
2767
self.name = _ustr(self)
2768
self.errmsg = "Expected " + self.name
2769
self.mayIndexError = False
2770
self.mayReturnEmpty = True
2772
def parseImpl( self, instring, loc, doActions=True ):
2773
result = self.re.match(instring,loc)
2775
raise ParseException(instring, loc, self.errmsg, self)
2778
d = result.groupdict()
2779
ret = ParseResults(result.group())
2785
def __str__( self ):
2787
return super(Regex,self).__str__()
2791
if self.strRepr is None:
2792
self.strRepr = "Re:(%s)" % repr(self.pattern)
2797
class QuotedString(Token):
2799
Token for matching strings that are delimited by quoting characters.
2801
Defined with the following parameters:
2802
- quoteChar - string of one or more characters defining the quote delimiting string
2803
- escChar - character to escape quotes, typically backslash (default=C{None})
2804
- escQuote - special quote sequence to escape an embedded quote string (such as SQL's "" to escape an embedded ") (default=C{None})
2805
- multiline - boolean indicating whether quotes can span multiple lines (default=C{False})
2806
- unquoteResults - boolean indicating whether the matched text should be unquoted (default=C{True})
2807
- endQuoteChar - string of one or more characters defining the end of the quote delimited string (default=C{None} => same as quoteChar)
2808
- convertWhitespaceEscapes - convert escaped whitespace (C{'\t'}, C{'\n'}, etc.) to actual whitespace (default=C{True})
2811
qs = QuotedString('"')
2812
print(qs.searchString('lsjdf "This is the quote" sldjf'))
2813
complex_qs = QuotedString('{{', endQuoteChar='}}')
2814
print(complex_qs.searchString('lsjdf {{This is the "quote"}} sldjf'))
2815
sql_qs = QuotedString('"', escQuote='""')
2816
print(sql_qs.searchString('lsjdf "This is the quote with ""embedded"" quotes" sldjf'))
2818
[['This is the quote']]
2819
[['This is the "quote"']]
2820
[['This is the quote with "embedded" quotes']]
2822
def __init__( self, quoteChar, escChar=None, escQuote=None, multiline=False, unquoteResults=True, endQuoteChar=None, convertWhitespaceEscapes=True):
2823
super(QuotedString,self).__init__()
2825
# remove white space from quote chars - wont work anyway
2826
quoteChar = quoteChar.strip()
2828
warnings.warn("quoteChar cannot be the empty string",SyntaxWarning,stacklevel=2)
2831
if endQuoteChar is None:
2832
endQuoteChar = quoteChar
2834
endQuoteChar = endQuoteChar.strip()
2835
if not endQuoteChar:
2836
warnings.warn("endQuoteChar cannot be the empty string",SyntaxWarning,stacklevel=2)
2839
self.quoteChar = quoteChar
2840
self.quoteCharLen = len(quoteChar)
2841
self.firstQuoteChar = quoteChar[0]
2842
self.endQuoteChar = endQuoteChar
2843
self.endQuoteCharLen = len(endQuoteChar)
2844
self.escChar = escChar
2845
self.escQuote = escQuote
2846
self.unquoteResults = unquoteResults
2847
self.convertWhitespaceEscapes = convertWhitespaceEscapes
2850
self.flags = re.MULTILINE | re.DOTALL
2851
self.pattern = r'%s(?:[^%s%s]' % \
2852
( re.escape(self.quoteChar),
2853
_escapeRegexRangeChars(self.endQuoteChar[0]),
2854
(escChar is not None and _escapeRegexRangeChars(escChar) or '') )
2857
self.pattern = r'%s(?:[^%s\n\r%s]' % \
2858
( re.escape(self.quoteChar),
2859
_escapeRegexRangeChars(self.endQuoteChar[0]),
2860
(escChar is not None and _escapeRegexRangeChars(escChar) or '') )
2861
if len(self.endQuoteChar) > 1:
2863
'|(?:' + ')|(?:'.join("%s[^%s]" % (re.escape(self.endQuoteChar[:i]),
2864
_escapeRegexRangeChars(self.endQuoteChar[i]))
2865
for i in range(len(self.endQuoteChar)-1,0,-1)) + ')'
2868
self.pattern += (r'|(?:%s)' % re.escape(escQuote))
2870
self.pattern += (r'|(?:%s.)' % re.escape(escChar))
2871
self.escCharReplacePattern = re.escape(self.escChar)+"(.)"
2872
self.pattern += (r')*%s' % re.escape(self.endQuoteChar))
2875
self.re = re.compile(self.pattern, self.flags)
2876
self.reString = self.pattern
2877
except sre_constants.error:
2878
warnings.warn("invalid pattern (%s) passed to Regex" % self.pattern,
2879
SyntaxWarning, stacklevel=2)
2882
self.name = _ustr(self)
2883
self.errmsg = "Expected " + self.name
2884
self.mayIndexError = False
2885
self.mayReturnEmpty = True
2887
def parseImpl( self, instring, loc, doActions=True ):
2888
result = instring[loc] == self.firstQuoteChar and self.re.match(instring,loc) or None
2890
raise ParseException(instring, loc, self.errmsg, self)
2893
ret = result.group()
2895
if self.unquoteResults:
2898
ret = ret[self.quoteCharLen:-self.endQuoteCharLen]
2900
if isinstance(ret,basestring):
2901
# replace escaped whitespace
2902
if '\\' in ret and self.convertWhitespaceEscapes:
2909
for wslit,wschar in ws_map.items():
2910
ret = ret.replace(wslit, wschar)
2912
# replace escaped characters
2914
ret = re.sub(self.escCharReplacePattern,"\g<1>",ret)
2916
# replace escaped quotes
2918
ret = ret.replace(self.escQuote, self.endQuoteChar)
2922
def __str__( self ):
2924
return super(QuotedString,self).__str__()
2928
if self.strRepr is None:
2929
self.strRepr = "quoted string, starting with %s ending with %s" % (self.quoteChar, self.endQuoteChar)
2934
class CharsNotIn(Token):
2936
Token for matching words composed of characters I{not} in a given set (will
2937
include whitespace in matched characters if not listed in the provided exclusion set - see example).
2938
Defined with string containing all disallowed characters, and an optional
2939
minimum, maximum, and/or exact length. The default value for C{min} is 1 (a
2940
minimum value < 1 is not valid); the default values for C{max} and C{exact}
2941
are 0, meaning no maximum or exact length restriction.
2944
# define a comma-separated-value as anything that is not a ','
2945
csv_value = CharsNotIn(',')
2946
print(delimitedList(csv_value).parseString("dkls,lsdkjf,s12 34,@!#,213"))
2948
['dkls', 'lsdkjf', 's12 34', '@!#', '213']
2950
def __init__( self, notChars, min=1, max=0, exact=0 ):
2951
super(CharsNotIn,self).__init__()
2952
self.skipWhitespace = False
2953
self.notChars = notChars
2956
raise ValueError("cannot specify a minimum length < 1; use Optional(CharsNotIn()) if zero-length char group is permitted")
2963
self.maxLen = _MAX_INT
2969
self.name = _ustr(self)
2970
self.errmsg = "Expected " + self.name
2971
self.mayReturnEmpty = ( self.minLen == 0 )
2972
self.mayIndexError = False
2974
def parseImpl( self, instring, loc, doActions=True ):
2975
if instring[loc] in self.notChars:
2976
raise ParseException(instring, loc, self.errmsg, self)
2980
notchars = self.notChars
2981
maxlen = min( start+self.maxLen, len(instring) )
2982
while loc < maxlen and \
2983
(instring[loc] not in notchars):
2986
if loc - start < self.minLen:
2987
raise ParseException(instring, loc, self.errmsg, self)
2989
return loc, instring[start:loc]
2991
def __str__( self ):
2993
return super(CharsNotIn, self).__str__()
2997
if self.strRepr is None:
2998
if len(self.notChars) > 4:
2999
self.strRepr = "!W:(%s...)" % self.notChars[:4]
3001
self.strRepr = "!W:(%s)" % self.notChars
3007
Special matching class for matching whitespace. Normally, whitespace is ignored
3008
by pyparsing grammars. This class is included when some whitespace structures
3009
are significant. Define with a string containing the whitespace characters to be
3010
matched; default is C{" \\t\\r\\n"}. Also takes optional C{min}, C{max}, and C{exact} arguments,
3011
as defined for the C{L{Word}} class.
3020
def __init__(self, ws=" \t\r\n", min=1, max=0, exact=0):
3021
super(White,self).__init__()
3022
self.matchWhite = ws
3023
self.setWhitespaceChars( "".join(c for c in self.whiteChars if c not in self.matchWhite) )
3024
#~ self.leaveWhitespace()
3025
self.name = ("".join(White.whiteStrs[c] for c in self.matchWhite))
3026
self.mayReturnEmpty = True
3027
self.errmsg = "Expected " + self.name
3034
self.maxLen = _MAX_INT
3040
def parseImpl( self, instring, loc, doActions=True ):
3041
if not(instring[ loc ] in self.matchWhite):
3042
raise ParseException(instring, loc, self.errmsg, self)
3045
maxloc = start + self.maxLen
3046
maxloc = min( maxloc, len(instring) )
3047
while loc < maxloc and instring[loc] in self.matchWhite:
3050
if loc - start < self.minLen:
3051
raise ParseException(instring, loc, self.errmsg, self)
3053
return loc, instring[start:loc]
3056
class _PositionToken(Token):
3057
def __init__( self ):
3058
super(_PositionToken,self).__init__()
3059
self.name=self.__class__.__name__
3060
self.mayReturnEmpty = True
3061
self.mayIndexError = False
3063
class GoToColumn(_PositionToken):
3065
Token to advance to a specific column of input text; useful for tabular report scraping.
3067
def __init__( self, colno ):
3068
super(GoToColumn,self).__init__()
3071
def preParse( self, instring, loc ):
3072
if col(loc,instring) != self.col:
3073
instrlen = len(instring)
3074
if self.ignoreExprs:
3075
loc = self._skipIgnorables( instring, loc )
3076
while loc < instrlen and instring[loc].isspace() and col( loc, instring ) != self.col :
3080
def parseImpl( self, instring, loc, doActions=True ):
3081
thiscol = col( loc, instring )
3082
if thiscol > self.col:
3083
raise ParseException( instring, loc, "Text not in expected column", self )
3084
newloc = loc + self.col - thiscol
3085
ret = instring[ loc: newloc ]
3089
class LineStart(_PositionToken):
3091
Matches if current position is at the beginning of a line within the parse string
3098
AAA but not this one
3099
B AAA and definitely not this one
3102
for t in (LineStart() + 'AAA' + restOfLine).searchString(test):
3106
['AAA', ' this line']
3107
['AAA', ' and this line']
3110
def __init__( self ):
3111
super(LineStart,self).__init__()
3112
self.errmsg = "Expected start of line"
3114
def parseImpl( self, instring, loc, doActions=True ):
3115
if col(loc, instring) == 1:
3117
raise ParseException(instring, loc, self.errmsg, self)
3119
class LineEnd(_PositionToken):
3121
Matches if current position is at the end of a line within the parse string
3123
def __init__( self ):
3124
super(LineEnd,self).__init__()
3125
self.setWhitespaceChars( ParserElement.DEFAULT_WHITE_CHARS.replace("\n","") )
3126
self.errmsg = "Expected end of line"
3128
def parseImpl( self, instring, loc, doActions=True ):
3129
if loc<len(instring):
3130
if instring[loc] == "\n":
3133
raise ParseException(instring, loc, self.errmsg, self)
3134
elif loc == len(instring):
3137
raise ParseException(instring, loc, self.errmsg, self)
3139
class StringStart(_PositionToken):
3141
Matches if current position is at the beginning of the parse string
3143
def __init__( self ):
3144
super(StringStart,self).__init__()
3145
self.errmsg = "Expected start of text"
3147
def parseImpl( self, instring, loc, doActions=True ):
3149
# see if entire string up to here is just whitespace and ignoreables
3150
if loc != self.preParse( instring, 0 ):
3151
raise ParseException(instring, loc, self.errmsg, self)
3154
class StringEnd(_PositionToken):
3156
Matches if current position is at the end of the parse string
3158
def __init__( self ):
3159
super(StringEnd,self).__init__()
3160
self.errmsg = "Expected end of text"
3162
def parseImpl( self, instring, loc, doActions=True ):
3163
if loc < len(instring):
3164
raise ParseException(instring, loc, self.errmsg, self)
3165
elif loc == len(instring):
3167
elif loc > len(instring):
3170
raise ParseException(instring, loc, self.errmsg, self)
3172
class WordStart(_PositionToken):
3174
Matches if the current position is at the beginning of a Word, and
3175
is not preceded by any character in a given set of C{wordChars}
3176
(default=C{printables}). To emulate the C{\b} behavior of regular expressions,
3177
use C{WordStart(alphanums)}. C{WordStart} will also match at the beginning of
3178
the string being parsed, or at the beginning of a line.
3180
def __init__(self, wordChars = printables):
3181
super(WordStart,self).__init__()
3182
self.wordChars = set(wordChars)
3183
self.errmsg = "Not at the start of a word"
3185
def parseImpl(self, instring, loc, doActions=True ):
3187
if (instring[loc-1] in self.wordChars or
3188
instring[loc] not in self.wordChars):
3189
raise ParseException(instring, loc, self.errmsg, self)
3192
class WordEnd(_PositionToken):
3194
Matches if the current position is at the end of a Word, and
3195
is not followed by any character in a given set of C{wordChars}
3196
(default=C{printables}). To emulate the C{\b} behavior of regular expressions,
3197
use C{WordEnd(alphanums)}. C{WordEnd} will also match at the end of
3198
the string being parsed, or at the end of a line.
3200
def __init__(self, wordChars = printables):
3201
super(WordEnd,self).__init__()
3202
self.wordChars = set(wordChars)
3203
self.skipWhitespace = False
3204
self.errmsg = "Not at the end of a word"
3206
def parseImpl(self, instring, loc, doActions=True ):
3207
instrlen = len(instring)
3208
if instrlen>0 and loc<instrlen:
3209
if (instring[loc] in self.wordChars or
3210
instring[loc-1] not in self.wordChars):
3211
raise ParseException(instring, loc, self.errmsg, self)
3215
class ParseExpression(ParserElement):
3217
Abstract subclass of ParserElement, for combining and post-processing parsed tokens.
3219
def __init__( self, exprs, savelist = False ):
3220
super(ParseExpression,self).__init__(savelist)
3221
if isinstance( exprs, _generatorType ):
3224
if isinstance( exprs, basestring ):
3225
self.exprs = [ ParserElement._literalStringClass( exprs ) ]
3226
elif isinstance( exprs, collections.Iterable ):
3228
# if sequence of strings provided, wrap with Literal
3229
if all(isinstance(expr, basestring) for expr in exprs):
3230
exprs = map(ParserElement._literalStringClass, exprs)
3231
self.exprs = list(exprs)
3234
self.exprs = list( exprs )
3236
self.exprs = [ exprs ]
3237
self.callPreparse = False
3239
def __getitem__( self, i ):
3240
return self.exprs[i]
3242
def append( self, other ):
3243
self.exprs.append( other )
3247
def leaveWhitespace( self ):
3248
"""Extends C{leaveWhitespace} defined in base class, and also invokes C{leaveWhitespace} on
3249
all contained expressions."""
3250
self.skipWhitespace = False
3251
self.exprs = [ e.copy() for e in self.exprs ]
3252
for e in self.exprs:
3256
def ignore( self, other ):
3257
if isinstance( other, Suppress ):
3258
if other not in self.ignoreExprs:
3259
super( ParseExpression, self).ignore( other )
3260
for e in self.exprs:
3261
e.ignore( self.ignoreExprs[-1] )
3263
super( ParseExpression, self).ignore( other )
3264
for e in self.exprs:
3265
e.ignore( self.ignoreExprs[-1] )
3268
def __str__( self ):
3270
return super(ParseExpression,self).__str__()
3274
if self.strRepr is None:
3275
self.strRepr = "%s:(%s)" % ( self.__class__.__name__, _ustr(self.exprs) )
3278
def streamline( self ):
3279
super(ParseExpression,self).streamline()
3281
for e in self.exprs:
3284
# collapse nested And's of the form And( And( And( a,b), c), d) to And( a,b,c,d )
3285
# but only if there are no parse actions or resultsNames on the nested And's
3286
# (likewise for Or's and MatchFirst's)
3287
if ( len(self.exprs) == 2 ):
3288
other = self.exprs[0]
3289
if ( isinstance( other, self.__class__ ) and
3290
not(other.parseAction) and
3291
other.resultsName is None and
3293
self.exprs = other.exprs[:] + [ self.exprs[1] ]
3295
self.mayReturnEmpty |= other.mayReturnEmpty
3296
self.mayIndexError |= other.mayIndexError
3298
other = self.exprs[-1]
3299
if ( isinstance( other, self.__class__ ) and
3300
not(other.parseAction) and
3301
other.resultsName is None and
3303
self.exprs = self.exprs[:-1] + other.exprs[:]
3305
self.mayReturnEmpty |= other.mayReturnEmpty
3306
self.mayIndexError |= other.mayIndexError
3308
self.errmsg = "Expected " + _ustr(self)
3312
def setResultsName( self, name, listAllMatches=False ):
3313
ret = super(ParseExpression,self).setResultsName(name,listAllMatches)
3316
def validate( self, validateTrace=[] ):
3317
tmp = validateTrace[:]+[self]
3318
for e in self.exprs:
3320
self.checkRecursion( [] )
3323
ret = super(ParseExpression,self).copy()
3324
ret.exprs = [e.copy() for e in self.exprs]
3327
class And(ParseExpression):
3329
Requires all given C{ParseExpression}s to be found in the given order.
3330
Expressions may be separated by whitespace.
3331
May be constructed using the C{'+'} operator.
3332
May also be constructed using the C{'-'} operator, which will suppress backtracking.
3335
integer = Word(nums)
3336
name_expr = OneOrMore(Word(alphas))
3338
expr = And([integer("id"),name_expr("name"),integer("age")])
3339
# more easily written as:
3340
expr = integer("id") + name_expr("name") + integer("age")
3343
class _ErrorStop(Empty):
3344
def __init__(self, *args, **kwargs):
3345
super(And._ErrorStop,self).__init__(*args, **kwargs)
3347
self.leaveWhitespace()
3349
def __init__( self, exprs, savelist = True ):
3350
super(And,self).__init__(exprs, savelist)
3351
self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs)
3352
self.setWhitespaceChars( self.exprs[0].whiteChars )
3353
self.skipWhitespace = self.exprs[0].skipWhitespace
3354
self.callPreparse = True
3356
def parseImpl( self, instring, loc, doActions=True ):
3357
# pass False as last arg to _parse for first element, since we already
3358
# pre-parsed the string as part of our And pre-parsing
3359
loc, resultlist = self.exprs[0]._parse( instring, loc, doActions, callPreParse=False )
3361
for e in self.exprs[1:]:
3362
if isinstance(e, And._ErrorStop):
3367
loc, exprtokens = e._parse( instring, loc, doActions )
3368
except ParseSyntaxException:
3370
except ParseBaseException as pe:
3371
pe.__traceback__ = None
3372
raise ParseSyntaxException._from_exception(pe)
3374
raise ParseSyntaxException(instring, len(instring), self.errmsg, self)
3376
loc, exprtokens = e._parse( instring, loc, doActions )
3377
if exprtokens or exprtokens.haskeys():
3378
resultlist += exprtokens
3379
return loc, resultlist
3381
def __iadd__(self, other ):
3382
if isinstance( other, basestring ):
3383
other = ParserElement._literalStringClass( other )
3384
return self.append( other ) #And( [ self, other ] )
3386
def checkRecursion( self, parseElementList ):
3387
subRecCheckList = parseElementList[:] + [ self ]
3388
for e in self.exprs:
3389
e.checkRecursion( subRecCheckList )
3390
if not e.mayReturnEmpty:
3393
def __str__( self ):
3394
if hasattr(self,"name"):
3397
if self.strRepr is None:
3398
self.strRepr = "{" + " ".join(_ustr(e) for e in self.exprs) + "}"
3403
class Or(ParseExpression):
3405
Requires that at least one C{ParseExpression} is found.
3406
If two expressions match, the expression that matches the longest string will be used.
3407
May be constructed using the C{'^'} operator.
3410
# construct Or using '^' operator
3412
number = Word(nums) ^ Combine(Word(nums) + '.' + Word(nums))
3413
print(number.searchString("123 3.1416 789"))
3415
[['123'], ['3.1416'], ['789']]
3417
def __init__( self, exprs, savelist = False ):
3418
super(Or,self).__init__(exprs, savelist)
3420
self.mayReturnEmpty = any(e.mayReturnEmpty for e in self.exprs)
3422
self.mayReturnEmpty = True
3424
def parseImpl( self, instring, loc, doActions=True ):
3428
for e in self.exprs:
3430
loc2 = e.tryParse( instring, loc )
3431
except ParseException as err:
3432
err.__traceback__ = None
3433
if err.loc > maxExcLoc:
3437
if len(instring) > maxExcLoc:
3438
maxException = ParseException(instring,len(instring),e.errmsg,self)
3439
maxExcLoc = len(instring)
3441
# save match among all matches, to retry longest to shortest
3442
matches.append((loc2, e))
3445
matches.sort(key=lambda x: -x[0])
3448
return e._parse( instring, loc, doActions )
3449
except ParseException as err:
3450
err.__traceback__ = None
3451
if err.loc > maxExcLoc:
3455
if maxException is not None:
3456
maxException.msg = self.errmsg
3459
raise ParseException(instring, loc, "no defined alternatives to match", self)
3462
def __ixor__(self, other ):
3463
if isinstance( other, basestring ):
3464
other = ParserElement._literalStringClass( other )
3465
return self.append( other ) #Or( [ self, other ] )
3467
def __str__( self ):
3468
if hasattr(self,"name"):
3471
if self.strRepr is None:
3472
self.strRepr = "{" + " ^ ".join(_ustr(e) for e in self.exprs) + "}"
3476
def checkRecursion( self, parseElementList ):
3477
subRecCheckList = parseElementList[:] + [ self ]
3478
for e in self.exprs:
3479
e.checkRecursion( subRecCheckList )
3482
class MatchFirst(ParseExpression):
3484
Requires that at least one C{ParseExpression} is found.
3485
If two expressions match, the first one listed is the one that will match.
3486
May be constructed using the C{'|'} operator.
3489
# construct MatchFirst using '|' operator
3491
# watch the order of expressions to match
3492
number = Word(nums) | Combine(Word(nums) + '.' + Word(nums))
3493
print(number.searchString("123 3.1416 789")) # Fail! -> [['123'], ['3'], ['1416'], ['789']]
3495
# put more selective expression first
3496
number = Combine(Word(nums) + '.' + Word(nums)) | Word(nums)
3497
print(number.searchString("123 3.1416 789")) # Better -> [['123'], ['3.1416'], ['789']]
3499
def __init__( self, exprs, savelist = False ):
3500
super(MatchFirst,self).__init__(exprs, savelist)
3502
self.mayReturnEmpty = any(e.mayReturnEmpty for e in self.exprs)
3504
self.mayReturnEmpty = True
3506
def parseImpl( self, instring, loc, doActions=True ):
3509
for e in self.exprs:
3511
ret = e._parse( instring, loc, doActions )
3513
except ParseException as err:
3514
if err.loc > maxExcLoc:
3518
if len(instring) > maxExcLoc:
3519
maxException = ParseException(instring,len(instring),e.errmsg,self)
3520
maxExcLoc = len(instring)
3522
# only got here if no expression matched, raise exception for match that made it the furthest
3524
if maxException is not None:
3525
maxException.msg = self.errmsg
3528
raise ParseException(instring, loc, "no defined alternatives to match", self)
3530
def __ior__(self, other ):
3531
if isinstance( other, basestring ):
3532
other = ParserElement._literalStringClass( other )
3533
return self.append( other ) #MatchFirst( [ self, other ] )
3535
def __str__( self ):
3536
if hasattr(self,"name"):
3539
if self.strRepr is None:
3540
self.strRepr = "{" + " | ".join(_ustr(e) for e in self.exprs) + "}"
3544
def checkRecursion( self, parseElementList ):
3545
subRecCheckList = parseElementList[:] + [ self ]
3546
for e in self.exprs:
3547
e.checkRecursion( subRecCheckList )
3550
class Each(ParseExpression):
3552
Requires all given C{ParseExpression}s to be found, but in any order.
3553
Expressions may be separated by whitespace.
3554
May be constructed using the C{'&'} operator.
3557
color = oneOf("RED ORANGE YELLOW GREEN BLUE PURPLE BLACK WHITE BROWN")
3558
shape_type = oneOf("SQUARE CIRCLE TRIANGLE STAR HEXAGON OCTAGON")
3559
integer = Word(nums)
3560
shape_attr = "shape:" + shape_type("shape")
3561
posn_attr = "posn:" + Group(integer("x") + ',' + integer("y"))("posn")
3562
color_attr = "color:" + color("color")
3563
size_attr = "size:" + integer("size")
3565
# use Each (using operator '&') to accept attributes in any order
3566
# (shape and posn are required, color and size are optional)
3567
shape_spec = shape_attr & posn_attr & Optional(color_attr) & Optional(size_attr)
3569
shape_spec.runTests('''
3570
shape: SQUARE color: BLACK posn: 100, 120
3571
shape: CIRCLE size: 50 color: BLUE posn: 50,80
3572
color:GREEN size:20 shape:TRIANGLE posn:20,40
3576
shape: SQUARE color: BLACK posn: 100, 120
3577
['shape:', 'SQUARE', 'color:', 'BLACK', 'posn:', ['100', ',', '120']]
3579
- posn: ['100', ',', '120']
3585
shape: CIRCLE size: 50 color: BLUE posn: 50,80
3586
['shape:', 'CIRCLE', 'size:', '50', 'color:', 'BLUE', 'posn:', ['50', ',', '80']]
3588
- posn: ['50', ',', '80']
3595
color: GREEN size: 20 shape: TRIANGLE posn: 20,40
3596
['color:', 'GREEN', 'size:', '20', 'shape:', 'TRIANGLE', 'posn:', ['20', ',', '40']]
3598
- posn: ['20', ',', '40']
3604
def __init__( self, exprs, savelist = True ):
3605
super(Each,self).__init__(exprs, savelist)
3606
self.mayReturnEmpty = all(e.mayReturnEmpty for e in self.exprs)
3607
self.skipWhitespace = True
3608
self.initExprGroups = True
3610
def parseImpl( self, instring, loc, doActions=True ):
3611
if self.initExprGroups:
3612
self.opt1map = dict((id(e.expr),e) for e in self.exprs if isinstance(e,Optional))
3613
opt1 = [ e.expr for e in self.exprs if isinstance(e,Optional) ]
3614
opt2 = [ e for e in self.exprs if e.mayReturnEmpty and not isinstance(e,Optional)]
3615
self.optionals = opt1 + opt2
3616
self.multioptionals = [ e.expr for e in self.exprs if isinstance(e,ZeroOrMore) ]
3617
self.multirequired = [ e.expr for e in self.exprs if isinstance(e,OneOrMore) ]
3618
self.required = [ e for e in self.exprs if not isinstance(e,(Optional,ZeroOrMore,OneOrMore)) ]
3619
self.required += self.multirequired
3620
self.initExprGroups = False
3622
tmpReqd = self.required[:]
3623
tmpOpt = self.optionals[:]
3628
tmpExprs = tmpReqd + tmpOpt + self.multioptionals + self.multirequired
3632
tmpLoc = e.tryParse( instring, tmpLoc )
3633
except ParseException:
3636
matchOrder.append(self.opt1map.get(id(e),e))
3641
if len(failed) == len(tmpExprs):
3642
keepMatching = False
3645
missing = ", ".join(_ustr(e) for e in tmpReqd)
3646
raise ParseException(instring,loc,"Missing one or more required elements (%s)" % missing )
3648
# add any unmatched Optionals, in case they have default values defined
3649
matchOrder += [e for e in self.exprs if isinstance(e,Optional) and e.expr in tmpOpt]
3652
for e in matchOrder:
3653
loc,results = e._parse(instring,loc,doActions)
3654
resultlist.append(results)
3656
finalResults = sum(resultlist, ParseResults([]))
3657
return loc, finalResults
3659
def __str__( self ):
3660
if hasattr(self,"name"):
3663
if self.strRepr is None:
3664
self.strRepr = "{" + " & ".join(_ustr(e) for e in self.exprs) + "}"
3668
def checkRecursion( self, parseElementList ):
3669
subRecCheckList = parseElementList[:] + [ self ]
3670
for e in self.exprs:
3671
e.checkRecursion( subRecCheckList )
3674
class ParseElementEnhance(ParserElement):
3676
Abstract subclass of C{ParserElement}, for combining and post-processing parsed tokens.
3678
def __init__( self, expr, savelist=False ):
3679
super(ParseElementEnhance,self).__init__(savelist)
3680
if isinstance( expr, basestring ):
3681
if issubclass(ParserElement._literalStringClass, Token):
3682
expr = ParserElement._literalStringClass(expr)
3684
expr = ParserElement._literalStringClass(Literal(expr))
3687
if expr is not None:
3688
self.mayIndexError = expr.mayIndexError
3689
self.mayReturnEmpty = expr.mayReturnEmpty
3690
self.setWhitespaceChars( expr.whiteChars )
3691
self.skipWhitespace = expr.skipWhitespace
3692
self.saveAsList = expr.saveAsList
3693
self.callPreparse = expr.callPreparse
3694
self.ignoreExprs.extend(expr.ignoreExprs)
3696
def parseImpl( self, instring, loc, doActions=True ):
3697
if self.expr is not None:
3698
return self.expr._parse( instring, loc, doActions, callPreParse=False )
3700
raise ParseException("",loc,self.errmsg,self)
3702
def leaveWhitespace( self ):
3703
self.skipWhitespace = False
3704
self.expr = self.expr.copy()
3705
if self.expr is not None:
3706
self.expr.leaveWhitespace()
3709
def ignore( self, other ):
3710
if isinstance( other, Suppress ):
3711
if other not in self.ignoreExprs:
3712
super( ParseElementEnhance, self).ignore( other )
3713
if self.expr is not None:
3714
self.expr.ignore( self.ignoreExprs[-1] )
3716
super( ParseElementEnhance, self).ignore( other )
3717
if self.expr is not None:
3718
self.expr.ignore( self.ignoreExprs[-1] )
3721
def streamline( self ):
3722
super(ParseElementEnhance,self).streamline()
3723
if self.expr is not None:
3724
self.expr.streamline()
3727
def checkRecursion( self, parseElementList ):
3728
if self in parseElementList:
3729
raise RecursiveGrammarException( parseElementList+[self] )
3730
subRecCheckList = parseElementList[:] + [ self ]
3731
if self.expr is not None:
3732
self.expr.checkRecursion( subRecCheckList )
3734
def validate( self, validateTrace=[] ):
3735
tmp = validateTrace[:]+[self]
3736
if self.expr is not None:
3737
self.expr.validate(tmp)
3738
self.checkRecursion( [] )
3740
def __str__( self ):
3742
return super(ParseElementEnhance,self).__str__()
3746
if self.strRepr is None and self.expr is not None:
3747
self.strRepr = "%s:(%s)" % ( self.__class__.__name__, _ustr(self.expr) )
3751
class FollowedBy(ParseElementEnhance):
3753
Lookahead matching of the given parse expression. C{FollowedBy}
3754
does I{not} advance the parsing position within the input string, it only
3755
verifies that the specified parse expression matches at the current
3756
position. C{FollowedBy} always returns a null token list.
3759
# use FollowedBy to match a label only if it is followed by a ':'
3760
data_word = Word(alphas)
3761
label = data_word + FollowedBy(':')
3762
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
3764
OneOrMore(attr_expr).parseString("shape: SQUARE color: BLACK posn: upper left").pprint()
3766
[['shape', 'SQUARE'], ['color', 'BLACK'], ['posn', 'upper left']]
3768
def __init__( self, expr ):
3769
super(FollowedBy,self).__init__(expr)
3770
self.mayReturnEmpty = True
3772
def parseImpl( self, instring, loc, doActions=True ):
3773
self.expr.tryParse( instring, loc )
3777
class NotAny(ParseElementEnhance):
3779
Lookahead to disallow matching with the given parse expression. C{NotAny}
3780
does I{not} advance the parsing position within the input string, it only
3781
verifies that the specified parse expression does I{not} match at the current
3782
position. Also, C{NotAny} does I{not} skip over leading whitespace. C{NotAny}
3783
always returns a null token list. May be constructed using the '~' operator.
3788
def __init__( self, expr ):
3789
super(NotAny,self).__init__(expr)
3790
#~ self.leaveWhitespace()
3791
self.skipWhitespace = False # do NOT use self.leaveWhitespace(), don't want to propagate to exprs
3792
self.mayReturnEmpty = True
3793
self.errmsg = "Found unwanted token, "+_ustr(self.expr)
3795
def parseImpl( self, instring, loc, doActions=True ):
3796
if self.expr.canParseNext(instring, loc):
3797
raise ParseException(instring, loc, self.errmsg, self)
3800
def __str__( self ):
3801
if hasattr(self,"name"):
3804
if self.strRepr is None:
3805
self.strRepr = "~{" + _ustr(self.expr) + "}"
3809
class _MultipleMatch(ParseElementEnhance):
3810
def __init__( self, expr, stopOn=None):
3811
super(_MultipleMatch, self).__init__(expr)
3812
self.saveAsList = True
3814
if isinstance(ender, basestring):
3815
ender = ParserElement._literalStringClass(ender)
3816
self.not_ender = ~ender if ender is not None else None
3818
def parseImpl( self, instring, loc, doActions=True ):
3819
self_expr_parse = self.expr._parse
3820
self_skip_ignorables = self._skipIgnorables
3821
check_ender = self.not_ender is not None
3823
try_not_ender = self.not_ender.tryParse
3825
# must be at least one (but first see if we are the stopOn sentinel;
3828
try_not_ender(instring, loc)
3829
loc, tokens = self_expr_parse( instring, loc, doActions, callPreParse=False )
3831
hasIgnoreExprs = (not not self.ignoreExprs)
3834
try_not_ender(instring, loc)
3836
preloc = self_skip_ignorables( instring, loc )
3839
loc, tmptokens = self_expr_parse( instring, preloc, doActions )
3840
if tmptokens or tmptokens.haskeys():
3842
except (ParseException,IndexError):
3847
class OneOrMore(_MultipleMatch):
3849
Repetition of one or more of the given expression.
3852
- expr - expression that must match one or more times
3853
- stopOn - (default=C{None}) - expression for a terminating sentinel
3854
(only required if the sentinel would ordinarily match the repetition
3858
data_word = Word(alphas)
3859
label = data_word + FollowedBy(':')
3860
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word).setParseAction(' '.join))
3862
text = "shape: SQUARE posn: upper left color: BLACK"
3863
OneOrMore(attr_expr).parseString(text).pprint() # Fail! read 'color' as data instead of next label -> [['shape', 'SQUARE color']]
3865
# use stopOn attribute for OneOrMore to avoid reading label string as part of the data
3866
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
3867
OneOrMore(attr_expr).parseString(text).pprint() # Better -> [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'BLACK']]
3869
# could also be written as
3870
(attr_expr * (1,)).parseString(text).pprint()
3873
def __str__( self ):
3874
if hasattr(self,"name"):
3877
if self.strRepr is None:
3878
self.strRepr = "{" + _ustr(self.expr) + "}..."
3882
class ZeroOrMore(_MultipleMatch):
3884
Optional repetition of zero or more of the given expression.
3887
- expr - expression that must match zero or more times
3888
- stopOn - (default=C{None}) - expression for a terminating sentinel
3889
(only required if the sentinel would ordinarily match the repetition
3892
Example: similar to L{OneOrMore}
3894
def __init__( self, expr, stopOn=None):
3895
super(ZeroOrMore,self).__init__(expr, stopOn=stopOn)
3896
self.mayReturnEmpty = True
3898
def parseImpl( self, instring, loc, doActions=True ):
3900
return super(ZeroOrMore, self).parseImpl(instring, loc, doActions)
3901
except (ParseException,IndexError):
3904
def __str__( self ):
3905
if hasattr(self,"name"):
3908
if self.strRepr is None:
3909
self.strRepr = "[" + _ustr(self.expr) + "]..."
3913
class _NullToken(object):
3916
__nonzero__ = __bool__
3920
_optionalNotMatched = _NullToken()
3921
class Optional(ParseElementEnhance):
3923
Optional matching of the given expression.
3926
- expr - expression that must match zero or more times
3927
- default (optional) - value to be returned if the optional expression is not found.
3930
# US postal code can be a 5-digit zip, plus optional 4-digit qualifier
3931
zip = Combine(Word(nums, exact=5) + Optional('-' + Word(nums, exact=4)))
3933
# traditional ZIP code
3943
# traditional ZIP code
3954
FAIL: Expected end of text (at char 5), (line:1, col:6)
3956
def __init__( self, expr, default=_optionalNotMatched ):
3957
super(Optional,self).__init__( expr, savelist=False )
3958
self.saveAsList = self.expr.saveAsList
3959
self.defaultValue = default
3960
self.mayReturnEmpty = True
3962
def parseImpl( self, instring, loc, doActions=True ):
3964
loc, tokens = self.expr._parse( instring, loc, doActions, callPreParse=False )
3965
except (ParseException,IndexError):
3966
if self.defaultValue is not _optionalNotMatched:
3967
if self.expr.resultsName:
3968
tokens = ParseResults([ self.defaultValue ])
3969
tokens[self.expr.resultsName] = self.defaultValue
3971
tokens = [ self.defaultValue ]
3976
def __str__( self ):
3977
if hasattr(self,"name"):
3980
if self.strRepr is None:
3981
self.strRepr = "[" + _ustr(self.expr) + "]"
3985
class SkipTo(ParseElementEnhance):
3987
Token for skipping over all undefined text until the matched expression is found.
3990
- expr - target expression marking the end of the data to be skipped
3991
- include - (default=C{False}) if True, the target expression is also parsed
3992
(the skipped text and target expression are returned as a 2-element list).
3993
- ignore - (default=C{None}) used to define grammars (typically quoted strings and
3994
comments) that might contain false matches to the target expression
3995
- failOn - (default=C{None}) define expressions that are not allowed to be
3996
included in the skipped test; if found before the target expression is found,
3997
the SkipTo is not a match
4001
Outstanding Issues Report - 1 Jan 2000
4003
# | Severity | Description | Days Open
4004
-----+----------+-------------------------------------------+-----------
4005
101 | Critical | Intermittent system crash | 6
4006
94 | Cosmetic | Spelling error on Login ('log|n') | 14
4007
79 | Minor | System slow when running too many reports | 47
4009
integer = Word(nums)
4011
# use SkipTo to simply match everything up until the next SEP
4012
# - ignore quoted strings, so that a '|' character inside a quoted string does not match
4013
# - parse action will call token.strip() for each matched token, i.e., the description body
4014
string_data = SkipTo(SEP, ignore=quotedString)
4015
string_data.setParseAction(tokenMap(str.strip))
4016
ticket_expr = (integer("issue_num") + SEP
4017
+ string_data("sev") + SEP
4018
+ string_data("desc") + SEP
4019
+ integer("days_open"))
4021
for tkt in ticket_expr.searchString(report):
4024
['101', 'Critical', 'Intermittent system crash', '6']
4026
- desc: Intermittent system crash
4029
['94', 'Cosmetic', "Spelling error on Login ('log|n')", '14']
4031
- desc: Spelling error on Login ('log|n')
4034
['79', 'Minor', 'System slow when running too many reports', '47']
4036
- desc: System slow when running too many reports
4040
def __init__( self, other, include=False, ignore=None, failOn=None ):
4041
super( SkipTo, self ).__init__( other )
4042
self.ignoreExpr = ignore
4043
self.mayReturnEmpty = True
4044
self.mayIndexError = False
4045
self.includeMatch = include
4047
if isinstance(failOn, basestring):
4048
self.failOn = ParserElement._literalStringClass(failOn)
4050
self.failOn = failOn
4051
self.errmsg = "No match found for "+_ustr(self.expr)
4053
def parseImpl( self, instring, loc, doActions=True ):
4055
instrlen = len(instring)
4057
expr_parse = self.expr._parse
4058
self_failOn_canParseNext = self.failOn.canParseNext if self.failOn is not None else None
4059
self_ignoreExpr_tryParse = self.ignoreExpr.tryParse if self.ignoreExpr is not None else None
4062
while tmploc <= instrlen:
4063
if self_failOn_canParseNext is not None:
4064
# break if failOn expression matches
4065
if self_failOn_canParseNext(instring, tmploc):
4068
if self_ignoreExpr_tryParse is not None:
4069
# advance past ignore expressions
4072
tmploc = self_ignoreExpr_tryParse(instring, tmploc)
4073
except ParseBaseException:
4077
expr_parse(instring, tmploc, doActions=False, callPreParse=False)
4078
except (ParseException, IndexError):
4079
# no match, advance loc in string
4082
# matched skipto expr, done
4086
# ran off the end of the input string without matching skipto expr, fail
4087
raise ParseException(instring, loc, self.errmsg, self)
4089
# build up return values
4091
skiptext = instring[startloc:loc]
4092
skipresult = ParseResults(skiptext)
4094
if self.includeMatch:
4095
loc, mat = expr_parse(instring,loc,doActions,callPreParse=False)
4098
return loc, skipresult
4100
class Forward(ParseElementEnhance):
4102
Forward declaration of an expression to be defined later -
4103
used for recursive grammars, such as algebraic infix notation.
4104
When the expression is known, it is assigned to the C{Forward} variable using the '<<' operator.
4106
Note: take care when assigning to C{Forward} not to overlook precedence of operators.
4107
Specifically, '|' has a lower precedence than '<<', so that::
4108
fwdExpr << a | b | c
4109
will actually be evaluated as::
4110
(fwdExpr << a) | b | c
4111
thereby leaving b and c out as parseable alternatives. It is recommended that you
4112
explicitly group the values inserted into the C{Forward}::
4113
fwdExpr << (a | b | c)
4114
Converting to use the '<<=' operator instead will avoid this problem.
4116
See L{ParseResults.pprint} for an example of a recursive parser created using
4119
def __init__( self, other=None ):
4120
super(Forward,self).__init__( other, savelist=False )
4122
def __lshift__( self, other ):
4123
if isinstance( other, basestring ):
4124
other = ParserElement._literalStringClass(other)
4127
self.mayIndexError = self.expr.mayIndexError
4128
self.mayReturnEmpty = self.expr.mayReturnEmpty
4129
self.setWhitespaceChars( self.expr.whiteChars )
4130
self.skipWhitespace = self.expr.skipWhitespace
4131
self.saveAsList = self.expr.saveAsList
4132
self.ignoreExprs.extend(self.expr.ignoreExprs)
4135
def __ilshift__(self, other):
4136
return self << other
4138
def leaveWhitespace( self ):
4139
self.skipWhitespace = False
4142
def streamline( self ):
4143
if not self.streamlined:
4144
self.streamlined = True
4145
if self.expr is not None:
4146
self.expr.streamline()
4149
def validate( self, validateTrace=[] ):
4150
if self not in validateTrace:
4151
tmp = validateTrace[:]+[self]
4152
if self.expr is not None:
4153
self.expr.validate(tmp)
4154
self.checkRecursion([])
4156
def __str__( self ):
4157
if hasattr(self,"name"):
4159
return self.__class__.__name__ + ": ..."
4161
# stubbed out for now - creates awful memory and perf issues
4162
self._revertClass = self.__class__
4163
self.__class__ = _ForwardNoRecurse
4165
if self.expr is not None:
4166
retString = _ustr(self.expr)
4170
self.__class__ = self._revertClass
4171
return self.__class__.__name__ + ": " + retString
4174
if self.expr is not None:
4175
return super(Forward,self).copy()
4181
class _ForwardNoRecurse(Forward):
4182
def __str__( self ):
4185
class TokenConverter(ParseElementEnhance):
4187
Abstract subclass of C{ParseExpression}, for converting parsed results.
4189
def __init__( self, expr, savelist=False ):
4190
super(TokenConverter,self).__init__( expr )#, savelist )
4191
self.saveAsList = False
4193
class Combine(TokenConverter):
4195
Converter to concatenate all matching tokens to a single string.
4196
By default, the matching patterns must also be contiguous in the input string;
4197
this can be disabled by specifying C{'adjacent=False'} in the constructor.
4200
real = Word(nums) + '.' + Word(nums)
4201
print(real.parseString('3.1416')) # -> ['3', '.', '1416']
4202
# will also erroneously match the following
4203
print(real.parseString('3. 1416')) # -> ['3', '.', '1416']
4205
real = Combine(Word(nums) + '.' + Word(nums))
4206
print(real.parseString('3.1416')) # -> ['3.1416']
4207
# no match when there are internal spaces
4208
print(real.parseString('3. 1416')) # -> Exception: Expected W:(0123...)
4210
def __init__( self, expr, joinString="", adjacent=True ):
4211
super(Combine,self).__init__( expr )
4212
# suppress whitespace-stripping in contained parse expressions, but re-enable it on the Combine itself
4214
self.leaveWhitespace()
4215
self.adjacent = adjacent
4216
self.skipWhitespace = True
4217
self.joinString = joinString
4218
self.callPreparse = True
4220
def ignore( self, other ):
4222
ParserElement.ignore(self, other)
4224
super( Combine, self).ignore( other )
4227
def postParse( self, instring, loc, tokenlist ):
4228
retToks = tokenlist.copy()
4230
retToks += ParseResults([ "".join(tokenlist._asStringList(self.joinString)) ], modal=self.modalResults)
4232
if self.resultsName and retToks.haskeys():
4237
class Group(TokenConverter):
4239
Converter to return the matched tokens as a list - useful for returning tokens of C{L{ZeroOrMore}} and C{L{OneOrMore}} expressions.
4242
ident = Word(alphas)
4245
func = ident + Optional(delimitedList(term))
4246
print(func.parseString("fn a,b,100")) # -> ['fn', 'a', 'b', '100']
4248
func = ident + Group(Optional(delimitedList(term)))
4249
print(func.parseString("fn a,b,100")) # -> ['fn', ['a', 'b', '100']]
4251
def __init__( self, expr ):
4252
super(Group,self).__init__( expr )
4253
self.saveAsList = True
4255
def postParse( self, instring, loc, tokenlist ):
4256
return [ tokenlist ]
4258
class Dict(TokenConverter):
4260
Converter to return a repetitive expression as a list, but also as a dictionary.
4261
Each element can also be referenced using the first token in the expression as its key.
4262
Useful for tabular report scraping when the first column can be used as a item key.
4265
data_word = Word(alphas)
4266
label = data_word + FollowedBy(':')
4267
attr_expr = Group(label + Suppress(':') + OneOrMore(data_word).setParseAction(' '.join))
4269
text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
4270
attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
4272
# print attributes as plain groups
4273
print(OneOrMore(attr_expr).parseString(text).dump())
4275
# instead of OneOrMore(expr), parse using Dict(OneOrMore(Group(expr))) - Dict will auto-assign names
4276
result = Dict(OneOrMore(Group(attr_expr))).parseString(text)
4277
print(result.dump())
4279
# access named fields as dict entries, or output as dict
4280
print(result['shape'])
4281
print(result.asDict())
4283
['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
4285
[['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
4291
{'color': 'light blue', 'posn': 'upper left', 'texture': 'burlap', 'shape': 'SQUARE'}
4292
See more examples at L{ParseResults} of accessing fields by results name.
4294
def __init__( self, expr ):
4295
super(Dict,self).__init__( expr )
4296
self.saveAsList = True
4298
def postParse( self, instring, loc, tokenlist ):
4299
for i,tok in enumerate(tokenlist):
4303
if isinstance(ikey,int):
4304
ikey = _ustr(tok[0]).strip()
4306
tokenlist[ikey] = _ParseResultsWithOffset("",i)
4307
elif len(tok)==2 and not isinstance(tok[1],ParseResults):
4308
tokenlist[ikey] = _ParseResultsWithOffset(tok[1],i)
4310
dictvalue = tok.copy() #ParseResults(i)
4312
if len(dictvalue)!= 1 or (isinstance(dictvalue,ParseResults) and dictvalue.haskeys()):
4313
tokenlist[ikey] = _ParseResultsWithOffset(dictvalue,i)
4315
tokenlist[ikey] = _ParseResultsWithOffset(dictvalue[0],i)
4317
if self.resultsName:
4318
return [ tokenlist ]
4323
class Suppress(TokenConverter):
4325
Converter for ignoring the results of a parsed expression.
4328
source = "a, b, c,d"
4330
wd_list1 = wd + ZeroOrMore(',' + wd)
4331
print(wd_list1.parseString(source))
4333
# often, delimiters that are useful during parsing are just in the
4334
# way afterward - use Suppress to keep them out of the parsed output
4335
wd_list2 = wd + ZeroOrMore(Suppress(',') + wd)
4336
print(wd_list2.parseString(source))
4338
['a', ',', 'b', ',', 'c', ',', 'd']
4339
['a', 'b', 'c', 'd']
4340
(See also L{delimitedList}.)
4342
def postParse( self, instring, loc, tokenlist ):
4345
def suppress( self ):
4349
class OnlyOnce(object):
4351
Wrapper for parse actions, to ensure they are only called once.
4353
def __init__(self, methodCall):
4354
self.callable = _trim_arity(methodCall)
4356
def __call__(self,s,l,t):
4358
results = self.callable(s,l,t)
4361
raise ParseException(s,l,"")
4365
def traceParseAction(f):
4367
Decorator for debugging parse actions.
4369
When the parse action is called, this decorator will print C{">> entering I{method-name}(line:I{current_source_line}, I{parse_location}, I{matched_tokens})".}
4370
When the parse action completes, the decorator will print C{"<<"} followed by the returned value, or any exception that the parse action raised.
4376
def remove_duplicate_chars(tokens):
4377
return ''.join(sorted(set(''.join(tokens)))
4379
wds = OneOrMore(wd).setParseAction(remove_duplicate_chars)
4380
print(wds.parseString("slkdjs sld sldd sdlf sdljf"))
4382
>>entering remove_duplicate_chars(line: 'slkdjs sld sldd sdlf sdljf', 0, (['slkdjs', 'sld', 'sldd', 'sdlf', 'sdljf'], {}))
4383
<<leaving remove_duplicate_chars (ret: 'dfjkls')
4388
thisFunc = f.__name__
4391
thisFunc = paArgs[0].__class__.__name__ + '.' + thisFunc
4392
sys.stderr.write( ">>entering %s(line: '%s', %d, %r)\n" % (thisFunc,line(l,s),l,t) )
4395
except Exception as exc:
4396
sys.stderr.write( "<<leaving %s (exception: %s)\n" % (thisFunc,exc) )
4398
sys.stderr.write( "<<leaving %s (ret: %r)\n" % (thisFunc,ret) )
4401
z.__name__ = f.__name__
4402
except AttributeError:
4409
def delimitedList( expr, delim=",", combine=False ):
4411
Helper to define a delimited list of expressions - the delimiter defaults to ','.
4412
By default, the list elements and delimiters can have intervening whitespace, and
4413
comments, but this can be overridden by passing C{combine=True} in the constructor.
4414
If C{combine} is set to C{True}, the matching tokens are returned as a single token
4415
string, with the delimiters included; otherwise, the matching tokens are returned
4416
as a list of tokens, with the delimiters suppressed.
4419
delimitedList(Word(alphas)).parseString("aa,bb,cc") # -> ['aa', 'bb', 'cc']
4420
delimitedList(Word(hexnums), delim=':', combine=True).parseString("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE']
4422
dlName = _ustr(expr)+" ["+_ustr(delim)+" "+_ustr(expr)+"]..."
4424
return Combine( expr + ZeroOrMore( delim + expr ) ).setName(dlName)
4426
return ( expr + ZeroOrMore( Suppress( delim ) + expr ) ).setName(dlName)
4428
def countedArray( expr, intExpr=None ):
4430
Helper to define a counted list of expressions.
4431
This helper defines a pattern of the form::
4432
integer expr expr expr...
4433
where the leading integer tells how many expr expressions follow.
4434
The matched tokens returns the array of expr tokens as a list - the leading count token is suppressed.
4436
If C{intExpr} is specified, it should be a pyparsing expression that produces an integer value.
4439
countedArray(Word(alphas)).parseString('2 ab cd ef') # -> ['ab', 'cd']
4441
# in this parser, the leading integer value is given in binary,
4442
# '10' indicating that 2 values are in the array
4443
binaryConstant = Word('01').setParseAction(lambda t: int(t[0], 2))
4444
countedArray(Word(alphas), intExpr=binaryConstant).parseString('10 ab cd ef') # -> ['ab', 'cd']
4446
arrayExpr = Forward()
4447
def countFieldParseAction(s,l,t):
4449
arrayExpr << (n and Group(And([expr]*n)) or Group(empty))
4452
intExpr = Word(nums).setParseAction(lambda t:int(t[0]))
4454
intExpr = intExpr.copy()
4455
intExpr.setName("arrayLen")
4456
intExpr.addParseAction(countFieldParseAction, callDuringTry=True)
4457
return ( intExpr + arrayExpr ).setName('(len) ' + _ustr(expr) + '...')
4462
if isinstance(i,list):
4463
ret.extend(_flatten(i))
4468
def matchPreviousLiteral(expr):
4470
Helper to define an expression that is indirectly defined from
4471
the tokens matched in a previous expression, that is, it looks
4472
for a 'repeat' of a previous expression. For example::
4474
second = matchPreviousLiteral(first)
4475
matchExpr = first + ":" + second
4476
will match C{"1:1"}, but not C{"1:2"}. Because this matches a
4477
previous literal, will also match the leading C{"1:1"} in C{"1:10"}.
4478
If this is not desired, use C{matchPreviousExpr}.
4479
Do I{not} use with packrat parsing enabled.
4482
def copyTokenToRepeater(s,l,t):
4488
tflat = _flatten(t.asList())
4489
rep << And(Literal(tt) for tt in tflat)
4492
expr.addParseAction(copyTokenToRepeater, callDuringTry=True)
4493
rep.setName('(prev) ' + _ustr(expr))
4496
def matchPreviousExpr(expr):
4498
Helper to define an expression that is indirectly defined from
4499
the tokens matched in a previous expression, that is, it looks
4500
for a 'repeat' of a previous expression. For example::
4502
second = matchPreviousExpr(first)
4503
matchExpr = first + ":" + second
4504
will match C{"1:1"}, but not C{"1:2"}. Because this matches by
4505
expressions, will I{not} match the leading C{"1:1"} in C{"1:10"};
4506
the expressions are evaluated first, and then compared, so
4507
C{"1"} is compared with C{"10"}.
4508
Do I{not} use with packrat parsing enabled.
4513
def copyTokenToRepeater(s,l,t):
4514
matchTokens = _flatten(t.asList())
4515
def mustMatchTheseTokens(s,l,t):
4516
theseTokens = _flatten(t.asList())
4517
if theseTokens != matchTokens:
4518
raise ParseException("",0,"")
4519
rep.setParseAction( mustMatchTheseTokens, callDuringTry=True )
4520
expr.addParseAction(copyTokenToRepeater, callDuringTry=True)
4521
rep.setName('(prev) ' + _ustr(expr))
4524
def _escapeRegexRangeChars(s):
4525
#~ escape these chars: ^-]
4527
s = s.replace(c,_bslash+c)
4528
s = s.replace("\n",r"\n")
4529
s = s.replace("\t",r"\t")
4532
def oneOf( strs, caseless=False, useRegex=True ):
4534
Helper to quickly define a set of alternative Literals, and makes sure to do
4535
longest-first testing when there is a conflict, regardless of the input order,
4536
but returns a C{L{MatchFirst}} for best performance.
4539
- strs - a string of space-delimited literals, or a collection of string literals
4540
- caseless - (default=C{False}) - treat all literals as caseless
4541
- useRegex - (default=C{True}) - as an optimization, will generate a Regex
4542
object; otherwise, will generate a C{MatchFirst} object (if C{caseless=True}, or
4543
if creating a C{Regex} raises an exception)
4546
comp_oper = oneOf("< = > <= >= !=")
4550
comparison_expr = term + comp_oper + term
4551
print(comparison_expr.searchString("B = 12 AA=23 B<=AA AA>12"))
4553
[['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
4556
isequal = ( lambda a,b: a.upper() == b.upper() )
4557
masks = ( lambda a,b: b.upper().startswith(a.upper()) )
4558
parseElementClass = CaselessLiteral
4560
isequal = ( lambda a,b: a == b )
4561
masks = ( lambda a,b: b.startswith(a) )
4562
parseElementClass = Literal
4565
if isinstance(strs,basestring):
4566
symbols = strs.split()
4567
elif isinstance(strs, collections.Iterable):
4568
symbols = list(strs)
4570
warnings.warn("Invalid argument to oneOf, expected string or iterable",
4571
SyntaxWarning, stacklevel=2)
4576
while i < len(symbols)-1:
4578
for j,other in enumerate(symbols[i+1:]):
4579
if ( isequal(other, cur) ):
4582
elif ( masks(cur, other) ):
4584
symbols.insert(i,other)
4590
if not caseless and useRegex:
4591
#~ print (strs,"->", "|".join( [ _escapeRegexChars(sym) for sym in symbols] ))
4593
if len(symbols)==len("".join(symbols)):
4594
return Regex( "[%s]" % "".join(_escapeRegexRangeChars(sym) for sym in symbols) ).setName(' | '.join(symbols))
4596
return Regex( "|".join(re.escape(sym) for sym in symbols) ).setName(' | '.join(symbols))
4598
warnings.warn("Exception creating Regex for oneOf, building MatchFirst",
4599
SyntaxWarning, stacklevel=2)
4602
# last resort, just use MatchFirst
4603
return MatchFirst(parseElementClass(sym) for sym in symbols).setName(' | '.join(symbols))
4605
def dictOf( key, value ):
4607
Helper to easily and clearly define a dictionary by specifying the respective patterns
4608
for the key and value. Takes care of defining the C{L{Dict}}, C{L{ZeroOrMore}}, and C{L{Group}} tokens
4609
in the proper order. The key pattern can include delimiting markers or punctuation,
4610
as long as they are suppressed, thereby leaving the significant key text. The value
4611
pattern can include named results, so that the C{Dict} results can include named token
4615
text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
4616
attr_expr = (label + Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join))
4617
print(OneOrMore(attr_expr).parseString(text).dump())
4620
attr_value = Suppress(':') + OneOrMore(data_word, stopOn=label).setParseAction(' '.join)
4622
# similar to Dict, but simpler call format
4623
result = dictOf(attr_label, attr_value).parseString(text)
4624
print(result.dump())
4625
print(result['shape'])
4626
print(result.shape) # object attribute access works too
4627
print(result.asDict())
4629
[['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
4636
{'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
4638
return Dict( ZeroOrMore( Group ( key + value ) ) )
4640
def originalTextFor(expr, asString=True):
4642
Helper to return the original, untokenized text for a given expression. Useful to
4643
restore the parsed fields of an HTML start tag into the raw tag text itself, or to
4644
revert separate tokens with intervening whitespace back to the original matching
4645
input text. By default, returns astring containing the original parsed text.
4647
If the optional C{asString} argument is passed as C{False}, then the return value is a
4648
C{L{ParseResults}} containing any results names that were originally matched, and a
4649
single token containing the original matched text from the input string. So if
4650
the expression passed to C{L{originalTextFor}} contains expressions with defined
4651
results names, you must set C{asString} to C{False} if you want to preserve those
4652
results name values.
4655
src = "this is test <b> bold <i>text</i> </b> normal text "
4656
for tag in ("b","i"):
4657
opener,closer = makeHTMLTags(tag)
4658
patt = originalTextFor(opener + SkipTo(closer) + closer)
4659
print(patt.searchString(src)[0])
4661
['<b> bold <i>text</i> </b>']
4664
locMarker = Empty().setParseAction(lambda s,loc,t: loc)
4665
endlocMarker = locMarker.copy()
4666
endlocMarker.callPreparse = False
4667
matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
4669
extractText = lambda s,l,t: s[t._original_start:t._original_end]
4671
def extractText(s,l,t):
4672
t[:] = [s[t.pop('_original_start'):t.pop('_original_end')]]
4673
matchExpr.setParseAction(extractText)
4674
matchExpr.ignoreExprs = expr.ignoreExprs
4679
Helper to undo pyparsing's default grouping of And expressions, even
4680
if all but one are non-empty.
4682
return TokenConverter(expr).setParseAction(lambda t:t[0])
4684
def locatedExpr(expr):
4686
Helper to decorate a returned token with its starting and ending locations in the input string.
4687
This helper adds the following results names:
4688
- locn_start = location where matched expression begins
4689
- locn_end = location where matched expression ends
4690
- value = the actual parsed results
4692
Be careful if the input text contains C{<TAB>} characters, you may want to call
4693
C{L{ParserElement.parseWithTabs}}
4697
for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"):
4701
[[8, 'lksdjjf', 15]]
4704
locator = Empty().setParseAction(lambda s,l,t: l)
4705
return Group(locator("locn_start") + expr("value") + locator.copy().leaveWhitespace()("locn_end"))
4708
# convenience constants for positional expressions
4709
empty = Empty().setName("empty")
4710
lineStart = LineStart().setName("lineStart")
4711
lineEnd = LineEnd().setName("lineEnd")
4712
stringStart = StringStart().setName("stringStart")
4713
stringEnd = StringEnd().setName("stringEnd")
4715
_escapedPunc = Word( _bslash, r"\[]-*.$+^?()~ ", exact=2 ).setParseAction(lambda s,l,t:t[0][1])
4716
_escapedHexChar = Regex(r"\\0?[xX][0-9a-fA-F]+").setParseAction(lambda s,l,t:unichr(int(t[0].lstrip(r'\0x'),16)))
4717
_escapedOctChar = Regex(r"\\0[0-7]+").setParseAction(lambda s,l,t:unichr(int(t[0][1:],8)))
4718
_singleChar = _escapedPunc | _escapedHexChar | _escapedOctChar | Word(printables, excludeChars=r'\]', exact=1) | Regex(r"\w", re.UNICODE)
4719
_charRange = Group(_singleChar + Suppress("-") + _singleChar)
4720
_reBracketExpr = Literal("[") + Optional("^").setResultsName("negate") + Group( OneOrMore( _charRange | _singleChar ) ).setResultsName("body") + "]"
4724
Helper to easily define string ranges for use in Word construction. Borrows
4725
syntax from regexp '[]' string range definitions::
4726
srange("[0-9]") -> "0123456789"
4727
srange("[a-z]") -> "abcdefghijklmnopqrstuvwxyz"
4728
srange("[a-z$_]") -> "abcdefghijklmnopqrstuvwxyz$_"
4729
The input string must be enclosed in []'s, and the returned string is the expanded
4730
character set joined into a single string.
4731
The values enclosed in the []'s may be:
4732
- a single character
4733
- an escaped character with a leading backslash (such as C{\-} or C{\]})
4734
- an escaped hex character with a leading C{'\x'} (C{\x21}, which is a C{'!'} character)
4735
(C{\0x##} is also supported for backwards compatibility)
4736
- an escaped octal character with a leading C{'\0'} (C{\041}, which is a C{'!'} character)
4737
- a range of any of the above, separated by a dash (C{'a-z'}, etc.)
4738
- any combination of the above (C{'aeiouy'}, C{'a-zA-Z0-9_$'}, etc.)
4740
_expanded = lambda p: p if not isinstance(p,ParseResults) else ''.join(unichr(c) for c in range(ord(p[0]),ord(p[1])+1))
4742
return "".join(_expanded(part) for part in _reBracketExpr.parseString(s).body)
4746
def matchOnlyAtCol(n):
4748
Helper method for defining parse actions that require matching at a specific
4749
column in the input text.
4751
def verifyCol(strg,locn,toks):
4752
if col(locn,strg) != n:
4753
raise ParseException(strg,locn,"matched token not at column %d" % n)
4756
def replaceWith(replStr):
4758
Helper method for common parse actions that simply return a literal value. Especially
4759
useful when used with C{L{transformString<ParserElement.transformString>}()}.
4762
num = Word(nums).setParseAction(lambda toks: int(toks[0]))
4763
na = oneOf("N/A NA").setParseAction(replaceWith(math.nan))
4766
OneOrMore(term).parseString("324 234 N/A 234") # -> [324, 234, nan, 234]
4768
return lambda s,l,t: [replStr]
4770
def removeQuotes(s,l,t):
4772
Helper parse action for removing quotation marks from parsed quoted strings.
4775
# by default, quotation marks are included in parsed results
4776
quotedString.parseString("'Now is the Winter of our Discontent'") # -> ["'Now is the Winter of our Discontent'"]
4778
# use removeQuotes to strip quotation marks from parsed results
4779
quotedString.setParseAction(removeQuotes)
4780
quotedString.parseString("'Now is the Winter of our Discontent'") # -> ["Now is the Winter of our Discontent"]
4784
def tokenMap(func, *args):
4786
Helper to define a parse action by mapping a function to all elements of a ParseResults list.If any additional
4787
args are passed, they are forwarded to the given function as additional arguments after
4788
the token, as in C{hex_integer = Word(hexnums).setParseAction(tokenMap(int, 16))}, which will convert the
4789
parsed data to an integer using base 16.
4791
Example (compare the last to example in L{ParserElement.transformString}::
4792
hex_ints = OneOrMore(Word(hexnums)).setParseAction(tokenMap(int, 16))
4793
hex_ints.runTests('''
4794
00 11 22 aa FF 0a 0d 1a
4797
upperword = Word(alphas).setParseAction(tokenMap(str.upper))
4798
OneOrMore(upperword).runTests('''
4799
my kingdom for a horse
4802
wd = Word(alphas).setParseAction(tokenMap(str.title))
4803
OneOrMore(wd).setParseAction(' '.join).runTests('''
4804
now is the winter of our discontent made glorious summer by this sun of york
4807
00 11 22 aa FF 0a 0d 1a
4808
[0, 17, 34, 170, 255, 10, 13, 26]
4810
my kingdom for a horse
4811
['MY', 'KINGDOM', 'FOR', 'A', 'HORSE']
4813
now is the winter of our discontent made glorious summer by this sun of york
4814
['Now Is The Winter Of Our Discontent Made Glorious Summer By This Sun Of York']
4817
return [func(tokn, *args) for tokn in t]
4820
func_name = getattr(func, '__name__',
4821
getattr(func, '__class__').__name__)
4823
func_name = str(func)
4824
pa.__name__ = func_name
4828
upcaseTokens = tokenMap(lambda t: _ustr(t).upper())
4829
"""(Deprecated) Helper parse action to convert tokens to upper case. Deprecated in favor of L{pyparsing_common.upcaseTokens}"""
4831
downcaseTokens = tokenMap(lambda t: _ustr(t).lower())
4832
"""(Deprecated) Helper parse action to convert tokens to lower case. Deprecated in favor of L{pyparsing_common.downcaseTokens}"""
4834
def _makeTags(tagStr, xml):
4835
"""Internal helper to construct opening and closing tag expressions, given a tag name"""
4836
if isinstance(tagStr,basestring):
4838
tagStr = Keyword(tagStr, caseless=not xml)
4840
resname = tagStr.name
4842
tagAttrName = Word(alphas,alphanums+"_-:")
4844
tagAttrValue = dblQuotedString.copy().setParseAction( removeQuotes )
4845
openTag = Suppress("<") + tagStr("tag") + \
4846
Dict(ZeroOrMore(Group( tagAttrName + Suppress("=") + tagAttrValue ))) + \
4847
Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">")
4849
printablesLessRAbrack = "".join(c for c in printables if c not in ">")
4850
tagAttrValue = quotedString.copy().setParseAction( removeQuotes ) | Word(printablesLessRAbrack)
4851
openTag = Suppress("<") + tagStr("tag") + \
4852
Dict(ZeroOrMore(Group( tagAttrName.setParseAction(downcaseTokens) + \
4853
Optional( Suppress("=") + tagAttrValue ) ))) + \
4854
Optional("/",default=[False]).setResultsName("empty").setParseAction(lambda s,l,t:t[0]=='/') + Suppress(">")
4855
closeTag = Combine(_L("</") + tagStr + ">")
4857
openTag = openTag.setResultsName("start"+"".join(resname.replace(":"," ").title().split())).setName("<%s>" % resname)
4858
closeTag = closeTag.setResultsName("end"+"".join(resname.replace(":"," ").title().split())).setName("</%s>" % resname)
4859
openTag.tag = resname
4860
closeTag.tag = resname
4861
return openTag, closeTag
4863
def makeHTMLTags(tagStr):
4865
Helper to construct opening and closing tag expressions for HTML, given a tag name. Matches
4866
tags in either upper or lower case, attributes with namespaces and with quoted or unquoted values.
4869
text = '<td>More info at the <a href="http://pyparsing.wikispaces.com">pyparsing</a> wiki page</td>'
4870
# makeHTMLTags returns pyparsing expressions for the opening and closing tags as a 2-tuple
4871
a,a_end = makeHTMLTags("A")
4872
link_expr = a + SkipTo(a_end)("link_text") + a_end
4874
for link in link_expr.searchString(text):
4875
# attributes in the <A> tag (like "href" shown here) are also accessible as named results
4876
print(link.link_text, '->', link.href)
4878
pyparsing -> http://pyparsing.wikispaces.com
4880
return _makeTags( tagStr, False )
4882
def makeXMLTags(tagStr):
4884
Helper to construct opening and closing tag expressions for XML, given a tag name. Matches
4885
tags only in the given upper/lower case.
4887
Example: similar to L{makeHTMLTags}
4889
return _makeTags( tagStr, True )
4891
def withAttribute(*args,**attrDict):
4893
Helper to create a validating parse action to be used with start tags created
4894
with C{L{makeXMLTags}} or C{L{makeHTMLTags}}. Use C{withAttribute} to qualify a starting tag
4895
with a required attribute value, to avoid false matches on common tags such as
4896
C{<TD>} or C{<DIV>}.
4898
Call C{withAttribute} with a series of attribute names and values. Specify the list
4899
of filter attributes names and values as:
4900
- keyword arguments, as in C{(align="right")}, or
4901
- as an explicit dict with C{**} operator, when an attribute name is also a Python
4902
reserved word, as in C{**{"class":"Customer", "align":"right"}}
4903
- a list of name-value tuples, as in ( ("ns1:class", "Customer"), ("ns2:align","right") )
4904
For attribute names with a namespace prefix, you must use the second form. Attribute
4905
names are matched insensitive to upper/lower case.
4907
If just testing for C{class} (with or without a namespace), use C{L{withClass}}.
4909
To verify that the attribute exists, but without specifying a value, pass
4910
C{withAttribute.ANY_VALUE} as the value.
4916
<div type="grid">1 4 0 1 0</div>
4917
<div type="graph">1,3 2,3 1,1</div>
4918
<div>this has no type</div>
4922
div,div_end = makeHTMLTags("div")
4924
# only match div tag having a type attribute with value "grid"
4925
div_grid = div().setParseAction(withAttribute(type="grid"))
4926
grid_expr = div_grid + SkipTo(div | div_end)("body")
4927
for grid_header in grid_expr.searchString(html):
4928
print(grid_header.body)
4930
# construct a match with any div tag having a type attribute, regardless of the value
4931
div_any_type = div().setParseAction(withAttribute(type=withAttribute.ANY_VALUE))
4932
div_expr = div_any_type + SkipTo(div | div_end)("body")
4933
for div_header in div_expr.searchString(html):
4934
print(div_header.body)
4944
attrs = attrDict.items()
4945
attrs = [(k,v) for k,v in attrs]
4947
for attrName,attrValue in attrs:
4948
if attrName not in tokens:
4949
raise ParseException(s,l,"no matching attribute " + attrName)
4950
if attrValue != withAttribute.ANY_VALUE and tokens[attrName] != attrValue:
4951
raise ParseException(s,l,"attribute '%s' has value '%s', must be '%s'" %
4952
(attrName, tokens[attrName], attrValue))
4954
withAttribute.ANY_VALUE = object()
4956
def withClass(classname, namespace=''):
4958
Simplified version of C{L{withAttribute}} when matching on a div class - made
4959
difficult because C{class} is a reserved word in Python.
4965
<div class="grid">1 4 0 1 0</div>
4966
<div class="graph">1,3 2,3 1,1</div>
4967
<div>this <div> has no class</div>
4971
div,div_end = makeHTMLTags("div")
4972
div_grid = div().setParseAction(withClass("grid"))
4974
grid_expr = div_grid + SkipTo(div | div_end)("body")
4975
for grid_header in grid_expr.searchString(html):
4976
print(grid_header.body)
4978
div_any_type = div().setParseAction(withClass(withAttribute.ANY_VALUE))
4979
div_expr = div_any_type + SkipTo(div | div_end)("body")
4980
for div_header in div_expr.searchString(html):
4981
print(div_header.body)
4988
classattr = "%s:class" % namespace if namespace else "class"
4989
return withAttribute(**{classattr : classname})
4991
opAssoc = _Constants()
4992
opAssoc.LEFT = object()
4993
opAssoc.RIGHT = object()
4995
def infixNotation( baseExpr, opList, lpar=Suppress('('), rpar=Suppress(')') ):
4997
Helper method for constructing grammars of expressions made up of
4998
operators working in a precedence hierarchy. Operators may be unary or
4999
binary, left- or right-associative. Parse actions can also be attached
5000
to operator expressions. The generated parser will also recognize the use
5001
of parentheses to override operator precedences (see example below).
5003
Note: if you define a deep operator list, you may see performance issues
5004
when using infixNotation. See L{ParserElement.enablePackrat} for a
5005
mechanism to potentially improve your parser performance.
5008
- baseExpr - expression representing the most basic element for the nested
5009
- opList - list of tuples, one for each operator precedence level in the
5010
expression grammar; each tuple is of the form
5011
(opExpr, numTerms, rightLeftAssoc, parseAction), where:
5012
- opExpr is the pyparsing expression for the operator;
5013
may also be a string, which will be converted to a Literal;
5014
if numTerms is 3, opExpr is a tuple of two expressions, for the
5015
two operators separating the 3 terms
5016
- numTerms is the number of terms for this operator (must
5018
- rightLeftAssoc is the indicator whether the operator is
5019
right or left associative, using the pyparsing-defined
5020
constants C{opAssoc.RIGHT} and C{opAssoc.LEFT}.
5021
- parseAction is the parse action to be associated with
5022
expressions matching this operator expression (the
5023
parse action tuple member may be omitted)
5024
- lpar - expression for matching left-parentheses (default=C{Suppress('(')})
5025
- rpar - expression for matching right-parentheses (default=C{Suppress(')')})
5028
# simple example of four-function arithmetic with ints and variable names
5029
integer = pyparsing_common.signed_integer
5030
varname = pyparsing_common.identifier
5032
arith_expr = infixNotation(integer | varname,
5034
('-', 1, opAssoc.RIGHT),
5035
(oneOf('* /'), 2, opAssoc.LEFT),
5036
(oneOf('+ -'), 2, opAssoc.LEFT),
5039
arith_expr.runTests('''
5043
''', fullDump=False)
5046
[[5, '+', [3, '*', 6]]]
5049
[[[5, '+', 3], '*', 6]]
5052
[[['-', 2], '-', ['-', 11]]]
5055
lastExpr = baseExpr | ( lpar + ret + rpar )
5056
for i,operDef in enumerate(opList):
5057
opExpr,arity,rightLeftAssoc,pa = (operDef + (None,))[:4]
5058
termName = "%s term" % opExpr if arity < 3 else "%s%s term" % opExpr
5060
if opExpr is None or len(opExpr) != 2:
5061
raise ValueError("if numterms=3, opExpr must be a tuple or list of two expressions")
5062
opExpr1, opExpr2 = opExpr
5063
thisExpr = Forward().setName(termName)
5064
if rightLeftAssoc == opAssoc.LEFT:
5066
matchExpr = FollowedBy(lastExpr + opExpr) + Group( lastExpr + OneOrMore( opExpr ) )
5068
if opExpr is not None:
5069
matchExpr = FollowedBy(lastExpr + opExpr + lastExpr) + Group( lastExpr + OneOrMore( opExpr + lastExpr ) )
5071
matchExpr = FollowedBy(lastExpr+lastExpr) + Group( lastExpr + OneOrMore(lastExpr) )
5073
matchExpr = FollowedBy(lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr) + \
5074
Group( lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr )
5076
raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
5077
elif rightLeftAssoc == opAssoc.RIGHT:
5079
# try to avoid LR with this extra test
5080
if not isinstance(opExpr, Optional):
5081
opExpr = Optional(opExpr)
5082
matchExpr = FollowedBy(opExpr.expr + thisExpr) + Group( opExpr + thisExpr )
5084
if opExpr is not None:
5085
matchExpr = FollowedBy(lastExpr + opExpr + thisExpr) + Group( lastExpr + OneOrMore( opExpr + thisExpr ) )
5087
matchExpr = FollowedBy(lastExpr + thisExpr) + Group( lastExpr + OneOrMore( thisExpr ) )
5089
matchExpr = FollowedBy(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) + \
5090
Group( lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr )
5092
raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
5094
raise ValueError("operator must indicate right or left associativity")
5096
matchExpr.setParseAction( pa )
5097
thisExpr <<= ( matchExpr.setName(termName) | lastExpr )
5102
operatorPrecedence = infixNotation
5103
"""(Deprecated) Former name of C{L{infixNotation}}, will be dropped in a future release."""
5105
dblQuotedString = Combine(Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*')+'"').setName("string enclosed in double quotes")
5106
sglQuotedString = Combine(Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*")+"'").setName("string enclosed in single quotes")
5107
quotedString = Combine(Regex(r'"(?:[^"\n\r\\]|(?:"")|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*')+'"'|
5108
Regex(r"'(?:[^'\n\r\\]|(?:'')|(?:\\(?:[^x]|x[0-9a-fA-F]+)))*")+"'").setName("quotedString using single or double quotes")
5109
unicodeString = Combine(_L('u') + quotedString.copy()).setName("unicode string literal")
5111
def nestedExpr(opener="(", closer=")", content=None, ignoreExpr=quotedString.copy()):
5113
Helper method for defining nested lists enclosed in opening and closing
5114
delimiters ("(" and ")" are the default).
5117
- opener - opening character for a nested list (default=C{"("}); can also be a pyparsing expression
5118
- closer - closing character for a nested list (default=C{")"}); can also be a pyparsing expression
5119
- content - expression for items within the nested lists (default=C{None})
5120
- ignoreExpr - expression for ignoring opening and closing delimiters (default=C{quotedString})
5122
If an expression is not provided for the content argument, the nested
5123
expression will capture all whitespace-delimited content between delimiters
5124
as a list of separate values.
5126
Use the C{ignoreExpr} argument to define expressions that may contain
5127
opening or closing characters that should not be treated as opening
5128
or closing characters for nesting, such as quotedString or a comment
5129
expression. Specify multiple expressions using an C{L{Or}} or C{L{MatchFirst}}.
5130
The default is L{quotedString}, but if no expressions are to be ignored,
5131
then pass C{None} for this argument.
5134
data_type = oneOf("void int short long char float double")
5135
decl_data_type = Combine(data_type + Optional(Word('*')))
5136
ident = Word(alphas+'_', alphanums+'_')
5137
number = pyparsing_common.number
5138
arg = Group(decl_data_type + ident)
5139
LPAR,RPAR = map(Suppress, "()")
5141
code_body = nestedExpr('{', '}', ignoreExpr=(quotedString | cStyleComment))
5143
c_function = (decl_data_type("type")
5145
+ LPAR + Optional(delimitedList(arg), [])("args") + RPAR
5146
+ code_body("body"))
5147
c_function.ignore(cStyleComment)
5154
int dec_to_hex(char hchar) {
5155
if (hchar >= '0' && hchar <= '9') {
5156
return (ord(hchar)-ord('0'));
5158
return (10+ord(hchar)-ord('A'));
5162
for func in c_function.searchString(source_code):
5163
print("%(name)s (%(type)s) args: %(args)s" % func)
5166
is_odd (int) args: [['int', 'x']]
5167
dec_to_hex (int) args: [['char', 'hchar']]
5169
if opener == closer:
5170
raise ValueError("opening and closing strings cannot be the same")
5172
if isinstance(opener,basestring) and isinstance(closer,basestring):
5173
if len(opener) == 1 and len(closer)==1:
5174
if ignoreExpr is not None:
5175
content = (Combine(OneOrMore(~ignoreExpr +
5176
CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS,exact=1))
5177
).setParseAction(lambda t:t[0].strip()))
5179
content = (empty.copy()+CharsNotIn(opener+closer+ParserElement.DEFAULT_WHITE_CHARS
5180
).setParseAction(lambda t:t[0].strip()))
5182
if ignoreExpr is not None:
5183
content = (Combine(OneOrMore(~ignoreExpr +
5184
~Literal(opener) + ~Literal(closer) +
5185
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
5186
).setParseAction(lambda t:t[0].strip()))
5188
content = (Combine(OneOrMore(~Literal(opener) + ~Literal(closer) +
5189
CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS,exact=1))
5190
).setParseAction(lambda t:t[0].strip()))
5192
raise ValueError("opening and closing arguments must be strings if no content expression is given")
5194
if ignoreExpr is not None:
5195
ret <<= Group( Suppress(opener) + ZeroOrMore( ignoreExpr | ret | content ) + Suppress(closer) )
5197
ret <<= Group( Suppress(opener) + ZeroOrMore( ret | content ) + Suppress(closer) )
5198
ret.setName('nested %s%s expression' % (opener,closer))
5201
def indentedBlock(blockStatementExpr, indentStack, indent=True):
5203
Helper method for defining space-delimited indentation blocks, such as
5204
those used to define block statements in Python source code.
5207
- blockStatementExpr - expression defining syntax of statement that
5208
is repeated within the indented block
5209
- indentStack - list created by caller to manage indentation stack
5210
(multiple statementWithIndentedBlock expressions within a single grammar
5211
should share a common indentStack)
5212
- indent - boolean indicating whether block must be indented beyond the
5213
the current level; set to False for block of left-most statements
5216
A valid block must contain at least one C{blockStatement}.
5244
identifier = Word(alphas, alphanums)
5245
funcDecl = ("def" + identifier + Group( "(" + Optional( delimitedList(identifier) ) + ")" ) + ":")
5246
func_body = indentedBlock(stmt, indentStack)
5247
funcDef = Group( funcDecl + func_body )
5250
funcCall = Group(identifier + "(" + Optional(delimitedList(rvalue)) + ")")
5251
rvalue << (funcCall | identifier | Word(nums))
5252
assignment = Group(identifier + "=" + rvalue)
5253
stmt << ( funcDef | assignment | identifier )
5255
module_body = OneOrMore(stmt)
5257
parseTree = module_body.parseString(data)
5264
[['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
5268
['(', 'a', 'b', 'c', ')'],
5270
[['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
5275
['(', 'x', 'y', ')'],
5277
[[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
5279
def checkPeerIndent(s,l,t):
5280
if l >= len(s): return
5282
if curCol != indentStack[-1]:
5283
if curCol > indentStack[-1]:
5284
raise ParseFatalException(s,l,"illegal nesting")
5285
raise ParseException(s,l,"not a peer entry")
5287
def checkSubIndent(s,l,t):
5289
if curCol > indentStack[-1]:
5290
indentStack.append( curCol )
5292
raise ParseException(s,l,"not a subentry")
5294
def checkUnindent(s,l,t):
5295
if l >= len(s): return
5297
if not(indentStack and curCol < indentStack[-1] and curCol <= indentStack[-2]):
5298
raise ParseException(s,l,"not an unindent")
5301
NL = OneOrMore(LineEnd().setWhitespaceChars("\t ").suppress())
5302
INDENT = (Empty() + Empty().setParseAction(checkSubIndent)).setName('INDENT')
5303
PEER = Empty().setParseAction(checkPeerIndent).setName('')
5304
UNDENT = Empty().setParseAction(checkUnindent).setName('UNINDENT')
5306
smExpr = Group( Optional(NL) +
5307
#~ FollowedBy(blockStatementExpr) +
5308
INDENT + (OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) + UNDENT)
5310
smExpr = Group( Optional(NL) +
5311
(OneOrMore( PEER + Group(blockStatementExpr) + Optional(NL) )) )
5312
blockStatementExpr.ignore(_bslash + LineEnd())
5313
return smExpr.setName('indented block')
5315
alphas8bit = srange(r"[\0xc0-\0xd6\0xd8-\0xf6\0xf8-\0xff]")
5316
punc8bit = srange(r"[\0xa1-\0xbf\0xd7\0xf7]")
5318
anyOpenTag,anyCloseTag = makeHTMLTags(Word(alphas,alphanums+"_:").setName('any tag'))
5319
_htmlEntityMap = dict(zip("gt lt amp nbsp quot apos".split(),'><& "\''))
5320
commonHTMLEntity = Regex('&(?P<entity>' + '|'.join(_htmlEntityMap.keys()) +");").setName("common HTML entity")
5321
def replaceHTMLEntity(t):
5322
"""Helper parser action to replace common HTML entities with their special characters"""
5323
return _htmlEntityMap.get(t.entity)
5325
# it's easy to get these comment structures wrong - they're very common, so may as well make them available
5326
cStyleComment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + '*/').setName("C style comment")
5327
"Comment of the form C{/* ... */}"
5329
htmlComment = Regex(r"<!--[\s\S]*?-->").setName("HTML comment")
5330
"Comment of the form C{<!-- ... -->}"
5332
restOfLine = Regex(r".*").leaveWhitespace().setName("rest of line")
5333
dblSlashComment = Regex(r"//(?:\\\n|[^\n])*").setName("// comment")
5334
"Comment of the form C{// ... (to end of line)}"
5336
cppStyleComment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + '*/'| dblSlashComment).setName("C++ style comment")
5337
"Comment of either form C{L{cStyleComment}} or C{L{dblSlashComment}}"
5339
javaStyleComment = cppStyleComment
5340
"Same as C{L{cppStyleComment}}"
5342
pythonStyleComment = Regex(r"#.*").setName("Python style comment")
5343
"Comment of the form C{# ... (to end of line)}"
5345
_commasepitem = Combine(OneOrMore(Word(printables, excludeChars=',') +
5346
Optional( Word(" \t") +
5347
~Literal(",") + ~LineEnd() ) ) ).streamline().setName("commaItem")
5348
commaSeparatedList = delimitedList( Optional( quotedString.copy() | _commasepitem, default="") ).setName("commaSeparatedList")
5349
"""(Deprecated) Predefined expression of 1 or more printable words or quoted strings, separated by commas.
5350
This expression is deprecated in favor of L{pyparsing_common.comma_separated_list}."""
5352
# some other useful expressions - using lower-case class name since we are really using this as a namespace
5353
class pyparsing_common:
5355
Here are some common low-level expressions that may be useful in jump-starting parser development:
5356
- numeric forms (L{integers<integer>}, L{reals<real>}, L{scientific notation<sci_real>})
5357
- common L{programming identifiers<identifier>}
5358
- network addresses (L{MAC<mac_address>}, L{IPv4<ipv4_address>}, L{IPv6<ipv6_address>})
5359
- ISO8601 L{dates<iso8601_date>} and L{datetime<iso8601_datetime>}
5361
- L{comma-separated list<comma_separated_list>}
5363
- C{L{convertToInteger}}
5364
- C{L{convertToFloat}}
5365
- C{L{convertToDate}}
5366
- C{L{convertToDatetime}}
5367
- C{L{stripHTMLTags}}
5368
- C{L{upcaseTokens}}
5369
- C{L{downcaseTokens}}
5372
pyparsing_common.number.runTests('''
5373
# any int or real number, returned as the appropriate type
5382
pyparsing_common.fnumber.runTests('''
5383
# any int or real number, returned as float
5392
pyparsing_common.hex_integer.runTests('''
5398
pyparsing_common.fraction.runTests('''
5404
pyparsing_common.mixed_integer.runTests('''
5413
pyparsing_common.uuid.setParseAction(tokenMap(uuid.UUID))
5414
pyparsing_common.uuid.runTests('''
5416
12345678-1234-5678-1234-567812345678
5419
# any int or real number, returned as the appropriate type
5438
# any int or real number, returned as float
5485
12345678-1234-5678-1234-567812345678
5486
[UUID('12345678-1234-5678-1234-567812345678')]
5489
convertToInteger = tokenMap(int)
5491
Parse action for converting parsed integers to Python int
5494
convertToFloat = tokenMap(float)
5496
Parse action for converting parsed numbers to Python float
5499
integer = Word(nums).setName("integer").setParseAction(convertToInteger)
5500
"""expression that parses an unsigned integer, returns an int"""
5502
hex_integer = Word(hexnums).setName("hex integer").setParseAction(tokenMap(int,16))
5503
"""expression that parses a hexadecimal integer, returns an int"""
5505
signed_integer = Regex(r'[+-]?\d+').setName("signed integer").setParseAction(convertToInteger)
5506
"""expression that parses an integer with optional leading sign, returns an int"""
5508
fraction = (signed_integer().setParseAction(convertToFloat) + '/' + signed_integer().setParseAction(convertToFloat)).setName("fraction")
5509
"""fractional expression of an integer divided by an integer, returns a float"""
5510
fraction.addParseAction(lambda t: t[0]/t[-1])
5512
mixed_integer = (fraction | signed_integer + Optional(Optional('-').suppress() + fraction)).setName("fraction or mixed integer-fraction")
5513
"""mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
5514
mixed_integer.addParseAction(sum)
5516
real = Regex(r'[+-]?\d+\.\d*').setName("real number").setParseAction(convertToFloat)
5517
"""expression that parses a floating point number and returns a float"""
5519
sci_real = Regex(r'[+-]?\d+([eE][+-]?\d+|\.\d*([eE][+-]?\d+)?)').setName("real number with scientific notation").setParseAction(convertToFloat)
5520
"""expression that parses a floating point number with optional scientific notation and returns a float"""
5522
# streamlining this expression makes the docs nicer-looking
5523
number = (sci_real | real | signed_integer).streamline()
5524
"""any numeric expression, returns the corresponding Python type"""
5526
fnumber = Regex(r'[+-]?\d+\.?\d*([eE][+-]?\d+)?').setName("fnumber").setParseAction(convertToFloat)
5527
"""any int or real number, returned as float"""
5529
identifier = Word(alphas+'_', alphanums+'_').setName("identifier")
5530
"""typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
5532
ipv4_address = Regex(r'(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}').setName("IPv4 address")
5533
"IPv4 address (C{0.0.0.0 - 255.255.255.255})"
5535
_ipv6_part = Regex(r'[0-9a-fA-F]{1,4}').setName("hex_integer")
5536
_full_ipv6_address = (_ipv6_part + (':' + _ipv6_part)*7).setName("full IPv6 address")
5537
_short_ipv6_address = (Optional(_ipv6_part + (':' + _ipv6_part)*(0,6)) + "::" + Optional(_ipv6_part + (':' + _ipv6_part)*(0,6))).setName("short IPv6 address")
5538
_short_ipv6_address.addCondition(lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8)
5539
_mixed_ipv6_address = ("::ffff:" + ipv4_address).setName("mixed IPv6 address")
5540
ipv6_address = Combine((_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).setName("IPv6 address")).setName("IPv6 address")
5541
"IPv6 address (long, short, or mixed form)"
5543
mac_address = Regex(r'[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}').setName("MAC address")
5544
"MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
5547
def convertToDate(fmt="%Y-%m-%d"):
5549
Helper to create a parse action for converting parsed date string to Python datetime.date
5552
- fmt - format to be passed to datetime.strptime (default=C{"%Y-%m-%d"})
5555
date_expr = pyparsing_common.iso8601_date.copy()
5556
date_expr.setParseAction(pyparsing_common.convertToDate())
5557
print(date_expr.parseString("1999-12-31"))
5559
[datetime.date(1999, 12, 31)]
5563
return datetime.strptime(t[0], fmt).date()
5564
except ValueError as ve:
5565
raise ParseException(s, l, str(ve))
5569
def convertToDatetime(fmt="%Y-%m-%dT%H:%M:%S.%f"):
5571
Helper to create a parse action for converting parsed datetime string to Python datetime.datetime
5574
- fmt - format to be passed to datetime.strptime (default=C{"%Y-%m-%dT%H:%M:%S.%f"})
5577
dt_expr = pyparsing_common.iso8601_datetime.copy()
5578
dt_expr.setParseAction(pyparsing_common.convertToDatetime())
5579
print(dt_expr.parseString("1999-12-31T23:59:59.999"))
5581
[datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
5585
return datetime.strptime(t[0], fmt)
5586
except ValueError as ve:
5587
raise ParseException(s, l, str(ve))
5590
iso8601_date = Regex(r'(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?').setName("ISO8601 date")
5591
"ISO8601 date (C{yyyy-mm-dd})"
5593
iso8601_datetime = Regex(r'(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?').setName("ISO8601 datetime")
5594
"ISO8601 datetime (C{yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)}) - trailing seconds, milliseconds, and timezone optional; accepts separating C{'T'} or C{' '}"
5596
uuid = Regex(r'[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}').setName("UUID")
5597
"UUID (C{xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx})"
5599
_html_stripper = anyOpenTag.suppress() | anyCloseTag.suppress()
5601
def stripHTMLTags(s, l, tokens):
5603
Parse action to remove HTML tags from web page HTML source
5606
# strip HTML links from normal text
5607
text = '<td>More info at the <a href="http://pyparsing.wikispaces.com">pyparsing</a> wiki page</td>'
5608
td,td_end = makeHTMLTags("TD")
5609
table_text = td + SkipTo(td_end).setParseAction(pyparsing_common.stripHTMLTags)("body") + td_end
5611
print(table_text.parseString(text).body) # -> 'More info at the pyparsing wiki page'
5613
return pyparsing_common._html_stripper.transformString(tokens[0])
5615
_commasepitem = Combine(OneOrMore(~Literal(",") + ~LineEnd() + Word(printables, excludeChars=',')
5616
+ Optional( White(" \t") ) ) ).streamline().setName("commaItem")
5617
comma_separated_list = delimitedList( Optional( quotedString.copy() | _commasepitem, default="") ).setName("comma separated list")
5618
"""Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
5620
upcaseTokens = staticmethod(tokenMap(lambda t: _ustr(t).upper()))
5621
"""Parse action to convert tokens to upper case."""
5623
downcaseTokens = staticmethod(tokenMap(lambda t: _ustr(t).lower()))
5624
"""Parse action to convert tokens to lower case."""
5627
if __name__ == "__main__":
5629
selectToken = CaselessLiteral("select")
5630
fromToken = CaselessLiteral("from")
5632
ident = Word(alphas, alphanums + "_$")
5634
columnName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens)
5635
columnNameList = Group(delimitedList(columnName)).setName("columns")
5636
columnSpec = ('*' | columnNameList)
5638
tableName = delimitedList(ident, ".", combine=True).setParseAction(upcaseTokens)
5639
tableNameList = Group(delimitedList(tableName)).setName("tables")
5641
simpleSQL = selectToken("command") + columnSpec("columns") + fromToken + tableNameList("tables")
5643
# demo runTests method, including embedded comments in test string
5644
simpleSQL.runTests("""
5645
# '*' as column list and dotted table name
5646
select * from SYS.XYZZY
5648
# caseless match on "SELECT", and casts back to "select"
5649
SELECT * from XYZZY, ABC
5651
# list of column names, and mixed case SELECT keyword
5652
Select AA,BB,CC from Sys.dual
5655
Select A, B, C from Sys.dual, Table2
5657
# invalid SELECT keyword - should fail
5658
Xelect A, B, C from Sys.dual
5660
# incomplete command - should fail
5663
# invalid column name - should fail
5664
Select ^^^ frox Sys.dual
5668
pyparsing_common.number.runTests("""
5677
# any int or real number, returned as float
5678
pyparsing_common.fnumber.runTests("""
5687
pyparsing_common.hex_integer.runTests("""
5693
pyparsing_common.uuid.setParseAction(tokenMap(uuid.UUID))
5694
pyparsing_common.uuid.runTests("""
5695
12345678-1234-5678-1234-567812345678