1
# -*- coding: utf-8 -*-
8
:copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS.
9
:license: BSD, see LICENSE for details.
13
from pygments.filter import apply_filters, Filter
14
from pygments.filters import get_filter_by_name
15
from pygments.token import Error, Text, Other, _TokenType
16
from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
20
__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
21
'LexerContext', 'include', 'bygroups', 'using', 'this']
24
_default_analyse = staticmethod(lambda x: 0.0)
27
class LexerMeta(type):
29
This metaclass automagically converts ``analyse_text`` methods into
30
static methods which always return float values.
33
def __new__(cls, name, bases, d):
34
if 'analyse_text' in d:
35
d['analyse_text'] = make_analysator(d['analyse_text'])
36
return type.__new__(cls, name, bases, d)
41
Lexer for a specific language.
43
Basic options recognized:
45
Strip leading and trailing newlines from the input (default: True).
47
Strip all leading and trailing whitespace from the input
50
Make sure that the input ends with a newline (default: True). This
51
is required for some lexers that consume input linewise.
52
*New in Pygments 1.3.*
54
If given and greater than 0, expand tabs in the input (default: 0).
56
If given, must be an encoding name. This encoding will be used to
57
convert the input string to Unicode, if it is not already a Unicode
58
string (default: ``'latin1'``).
59
Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
60
``'chardet'`` to use the chardet library, if it is installed.
66
#: Shortcuts for the lexer
78
__metaclass__ = LexerMeta
80
def __init__(self, **options):
81
self.options = options
82
self.stripnl = get_bool_opt(options, 'stripnl', True)
83
self.stripall = get_bool_opt(options, 'stripall', False)
84
self.ensurenl = get_bool_opt(options, 'ensurenl', True)
85
self.tabsize = get_int_opt(options, 'tabsize', 0)
86
self.encoding = options.get('encoding', 'latin1')
87
# self.encoding = options.get('inencoding', None) or self.encoding
89
for filter_ in get_list_opt(options, 'filters', ()):
90
self.add_filter(filter_)
94
return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
97
return '<pygments.lexers.%s>' % self.__class__.__name__
99
def add_filter(self, filter_, **options):
101
Add a new stream filter to this lexer.
103
if not isinstance(filter_, Filter):
104
filter_ = get_filter_by_name(filter_, **options)
105
self.filters.append(filter_)
107
def analyse_text(text):
109
Has to return a float between ``0`` and ``1`` that indicates
110
if a lexer wants to highlight this text. Used by ``guess_lexer``.
111
If this method returns ``0`` it won't highlight it in any case, if
112
it returns ``1`` highlighting with this lexer is guaranteed.
114
The `LexerMeta` metaclass automatically wraps this function so
115
that it works like a static method (no ``self`` or ``cls``
116
parameter) and the return value is automatically converted to
117
`float`. If the return value is an object that is boolean `False`
118
it's the same as if the return values was ``0.0``.
121
def get_tokens(self, text, unfiltered=False):
123
Return an iterable of (tokentype, value) pairs generated from
124
`text`. If `unfiltered` is set to `True`, the filtering mechanism
125
is bypassed even if filters are defined.
127
Also preprocess the text, i.e. expand tabs and strip it if
128
wanted and applies registered filters.
130
if not isinstance(text, unicode):
131
if self.encoding == 'guess':
133
text = text.decode('utf-8')
134
if text.startswith(u'\ufeff'):
135
text = text[len(u'\ufeff'):]
136
except UnicodeDecodeError:
137
text = text.decode('latin1')
138
elif self.encoding == 'chardet':
142
raise ImportError('To enable chardet encoding guessing, '
143
'please install the chardet library '
144
'from http://chardet.feedparser.org/')
145
enc = chardet.detect(text)
146
text = text.decode(enc['encoding'])
148
text = text.decode(self.encoding)
149
# text now *is* a unicode string
150
text = text.replace('\r\n', '\n')
151
text = text.replace('\r', '\n')
155
text = text.strip('\n')
157
text = text.expandtabs(self.tabsize)
158
if self.ensurenl and not text.endswith('\n'):
162
for i, t, v in self.get_tokens_unprocessed(text):
166
stream = apply_filters(stream, self.filters, self)
169
def get_tokens_unprocessed(self, text):
171
Return an iterable of (tokentype, value) pairs.
172
In subclasses, implement this method as a generator to
173
maximize effectiveness.
175
raise NotImplementedError
178
class DelegatingLexer(Lexer):
180
This lexer takes two lexer as arguments. A root lexer and
181
a language lexer. First everything is scanned using the language
182
lexer, afterwards all ``Other`` tokens are lexed using the root
185
The lexers from the ``template`` lexer package use this base lexer.
188
def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
189
self.root_lexer = _root_lexer(**options)
190
self.language_lexer = _language_lexer(**options)
191
self.needle = _needle
192
Lexer.__init__(self, **options)
194
def get_tokens_unprocessed(self, text):
198
for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
201
insertions.append((len(buffered), lng_buffer))
205
lng_buffer.append((i, t, v))
207
insertions.append((len(buffered), lng_buffer))
208
return do_insertions(insertions,
209
self.root_lexer.get_tokens_unprocessed(buffered))
212
#-------------------------------------------------------------------------------
213
# RegexLexer and ExtendedRegexLexer
219
Indicates that a state should include rules from another state.
224
class combined(tuple):
226
Indicates a state combined from multiple states.
229
def __new__(cls, *args):
230
return tuple.__new__(cls, args)
232
def __init__(self, *args):
233
# tuple.__init__ doesn't do anything
237
class _PseudoMatch(object):
239
A pseudo match object constructed from a string.
242
def __init__(self, start, text):
246
def start(self, arg=None):
249
def end(self, arg=None):
250
return self._start + len(self._text)
252
def group(self, arg=None):
254
raise IndexError('No such group')
266
Callback that yields multiple actions for each group in the match.
268
def callback(lexer, match, ctx=None):
269
for i, action in enumerate(args):
272
elif type(action) is _TokenType:
273
data = match.group(i + 1)
275
yield match.start(i + 1), action, data
278
ctx.pos = match.start(i + 1)
279
for item in action(lexer, _PseudoMatch(match.start(i + 1),
280
match.group(i + 1)), ctx):
284
ctx.pos = match.end()
290
Special singleton used for indicating the caller class.
296
def using(_other, **kwargs):
298
Callback that processes the match with a different lexer.
300
The keyword arguments are forwarded to the lexer, except `state` which
301
is handled separately.
303
`state` specifies the state that the new lexer will start in, and can
304
be an enumerable such as ('root', 'inline', 'string') or a simple
305
string which is assumed to be on top of the root state.
307
Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
310
if 'state' in kwargs:
311
s = kwargs.pop('state')
312
if isinstance(s, (list, tuple)):
313
gt_kwargs['stack'] = s
315
gt_kwargs['stack'] = ('root', s)
318
def callback(lexer, match, ctx=None):
319
# if keyword arguments are given the callback
320
# function has to create a new lexer instance
322
# XXX: cache that somehow
323
kwargs.update(lexer.options)
324
lx = lexer.__class__(**kwargs)
328
for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
331
ctx.pos = match.end()
333
def callback(lexer, match, ctx=None):
334
# XXX: cache that somehow
335
kwargs.update(lexer.options)
336
lx = _other(**kwargs)
339
for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
342
ctx.pos = match.end()
346
class RegexLexerMeta(LexerMeta):
348
Metaclass for RegexLexer, creates the self._tokens attribute from
349
self.tokens on the first instantiation.
352
def _process_state(cls, unprocessed, processed, state):
353
assert type(state) is str, "wrong state name %r" % state
354
assert state[0] != '#', "invalid state name %r" % state
355
if state in processed:
356
return processed[state]
357
tokens = processed[state] = []
359
for tdef in unprocessed[state]:
360
if isinstance(tdef, include):
361
# it's a state reference
362
assert tdef != state, "circular state reference %r" % state
363
tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
366
assert type(tdef) is tuple, "wrong rule def %r" % tdef
369
rex = re.compile(tdef[0], rflags).match
370
except Exception, err:
371
raise ValueError("uncompilable regex %r in state %r of %r: %s" %
372
(tdef[0], state, cls, err))
374
assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
375
'token type must be simple type or callable, not %r' % (tdef[1],)
381
if isinstance(tdef2, str):
385
elif tdef2 in unprocessed:
387
elif tdef2 == '#push':
389
elif tdef2[:5] == '#pop:':
390
new_state = -int(tdef2[5:])
392
assert False, 'unknown new state %r' % tdef2
393
elif isinstance(tdef2, combined):
394
# combine a new state from existing ones
395
new_state = '_tmp_%d' % cls._tmpname
399
assert istate != state, 'circular state ref %r' % istate
400
itokens.extend(cls._process_state(unprocessed,
402
processed[new_state] = itokens
403
new_state = (new_state,)
404
elif isinstance(tdef2, tuple):
405
# push more than one state
407
assert (state in unprocessed or
408
state in ('#pop', '#push')), \
409
'unknown new state ' + state
412
assert False, 'unknown new state def %r' % tdef2
413
tokens.append((rex, tdef[1], new_state))
416
def process_tokendef(cls, name, tokendefs=None):
417
processed = cls._all_tokens[name] = {}
418
tokendefs = tokendefs or cls.tokens[name]
419
for state in tokendefs.keys():
420
cls._process_state(tokendefs, processed, state)
423
def __call__(cls, *args, **kwds):
424
if not hasattr(cls, '_tokens'):
427
if hasattr(cls, 'token_variants') and cls.token_variants:
431
cls._tokens = cls.process_tokendef('', cls.tokens)
433
return type.__call__(cls, *args, **kwds)
436
class RegexLexer(Lexer):
438
Base for simple stateful regular expression-based lexers.
439
Simplifies the lexing process so that you need only
440
provide a list of states and regular expressions.
442
__metaclass__ = RegexLexerMeta
444
#: Flags for compiling the regular expressions.
445
#: Defaults to MULTILINE.
448
#: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
450
#: The initial state is 'root'.
451
#: ``new_state`` can be omitted to signify no state transition.
452
#: If it is a string, the state is pushed on the stack and changed.
453
#: If it is a tuple of strings, all states are pushed on the stack and
454
#: the current state will be the topmost.
455
#: It can also be ``combined('state1', 'state2', ...)``
456
#: to signify a new, anonymous state combined from the rules of two
457
#: or more existing ones.
458
#: Furthermore, it can be '#pop' to signify going back one step in
459
#: the state stack, or '#push' to push the current state on the stack
462
#: The tuple can also be replaced with ``include('state')``, in which
463
#: case the rules from the state named by the string are included in the
467
def get_tokens_unprocessed(self, text, stack=('root',)):
469
Split ``text`` into (tokentype, text) pairs.
471
``stack`` is the inital stack (default: ``['root']``)
474
tokendefs = self._tokens
475
statestack = list(stack)
476
statetokens = tokendefs[statestack[-1]]
478
for rexmatch, action, new_state in statetokens:
479
m = rexmatch(text, pos)
481
if type(action) is _TokenType:
482
yield pos, action, m.group()
484
for item in action(self, m):
487
if new_state is not None:
489
if isinstance(new_state, tuple):
490
for state in new_state:
493
elif state == '#push':
494
statestack.append(statestack[-1])
496
statestack.append(state)
497
elif isinstance(new_state, int):
499
del statestack[new_state:]
500
elif new_state == '#push':
501
statestack.append(statestack[-1])
503
assert False, "wrong state def: %r" % new_state
504
statetokens = tokendefs[statestack[-1]]
508
if text[pos] == '\n':
509
# at EOL, reset state to "root"
511
statestack = ['root']
512
statetokens = tokendefs['root']
513
yield pos, Text, u'\n'
515
yield pos, Error, text[pos]
521
class LexerContext(object):
523
A helper object that holds lexer position data.
526
def __init__(self, text, pos, stack=None, end=None):
529
self.end = end or len(text) # end=0 not supported ;-)
530
self.stack = stack or ['root']
533
return 'LexerContext(%r, %r, %r)' % (
534
self.text, self.pos, self.stack)
537
class ExtendedRegexLexer(RegexLexer):
539
A RegexLexer that uses a context object to store its state.
542
def get_tokens_unprocessed(self, text=None, context=None):
544
Split ``text`` into (tokentype, text) pairs.
545
If ``context`` is given, use this lexer context instead.
547
tokendefs = self._tokens
549
ctx = LexerContext(text, 0)
550
statetokens = tokendefs['root']
553
statetokens = tokendefs[ctx.stack[-1]]
556
for rexmatch, action, new_state in statetokens:
557
m = rexmatch(text, ctx.pos, ctx.end)
559
if type(action) is _TokenType:
560
yield ctx.pos, action, m.group()
563
for item in action(self, m, ctx):
566
# altered the state stack?
567
statetokens = tokendefs[ctx.stack[-1]]
568
# CAUTION: callback must set ctx.pos!
569
if new_state is not None:
571
if isinstance(new_state, tuple):
572
ctx.stack.extend(new_state)
573
elif isinstance(new_state, int):
575
del ctx.stack[new_state:]
576
elif new_state == '#push':
577
ctx.stack.append(ctx.stack[-1])
579
assert False, "wrong state def: %r" % new_state
580
statetokens = tokendefs[ctx.stack[-1]]
584
if ctx.pos >= ctx.end:
586
if text[ctx.pos] == '\n':
587
# at EOL, reset state to "root"
590
statetokens = tokendefs['root']
591
yield ctx.pos, Text, u'\n'
593
yield ctx.pos, Error, text[ctx.pos]
599
def do_insertions(insertions, tokens):
601
Helper for lexers which must combine the results of several
604
``insertions`` is a list of ``(index, itokens)`` pairs.
605
Each ``itokens`` iterable should be inserted at position
606
``index`` into the token stream given by the ``tokens``
609
The result is a combined token stream.
611
TODO: clean up the code here.
613
insertions = iter(insertions)
615
index, itokens = insertions.next()
616
except StopIteration:
625
# iterate over the token stream where we want to insert
626
# the tokens from the insertion list.
627
for i, t, v in tokens:
628
# first iteration. store the postition of first item
632
while insleft and i + len(v) >= index:
633
tmpval = v[oldi:index - i]
634
yield realpos, t, tmpval
635
realpos += len(tmpval)
636
for it_index, it_token, it_value in itokens:
637
yield realpos, it_token, it_value
638
realpos += len(it_value)
641
index, itokens = insertions.next()
642
except StopIteration:
644
break # not strictly necessary
645
yield realpos, t, v[oldi:]
646
realpos += len(v) - oldi
650
# no normal tokens, set realpos to zero
651
realpos = realpos or 0
652
for p, t, v in itokens:
656
index, itokens = insertions.next()
657
except StopIteration:
659
break # not strictly necessary