1
#=======================================================================
3
# Python Lexical Analyser
5
# Lexical Analyser Specification
7
#=======================================================================
17
# debug_flags for Lexicon constructor
23
This class is used as part of a Plex.Lexicon specification to
24
introduce a user-defined state.
28
State(name, token_specifications)
34
def __init__(self, name, tokens):
40
Lexicon(specification) builds a lexical analyser from the given
41
|specification|. The specification consists of a list of
42
specification items. Each specification item may be either:
44
1) A token definition, which is a tuple:
48
The |pattern| is a regular axpression built using the
49
constructors defined in the Plex module.
51
The |action| is the action to be performed when this pattern
52
is recognised (see below).
54
2) A state definition:
58
where |name| is a character string naming the state,
59
and |tokens| is a list of token definitions as
60
above. The meaning and usage of states is described
66
The |action| in a token specication may be one of three things:
68
1) A function, which is called as follows:
70
function(scanner, text)
72
where |scanner| is the relevant Scanner instance, and |text|
73
is the matched text. If the function returns anything
74
other than None, that value is returned as the value of the
75
token. If it returns None, scanning continues as if the IGNORE
76
action were specified (see below).
78
2) One of the following special actions:
80
IGNORE means that the recognised characters will be treated as
81
white space and ignored. Scanning will continue until
82
the next non-ignored token is recognised before returning.
84
TEXT causes the scanned text itself to be returned as the
87
3) Any other value, which is returned as the value of the token.
92
At any given time, the scanner is in one of a number of states.
93
Associated with each state is a set of possible tokens. When scanning,
94
only tokens associated with the current state are recognised.
96
There is a default state, whose name is the empty string. Token
97
definitions which are not inside any State definition belong to
100
The initial state of the scanner is the default state. The state can
101
be changed in one of two ways:
103
1) Using Begin(state_name) as the action of a token.
105
2) Calling the begin(state_name) method of the Scanner.
107
To change back to the default state, use '' as the state name.
110
machine = None # Machine
111
tables = None # StateTableMachine
113
def __init__(self, specifications, debug = None, debug_flags = 7, timings = None):
114
if type(specifications) <> types.ListType:
115
raise Errors.InvalidScanner("Scanner definition is not a list")
117
from Timing import time
120
nfa = Machines.Machine()
121
default_initial_state = nfa.new_initial_state('')
123
for spec in specifications:
124
if isinstance(spec, State):
125
user_initial_state = nfa.new_initial_state(spec.name)
126
for token in spec.tokens:
127
self.add_token_to_machine(
128
nfa, user_initial_state, token, token_number)
129
token_number = token_number + 1
130
elif type(spec) == types.TupleType:
131
self.add_token_to_machine(
132
nfa, default_initial_state, spec, token_number)
133
token_number = token_number + 1
135
raise Errors.InvalidToken(
137
"Expected a token definition (tuple) or State instance")
140
total_time = total_time + (time2 - time1)
142
if debug and (debug_flags & 1):
143
debug.write("\n============= NFA ===========\n")
145
dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug)
148
total_time = total_time + (time4 - time3)
149
if debug and (debug_flags & 2):
150
debug.write("\n============= DFA ===========\n")
153
timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
154
timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
155
timings.write("TOTAL : %5.2f\n" % total_time)
158
def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
160
(re, action_spec) = self.parse_token_definition(token_spec)
161
# Disabled this -- matching empty strings can be useful
163
# raise Errors.InvalidToken(
164
# token_number, "Pattern can match 0 input symbols")
165
if isinstance(action_spec, Actions.Action):
167
elif callable(action_spec):
168
action = Actions.Call(action_spec)
170
action = Actions.Return(action_spec)
171
final_state = machine.new_state()
172
re.build_machine(machine, initial_state, final_state,
173
match_bol = 1, nocase = 0)
174
final_state.set_action(action, priority = -token_number)
175
except Errors.PlexError, e:
176
raise e.__class__("Token number %d: %s" % (token_number, e))
178
def parse_token_definition(self, token_spec):
179
if type(token_spec) <> types.TupleType:
180
raise Errors.InvalidToken("Token definition is not a tuple")
181
if len(token_spec) <> 2:
182
raise Errors.InvalidToken("Wrong number of items in token definition")
183
pattern, action = token_spec
184
if not isinstance(pattern, Regexps.RE):
185
raise Errors.InvalidToken("Pattern is not an RE instance")
186
return (pattern, action)
188
def get_initial_state(self, name):
189
return self.machine.get_initial_state(name)