1
#ifndef INC_UnicodeCharScanner_hpp__
2
#define INC_UnicodeCharScanner_hpp__
7
#include <antlr/config.hpp>
8
#include <antlr/CommonToken.hpp>
9
#include <antlr/TokenStream.hpp>
10
#include <antlr/RecognitionException.hpp>
11
#include <antlr/SemanticException.hpp>
12
#include <antlr/InputBuffer.hpp>
13
#include <antlr/BitSet.hpp>
14
#include <antlr/LexerSharedInputState.hpp>
16
#include "MismatchedUnicodeCharException.hpp"
18
/** Superclass of generated lexers
20
class UnicodeCharScanner : public antlr::TokenStream {
22
typedef antlr::RefToken (*factory_type)();
24
typedef unsigned int char_type;
25
typedef std::map<std::string,int> string_map;
27
UnicodeCharScanner( antlr::InputBuffer& cb, bool case_sensitive )
28
: saveConsumedInput(true)
29
, caseSensitive(case_sensitive)
31
, inputState(new antlr::LexerInputState(cb))
36
setTokenObjectFactory(&antlr::CommonToken::factory);
38
UnicodeCharScanner( antlr::InputBuffer* cb, bool case_sensitive )
39
: saveConsumedInput(true)
40
, caseSensitive(case_sensitive)
42
, inputState(new antlr::LexerInputState(cb))
47
setTokenObjectFactory(&antlr::CommonToken::factory);
49
UnicodeCharScanner( const antlr::LexerSharedInputState& state, bool case_sensitive )
50
: saveConsumedInput(true)
51
, caseSensitive(case_sensitive)
58
setTokenObjectFactory(&antlr::CommonToken::factory);
61
virtual ~UnicodeCharScanner()
65
virtual char_type LA(char_type i)
67
char_type c = inputState->getInput().LA(i);
71
virtual void append(char_type c)
73
if (saveConsumedInput)
75
size_t len = text.length();
77
if( (len % 256) == 0 )
78
text.reserve(len+256);
80
// This is how UTF8 is encoded
81
// +---------------------------+----------+----------+----------+----------+
82
// | Unicode scalar | 1st | 2nd | 3th | 4th |
83
// +---------------------------+----------+----------+----------+----------+
84
// |00000000 0xxxxxxx | 0xxxxxxx | | | |
85
// |00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | |
86
// |zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | |
87
// |000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
88
// +---------------------------+----------+----------+----------+----------+
97
text += ( (c >> 6) | 0xC0 );
98
text += ( c & 0x3F | 0x80 );
100
else if (c < 0x10000)
102
text += ( (c >> 12) | 0xE0 );
103
text += ( ((c >> 6) & 0x3F) | 0x80 );
104
text += ( (c & 0x3F) | 0x80 );
106
else if (c < 0x200000)
108
text += ( (c >> 18) | 0xF0 ); // first 3 bits
109
text += ( (((c >> 16) & 0x3) << 4) |
110
((c >> 12) & 0xF) | 0x80 );
111
text += ( ((c >> 6) & 0x3F) | 0x80 );
112
text += ( (c & 0x3F) | 0x80 );
119
virtual void append(const std::string& s)
122
if (saveConsumedInput)
126
virtual void commit()
128
inputState->getInput().commit();
131
virtual void consume()
133
if (inputState->guessing == 0)
137
inputState->column++;
139
inputState->getInput().consume();
142
/** Consume chars until one matches the given char */
143
virtual void consumeUntil(char_type c)
147
char_type la_1 = LA(1);
148
if( static_cast<char_type>(EOF_CHAR) == la_1 || la_1 == c )
154
/** Consume chars until one matches the given set */
155
virtual void consumeUntil(const antlr::BitSet& set)
159
char_type la_1 = LA(1);
160
if( static_cast<char_type>(EOF_CHAR) == la_1 || set.member(la_1) )
166
/// Mark the current position and return a id for it
167
virtual unsigned int mark()
169
return inputState->getInput().mark();
172
/// Rewind the scanner to a previously marked position
173
virtual void rewind(unsigned int pos)
175
inputState->getInput().rewind(pos);
178
/// See if input contains character 'c' throw MismatchedUnicodeCharException if not
179
virtual void match(char_type c)
181
char_type la_1 = LA(1);
183
throw MismatchedUnicodeCharException(la_1, c, false, this);
187
/** See if input contains element from bitset b
188
* throw MismatchedUnicodeCharException if not
190
virtual void match(const antlr::BitSet& b)
192
char_type la_1 = LA(1);
194
if ( !b.member(la_1) )
195
throw MismatchedUnicodeCharException( la_1, b, false, this );
199
/** See if input contains string 's' throw MismatchedUnicodeCharException if not
200
* @note the string cannot match EOF
202
virtual void match( const char* s )
206
// the & 0xFF is here to prevent sign extension lateron
207
char_type la_1 = LA(1), c = (*s++ & 0xFF);
210
throw MismatchedUnicodeCharException(la_1, c, false, this);
215
/** See if input contains string 's' throw MismatchedUnicodeCharException if not
216
* @note the string cannot match EOF
218
virtual void match(const std::string& s)
220
size_t len = s.length();
222
for (size_t i = 0; i < len; i++)
224
// the & 0xFF is here to prevent sign extension lateron
225
char_type la_1 = LA(1), c = (s[i] & 0xFF);
228
throw MismatchedUnicodeCharException(la_1, c, false, this);
233
/** See if input does not contain character 'c'
234
* throw MismatchedUnicodeCharException if not
236
virtual void matchNot(char_type c)
238
char_type la_1 = LA(1);
241
throw MismatchedUnicodeCharException(la_1, c, true, this);
245
/** See if input contains character in range c1-c2
246
* throw MismatchedUnicodeCharException if not
248
virtual void matchRange(char_type c1, char_type c2)
250
char_type la_1 = LA(1);
252
if ( la_1 < c1 || la_1 > c2 )
253
throw MismatchedUnicodeCharException(la_1, c1, c2, false, this);
258
/// Get the line the scanner currently is in (starts at 1)
259
virtual int getLine() const
261
return inputState->line;
264
/// set the line number
265
virtual void setLine(int l)
267
inputState->line = l;
270
/// Get the column the scanner currently is in (starts at 1)
271
virtual int getColumn() const
273
return inputState->column;
275
/// set the column number
276
virtual void setColumn(int c)
278
inputState->column = c;
281
/// get the filename for the file currently used
282
virtual const std::string& getFilename() const
284
return inputState->filename;
286
/// Set the filename the scanner is using (used in error messages)
287
virtual void setFilename(const std::string& f)
289
inputState->filename = f;
292
virtual bool getCommitToPath() const
297
virtual void setCommitToPath(bool commit)
299
commitToPath = commit;
302
/** return a copy of the current text buffer */
303
virtual const std::string& getText() const
308
virtual void setText(const std::string& s)
313
virtual void resetText()
316
inputState->tokenStartColumn = inputState->column;
317
inputState->tokenStartLine = inputState->line;
320
virtual antlr::RefToken getTokenObject() const
325
///{ These need different handling in unicode case
327
virtual bool getCaseSensitiveLiterals() const=0;
329
virtual bool getCaseSensitive() const
331
return caseSensitive;
334
virtual void setCaseSensitive(bool t)
339
/** Override this method to get more specific case handling
340
* @note some platforms probably require setting the right locale for
341
* correct functioning.
343
virtual char_type toLower(char_type c) const
345
return std::tolower(c);
348
/** Used to keep track of line breaks, needs to be called from
349
* within generated lexers when a \n \r is encountered.
351
virtual void newline()
354
inputState->column = 1;
357
/** Advance the current column number by an appropriate amount according
358
* to the tabsize. This method needs to be explicitly called from the
359
* lexer rules encountering tabs.
364
int nc = ( ((c-1)/tabsize) + 1) * tabsize + 1; // calculate tab stop
367
/// set the tabsize. Returns the old tabsize
368
int setTabsize( int size )
370
int oldsize = tabsize;
374
/// Return the tabsize used by the scanner
375
int getTabSize() const
381
/** Report exception errors caught in nextToken() */
382
virtual void reportError(const antlr::RecognitionException& ex)
384
std::cerr << ex.toString().c_str() << std::endl;
387
/** Parser error-reporting function can be overridden in subclass */
388
virtual void reportError(const std::string& s)
390
if (getFilename() == "")
391
std::cerr << "error: " << s.c_str() << std::endl;
393
std::cerr << getFilename().c_str() << ": error: " << s.c_str() << std::endl;
396
/** Parser warning-reporting function can be overridden in subclass */
397
virtual void reportWarning(const std::string& s)
399
if (getFilename() == "")
400
std::cerr << "warning: " << s.c_str() << std::endl;
402
std::cerr << getFilename().c_str() << ": warning: " << s.c_str() << std::endl;
405
virtual antlr::InputBuffer& getInputBuffer()
407
return inputState->getInput();
410
virtual antlr::LexerSharedInputState getInputState()
415
/** set the input state for the lexer.
416
* @note state is a reference counted object, hence no reference */
417
virtual void setInputState(antlr::LexerSharedInputState state)
422
/// Set the factory for created tokens
423
virtual void setTokenObjectFactory(factory_type factory)
425
tokenFactory = factory;
428
/** Test the token text against the literals table
429
* Override this method to perform a different literals test
431
virtual int testLiteralsTable(int ttype) const
433
string_map::const_iterator i = literals.find(text);
434
if (i != literals.end())
439
/** Test the text passed in against the literals table
440
* Override this method to perform a different literals test
441
* This is used primarily when you want to test a portion of
444
virtual int testLiteralsTable(const std::string& text, int ttype) const
446
string_map::const_iterator i = literals.find(text);
447
if (i != literals.end())
452
/** This method is called by YourLexer::nextToken() when the lexer has
453
* hit EOF condition. EOF is NOT a character.
454
* This method is not called if EOF is reached during
455
* syntactic predicate evaluation or during evaluation
456
* of normal lexical rules, which presumably would be
457
* an IOException. This traps the "normal" EOF condition.
459
* uponEOF() is called after the complete evaluation of
460
* the previous token and only if your parser asks
461
* for another token beyond that last non-EOF token.
463
* You might want to throw token or char stream exceptions
464
* like: "Heh, premature eof" or a retry stream exception
465
* ("I found the end of this file, go back to referencing file").
467
virtual void uponEOF()
471
/// Methods used to change tracing behavior
474
for( int i = 0; i < traceDepth; i++ )
478
void traceIn(const char* rname)
482
std::cout << "> lexer " << rname
483
<< "; c==" << LA(1) << std::endl;
486
void traceOut(const char* rname)
489
std::cout << "< lexer " << rname
490
<< "; c==" << LA(1) << std::endl;
494
static const int EOF_CHAR = EOF;
496
std::string text; ///< Text of current token
497
/// flag indicating wether consume saves characters
498
bool saveConsumedInput;
499
factory_type tokenFactory; ///< Factory for tokens
500
bool caseSensitive; ///< Is this lexer case sensitive
501
string_map literals; // set by subclass
503
antlr::RefToken _returnToken; ///< used to return tokens w/o using return val
505
/// Input state, gives access to input stream, shared among different lexers
506
antlr::LexerSharedInputState inputState;
508
/** Used during filter mode to indicate that path is desired.
509
* A subsequent scan error will report an error as usual
510
* if acceptPath=true;
514
unsigned int tabsize; ///< tab size the scanner uses.
516
/// Create a new RefToken of type t
517
virtual antlr::RefToken makeToken(int t)
519
antlr::RefToken tok = tokenFactory();
520
// actually at this point you want to convert the stored lexeme text
521
// into the format you want to have it in in the backend...
523
tok->setColumn(inputState->tokenStartColumn);
524
tok->setLine(inputState->tokenStartLine);
528
/** Tracer class, used when -traceLexer is passed to antlr
532
UnicodeCharScanner* parser;
535
Tracer(const Tracer& other); // undefined
536
Tracer& operator=(const Tracer& other); // undefined
538
Tracer( UnicodeCharScanner* p, const char* t )
541
parser->traceIn(text);
545
parser->traceOut(text);
551
UnicodeCharScanner( const UnicodeCharScanner& other ); // undefined
552
UnicodeCharScanner& operator=( const UnicodeCharScanner& other ); // undefined
555
#endif //INC_UnicodeCharScanner_hpp__