1
// Protocol Buffers - Google's data interchange format
2
// Copyright 2008 Google Inc.
3
// http://code.google.com/p/protobuf/
5
// Licensed under the Apache License, Version 2.0 (the "License");
6
// you may not use this file except in compliance with the License.
7
// You may obtain a copy of the License at
9
// http://www.apache.org/licenses/LICENSE-2.0
11
// Unless required by applicable law or agreed to in writing, software
12
// distributed under the License is distributed on an "AS IS" BASIS,
13
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
// See the License for the specific language governing permissions and
15
// limitations under the License.
17
// Author: kenton@google.com (Kenton Varda)
18
// Based on original Protocol Buffers design by
19
// Sanjay Ghemawat, Jeff Dean, and others.
25
#include <google/protobuf/io/tokenizer.h>
26
#include <google/protobuf/io/zero_copy_stream_impl.h>
28
#include <google/protobuf/stubs/common.h>
29
#include <google/protobuf/stubs/strutil.h>
30
#include <google/protobuf/stubs/substitute.h>
31
#include <google/protobuf/testing/googletest.h>
32
#include <gtest/gtest.h>
39
// ===================================================================
40
// Data-Driven Test Infrastructure
42
// TODO(kenton): This is copied from coded_stream_unittest. This is
43
// temporary until these fetaures are integrated into gTest itself.
45
// TEST_1D and TEST_2D are macros I'd eventually like to see added to
46
// gTest. These macros can be used to declare tests which should be
47
// run multiple times, once for each item in some input array. TEST_1D
48
// tests all cases in a single input array. TEST_2D tests all
49
// combinations of cases from two arrays. The arrays must be statically
50
// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example:
52
// int kCases[] = {1, 2, 3, 4}
53
// TEST_1D(MyFixture, MyTest, kCases) {
54
// EXPECT_GT(kCases_case, 0);
57
// This test iterates through the numbers 1, 2, 3, and 4 and tests that
58
// they are all grater than zero. In case of failure, the exact case
59
// which failed will be printed. The case type must be printable using
60
// ostream::operator<<.
62
#define TEST_1D(FIXTURE, NAME, CASES) \
63
class FIXTURE##_##NAME##_DD : public FIXTURE { \
65
template <typename CaseType> \
66
void DoSingleCase(const CaseType& CASES##_case); \
69
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
70
for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \
71
SCOPED_TRACE(testing::Message() \
72
<< #CASES " case #" << i << ": " << CASES[i]); \
73
DoSingleCase(CASES[i]); \
77
template <typename CaseType> \
78
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
80
#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \
81
class FIXTURE##_##NAME##_DD : public FIXTURE { \
83
template <typename CaseType1, typename CaseType2> \
84
void DoSingleCase(const CaseType1& CASES1##_case, \
85
const CaseType2& CASES2##_case); \
88
TEST_F(FIXTURE##_##NAME##_DD, NAME) { \
89
for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \
90
for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \
91
SCOPED_TRACE(testing::Message() \
92
<< #CASES1 " case #" << i << ": " << CASES1[i] << ", " \
93
<< #CASES2 " case #" << j << ": " << CASES2[j]); \
94
DoSingleCase(CASES1[i], CASES2[j]); \
99
template <typename CaseType1, typename CaseType2> \
100
void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
101
const CaseType2& CASES2##_case)
103
// -------------------------------------------------------------------
105
// An input stream that is basically like an ArrayInputStream but sometimes
106
// returns empty buffers, just to throw us off.
107
class TestInputStream : public ZeroCopyInputStream {
109
TestInputStream(const void* data, int size, int block_size)
110
: array_stream_(data, size, block_size), counter_(0) {}
111
~TestInputStream() {}
113
// implements ZeroCopyInputStream ----------------------------------
114
bool Next(const void** data, int* size) {
115
// We'll return empty buffers starting with the first buffer, and every
116
// 3 and 5 buffers after that.
117
if (counter_ % 3 == 0 || counter_ % 5 == 0) {
124
return array_stream_.Next(data, size);
128
void BackUp(int count) { return array_stream_.BackUp(count); }
129
bool Skip(int count) { return array_stream_.Skip(count); }
130
int64 ByteCount() const { return array_stream_.ByteCount(); }
133
ArrayInputStream array_stream_;
137
// -------------------------------------------------------------------
139
// An error collector which simply concatenates all its errors into a big
140
// block of text which can be checked.
141
class TestErrorCollector : public ErrorCollector {
143
TestErrorCollector() {}
144
~TestErrorCollector() {}
148
// implements ErrorCollector ---------------------------------------
149
void AddError(int line, int column, const string& message) {
150
strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
151
line, column, message);
155
// -------------------------------------------------------------------
157
// We test each operation over a variety of block sizes to insure that
158
// we test cases where reads cross buffer boundaries as well as cases
159
// where they don't. This is sort of a brute-force approach to this,
160
// but it's easy to write and easy to understand.
161
const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
163
class TokenizerTest : public testing::Test {
166
uint64 ParseInteger(const string& text) {
168
EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
173
// ===================================================================
175
// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
176
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
177
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
179
// In each test case, the entire input text should parse as a single token
180
// of the given type.
181
struct SimpleTokenCase {
183
Tokenizer::TokenType type;
186
inline ostream& operator<<(ostream& out,
187
const SimpleTokenCase& test_case) {
188
return out << CEscape(test_case.input);
191
SimpleTokenCase kSimpleTokenCases[] = {
193
{ "hello", Tokenizer::TYPE_IDENTIFIER },
196
{ "123", Tokenizer::TYPE_INTEGER },
197
{ "0xab6", Tokenizer::TYPE_INTEGER },
198
{ "0XAB6", Tokenizer::TYPE_INTEGER },
199
{ "0X1234567", Tokenizer::TYPE_INTEGER },
200
{ "0x89abcdef", Tokenizer::TYPE_INTEGER },
201
{ "0x89ABCDEF", Tokenizer::TYPE_INTEGER },
202
{ "01234567", Tokenizer::TYPE_INTEGER },
205
{ "123.45", Tokenizer::TYPE_FLOAT },
206
{ "1.", Tokenizer::TYPE_FLOAT },
207
{ "1e3", Tokenizer::TYPE_FLOAT },
208
{ "1E3", Tokenizer::TYPE_FLOAT },
209
{ "1e-3", Tokenizer::TYPE_FLOAT },
210
{ "1e+3", Tokenizer::TYPE_FLOAT },
211
{ "1.e3", Tokenizer::TYPE_FLOAT },
212
{ "1.2e3", Tokenizer::TYPE_FLOAT },
213
{ ".1", Tokenizer::TYPE_FLOAT },
214
{ ".1e3", Tokenizer::TYPE_FLOAT },
215
{ ".1e-3", Tokenizer::TYPE_FLOAT },
216
{ ".1e+3", Tokenizer::TYPE_FLOAT },
219
{ "'hello'", Tokenizer::TYPE_STRING },
220
{ "\"foo\"", Tokenizer::TYPE_STRING },
221
{ "'a\"b'", Tokenizer::TYPE_STRING },
222
{ "\"a'b\"", Tokenizer::TYPE_STRING },
223
{ "'a\\'b'", Tokenizer::TYPE_STRING },
224
{ "\"a\\\"b\"", Tokenizer::TYPE_STRING },
225
{ "'\\xf'", Tokenizer::TYPE_STRING },
226
{ "'\\0'", Tokenizer::TYPE_STRING },
229
{ "+", Tokenizer::TYPE_SYMBOL },
230
{ ".", Tokenizer::TYPE_SYMBOL },
233
TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
234
// Set up the tokenizer.
235
TestInputStream input(kSimpleTokenCases_case.input.data(),
236
kSimpleTokenCases_case.input.size(),
238
TestErrorCollector error_collector;
239
Tokenizer tokenizer(&input, &error_collector);
241
// Before Next() is called, the initial token should always be TYPE_START.
242
EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
243
EXPECT_EQ("", tokenizer.current().text);
244
EXPECT_EQ(0, tokenizer.current().line);
245
EXPECT_EQ(0, tokenizer.current().column);
248
ASSERT_TRUE(tokenizer.Next());
250
// Check that it has the right type.
251
EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
252
// Check that it contains the complete input text.
253
EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
254
// Check that it is located at the beginning of the input
255
EXPECT_EQ(0, tokenizer.current().line);
256
EXPECT_EQ(0, tokenizer.current().column);
258
// There should be no more input.
259
EXPECT_FALSE(tokenizer.Next());
261
// After Next() returns false, the token should have type TYPE_END.
262
EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
263
EXPECT_EQ("", tokenizer.current().text);
264
EXPECT_EQ(0, tokenizer.current().line);
265
EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
267
// There should be no errors.
268
EXPECT_TRUE(error_collector.text_.empty());
271
TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
272
// Test the "allow_f_after_float" option.
274
// Set up the tokenizer.
275
const char* text = "1f 2.5f 6e3f 7F";
276
TestInputStream input(text, strlen(text), kBlockSizes_case);
277
TestErrorCollector error_collector;
278
Tokenizer tokenizer(&input, &error_collector);
279
tokenizer.set_allow_f_after_float(true);
281
// Advance through tokens and check that they are parsed as expected.
282
ASSERT_TRUE(tokenizer.Next());
283
EXPECT_EQ(tokenizer.current().text, "1f");
284
EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
285
ASSERT_TRUE(tokenizer.Next());
286
EXPECT_EQ(tokenizer.current().text, "2.5f");
287
EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
288
ASSERT_TRUE(tokenizer.Next());
289
EXPECT_EQ(tokenizer.current().text, "6e3f");
290
EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
291
ASSERT_TRUE(tokenizer.Next());
292
EXPECT_EQ(tokenizer.current().text, "7F");
293
EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
295
// There should be no more input.
296
EXPECT_FALSE(tokenizer.Next());
297
// There should be no errors.
298
EXPECT_TRUE(error_collector.text_.empty());
303
// -------------------------------------------------------------------
305
// In each case, the input is parsed to produce a list of tokens. The
306
// last token in "output" must have type TYPE_END.
307
struct MultiTokenCase {
309
Tokenizer::Token output[10]; // The compiler wants a constant array
310
// size for initialization to work. There
311
// is no reason this can't be increased if
315
inline ostream& operator<<(ostream& out,
316
const MultiTokenCase& test_case) {
317
return out << CEscape(test_case.input);
320
MultiTokenCase kMultiTokenCases[] = {
323
{ Tokenizer::TYPE_END , "" , 0, 0 },
326
// Test all token types at the same time.
327
{ "foo 1 1.2 + 'bar'", {
328
{ Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0 },
329
{ Tokenizer::TYPE_INTEGER , "1" , 0, 4 },
330
{ Tokenizer::TYPE_FLOAT , "1.2" , 0, 6 },
331
{ Tokenizer::TYPE_SYMBOL , "+" , 0, 10 },
332
{ Tokenizer::TYPE_STRING , "'bar'", 0, 12 },
333
{ Tokenizer::TYPE_END , "" , 0, 17 },
336
// Test that consecutive symbols are parsed as separate tokens.
338
{ Tokenizer::TYPE_SYMBOL , "!" , 0, 0 },
339
{ Tokenizer::TYPE_SYMBOL , "@" , 0, 1 },
340
{ Tokenizer::TYPE_SYMBOL , "+" , 0, 2 },
341
{ Tokenizer::TYPE_SYMBOL , "%" , 0, 3 },
342
{ Tokenizer::TYPE_END , "" , 0, 4 },
345
// Test that newlines affect line numbers correctly.
346
{ "foo bar\nrab oof", {
347
{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
348
{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4 },
349
{ Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0 },
350
{ Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4 },
351
{ Tokenizer::TYPE_END , "" , 1, 7 },
354
// Test that tabs affect column numbers correctly.
355
{ "foo\tbar \tbaz", {
356
{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
357
{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8 },
358
{ Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
359
{ Tokenizer::TYPE_END , "" , 0, 19 },
362
// Test that line comments are ignored.
363
{ "foo // This is a comment\n"
364
"bar // This is another comment", {
365
{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
366
{ Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0 },
367
{ Tokenizer::TYPE_END , "" , 1, 30 },
370
// Test that block comments are ignored.
371
{ "foo /* This is a block comment */ bar", {
372
{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
373
{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
374
{ Tokenizer::TYPE_END , "" , 0, 37 },
377
// Test that sh-style comments are not ignored by default.
380
{ Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 },
381
{ Tokenizer::TYPE_SYMBOL , "#" , 0, 4 },
382
{ Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6 },
383
{ Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0 },
384
{ Tokenizer::TYPE_END , "" , 1, 3 },
388
TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
389
// Set up the tokenizer.
390
TestInputStream input(kMultiTokenCases_case.input.data(),
391
kMultiTokenCases_case.input.size(),
393
TestErrorCollector error_collector;
394
Tokenizer tokenizer(&input, &error_collector);
396
// Before Next() is called, the initial token should always be TYPE_START.
397
EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
398
EXPECT_EQ("", tokenizer.current().text);
399
EXPECT_EQ(0, tokenizer.current().line);
400
EXPECT_EQ(0, tokenizer.current().column);
402
// Loop through all expected tokens.
404
Tokenizer::Token token;
406
token = kMultiTokenCases_case.output[i++];
408
SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
410
// Next() should only return false when it hits the end token.
411
if (token.type != Tokenizer::TYPE_END) {
412
ASSERT_TRUE(tokenizer.Next());
414
ASSERT_FALSE(tokenizer.Next());
417
// Check that the token matches the expected one.
418
EXPECT_EQ(token.type, tokenizer.current().type);
419
EXPECT_EQ(token.text, tokenizer.current().text);
420
EXPECT_EQ(token.line, tokenizer.current().line);
421
EXPECT_EQ(token.column, tokenizer.current().column);
423
} while (token.type != Tokenizer::TYPE_END);
425
// There should be no errors.
426
EXPECT_TRUE(error_collector.text_.empty());
429
// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
430
// "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
431
#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
433
TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
434
// Test the "comment_style" option.
436
const char* text = "foo # bar\n"
438
"corge /* grault */\n"
440
const char* const kTokens[] = {"foo", // "# bar" is ignored
441
"baz", "/", "/", "qux",
442
"corge", "/", "*", "grault", "*", "/",
445
// Set up the tokenizer.
446
TestInputStream input(text, strlen(text), kBlockSizes_case);
447
TestErrorCollector error_collector;
448
Tokenizer tokenizer(&input, &error_collector);
449
tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
451
// Advance through tokens and check that they are parsed as expected.
452
for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
453
EXPECT_TRUE(tokenizer.Next());
454
EXPECT_EQ(tokenizer.current().text, kTokens[i]);
457
// There should be no more input.
458
EXPECT_FALSE(tokenizer.Next());
459
// There should be no errors.
460
EXPECT_TRUE(error_collector.text_.empty());
465
// -------------------------------------------------------------------
467
// Test parse helpers. It's not really worth setting up a full data-driven
469
TEST_F(TokenizerTest, ParseInteger) {
470
EXPECT_EQ(0, ParseInteger("0"));
471
EXPECT_EQ(123, ParseInteger("123"));
472
EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
473
EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
474
EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
475
EXPECT_EQ(01234567, ParseInteger("01234567"));
477
// Test invalid integers that may still be tokenized as integers.
478
EXPECT_EQ(0, ParseInteger("0x"));
480
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
481
// Test invalid integers that will never be tokenized as integers.
482
EXPECT_DEBUG_DEATH(ParseInteger("zxy"),
483
"passed text that could not have been tokenized as an integer");
484
EXPECT_DEBUG_DEATH(ParseInteger("1.2"),
485
"passed text that could not have been tokenized as an integer");
486
EXPECT_DEBUG_DEATH(ParseInteger("08"),
487
"passed text that could not have been tokenized as an integer");
488
EXPECT_DEBUG_DEATH(ParseInteger("0xg"),
489
"passed text that could not have been tokenized as an integer");
490
EXPECT_DEBUG_DEATH(ParseInteger("-1"),
491
"passed text that could not have been tokenized as an integer");
492
#endif // GTEST_HAS_DEATH_TEST
496
EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
497
EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
498
EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
499
EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
500
EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
501
EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
502
EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
505
TEST_F(TokenizerTest, ParseFloat) {
506
EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1."));
507
EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3"));
508
EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3"));
509
EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
510
EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1"));
511
EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25"));
512
EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
513
EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
514
EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
515
EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
516
EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5"));
517
EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
518
EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2"));
519
EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
521
// Test invalid integers that may still be tokenized as integers.
522
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
523
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
524
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
527
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
528
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
529
EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
531
// These should parse successfully even though they are out of range.
532
// Overflows become infinity and underflows become zero.
533
EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
534
EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
536
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
537
// Test invalid integers that will never be tokenized as integers.
538
EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
539
"passed text that could not have been tokenized as a float");
540
EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
541
"passed text that could not have been tokenized as a float");
542
EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
543
"passed text that could not have been tokenized as a float");
544
#endif // GTEST_HAS_DEATH_TEST
547
TEST_F(TokenizerTest, ParseString) {
549
Tokenizer::ParseString("'hello'", &output);
550
EXPECT_EQ("hello", output);
551
Tokenizer::ParseString("\"blah\\nblah2\"", &output);
552
EXPECT_EQ("blah\nblah2", output);
553
Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
554
EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
555
Tokenizer::ParseString("'\\x20\\x4'", &output);
556
EXPECT_EQ("\x20\x4", output);
558
// Test invalid strings that may still be tokenized as strings.
559
Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid
560
EXPECT_EQ("\a?\v\t", output);
561
Tokenizer::ParseString("'", &output);
562
EXPECT_EQ("", output);
563
Tokenizer::ParseString("'\\", &output);
564
EXPECT_EQ("\\", output);
566
// Test invalid strings that will never be tokenized as strings.
567
#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet
568
EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
569
"passed text that could not have been tokenized as a string");
570
#endif // GTEST_HAS_DEATH_TEST
573
// -------------------------------------------------------------------
575
// Each case parses some input text, ignoring the tokens produced, and
576
// checks that the error output matches what is expected.
579
bool recoverable; // True if the tokenizer should be able to recover and
580
// parse more tokens after seeing this error. Cases
581
// for which this is true must end with "foo" as
582
// the last token, which the test will check for.
586
inline ostream& operator<<(ostream& out,
587
const ErrorCase& test_case) {
588
return out << CEscape(test_case.input);
591
ErrorCase kErrorCases[] = {
594
"0:2: Invalid escape sequence in string literal.\n" },
596
"0:3: Expected hex digits for escape sequence.\n" },
598
"0:4: String literals cannot cross line boundaries.\n" },
600
"0:4: String literals cannot cross line boundaries.\n" },
604
"0:3: Need space between number and identifier.\n" },
608
"0:2: \"0x\" must be followed by hex digits.\n" },
609
{ "0541823 foo", true,
610
"0:4: Numbers starting with leading zero must be in octal.\n" },
611
{ "0x123z foo", true,
612
"0:5: Need space between number and identifier.\n" },
613
{ "0x123.4 foo", true,
614
"0:5: Hex and octal numbers must be integers.\n" },
615
{ "0123.4 foo", true,
616
"0:4: Hex and octal numbers must be integers.\n" },
620
"0:2: \"e\" must be followed by exponent.\n" },
622
"0:3: \"e\" must be followed by exponent.\n" },
624
"0:3: Already saw decimal point or exponent; can't have another one.\n" },
626
"0:3: Already saw decimal point or exponent; can't have another one.\n" },
628
"0:1: Need space between identifier and decimal point.\n" },
629
// allow_f_after_float not enabled, so this should be an error.
631
"0:3: Need space between number and identifier.\n" },
633
// Block comment errors.
635
"0:2: End-of-file inside block comment.\n"
636
"0:0: Comment started here.\n"},
638
"0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"},
640
// Control characters. Multiple consecutive control characters should only
641
// produce one error.
643
"0:0: Invalid control characters encountered in text.\n" },
645
"0:0: Invalid control characters encountered in text.\n" },
647
// Check that control characters at end of input don't result in an
650
"0:0: Invalid control characters encountered in text.\n" },
652
// Check recovery from '\0'. We have to explicitly specify the length of
653
// these strings because otherwise the string constructor will just call
654
// strlen() which will see the first '\0' and think that is the end of the
656
{ string("\0foo", 4), true,
657
"0:0: Invalid control characters encountered in text.\n" },
658
{ string("\0\0foo", 5), true,
659
"0:0: Invalid control characters encountered in text.\n" },
662
TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
663
// Set up the tokenizer.
664
TestInputStream input(kErrorCases_case.input.data(),
665
kErrorCases_case.input.size(),
667
TestErrorCollector error_collector;
668
Tokenizer tokenizer(&input, &error_collector);
670
// Ignore all input, except remember if the last token was "foo".
671
bool last_was_foo = false;
672
while (tokenizer.Next()) {
673
last_was_foo = tokenizer.current().text == "foo";
676
// Check that the errors match what was expected.
677
EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
679
// If the error was recoverable, make sure we saw "foo" after it.
680
if (kErrorCases_case.recoverable) {
681
EXPECT_TRUE(last_was_foo);
685
// -------------------------------------------------------------------
687
TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
688
string text = "foo bar";
689
TestInputStream input(text.data(), text.size(), kBlockSizes_case);
691
// Create a tokenizer, read one token, then destroy it.
693
TestErrorCollector error_collector;
694
Tokenizer tokenizer(&input, &error_collector);
699
// Only "foo" should have been read.
700
EXPECT_EQ(strlen("foo"), input.ByteCount());
705
} // namespace protobuf
706
} // namespace google