2
* Javascript normalizer.
4
* Copyright (C) 2008 Sourcefire, Inc.
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License version 2 as
10
* published by the Free Software Foundation.
12
* This program is distributed in the hope that it will be useful,
13
* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15
* GNU General Public License for more details.
17
* You should have received a copy of the GNU General Public License
18
* along with this program; if not, write to the Free Software
19
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
23
#include "clamav-config.h"
26
/* assert() only enabled with ./configure --enable-debug */
43
#include "lexglobal.h"
48
#include "jsparse/generated/operators.h"
49
#include "jsparse/generated/keywords.h"
50
#include "jsparse/textbuf.h"
52
/* ----------- tokenizer ---------------- */
53
enum tokenizer_state {
65
typedef struct scanner {
66
struct text_buffer buf;
73
enum tokenizer_state state;
74
enum tokenizer_state last_state;
77
typedef int YY_BUFFER_STATE;
79
static int yylex( YYSTYPE *lvalp, yyscan_t );
80
static void yy_delete_buffer( YY_BUFFER_STATE, yyscan_t);
81
static YY_BUFFER_STATE yy_scan_bytes( const char *, size_t, yyscan_t scanner );
82
static const char *yyget_text ( yyscan_t scanner );
83
static int yyget_leng ( yyscan_t scanner );
84
static int yylex_init ( yyscan_t * ptr_yy_globals ) ;
85
static void yyset_debug (int debug_flag ,yyscan_t yyscanner );
86
static int yylex_destroy ( yyscan_t yyscanner ) ;
87
/* ----------- tokenizer end ---------------- */
99
struct hashtable id_map;
100
struct scope *parent;/* hierarchy */
101
struct scope *nxt;/* all scopes kept in a list so we can easily free all of them */
102
enum fsm_state fsm_state;
104
unsigned int brackets;
114
/* state for the current JS file being parsed */
115
struct parser_state {
116
unsigned long var_uniq;
117
unsigned long syntax_errors;
119
struct scope *global;
120
struct scope *current;
123
struct tokens tokens;
126
static struct scope* scope_new(struct parser_state *state)
128
struct scope *parent = state->current;
129
struct scope *s = cli_calloc(1, sizeof(*s));
132
if(hashtab_init(&s->id_map, 10) < 0) {
138
s->nxt = state->list;
144
static struct scope* scope_done(struct scope *s)
146
struct scope* parent = s->parent;
147
/* TODO: have a hashtab_destroy */
148
hashtab_clear(&s->id_map);
149
free(s->id_map.htable);
155
* Base --(VAR)--> InsideVar
156
* InsideVar --(Identifier)-->InsideInitializer
157
* InsideVar --(anything_else) --> POP (to Base)
158
* InsideInitializer --(COMMA)--> POP (to InsideVar)
159
* InsideInitializer | InsideVar --(SEMICOLON) --> POP (to Base)
160
* InsideInitializer --(BRACKET_OPEN) --> WaitBrClose
161
* InsideInitializer --(PAR_OPEN) --> WaitParClose
162
* WaitBrClose --(BRACKET_OPEN) --> increase depth
163
* WaitBrClose --(BRACKET_CLOSE) --> POP
164
* WaitParClose --(PAR_CLOSE) --> POP
165
* WaitParClose --(PAR_OPEN) --> increase depth
168
/* Base --(VAR)--> PUSH, to InsideVar
169
* InsideVar --(Identifier)--> InsideInitializer
170
* InsideVar --(ELSE)--> POP, inc. syntax_errors
171
* InsideInitializer --(COMMA)--> POP (to InsideVar)
172
* --(BRACKET_OPEN)--> inc bracket_counter
173
* --(PAR_OPEN)--> inc par_counter
174
* --(BRACKET_CLOSE) --> dec bracket_counter
175
* --(PAR_CLOSE)--> dec par_counter
176
* --(VAR)--> PUSH, to InsideVar (if bracket_counter != 0 || par_counter != 0)
177
* --> POP, to InsideVar, inc. syntax_errors (if bracket_counter == 0 && par_counter == 0)
178
* POP only allowed if bracket_counter == 0 && par_counter == 0
180
* InsideInitializer acts differently, make it only a flag
181
* ....................
183
* Pushing, Poping is done when entering / exiting function scopes,
184
* tracking { and function ( is done by the function scope tracker too.
186
* we only need to track brackets.
194
* ^we must not normalize member method names
198
* Variables are declared at function scope, and their initial value is
199
* undefined. At the point where the initializer is, and from there on the value
202
* { doesn't introduce a new variable scope, they are in function's scope too
205
* alert(x); -> x exists, undefined
207
* alert(x); -> x exists, =5
213
* alert(x);//error, x not declared
219
* but we can declare variables without var, only valid if we use them after
222
* function foobar() {
224
* alert(x);//x is defined, value is 5
229
* alert(x); -> x exists, undefined
231
* var x=5; -> x equals to 5
233
* alert(x); -> x is 5
237
* var x=4; -> x exists, equals to 4
238
* alert(x); -> x exists, equals to 4
240
* var x=5; -> x equals to 5
242
* alert(x); -> x is 5
258
static const char* scope_declare(struct scope *s, const char *token, const size_t len, struct parser_state *state)
260
const struct element *el = hashtab_insert(&s->id_map, token, len, state->var_uniq++);
261
/* hashtab_insert either finds an already existing entry, or allocates a
262
* new one, we return the allocated string */
263
return el ? el->key : NULL;
266
static const char* scope_use(struct scope *s, const char *token, const size_t len)
268
const struct element *el = hashtab_find(&s->id_map, token, len);
270
/* identifier already found in current scope,
271
* return here to avoid overwriting uniq id */
274
/* identifier not yet in current scope's hashtab, add with ID -1.
275
* Later if we find a declaration it will automatically assign a uniq ID
276
* to it. If not, we'll know that we have to push ID == -1 tokens to an
278
el = hashtab_insert(&s->id_map, token, len, -1);
279
return el ? el->key : NULL;
282
static long scope_lookup(struct scope *s, const char *token, const size_t len)
285
const struct element *el = hashtab_find(&s->id_map, token, len);
286
if(el && el->data != -1) {
289
/* not found in current scope, try in outer scope */
295
static int tokens_ensure_capacity(struct tokens *tokens, size_t cap)
297
if(tokens->capacity < cap) {
299
tokens->data = cli_realloc(tokens->data, cap * sizeof(*tokens->data));
302
tokens->capacity = cap;
307
static int add_token(struct parser_state *state, const yystype *token)
309
if(tokens_ensure_capacity(&state->tokens, state->tokens.cnt + 1) < 0)
311
state->tokens.data[state->tokens.cnt++] = *token;
321
static inline int buf_outc(char c, struct buf *buf)
323
if(buf->pos >= sizeof(buf->buf)) {
324
if(write(buf->outfd, buf->buf, sizeof(buf->buf)) != sizeof(buf->buf))
328
buf->buf[buf->pos++] = c;
332
static inline int buf_outs(const char *s, struct buf *buf)
334
const size_t buf_len = sizeof(buf->buf);
339
while(i < buf_len && *s) {
343
buf->buf[i++] = tolower((unsigned char)(*s));
347
if(write(buf->outfd, buf->buf, buf_len) < 0)
356
static inline void output_space(char last, char current, struct buf *out)
358
if(isalnum(last) && isalnum(current))
363
/* return class of last character */
364
static char output_token(const yystype *token, struct scope *scope, struct buf *out, char lastchar)
367
const char *s = TOKEN_GET(token, cstring);
368
/* TODO: use a local buffer, instead of FILE* */
369
switch(token->type) {
370
case TOK_StringLiteral:
371
output_space(lastchar,'"', out);
379
output_space(lastchar,'0', out);
380
snprintf(sbuf, sizeof(sbuf), "%ld", TOKEN_GET(token, ival));
383
case TOK_NumericFloat:
384
output_space(lastchar,'0', out);
385
snprintf(sbuf, sizeof(sbuf), "%g", TOKEN_GET(token, dval));
388
case TOK_IDENTIFIER_NAME:
389
output_space(lastchar,'a', out);
391
long id = scope_lookup(scope, s, strlen(s));
393
/* identifier not normalized */
396
snprintf(sbuf, sizeof(sbuf), "n%03ld",id);
402
output_space(lastchar,'a', out);
403
buf_outs("function",out);
407
const size_t len = strlen(s);
408
output_space(lastchar,s[0], out);
410
return len ? s[len-1] : '\0';
417
* We can't delete the scope as soon as we see a }, because
418
* we still need the hashmap from it.
420
* If we would normalize all the identifiers, and output when a scope is closed,
421
* then it would be impossible to normalize calls to other functions.
423
* So we need to keep all scopes in memory, to do this instead of scope_done, we
424
* simply just set current = current->parent when a scope is closed.
425
* We keep a list of all scopes created in parser_state-> When we parsed
426
* everything, we output everything, and then delete all scopes.
428
* We also need to know where to switch scopes on the second pass, so for
429
* TOK_FUNCTION types we will use another pointer, that points to the scope
430
* (added to yystype's union).
432
* We lookup the identifier in the scope (using scope_lookup, it looks in parent
433
* scopes too), if ID is found then output (n%3d, Id),
434
* otherwise output the identifier as is.
436
* To make it easier to match sigs, we do a xfrm :
437
* 'function ID1 (..'. => 'n%3d = function (...'
441
* we'll add all identifier to the scope's map
442
* those that are not decl. will have initial ID -1
443
* if we later see a decl for it in same scope, it'll automatically get a
446
* When parsing of local scope is done, we take any ID -1 identifiers,
447
* and push them up one level (careful not to overwrite existing IDs).
449
* it would be nice if the tokens would contain a link to the entry in the
450
* hashtab, a link that automatically gets updated when the element is moved
451
* (pushed up). This would prevent subsequent lookups in the map,
452
* when we want to output the tokens.
453
* There is no easy way to do that, so we just do another lookup
458
* This actually works, redefining foo:
460
* var foo=5; alert(foo);
462
* So we can't treat function names just as any other identifier?
463
* We can, because you can no longer call foo, if you redefined it as a var.
464
* So if we rename both foo-s with same name, it will have same behaviour.
466
* This means that a new scope should begin after function, and not after
470
static void scope_free_all(struct scope *p)
480
void cli_strtokenize(char *buffer, const char delim, const size_t token_count, const char **tokens);
481
static int match_parameters(const yystype *tokens, const char ** param_names, size_t count)
484
if(tokens[0].type != TOK_PAR_OPEN)
488
const char *token_val = TOKEN_GET(&tokens[i], cstring);
489
if(tokens[i].type != TOK_IDENTIFIER_NAME ||
491
strcmp(token_val, param_names[j++]))
494
if((count && tokens[i].type != TOK_COMMA)
495
|| (!count && tokens[i].type != TOK_PAR_CLOSE))
502
static const char *de_packer_3[] = {"p","a","c","k","e","r"};
503
static const char *de_packer_2[] = {"p","a","c","k","e","d"};
507
#define MAX(a, b) ((a)>(b) ? (a) : (b))
510
static inline char *textbuffer_done(yyscan_t scanner)
512
/* free unusued memory */
513
char *str = cli_realloc(scanner->buf.data, scanner->buf.pos);
515
str = scanner->buf.data;
517
scanner->yytext = str;
518
scanner->yylen = scanner->buf.pos - 1;
519
memset(&scanner->buf, 0, sizeof(scanner->buf));
523
#define MODULE "JS-Norm: "
525
static void free_token(yystype *token)
527
if(token->vtype == vtype_string) {
528
free(token->val.string);
529
token->val.string = NULL;
533
static int replace_token_range(struct tokens *dst, size_t start, size_t end, const struct tokens *with)
535
const size_t len = with ? with->cnt : 0;
537
cli_dbgmsg(MODULE "Replacing tokens %lu - %lu with %lu tokens\n",start, end, len);
538
if(start >= dst->cnt || end > dst->cnt)
540
for(i=start;i<end;i++) {
541
free_token(&dst->data[i]);
543
if(tokens_ensure_capacity(dst, dst->cnt - (end-start) + len) < 0)
545
memmove(&dst->data[start+len], &dst->data[end], (dst->cnt - end) * sizeof(dst->data[0]));
546
if(with && len > 0) {
547
memcpy(&dst->data[start], with->data, len * sizeof(dst->data[0]));
549
dst->cnt = dst->cnt - (end-start) + len;
553
static int append_tokens(struct tokens *dst, const struct tokens *src)
557
if(tokens_ensure_capacity(dst, dst->cnt + src->cnt) == -1)
559
cli_dbgmsg(MODULE "Appending %lu tokens\n", src->cnt);
560
memcpy(&dst->data[dst->cnt], src->data, src->cnt * sizeof(dst->data[0]));
561
dst->cnt += src->cnt;
565
static void decode_de(yystype *params[], struct text_buffer *txtbuf)
567
const char *p = TOKEN_GET(params[0], cstring);
568
const long a = TOKEN_GET(params[1], ival);
569
/*const char *c = params[2];*/
570
char *k = TOKEN_GET(params[3], string);
571
/*const char *r = params[5];*/
578
memset(txtbuf, 0, sizeof(*txtbuf));
581
for(o = k; *o; o++) if(*o == '|') nsplit++;
583
tokens = malloc(sizeof(char*)*nsplit);
587
cli_strtokenize(k,'|',nsplit, tokens);
590
while(*p && !isalnum(*p)) {
591
if(*p=='\\' && (p[1] == '\'' || p[1] == '\"'))
594
textbuffer_putc(txtbuf, *p++);
599
while(*p && isalnum(*p)) {
601
unsigned char v = *p++;
602
/* TODO: use a table here */
603
if(v >= 'a') x = 10+v-'a';
604
else if(v >= 'A') x = 36+v-'A';
608
if(val >= nsplit || !tokens[val] || !tokens[val][0])
610
textbuffer_putc(txtbuf, *o++);
611
else textbuffer_append(txtbuf, tokens[val]);
614
textbuffer_append(txtbuf, "\0");
617
struct decode_result {
618
struct text_buffer txtbuf;
621
unsigned append:1; /* 0: tokens are replaced with new token(s),
622
1: old tokens are deleted, new ones appended at the end */
625
static void handle_de(yystype *tokens, size_t start, const size_t cnt, const char *name, struct decode_result *res)
627
/* find function decl. end */
628
size_t i, nesting = 1, j;
629
yystype* parameters [6];
630
const size_t parameters_cnt = 6;
632
for(i=start;i < cnt; i++) {
633
if(tokens[i].type == TOK_FUNCTION) {
634
if(TOKEN_GET(&tokens[i], scope))
644
memset(parameters, 0, sizeof(parameters));
646
/* find call to function */
647
for(;i+2 < cnt; i++) {
648
const char* token_val = TOKEN_GET(&tokens[i], cstring);
649
if(tokens[i].type == TOK_IDENTIFIER_NAME &&
651
!strcmp(name, token_val) &&
652
tokens[i+1].type == TOK_PAR_OPEN) {
655
for(j = 0;j < parameters_cnt && i < cnt;j++) {
656
parameters[j] = &tokens[i++];
657
if(j != parameters_cnt-1)
658
while (tokens[i].type != TOK_COMMA && i < cnt) i++;
660
while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
663
if(j == parameters_cnt)
664
decode_de(parameters, &res->txtbuf);
668
while(i<cnt && tokens[i].type != TOK_PAR_OPEN) i++;
671
/* TODO: move this v to another func */
672
for(j = 0;j < parameters_cnt && i < cnt;j++) {
673
parameters[j] = &tokens[i++];
674
if(j != parameters_cnt-1)
675
while (tokens[i].type != TOK_COMMA && i < cnt) i++;
677
while (tokens[i].type != TOK_PAR_CLOSE && i < cnt) i++;
680
if(j == parameters_cnt)
681
decode_de(parameters, &res->txtbuf);
683
if(parameters[0] && parameters[parameters_cnt-1]) {
684
res->pos_begin = parameters[0] - tokens;
685
res->pos_end = parameters[parameters_cnt-1] - tokens + 1;
686
if(tokens[res->pos_end].type == TOK_BRACKET_OPEN &&
687
tokens[res->pos_end+1].type == TOK_BRACKET_CLOSE &&
688
tokens[res->pos_end+2].type == TOK_PAR_CLOSE)
689
res->pos_end += 3; /* {}) */
691
res->pos_end++; /* ) */
695
static int handle_unescape(struct tokens *tokens, size_t start, const size_t cnt)
697
if(tokens->data[start].type == TOK_StringLiteral) {
699
struct tokens new_tokens;
702
R = cli_unescape(TOKEN_GET(&tokens->data[start], cstring));
703
tok.type = TOK_StringLiteral;
704
TOKEN_SET(&tok, string, R);
705
new_tokens.capacity = new_tokens.cnt = 1;
706
new_tokens.data = &tok;
707
if(replace_token_range(tokens, start-2, start+2, &new_tokens) < 0)
714
/* scriptasylum dot com's JS encoder */
715
static void handle_df(const yystype *tokens, size_t start, const size_t cnt, struct decode_result *res)
718
size_t len, s1_len, i;
722
if(tokens[start].type != TOK_StringLiteral)
724
str = TOKEN_GET(&tokens[start], string);
728
clast = str[len-1] - '0';
731
s1 = cli_unescape(str);
733
for(i=0;i<s1_len;i++) {
736
R = cli_unescape(s1);
738
res->pos_begin = start-2;
739
res->pos_end = start+2;
740
res->txtbuf.data = R;
741
res->txtbuf.pos = strlen(R);
747
static void handle_eval(struct tokens *tokens, size_t start, struct decode_result *res)
749
res->txtbuf.data = TOKEN_GET(&tokens->data[start], string);
750
if(res->txtbuf.data && tokens->data[start+1].type == TOK_PAR_CLOSE) {
751
TOKEN_SET(&tokens->data[start], string, NULL);
752
res->txtbuf.pos = strlen(res->txtbuf.data);
753
res->pos_begin = start-2;
754
res->pos_end = start+2;
758
static void run_folders(struct tokens *tokens)
762
for(i = 0; i < tokens->cnt; i++) {
763
const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
764
if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
766
!strcmp("unescape", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
768
handle_unescape(tokens, i+2, tokens->cnt);
773
static inline int state_update_scope(struct parser_state *state, const yystype *token)
775
if(token->type == TOK_FUNCTION) {
776
struct scope *scope = TOKEN_GET(token, scope);
778
state->current = scope;
781
/* dummy token marking function end */
782
if(state->current->parent)
783
state->current = state->current->parent;
784
/* don't output this token, it is just a dummy marker */
791
static void run_decoders(struct parser_state *state)
795
struct tokens *tokens = &state->tokens;
797
for(i = 0; i < tokens->cnt; i++) {
798
const char *cstring = TOKEN_GET(&tokens->data[i], cstring);
799
struct decode_result res;
800
res.pos_begin = res.pos_end = 0;
802
if(tokens->data[i].type == TOK_FUNCTION && i+13 < tokens->cnt) {
805
if(tokens->data[i].type == TOK_IDENTIFIER_NAME) {
806
cstring = TOKEN_GET(&tokens->data[i], cstring);
810
if(match_parameters(&tokens->data[i], de_packer_3, sizeof(de_packer_3)/sizeof(de_packer_3[0])) != -1
811
|| match_parameters(&tokens->data[i], de_packer_2, sizeof(de_packer_2)/sizeof(de_packer_2[0])) != -1) {
812
/* find function decl. end */
813
handle_de(tokens->data, i, tokens->cnt, name, &res);
815
} else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
817
!strcmp("dF", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
818
/* TODO: also match signature of dF function (possibly
819
* declared using unescape */
821
handle_df(tokens->data, i+2, tokens->cnt, &res);
822
} else if(i+2 < tokens->cnt && tokens->data[i].type == TOK_IDENTIFIER_NAME &&
824
!strcmp("eval", cstring) && tokens->data[i+1].type == TOK_PAR_OPEN) {
825
handle_eval(tokens, i+2, &res);
827
if(res.pos_end > res.pos_begin) {
828
struct tokens parent_tokens;
829
if(res.pos_end < tokens->cnt && tokens->data[res.pos_end].type == TOK_SEMICOLON)
831
parent_tokens = state->tokens;/* save current tokens */
832
/* initialize embedded context */
833
memset(&state->tokens, 0, sizeof(state->tokens));
834
if(++state->rec > 16)
835
cli_dbgmsg(MODULE "recursion limit reached\n");
837
cli_js_process_buffer(state, res.txtbuf.data, res.txtbuf.pos);
840
free(res.txtbuf.data);
841
/* state->tokens still refers to the embedded/nested context
844
replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, &state->tokens);
847
replace_token_range(&parent_tokens, res.pos_begin, res.pos_end, NULL);
848
append_tokens(&parent_tokens, &state->tokens);
850
/* end of embedded context, restore tokens state */
851
free(state->tokens.data);
852
state->tokens = parent_tokens;
854
state_update_scope(state, &state->tokens.data[i]);
858
void cli_js_parse_done(struct parser_state* state)
860
struct tokens * tokens = &state->tokens;
861
size_t par_balance = 0, i;
865
cli_dbgmsg(MODULE "in cli_js_parse_done()\n");
866
/* close unfinished token */
867
switch (state->scanner->state) {
876
cli_js_process_buffer(state, &end, 1);
877
/* close remaining paranthesis */
878
for (i=0;i<tokens->cnt;i++) {
879
if (tokens->data[i].type == TOK_PAR_OPEN)
881
else if (tokens->data[i].type == TOK_PAR_CLOSE && par_balance > 0)
884
if (par_balance > 0) {
885
memset(&val, 0, sizeof(val));
886
val.type = TOK_PAR_CLOSE;
887
TOKEN_SET(&val, cstring, ")");
888
while (par_balance-- > 0) {
889
add_token(state, &val);
893
/* we had to close unfinished strings, paranthesis,
894
* so that the folders/decoders can run properly */
895
run_folders(&state->tokens);
898
yylex_destroy(state->scanner);
899
state->scanner = NULL;
903
void cli_js_output(struct parser_state *state, const char *tempdir)
907
char lastchar = '\0';
910
snprintf(filename, 1024, "%s/javascript", tempdir);
913
buf.outfd = open(filename, O_CREAT | O_WRONLY, 0600);
915
cli_errmsg(MODULE "cannot open output file for writing: %s\n", filename);
919
if(lseek(buf.outfd, 0, SEEK_END) != 0) {
920
/* separate multiple scripts with \n */
921
buf_outc('\n', &buf);
923
buf_outs("<script>", &buf);
924
state->current = state->global;
925
for(i = 0; i < state->tokens.cnt; i++) {
926
if(state_update_scope(state, &state->tokens.data[i]))
927
lastchar = output_token(&state->tokens.data[i], state->current, &buf, lastchar);
929
/* add /script if not already there */
930
if(buf.pos < 9 || memcmp(buf.buf + buf.pos - 9, "</script>", 9))
931
buf_outs("</script>", &buf);
932
if(write(buf.outfd, buf.buf, buf.pos) < 0) {
933
cli_dbgmsg(MODULE "I/O error\n");
936
cli_dbgmsg(MODULE "dumped/appended normalized script to: %s\n",filename);
939
void cli_js_destroy(struct parser_state *state)
944
scope_free_all(state->list);
945
for(i=0;i<state->tokens.cnt;i++) {
946
free_token(&state->tokens.data[i]);
948
free(state->tokens.data);
949
/* detect use after free */
951
yylex_destroy(state->scanner);
952
memset(state, 0x55, sizeof(*state));
954
cli_dbgmsg(MODULE "cli_js_destroy() done\n");
957
/* buffer is html-normlike "chunk", if original file is bigger than buffer,
958
* we rewind to a space, so we'll know that tokens won't be broken in half at
959
* the end of a buffer. All tokens except string-literals of course.
960
* So we can assume that after the buffer there is either a space, EOF, or a
961
* chunk of text not containing whitespace at all (for which we care only if its
963
void cli_js_process_buffer(struct parser_state *state, const char *buf, size_t n)
965
struct scope* current = state->current;
971
/* this state has either not been initialized,
972
* or cli_js_parse_done() was already called on it */
973
cli_warnmsg(MODULE "invalid state\n");
976
yyb = yy_scan_bytes(buf, n, state->scanner);
977
memset(&val, 0, sizeof(val));
978
val.vtype = vtype_undefined;
979
/* on EOF yylex will return 0 */
980
while( (yv=yylex(&val, state->scanner)) != 0)
988
current->fsm_state = InsideVar;
990
case TOK_IDENTIFIER_NAME:
991
text = yyget_text(state->scanner);
992
leng = yyget_leng(state->scanner);
993
if(current->last_token == TOK_DOT) {
994
/* this is a member name, don't normalize
996
TOKEN_SET(&val, string, cli_strdup(text));
997
val.type = TOK_UNNORM_IDENTIFIER;
999
switch(current->fsm_state) {
1000
case WaitParameterList:
1001
state->syntax_errors++;
1004
case InsideInitializer:
1005
TOKEN_SET(&val, cstring, scope_use(current, text, leng));
1008
case InsideFunctionDecl:
1009
TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
1010
current->fsm_state = InsideInitializer;
1011
current->brackets = 0;
1013
case WaitFunctionName:
1014
TOKEN_SET(&val, cstring, scope_declare(current, text, leng, state));
1015
current->fsm_state = WaitParameterList;
1021
switch(current->fsm_state) {
1022
case WaitFunctionName:
1024
case WaitParameterList:
1025
current->fsm_state = InsideFunctionDecl;
1033
switch(current->fsm_state) {
1034
case WaitFunctionName:
1035
state->syntax_errors++;
1037
case WaitParameterList:
1038
current->fsm_state = Base;
1045
case TOK_CURLY_BRACE_OPEN:
1046
switch(current->fsm_state) {
1047
case WaitFunctionName:
1049
case WaitParameterList:
1050
case InsideFunctionDecl:
1051
/* in a syntactically correct
1052
* file, we would already be in
1053
* the Base state when we see a {
1055
current->fsm_state = Base;
1058
case InsideInitializer:
1059
state->syntax_errors++;
1067
case TOK_CURLY_BRACE_CLOSE:
1068
if(current->blocks > 0)
1071
state->syntax_errors++;
1072
if(!current->blocks) {
1073
if(current->parent) {
1074
/* add dummy FUNCTION token to
1075
* mark function end */
1076
TOKEN_SET(&val, cstring, "}");
1077
add_token(state, &val);
1078
TOKEN_SET(&val, scope, NULL);
1079
val.type = TOK_FUNCTION;
1081
state->current = current = current->parent;
1084
state->syntax_errors++;
1088
case TOK_BRACKET_OPEN:
1089
current->brackets++;
1091
case TOK_BRACKET_CLOSE:
1092
if(current->brackets > 0)
1093
current->brackets--;
1095
state->syntax_errors++;
1098
if (current->fsm_state == InsideInitializer && current->brackets == 0 && current->blocks == 0) {
1099
/* initializer ended only if we
1100
* encountered a comma, and [] are
1102
* This avoids switching state on:
1103
* var x = [4,y,u];*/
1104
current->fsm_state = InsideVar;
1108
if (current->brackets == 0 && current->blocks == 0) {
1109
/* avoid switching state on unbalanced []:
1110
* var x = [test;testi]; */
1111
current->fsm_state = Base;
1115
current = scope_new(state);
1116
current->fsm_state = WaitFunctionName;
1117
TOKEN_SET(&val, scope, state->current);
1119
case TOK_StringLiteral:
1120
if(state->tokens.cnt > 0 && state->tokens.data[state->tokens.cnt-1].type == TOK_PLUS) {
1121
/* see if can fold */
1122
yystype *prev_string = &state->tokens.data[state->tokens.cnt-2];
1123
if(prev_string->type == TOK_StringLiteral) {
1124
char *str = TOKEN_GET(prev_string, string);
1125
size_t str_len = strlen(str);
1127
text = yyget_text(state->scanner);
1128
leng = yyget_leng(state->scanner);
1131
/* delete TOK_PLUS */
1132
free_token(&state->tokens.data[--state->tokens.cnt]);
1134
str = cli_realloc(str, str_len + leng + 1);
1135
strncpy(str+str_len, text, leng);
1136
str[str_len + leng] = '\0';
1137
TOKEN_SET(prev_string, string, str);
1138
free(val.val.string);
1139
memset(&val, 0, sizeof(val));
1140
val.vtype = vtype_undefined;
1146
if(val.vtype == vtype_undefined) {
1147
text = yyget_text(state->scanner);
1148
TOKEN_SET(&val, string, cli_strdup(text));
1151
add_token(state, &val);
1152
current->last_token = yv;
1153
memset(&val, 0, sizeof(val));
1154
val.vtype = vtype_undefined;
1156
yy_delete_buffer(yyb, state->scanner);
1159
struct parser_state *cli_js_init(void)
1161
struct parser_state *state = cli_calloc(1, sizeof(*state));
1164
if(!scope_new(state)) {
1168
state->global = state->current;
1170
if(yylex_init(&state->scanner)) {
1171
scope_done(state->global);
1175
yyset_debug(1, state->scanner);
1176
cli_dbgmsg(MODULE "cli_js_init() done\n");
1180
/*-------------- tokenizer ---------------------*/
1189
BracketOpen = TOK_BRACKET_OPEN,
1190
BracketClose = TOK_BRACKET_CLOSE,
1192
CurlyOpen = TOK_CURLY_BRACE_OPEN,
1193
CurlyClose = TOK_CURLY_BRACE_CLOSE,
1194
ParOpen = TOK_PAR_OPEN,
1195
ParClose = TOK_PAR_CLOSE,
1197
SemiColon = TOK_SEMICOLON,
1207
#define WS Whitespace
1208
#define BO BracketOpen
1209
#define BC BracketClose
1211
#define CO CurlyOpen
1212
#define CC CurlyClose
1216
#define SC SemiColon
1219
static const enum char_class ctype[256] = {
1220
NA, NA, NA, NA, NA, NA, NA, NA, NA, WS, WS, WS, NA, WS, NA, NA,
1221
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1222
WS, OP, DQ, NA, ID, OP, OP, SQ, PO, PC, OP, OP, CM, OP, DT, SL,
1223
DG, DG, DG, DG, DG, DG, DG, DG, DG, DG, OP, SC, OP, OP, OP, OP,
1224
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
1225
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, BO, ID, BC, OP, ID,
1226
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
1227
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, CO, OP, CC, OP, NA,
1228
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1229
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1230
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1231
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1232
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1233
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1234
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1235
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
1238
static const enum char_class id_ctype[256] = {
1239
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1240
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1241
NA, NA, NA, NA, ID, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1242
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA, NA,
1243
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
1244
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, OP, NA, NA, ID,
1245
NA, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID,
1246
ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, ID, NA, NA, NA, NA, NA,
1247
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1248
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1249
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1250
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1251
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1252
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1253
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1254
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1257
#define CASE_SPECIAL_CHAR(C, S) case C: TOKEN_SET(lvalp, cstring, (S)); return cClass;
1259
#define BUF_KEEP_SIZE 32768
1261
static void textbuf_clean(struct text_buffer *buf)
1263
if(buf->capacity > BUF_KEEP_SIZE) {
1264
buf->data = cli_realloc(buf->data, BUF_KEEP_SIZE);
1265
buf->capacity = BUF_KEEP_SIZE;
1270
static inline int parseString(YYSTYPE *lvalp, yyscan_t scanner, const char q,
1271
enum tokenizer_state tostate)
1274
/* look for " terminating the string */
1275
const char *start = &scanner->in[scanner->pos], *end = start;
1277
const size_t siz = &scanner->in[scanner->insize] - end;
1278
end = memchr(end, q, siz);
1279
if(end && end > start && end[-1] == '\\') {
1285
if(end && end >= start)
1288
len = scanner->insize - scanner->pos;
1289
cli_textbuffer_append_normalize(&scanner->buf, start, len);
1291
/* skip over end quote */
1292
scanner->pos += len + 1;
1293
textbuffer_putc(&scanner->buf, '\0');
1294
TOKEN_SET(lvalp, string, textbuffer_done(scanner));
1295
scanner->state = Initial;
1296
assert(lvalp->val.string);
1297
return TOK_StringLiteral;
1299
scanner->pos += len;
1300
/* unfinished string */
1301
scanner->state = tostate;
1306
static inline int parseDQString(YYSTYPE *lvalp, yyscan_t scanner)
1308
return parseString(lvalp, scanner, '"', DoubleQString);
1311
static inline int parseSQString(YYSTYPE *lvalp, yyscan_t scanner)
1313
return parseString(lvalp, scanner, '\'', SingleQString);
1316
static inline int parseNumber(YYSTYPE *lvalp, yyscan_t scanner)
1318
const unsigned char *in = (const unsigned char*)scanner->in;
1320
while(scanner->pos < scanner->insize) {
1321
unsigned char c = in[scanner->pos++];
1323
textbuffer_putc(&scanner->buf, c);
1326
if(c =='.' && !is_float) {
1328
textbuffer_putc(&scanner->buf, '.');
1331
if((c=='e' || c=='E') && is_float) {
1332
textbuffer_putc(&scanner->buf, c);
1333
if(scanner->pos < scanner->insize) {
1334
c = in[scanner->pos++];
1335
if(c == '+' || c == '-' || isdigit(c)) {
1336
textbuffer_putc(&scanner->buf, c);
1342
textbuffer_putc(&scanner->buf, '\0');
1343
scanner->state = Initial;
1345
TOKEN_SET(lvalp, dval, atof(scanner->buf.data));
1346
return TOK_NumericFloat;
1348
TOKEN_SET(lvalp, ival, atoi(scanner->buf.data));
1349
return TOK_NumericInt;
1352
scanner->state = Number;
1356
static inline int parseId(YYSTYPE *lvalp, yyscan_t scanner)
1358
const struct keyword *kw;
1359
const unsigned char *in = (const unsigned char*)scanner->in;
1360
scanner->state = Initial;
1361
while(scanner->pos < scanner->insize) {
1362
unsigned char c = in[scanner->pos++];
1363
enum char_class cClass = id_ctype[c];
1366
textbuffer_putc(&scanner->buf, c);
1369
/* the table contains OP only for \ */
1371
if(scanner->pos < scanner->insize &&
1372
in[scanner->pos++] == 'u') {
1373
textbuffer_putc(&scanner->buf, c);
1376
if(scanner->pos == scanner->insize) {
1379
/* else fallthrough */
1381
/* character is no longer part of identifier */
1382
scanner->state = Initial;
1383
textbuffer_putc(&scanner->buf, '\0');
1385
kw = in_word_set(scanner->buf.data, scanner->buf.pos-1);
1387
/* we got a keyword */
1388
TOKEN_SET(lvalp, cstring, kw->name);
1391
/* it is not a keyword, just an identifier */
1392
TOKEN_SET(lvalp, cstring, NULL);
1393
return TOK_IDENTIFIER_NAME;
1396
scanner->state = Identifier;
1401
#define MIN(a,b) ((a)<(b) ? (a):(b))
1404
static int parseOperator(YYSTYPE *lvalp, yyscan_t scanner)
1406
size_t len = MIN(5, scanner->insize - scanner->pos);
1408
const struct operator *kw = in_op_set(&scanner->in[scanner->pos], len);
1410
TOKEN_SET(lvalp, cstring, kw->name);
1411
scanner->pos += len;
1419
TOKEN_SET(lvalp, cstring, NULL);
1423
static int yylex_init(yyscan_t *scanner)
1425
*scanner = cli_calloc(1, sizeof(**scanner));
1426
return *scanner ? 0 : -1;
1429
static int yylex_destroy(yyscan_t scanner)
1431
free(scanner->buf.data);
1436
static int yy_scan_bytes(const char *p, size_t len, yyscan_t scanner)
1439
scanner->insize = len;
1441
scanner->lastpos = -1;
1442
scanner->last_state = Dummy;
1446
static void yyset_debug (int debug_flag ,yyscan_t yyscanner )
1450
static void yy_delete_buffer( YY_BUFFER_STATE yyb, yyscan_t scanner)
1454
static const char *yyget_text(yyscan_t scanner)
1456
assert(scanner->buf.data || scanner->yytext);
1457
return scanner->yytext ? scanner->yytext : scanner->buf.data;
1460
static int yyget_leng(yyscan_t scanner)
1462
/* we have a \0 too */
1463
return scanner->yylen ? scanner->yylen : (scanner->buf.pos > 0 ? scanner->buf.pos - 1 : 0);
1466
static int yylex(YYSTYPE *lvalp, yyscan_t scanner)
1468
const size_t len = scanner->insize;
1469
const unsigned char *in = (const unsigned char*)scanner->in;
1470
unsigned char lookahead;
1471
enum char_class cClass;
1473
scanner->yytext = NULL;
1475
if(scanner->pos == scanner->lastpos) {
1476
if(scanner->last_state == scanner->state) {
1477
cli_dbgmsg(MODULE "infloop detected, skipping character\n");
1480
/* its not necesarely an infloop if it changed
1481
* state, and it shouldn't infloop between states */
1483
scanner->lastpos = scanner->pos;
1484
scanner->last_state = scanner->state;
1485
while(scanner->pos < scanner->insize) {
1486
switch(scanner->state) {
1488
textbuf_clean(&scanner->buf);
1489
cClass = ctype[in[scanner->pos++]];
1492
/* eat whitespace */
1495
if(scanner->pos < len) {
1496
lookahead = in[scanner->pos];
1499
scanner->state = MultilineComment;
1503
scanner->state = SinglelineComment;
1509
return parseOperator(lvalp, scanner);
1512
return parseOperator(lvalp, scanner);
1514
return parseDQString(lvalp, scanner);
1516
return parseSQString(lvalp, scanner);
1519
return parseNumber(lvalp, scanner);
1522
return parseId(lvalp,scanner);
1523
CASE_SPECIAL_CHAR(BracketOpen, "[");
1524
CASE_SPECIAL_CHAR(BracketClose, "]");
1525
CASE_SPECIAL_CHAR(Comma, ",");
1526
CASE_SPECIAL_CHAR(CurlyOpen, "{");
1527
CASE_SPECIAL_CHAR(CurlyClose, "}");
1528
CASE_SPECIAL_CHAR(ParOpen, "(");
1529
CASE_SPECIAL_CHAR(ParClose, ")");
1530
CASE_SPECIAL_CHAR(Dot, ".");
1531
CASE_SPECIAL_CHAR(SemiColon, ";");
1537
return parseString(lvalp, scanner, '"', DoubleQString);
1539
return parseString(lvalp, scanner, '\'', SingleQString);
1541
return parseId(lvalp, scanner);
1542
case MultilineComment:
1543
while(scanner->pos+1 < scanner->insize) {
1544
if(in[scanner->pos] == '*' && in[scanner->pos+1] == '/') {
1545
scanner->state = Initial;
1554
return parseNumber(lvalp, scanner);
1555
case SinglelineComment:
1556
while(scanner->pos < scanner->insize) {
1557
/* htmlnorm converts \n to space, so
1558
* stop on space too */
1559
if(in[scanner->pos] == '\n' || in[scanner->pos] == ' ')
1563
scanner->state = Initial;