~mirabilos/jupp/trunk

1 by tg
Initial revision
1
/*
2
 *	Syntax highlighting DFA interpreter
3
 *	Copyright
4
 *		(C) 2004 Joseph H. Allen
5
 *
6
 *	This file is part of JOE (Joe's Own Editor)
7
 */
8
9
#include "config.h"
481 by tg
more includes and warning cleanup madness
10
#include "types.h"
11
684 by tg
major overhaul of charmap handling + quelling of lots of warnings
12
__RCSID("$MirOS: contrib/code/jupp/syntax.c,v 1.26 2020/03/27 06:08:16 tg Exp $");
481 by tg
more includes and warning cleanup madness
13
1 by tg
Initial revision
14
#include <stdlib.h>
15
#include <string.h>
2 by tg
Update the non-generated files to joe-3.1jupp2
16
#include "b.h"
1 by tg
Initial revision
17
#include "scrn.h"
18
#include "utils.h"
19
#include "hash.h"
370 by tg
first cut at win32 self-relocation
20
#include "path.h"
1 by tg
Initial revision
21
#include "charmap.h"
22
#include "syntax.h"
23
356 by tg
UTF-8 syntax highlighting
24
static struct {
428 by tg
6-byte UTF-8 plus trailing NUL makes 7 octets buffer
25
	unsigned char buf[7];
356 by tg
UTF-8 syntax highlighting
26
	unsigned char start;
27
	unsigned char limit;
28
	unsigned eaten : 1;
29
	unsigned ebbed : 1;
30
	unsigned unget : 1;
31
	unsigned first : 1;
32
} utfstate;
33
34
static int
35
utfoctet(P *p)
36
{
37
	int c;
38
39
	utfstate.first = 0;
40
	if (utfstate.eaten) {
41
 ate:
42
		if (utfstate.start < utfstate.limit)
43
			return (utfstate.buf[utfstate.start++]);
44
		if (utfstate.ebbed)
45
			return (NO_MORE_DATA);
46
		utfstate.eaten = utfstate.limit = 0;
47
	}
48
	if (!utfstate.limit) {
49
		utfstate.first = 1;
50
		if (utfstate.unget) {
51
			c = utfstate.buf[utfstate.start];
52
			utfstate.unget = 0;
53
		} else
54
			c = pgetb(p);
55
		if ((c == NO_MORE_DATA) || (c < 0x80))
56
			return (c);
57
		if ((c < 0xC2) || (c >= 0xFE))
58
			return (0xFF);
59
		utfstate.start = 0;
60
		utfstate.buf[utfstate.start++] = (unsigned char)c;
61
		utfstate.limit = (c < 0xE0) ? 2 : (c < 0xF0) ? 3 :
62
		    (c < 0xF8) ? 4 : (c < 0xFC) ? 5 : 6;
63
	}
64
	while (utfstate.start < utfstate.limit) {
65
		if (((c = pgetb(p)) == NO_MORE_DATA) || ((c ^ 0x80) > 0x3F)) {
66
			/* invalid follow byte, invalidate all previous ones */
67
			utfstate.limit = 0;
68
			while (utfstate.limit < utfstate.start)
69
				utfstate.buf[utfstate.limit++] = 0xFF;
70
			/* append this as ungetch unless the well is dry */
71
			if (c == NO_MORE_DATA)
72
				utfstate.ebbed = 1;
73
			else {
74
				utfstate.buf[utfstate.limit] = (unsigned char)c;
75
				utfstate.unget = 1;
76
			}
77
			/* now return those bytes */
78
			break;
79
		}
80
		utfstate.buf[utfstate.start++] = (unsigned char)c;
81
	}
82
	utfstate.start = 0;
83
	utfstate.eaten = 1;
84
	goto ate;
85
}
86
87
static int
88
octetutf(P *p)
89
{
90
	int c;
91
397 by tg
unbreak syntax highlighting in nōn-UTF8 locales
92
	utfstate.first = 0;
356 by tg
UTF-8 syntax highlighting
93
	if (!(utfstate.start < utfstate.limit)) {
94
		if ((c = pgetb(p)) == NO_MORE_DATA)
95
			return (NO_MORE_DATA);
96
97
		utfstate.limit = utf8_encode(utfstate.buf,
684 by tg
major overhaul of charmap handling + quelling of lots of warnings
98
		    joe_to_uni(p->b->o.charmap, c));
356 by tg
UTF-8 syntax highlighting
99
		utfstate.start = 0;
397 by tg
unbreak syntax highlighting in nōn-UTF8 locales
100
		utfstate.first = 1;
356 by tg
UTF-8 syntax highlighting
101
	}
102
	return (utfstate.buf[utfstate.start++]);
103
}
104
1 by tg
Initial revision
105
/* Parse one line.  Returns new state.
106
   'syntax' is the loaded syntax definition for this buffer.
107
   'line' is advanced to start of next line.
108
   Global array 'attr_buf' end up with coloring for each character of line.
109
   'state' is initial parser state for the line (0 is initial state).
110
*/
111
112
int *attr_buf = 0;
113
int attr_size = 0;
114
356 by tg
UTF-8 syntax highlighting
115
int parse(struct high_syntax *syntax, P *line, int state)
1 by tg
Initial revision
116
{
117
	struct high_state *h = syntax->states[state];
118
			/* Current state */
119
	unsigned char buf[20];	/* Name buffer (trunc after 19 characters) */
356 by tg
UTF-8 syntax highlighting
120
	int buf_idx = 0;	/* Index into buffer */
121
	int buf_len = 0;	/* counts only starting characters */
122
	int buf_en = 0;		/* Set for name buffering */
1 by tg
Initial revision
123
	int *attr_end = attr_buf+attr_size;
124
	int *attr = attr_buf;
356 by tg
UTF-8 syntax highlighting
125
	int c;			/* Current character */
126
	int ofst = 0;	/* record length after we've stopped buffering */
684 by tg
major overhaul of charmap handling + quelling of lots of warnings
127
	int (*getoctet)(P *) = joe_maputf(line->b->o.charmap) ?
128
	    utfoctet : octetutf;
356 by tg
UTF-8 syntax highlighting
129
130
	memset(&utfstate, 0, sizeof(utfstate));
131
	buf[0] = 0;
1 by tg
Initial revision
132
133
	/* Get next character */
356 by tg
UTF-8 syntax highlighting
134
	while((c = getoctet(line)) != NO_MORE_DATA) {
1 by tg
Initial revision
135
		struct high_cmd *cmd, *kw_cmd;
136
		int x;
137
138
		/* Expand attribute array if necessary */
139
		if(attr==attr_end) {
539 by tg
don’t box malloc/calloc/realloc/free; don’t cast malloc result; order calloc args
140
			attr_buf = realloc(attr_buf,
141
			    sizeof(int) * (attr_size * 2));
1 by tg
Initial revision
142
			attr = attr_buf + attr_size;
143
			attr_size *= 2;
144
			attr_end = attr_buf + attr_size;
145
		}
146
147
		/* Advance to next attribute position (note attr[-1] below) */
356 by tg
UTF-8 syntax highlighting
148
		if (utfstate.first)
149
			attr++;
1 by tg
Initial revision
150
151
		/* Loop while noeat */
152
		do {
153
			/* Color with current state */
154
			attr[-1] = h->color;
155
			/* Get command for this character */
156
			cmd = h->cmd[c];
157
			/* Determine new state */
305 by tg
refactor, sort, etc.
158
			if (cmd->keywords && (cmd->ignore ?
159
			    (kw_cmd = htfind(cmd->keywords, joe_strtolower(buf))) :
160
			    (kw_cmd = htfind(cmd->keywords, buf)))) {
1 by tg
Initial revision
161
				cmd = kw_cmd;
162
				h = cmd->new_state;
163
				/* Recolor keyword */
356 by tg
UTF-8 syntax highlighting
164
				for (x = -(buf_len + 1); x < -1; ++x)
165
					attr[x - ofst] = h->color;
1 by tg
Initial revision
166
			} else {
167
				h = cmd->new_state;
168
			}
169
			/* Recolor if necessary */
220 by tg
valgrind found a possible buffer underflow
170
			x = cmd->recolor;
171
			while (&attr[x] < attr_buf)
172
				++x;
173
			while (x < 0)
174
				attr[x++] = h->color;
1 by tg
Initial revision
175
176
			/* Start buffering? */
177
			if (cmd->start_buffering) {
178
				buf_idx = 0;
356 by tg
UTF-8 syntax highlighting
179
				buf_len = 0;
1 by tg
Initial revision
180
				buf_en = 1;
181
				ofst = 0;
182
			}
183
184
			/* Stop buffering? */
185
			if (cmd->stop_buffering)
186
				buf_en = 0;
187
		} while(cmd->noeat);
188
189
		/* Save character in buffer */
190
		if (!buf_en)
356 by tg
UTF-8 syntax highlighting
191
			ofst += utfstate.first;
192
		else if (buf_idx < 19) {
193
			buf[buf_idx++] = c;
194
			buf[buf_idx] = 0;
195
			buf_len += utfstate.first;
196
		}
1 by tg
Initial revision
197
356 by tg
UTF-8 syntax highlighting
198
		if (c == '\n')
1 by tg
Initial revision
199
			break;
200
	}
201
	/* Return new state number */
202
	return h->no;
203
}
204
205
/* Subroutines for load_dfa() */
206
356 by tg
UTF-8 syntax highlighting
207
static struct high_state *find_state(struct high_syntax *syntax, const unsigned char *name)
1 by tg
Initial revision
208
{
209
	int x;
210
	struct high_state *state;
211
212
	/* Find state */
213
	for(x=0;x!=syntax->nstates;++x)
214
		if(!strcmp(syntax->states[x]->name,name))
215
			break;
216
217
	/* It doesn't exist, so create it */
218
	if(x==syntax->nstates) {
219
		int y;
539 by tg
don’t box malloc/calloc/realloc/free; don’t cast malloc result; order calloc args
220
		state = malloc(sizeof(struct high_state));
356 by tg
UTF-8 syntax highlighting
221
		state->name=(const unsigned char *)strdup((const char *)name);
1 by tg
Initial revision
222
		state->no=syntax->nstates;
223
		state->color=FG_WHITE;
224
		if(!syntax->nstates)
225
			/* We're the first state */
226
			syntax->default_cmd.new_state = state;
227
		if(syntax->nstates==syntax->szstates)
539 by tg
don’t box malloc/calloc/realloc/free; don’t cast malloc result; order calloc args
228
			syntax->states = realloc(syntax->states,
229
			   sizeof(struct high_state *) * (syntax->szstates *= 2));
1 by tg
Initial revision
230
		syntax->states[syntax->nstates++]=state;
231
		for(y=0; y!=256; ++y)
232
			state->cmd[y] = &syntax->default_cmd;
233
	} else
234
		state = syntax->states[x];
235
	return state;
236
}
237
238
/* Load syntax file */
239
240
struct high_syntax *syntax_list;
241
356 by tg
UTF-8 syntax highlighting
242
struct high_syntax *load_dfa(const unsigned char *name)
1 by tg
Initial revision
243
{
244
	unsigned char buf[1024];
245
	unsigned char bf[256];
246
	unsigned char bf1[256];
247
	int clist[256];
248
	unsigned char *p;
249
	int c;
2 by tg
Update the non-generated files to joe-3.1jupp2
250
	FILE *f = NULL;
1 by tg
Initial revision
251
	struct high_state *state=0;	/* Current state */
252
	struct high_syntax *syntax;	/* New syntax table */
253
	int line = 0;
481 by tg
more includes and warning cleanup madness
254
	void *np;
1 by tg
Initial revision
255
256
	if (!name)
257
		return NULL;
258
594 by tg
new ralloc(x,y) = malloc(x*y) with checks, but not, unlike calloc,
259
	if (!attr_buf)
260
		attr_buf = ralloc((size_t)(attr_size = 1024), sizeof(int));
1 by tg
Initial revision
261
262
	/* Find syntax table */
263
264
	/* Already loaded? */
265
	for(syntax=syntax_list;syntax;syntax=syntax->next)
266
		if(!strcmp(syntax->name,name))
267
			return syntax;
268
269
	/* Load it */
270
	p = (unsigned char *)getenv("HOME");
271
	if (p) {
412 by tg
use ~/.jupp/ (for now) not ~/.joe/ to not conflict with joe 4.x
272
		joe_snprintf_2((char *)buf,sizeof(buf),"%s/.jupp/syntax/%s.jsf",p,name);
1 by tg
Initial revision
273
		f = fopen((char *)buf,"r");
274
	}
275
370 by tg
first cut at win32 self-relocation
276
	if (!f && has_JOERC) {
277
		joe_snprintf_2((char *)buf,sizeof(buf),"%ssyntax/%s.jsf",get_JOERC,name);
1 by tg
Initial revision
278
		f = fopen((char *)buf,"r");
279
	}
280
	if(!f)
281
		return 0;
282
283
	/* Create new one */
393 by tg
few more malloc → calloc
284
	syntax = calloc(1, sizeof(struct high_syntax));
356 by tg
UTF-8 syntax highlighting
285
	syntax->name = (const unsigned char *)strdup((const char *)name);
1 by tg
Initial revision
286
	syntax->next = syntax_list;
287
	syntax_list = syntax;
594 by tg
new ralloc(x,y) = malloc(x*y) with checks, but not, unlike calloc,
288
	syntax->states = ralloc((size_t)(syntax->szstates = 64),
289
	    sizeof(struct high_state *));
203 by tg
use 120 sync lines by default; should fix natureshadow’s problem
290
	syntax->sync_lines = 120;
1 by tg
Initial revision
291
251 by tg
some clang-3.2/scan-build work; also checked against gcc-4.7, gcc-4.8 and
292
	memset(clist, 0, sizeof(clist));
293
1 by tg
Initial revision
294
	/* Parse file */
295
	while(fgets((char *)buf,1023,f)) {
296
		++line;
297
		p = buf;
251 by tg
some clang-3.2/scan-build work; also checked against gcc-4.7, gcc-4.8 and
298
		parse_ws(&p,'#');
1 by tg
Initial revision
299
		if(!parse_char(&p, ':')) {
300
			if(!parse_ident(&p, bf, 255)) {
301
302
				state = find_state(syntax,bf);
303
304
				parse_ws(&p,'#');
305
				if(!parse_ident(&p,bf,255)) {
306
					struct high_color *color;
307
					for(color=syntax->color;color;color=color->next)
308
						if(!strcmp(color->name,bf))
309
							break;
310
					if(color)
311
						state->color=color->color;
312
					else {
313
						state->color=0;
156 by tg
Better error messages when parsing syntax and rc files
314
						fprintf(stderr,"%s:%d: Unknown class '%s'\n", name, line, bf);
1 by tg
Initial revision
315
					}
316
				} else
156 by tg
Better error messages when parsing syntax and rc files
317
					fprintf(stderr,"%s:%d: Missing color for state definition\n", name, line);
1 by tg
Initial revision
318
			} else
156 by tg
Better error messages when parsing syntax and rc files
319
				fprintf(stderr,"%s:%d: Missing state name\n", name, line);
1 by tg
Initial revision
320
		} else if(!parse_char(&p, '=')) {
321
			if(!parse_ident(&p, bf, 255)) {
322
				struct high_color *color;
323
324
				/* Find color */
325
				for(color=syntax->color;color;color=color->next)
326
					if(!strcmp(color->name,bf))
327
						break;
328
				/* If it doesn't exist, create it */
329
				if(!color) {
393 by tg
few more malloc → calloc
330
					color = calloc(1, sizeof(struct high_color));
1 by tg
Initial revision
331
					color->name = (unsigned char *)strdup((char *)bf);
332
					color->next = syntax->color;
333
					syntax->color = color;
334
				} else {
156 by tg
Better error messages when parsing syntax and rc files
335
					fprintf(stderr,"%s:%d: Class '%s' already defined\n", name, line, bf);
1 by tg
Initial revision
336
				}
337
338
				/* Parse color definition */
339
				while(parse_ws(&p,'#'), !parse_ident(&p,bf,255)) {
340
					color->color |= meta_color(bf);
341
				}
342
			}
343
		} else if(!parse_char(&p, '-')) { /* No. sync lines */
476 by tg
banish parse_int()
344
			syntax->sync_lines = (int)ustolb(p, &np,
345
			    INT_MIN, INT_MAX, USTOL_TRIM);
346
			if (!np)
1 by tg
Initial revision
347
				syntax->sync_lines = -1;
476 by tg
banish parse_int()
348
			else
349
				p = np;
1 by tg
Initial revision
350
		} else {
351
			c = parse_ws(&p,'#');
352
353
			if (!c) {
354
			} else if (c=='"' || c=='*') {
355
				if (state) {
576 by tg
merge fixes from gitlab branch; highlights:
356
					if (!parse_field(&p, UC "*")) {
1 by tg
Initial revision
357
						int z;
358
						for(z=0;z!=256;++z)
359
							clist[z] = 1;
360
					} else {
361
						c = parse_string(&p, bf, 255);
362
						if(c)
156 by tg
Better error messages when parsing syntax and rc files
363
							fprintf(stderr,"%s:%d: Bad string\n", name, line);
1 by tg
Initial revision
364
						else {
365
							int z;
366
							int first, second;
367
							unsigned char *t = bf;
368
							for(z=0;z!=256;++z)
369
								clist[z] = 0;
370
							while(!parse_range(&t, &first, &second)) {
371
								if(first>second)
372
									second = first;
373
								while(first<=second)
374
									clist[first++] = 1;
375
							}
376
						}
377
					}
378
					/* Create command */
379
					parse_ws(&p,'#');
380
					if(!parse_ident(&p,bf,255)) {
662 by tg
introduce bunch of Coverity fixes
381
						struct high_cmd *cmd = calloc(1, sizeof(struct high_cmd));
1 by tg
Initial revision
382
						int z;
662 by tg
introduce bunch of Coverity fixes
383
1 by tg
Initial revision
384
						cmd->new_state = find_state(syntax,bf);
385
386
						/* Parse options */
387
						while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
388
							if(!strcmp(bf,"buffer")) {
389
								cmd->start_buffering = 1;
390
							} else if(!strcmp(bf,"hold")) {
391
								cmd->stop_buffering = 1;
392
							} else if(!strcmp(bf,"recolor")) {
393
								parse_ws(&p,'#');
394
								if(!parse_char(&p,'=')) {
395
									parse_ws(&p,'#');
476 by tg
banish parse_int()
396
									cmd->recolor = (int)ustolb(p, &np,
397
									    INT_MIN, INT_MAX, USTOL_TRIM);
398
									if (!np)
156 by tg
Better error messages when parsing syntax and rc files
399
										fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
476 by tg
banish parse_int()
400
									else
401
										p = np;
1 by tg
Initial revision
402
								} else
156 by tg
Better error messages when parsing syntax and rc files
403
									fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
1 by tg
Initial revision
404
							} else if(!strcmp(bf,"strings") || !strcmp(bf,"istrings")) {
405
								if (bf[0]=='i')
406
									cmd->ignore = 1;
407
								while(fgets((char *)buf,1023,f)) {
408
									++line;
409
									p = buf;
251 by tg
some clang-3.2/scan-build work; also checked against gcc-4.7, gcc-4.8 and
410
									parse_ws(&p,'#');
1 by tg
Initial revision
411
									if (*p) {
576 by tg
merge fixes from gitlab branch; highlights:
412
										if (!parse_field(&p, UC "done"))
1 by tg
Initial revision
413
											break;
414
										if(!parse_string(&p,bf,255)) {
415
											parse_ws(&p,'#');
416
											if (cmd->ignore)
305 by tg
refactor, sort, etc.
417
												joe_strtolower(bf);
1 by tg
Initial revision
418
											if(!parse_ident(&p,bf1,255)) {
393 by tg
few more malloc → calloc
419
												struct high_cmd *kw_cmd = calloc(1, sizeof(struct high_cmd));
1 by tg
Initial revision
420
												kw_cmd->noeat=1;
421
												kw_cmd->new_state = find_state(syntax,bf1);
422
												if(!cmd->keywords)
423
													cmd->keywords = htmk(64);
424
												htadd(cmd->keywords,(unsigned char *)strdup((char *)bf),kw_cmd);
425
												while (parse_ws(&p,'#'), !parse_ident(&p,bf,255))
426
													if(!strcmp(bf,"buffer")) {
427
														kw_cmd->start_buffering = 1;
428
													} else if(!strcmp(bf,"hold")) {
429
														kw_cmd->stop_buffering = 1;
430
													} else if(!strcmp(bf,"recolor")) {
431
														parse_ws(&p,'#');
432
														if(!parse_char(&p,'=')) {
433
															parse_ws(&p,'#');
476 by tg
banish parse_int()
434
															kw_cmd->recolor = (int)ustolb(p, &np,
435
															    INT_MIN, INT_MAX, USTOL_TRIM);
436
															if (!np)
156 by tg
Better error messages when parsing syntax and rc files
437
																fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
476 by tg
banish parse_int()
438
															else
439
																p = np;
1 by tg
Initial revision
440
														} else
156 by tg
Better error messages when parsing syntax and rc files
441
															fprintf(stderr,"%s:%d: Missing value for option %s\n", name, line, bf);
1 by tg
Initial revision
442
													} else
156 by tg
Better error messages when parsing syntax and rc files
443
														fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
1 by tg
Initial revision
444
											} else
156 by tg
Better error messages when parsing syntax and rc files
445
												fprintf(stderr,"%s:%d: Missing state name\n", name, line);
1 by tg
Initial revision
446
										} else
156 by tg
Better error messages when parsing syntax and rc files
447
											fprintf(stderr,"%s:%d: Missing string\n", name, line);
1 by tg
Initial revision
448
									}
449
								}
450
							} else if(!strcmp(bf,"noeat")) {
451
								cmd->noeat = 1;
157 by tg
Ignore syntax keywords from joe-3.7 diff.jsf that are not yet implemented
452
							} else if(!strcmp(bf,"mark")) {
453
								/* not implemented yet */ ;
454
							} else if(!strcmp(bf,"markend")) {
455
								/* not implemented yet */ ;
456
							} else if(!strcmp(bf,"recolormark")) {
457
								/* not implemented yet */ ;
1 by tg
Initial revision
458
							} else
156 by tg
Better error messages when parsing syntax and rc files
459
								fprintf(stderr,"%s:%d: Unknown option '%s'\n", name, line, bf);
1 by tg
Initial revision
460
461
						/* Install command */
462
						for(z=0;z!=256;++z)
463
							if(clist[z])
464
								state->cmd[z]=cmd;
465
					} else
156 by tg
Better error messages when parsing syntax and rc files
466
						fprintf(stderr,"%s:%d: Missing jump\n", name, line);
1 by tg
Initial revision
467
				} else
156 by tg
Better error messages when parsing syntax and rc files
468
					fprintf(stderr,"%s:%d: No state\n", name, line);
1 by tg
Initial revision
469
			} else
156 by tg
Better error messages when parsing syntax and rc files
470
				fprintf(stderr,"%s:%d: Unknown character\n", name, line);
1 by tg
Initial revision
471
		}
472
	}
473
474
	fclose(f);
475
476
	return syntax;
477
}