3
% Copyright 2009-2010 Taco Hoekwater <taco@@luatex.org>
5
% This file is part of LuaTeX.
7
% LuaTeX is free software; you can redistribute it and/or modify it under
8
% the terms of the GNU General Public License as published by the Free
9
% Software Foundation; either version 2 of the License, or (at your
10
% option) any later version.
12
% LuaTeX is distributed in the hope that it will be useful, but WITHOUT
13
% ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14
% FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15
% License for more details.
17
% You should have received a copy of the GNU General Public License along
18
% with LuaTeX; if not, see <http://www.gnu.org/licenses/>.
23
static const char _svn_version[] =
24
"$Id: stringpool.w 3587 2010-04-03 14:32:25Z taco $ "
25
"$URL: http://foundry.supelec.fr/svn/luatex/tags/beta-0.60.1/source/texk/web2c/luatexdir/tex/stringpool.w $";
27
@ Control sequence names and diagnostic messages are variable-length strings
28
of eight-bit characters. Since PASCAL did not have a well-developed string
29
mechanism, \TeX\ did all of its string processing by homegrown methods.
31
Elaborate facilities for dynamic strings are not needed, so all of the
32
necessary operations can be handled with a simple data structure.
33
The array |str_pool| contains all of the (eight-bit) bytes off all
34
of the strings, and the array |str_start| contains indices of the starting
35
points of each string. Strings are referred to by integer numbers, so that
36
string number |s| comprises the characters |str_pool[j]| for
37
|str_start_macro(s)<=j<str_start_macro(s+1)|. Additional integer variables
38
|pool_ptr| and |str_ptr| indicate the number of entries used so far
39
in |str_pool| and |str_start|, respectively; locations
40
|str_pool[pool_ptr]| and |str_start_macro(str_ptr)| are
41
ready for the next string to be allocated.
43
String numbers 0 to |biggest_char| are reserved for strings that correspond to
44
single UNICODE characters. This is in accordance with the conventions of \.{WEB}
45
which converts single-character strings into the ASCII code number of the
46
single character involved.
49
lstring *string_pool; /* the array of strings */
50
lstring *_string_pool; /* this variable lives |STRING_OFFSET| below |string_pool|
52
|_string_pool[str_ptr] == str_string(str_ptr)| */
54
str_number str_ptr = (STRING_OFFSET + 1); /* number of the current string being created */
55
str_number init_str_ptr; /* the starting value of |str_ptr| */
57
unsigned char *cur_string; /* current string buffer */
58
unsigned cur_length; /* current index in that buffer */
59
unsigned cur_string_size; /* malloced size of |cur_string| */
60
unsigned pool_size; /* occupied byte count */
63
@ Once a sequence of characters has been appended to |cur_string|, it
64
officially becomes a string when the function |make_string| is called.
65
This function returns the identification number of the new string as its
69
void reset_cur_string(void)
72
cur_string_size = 255;
73
cur_string = (unsigned char *) xmalloc(256);
74
memset(cur_string, 0, 256);
77
@ current string enters the pool
79
str_number make_string(void)
81
if (str_ptr == (max_strings + STRING_OFFSET))
82
overflow("number of strings",
83
(unsigned) (max_strings - init_str_ptr + STRING_OFFSET));
85
cur_string[cur_length] = '\0'; /* now |lstring.s| is always a valid C string */
86
str_string(str_ptr) = (unsigned char *) cur_string;
87
str_length(str_ptr) = cur_length;
88
pool_size += cur_length;
91
printf("Made a string: %s (s=%d)\n", (char *)str_string(str_ptr), (int)str_ptr);
98
static void utf_error(void)
101
{ "A funny symbol that I can't read has just been (re)read.",
102
"Just continue, I'll change it to 0xFFFD.",
105
deletions_allowed = false;
106
tex_error("String contains an invalid utf-8 sequence", hlp);
107
deletions_allowed = true;
111
unsigned str2uni(const unsigned char *k)
114
unsigned val = 0xFFFD;
115
const unsigned char *text = k;
116
if ((ch = *text++) < 0x80) {
118
} else if (ch <= 0xbf) { /* error */
119
} else if (ch <= 0xdf) {
120
if (*text >= 0x80 && *text < 0xc0)
121
val = (unsigned) (((ch & 0x1f) << 6) | (*text++ & 0x3f));
122
} else if (ch <= 0xef) {
123
if (*text >= 0x80 && *text < 0xc0 && text[1] >= 0x80 && text[1] < 0xc0) {
125
(((ch & 0xf) << 12) | ((text[0] & 0x3f) << 6) |
129
int w = (((ch & 0x7) << 2) | ((text[0] & 0x30) >> 4)) - 1, w2;
130
w = (w << 6) | ((text[0] & 0xf) << 2) | ((text[1] & 0x30) >> 4);
131
w2 = ((text[1] & 0xf) << 6) | (text[2] & 0x3f);
132
val = (unsigned) (w * 0x400 + w2 + 0x10000);
133
if (*text < 0x80 || text[1] < 0x80 || text[2] < 0x80 ||
134
*text >= 0xc0 || text[1] >= 0xc0 || text[2] >= 0xc0)
142
@ This is a very basic helper
144
unsigned char *uni2str(unsigned unic)
146
unsigned char *buf = xmalloc(5);
147
unsigned char *pt = buf;
149
*pt++ = (unsigned char) unic;
150
else if (unic < 0x800) {
151
*pt++ = (unsigned char) (0xc0 | (unic >> 6));
152
*pt++ = (unsigned char) (0x80 | (unic & 0x3f));
153
} else if (unic >= 0x110000) {
154
*pt++ = (unsigned char) (unic - 0x110000);
155
} else if (unic < 0x10000) {
156
*pt++ = (unsigned char) (0xe0 | (unic >> 12));
157
*pt++ = (unsigned char) (0x80 | ((unic >> 6) & 0x3f));
158
*pt++ = (unsigned char) (0x80 | (unic & 0x3f));
161
unsigned val = unic - 0x10000;
162
u = (int) (((val & 0xf0000) >> 16) + 1);
163
z = (int) ((val & 0x0f000) >> 12);
164
y = (int) ((val & 0x00fc0) >> 6);
165
x = (int) (val & 0x0003f);
166
*pt++ = (unsigned char) (0xf0 | (u >> 2));
167
*pt++ = (unsigned char) (0x80 | ((u & 3) << 4) | z);
168
*pt++ = (unsigned char) (0x80 | y);
169
*pt++ = (unsigned char) (0x80 | x);
176
@ |buffer_to_unichar| converts a sequence of bytes in the |buffer|
177
into a unicode character value. It does not check for overflow
178
of the |buffer|, but it is careful to check the validity of the
182
#define test_sequence_byte(A) do { \
183
if (((A)<0x80) || ((A)>=0xC0)) { \
190
static int buffer_to_unichar(int k)
192
int a; /* a utf char */
193
int b; /* a utf nibble */
197
} else if (b >= 0xF8) {
198
/* the 5- and 6-byte UTF-8 sequences generate integers
199
that are outside of the valid UCS range, and therefore
202
test_sequence_byte(-1);
203
} else if (b >= 0xF0) {
206
test_sequence_byte(b);
207
a = (a + (b - 128)) * 64;
209
test_sequence_byte(b);
210
a = (a + (b - 128)) * 64;
212
test_sequence_byte(b);
214
} else if (b >= 0xE0) {
217
test_sequence_byte(b);
218
a = (a + (b - 128)) * 64;
220
test_sequence_byte(b);
222
} else if (b >= 0xC0) {
225
test_sequence_byte(b);
228
/* This is an encoding error */
229
test_sequence_byte(-1);
235
int pool_to_unichar(unsigned char *t)
237
return (int) str2uni(t);
242
@ The following subroutine compares string |s| with another string of the
243
same length that appears in |buffer| starting at position |k|;
244
the result is |true| if and only if the strings are equal.
245
Empirical tests indicate that |str_eq_buf| is used in such a way that
246
it tends to return |true| about 80 percent of the time.
249
boolean str_eq_buf(str_number s, int k)
250
{ /* test equality of strings */
251
int a; /* a unicode character */
252
if (s < STRING_OFFSET) {
253
a = buffer_to_unichar(k);
257
unsigned char *j = str_string(s);
258
unsigned char *l = j + str_length(s);
260
if (*j++ != buffer[k++])
268
@ Here is a similar routine, but it compares two strings in the string pool,
269
and it does not assume that they have the same length.
272
boolean str_eq_str(str_number s, str_number t)
273
{ /* test equality of strings */
274
int a = 0; /* a utf char */
275
unsigned char *j, *k, *l; /* running indices */
276
if (s < STRING_OFFSET) {
277
if (t >= STRING_OFFSET) {
279
if (s <= 0x7F && (str_length(t) == 1) && *k == s)
281
a = pool_to_unichar(k);
288
} else if (t < STRING_OFFSET) {
290
if (t <= 0x7F && (str_length(s) == 1) && *j == t)
292
a = pool_to_unichar(j);
296
if (str_length(s) != str_length(t))
300
l = j + str_length(s);
311
boolean str_eq_cstr(str_number r, const char *s, size_t l)
313
if (l != (size_t) str_length(r))
315
return (strncmp((const char *) (str_string(r)), s, l) == 0);
319
@ The initial values of |str_pool|, |str_start|, |pool_ptr|,
320
and |str_ptr| are computed by the \.{INITEX} program, based in part
321
on the information that \.{WEB} has output while processing \TeX.
323
The first |string_offset| strings are single-characters strings matching
324
Unicode. There is no point in generating all of these. But |str_ptr| has
325
initialized properly, otherwise |print_char| cannot see the difference
326
between characters and strings.
329
@ initializes the string pool, but returns |false| if something goes wrong
331
boolean get_strings_started(void)
337
@ The string recycling routines. \TeX{} uses 2
338
upto 4 {\it new\/} strings when scanning a filename in an \.{\\input},
339
\.{\\openin}, or \.{\\openout} operation. These strings are normally
340
lost because the reference to them are not saved after finishing the
341
operation. |search_string| searches through the string pool for the
342
given string and returns either 0 or the found string number.
345
str_number search_string(str_number search)
347
str_number s; /* running index */
348
size_t len; /* length of searched string */
349
len = str_length(search);
351
return get_nullstr();
353
s = search - 1; /* start search with newest string below |s|; |search>1|! */
354
while (s >= STRING_OFFSET) {
355
/* first |string_offset| strings depend on implementation!! */
356
if (str_length(s) == len)
357
if (str_eq_str(s, search))
366
str_number maketexstring(const char *s)
368
if (s == NULL || *s == 0)
369
return get_nullstr();
370
return maketexlstring(s, strlen(s));
374
str_number maketexlstring(const char *s, size_t l)
376
if (s == NULL || l == 0)
377
return get_nullstr();
378
str_string(str_ptr) = xmalloc((unsigned) (l + 1));
379
memcpy(str_string(str_ptr), s, (l + 1));
380
str_length(str_ptr) = (unsigned) l;
382
return (str_ptr - 1);
385
@ append a C string to a TeX string
387
void append_string(const unsigned char *s, unsigned l)
389
if (s == NULL || *s == 0)
391
l = (unsigned) strlen((const char *) s);
393
memcpy(cur_string + cur_length, s, l);
399
char *makecstring(int s)
402
return makeclstring(s, &l);
406
char *makeclstring(int s, size_t * len)
408
if (s < STRING_OFFSET) {
409
*len = (size_t) utf8_size(s);
410
return (char *) uni2str((unsigned) s);
412
unsigned l = (unsigned) str_length(s);
413
char *cstrbuf = xmalloc(l + 1);
414
memcpy(cstrbuf, str_string(s), l);
422
int dump_string_pool(void)
427
dump_int(k - STRING_OFFSET);
428
for (j = STRING_OFFSET + 1; j < k; j++) {
429
l = (int) str_length(j);
430
if (str_string(j) == NULL)
434
dump_things(*str_string(j), str_length(j));
436
return (k - STRING_OFFSET);
440
int undump_string_pool(void)
445
if (max_strings < str_ptr + strings_free)
446
max_strings = str_ptr + strings_free;
447
str_ptr += STRING_OFFSET;
449
libcfree(string_pool);
450
init_string_pool_array((unsigned) max_strings);
451
for (j = STRING_OFFSET + 1; j < str_ptr; j++) {
454
str_length(j) = (unsigned) x;
455
pool_size += (unsigned) x;
456
str_string(j) = xmallocarray(unsigned char, (unsigned) (x + 1));
457
undump_things(*str_string(j), (unsigned) x);
458
*(str_string(j) + str_length(j)) = '\0';
463
init_str_ptr = str_ptr;
468
void init_string_pool_array(unsigned s)
470
string_pool = xmallocarray(lstring, s);
471
_string_pool = string_pool - STRING_OFFSET;
472
memset(string_pool, 0, s * sizeof(lstring));
473
/* seed the null string */
474
string_pool[0].s = xmalloc(1);
475
string_pool[0].s[0] = '\0';
478
@ To destroy an already made string, we say |flush_str|.
480
void flush_str(str_number s)
483
printf("Flushing a string: %s (s=%d,str_ptr=%d)\n", (char *)str_string(s), (int)s, (int)str_ptr);
485
if (s > STRING_OFFSET) { /* don't ever delete the null string */
486
pool_size -= (unsigned) str_length(s);
488
xfree(str_string(s));
490
while (str_string((str_ptr - 1)) == NULL)