64
78
#define S_CTRL 4 /* control char started (^) */
65
79
#define S_OCTAL2 5 /* octal digit 2 */
66
80
#define S_OCTAL3 6 /* octal digit 3 */
67
#define S_HEX1 7 /* hex digit */
68
#define S_HEX2 8 /* hex digit 2 */
81
#define S_HEX1 7 /* http hex digit */
82
#define S_HEX2 8 /* http hex digit 2 */
83
#define S_MIME1 9 /* mime hex digit 1 */
84
#define S_MIME2 10 /* mime hex digit 2 */
85
#define S_EATCRNL 11 /* mime eating CRNL */
86
#define S_AMP 12 /* seen & */
87
#define S_NUMBER 13 /* collecting number */
88
#define S_STRING 14 /* collecting string */
70
90
#define isoctal(c) (((u_char)(c)) >= '0' && ((u_char)(c)) <= '7')
71
91
#define xtod(c) (isdigit(c) ? (c - '0') : ((tolower(c) - 'a') + 10))
92
#define XTOD(c) (isdigit(c) ? (c - '0') : ((c - 'A') + 10))
97
static const struct nv {
101
{ "AElig", 198 }, /* capital AE diphthong (ligature) */
102
{ "Aacute", 193 }, /* capital A, acute accent */
103
{ "Acirc", 194 }, /* capital A, circumflex accent */
104
{ "Agrave", 192 }, /* capital A, grave accent */
105
{ "Aring", 197 }, /* capital A, ring */
106
{ "Atilde", 195 }, /* capital A, tilde */
107
{ "Auml", 196 }, /* capital A, dieresis or umlaut mark */
108
{ "Ccedil", 199 }, /* capital C, cedilla */
109
{ "ETH", 208 }, /* capital Eth, Icelandic */
110
{ "Eacute", 201 }, /* capital E, acute accent */
111
{ "Ecirc", 202 }, /* capital E, circumflex accent */
112
{ "Egrave", 200 }, /* capital E, grave accent */
113
{ "Euml", 203 }, /* capital E, dieresis or umlaut mark */
114
{ "Iacute", 205 }, /* capital I, acute accent */
115
{ "Icirc", 206 }, /* capital I, circumflex accent */
116
{ "Igrave", 204 }, /* capital I, grave accent */
117
{ "Iuml", 207 }, /* capital I, dieresis or umlaut mark */
118
{ "Ntilde", 209 }, /* capital N, tilde */
119
{ "Oacute", 211 }, /* capital O, acute accent */
120
{ "Ocirc", 212 }, /* capital O, circumflex accent */
121
{ "Ograve", 210 }, /* capital O, grave accent */
122
{ "Oslash", 216 }, /* capital O, slash */
123
{ "Otilde", 213 }, /* capital O, tilde */
124
{ "Ouml", 214 }, /* capital O, dieresis or umlaut mark */
125
{ "THORN", 222 }, /* capital THORN, Icelandic */
126
{ "Uacute", 218 }, /* capital U, acute accent */
127
{ "Ucirc", 219 }, /* capital U, circumflex accent */
128
{ "Ugrave", 217 }, /* capital U, grave accent */
129
{ "Uuml", 220 }, /* capital U, dieresis or umlaut mark */
130
{ "Yacute", 221 }, /* capital Y, acute accent */
131
{ "aacute", 225 }, /* small a, acute accent */
132
{ "acirc", 226 }, /* small a, circumflex accent */
133
{ "acute", 180 }, /* acute accent */
134
{ "aelig", 230 }, /* small ae diphthong (ligature) */
135
{ "agrave", 224 }, /* small a, grave accent */
136
{ "amp", 38 }, /* ampersand */
137
{ "aring", 229 }, /* small a, ring */
138
{ "atilde", 227 }, /* small a, tilde */
139
{ "auml", 228 }, /* small a, dieresis or umlaut mark */
140
{ "brvbar", 166 }, /* broken (vertical) bar */
141
{ "ccedil", 231 }, /* small c, cedilla */
142
{ "cedil", 184 }, /* cedilla */
143
{ "cent", 162 }, /* cent sign */
144
{ "copy", 169 }, /* copyright sign */
145
{ "curren", 164 }, /* general currency sign */
146
{ "deg", 176 }, /* degree sign */
147
{ "divide", 247 }, /* divide sign */
148
{ "eacute", 233 }, /* small e, acute accent */
149
{ "ecirc", 234 }, /* small e, circumflex accent */
150
{ "egrave", 232 }, /* small e, grave accent */
151
{ "eth", 240 }, /* small eth, Icelandic */
152
{ "euml", 235 }, /* small e, dieresis or umlaut mark */
153
{ "frac12", 189 }, /* fraction one-half */
154
{ "frac14", 188 }, /* fraction one-quarter */
155
{ "frac34", 190 }, /* fraction three-quarters */
156
{ "gt", 62 }, /* greater than */
157
{ "iacute", 237 }, /* small i, acute accent */
158
{ "icirc", 238 }, /* small i, circumflex accent */
159
{ "iexcl", 161 }, /* inverted exclamation mark */
160
{ "igrave", 236 }, /* small i, grave accent */
161
{ "iquest", 191 }, /* inverted question mark */
162
{ "iuml", 239 }, /* small i, dieresis or umlaut mark */
163
{ "laquo", 171 }, /* angle quotation mark, left */
164
{ "lt", 60 }, /* less than */
165
{ "macr", 175 }, /* macron */
166
{ "micro", 181 }, /* micro sign */
167
{ "middot", 183 }, /* middle dot */
168
{ "nbsp", 160 }, /* no-break space */
169
{ "not", 172 }, /* not sign */
170
{ "ntilde", 241 }, /* small n, tilde */
171
{ "oacute", 243 }, /* small o, acute accent */
172
{ "ocirc", 244 }, /* small o, circumflex accent */
173
{ "ograve", 242 }, /* small o, grave accent */
174
{ "ordf", 170 }, /* ordinal indicator, feminine */
175
{ "ordm", 186 }, /* ordinal indicator, masculine */
176
{ "oslash", 248 }, /* small o, slash */
177
{ "otilde", 245 }, /* small o, tilde */
178
{ "ouml", 246 }, /* small o, dieresis or umlaut mark */
179
{ "para", 182 }, /* pilcrow (paragraph sign) */
180
{ "plusmn", 177 }, /* plus-or-minus sign */
181
{ "pound", 163 }, /* pound sterling sign */
182
{ "quot", 34 }, /* double quote */
183
{ "raquo", 187 }, /* angle quotation mark, right */
184
{ "reg", 174 }, /* registered sign */
185
{ "sect", 167 }, /* section sign */
186
{ "shy", 173 }, /* soft hyphen */
187
{ "sup1", 185 }, /* superscript one */
188
{ "sup2", 178 }, /* superscript two */
189
{ "sup3", 179 }, /* superscript three */
190
{ "szlig", 223 }, /* small sharp s, German (sz ligature) */
191
{ "thorn", 254 }, /* small thorn, Icelandic */
192
{ "times", 215 }, /* multiply sign */
193
{ "uacute", 250 }, /* small u, acute accent */
194
{ "ucirc", 251 }, /* small u, circumflex accent */
195
{ "ugrave", 249 }, /* small u, grave accent */
196
{ "uml", 168 }, /* umlaut (dieresis) */
197
{ "uuml", 252 }, /* small u, dieresis or umlaut mark */
198
{ "yacute", 253 }, /* small y, acute accent */
199
{ "yen", 165 }, /* yen sign */
200
{ "yuml", 255 }, /* small y, dieresis or umlaut mark */
74
204
* unvis - decode characters previously encoded by vis
77
unvis(cp, c, astate, flag)
207
unvis(char *cp, int c, int *astate, int flag)
82
209
unsigned char uc = (unsigned char)c;
210
unsigned char st, ia, is, lc;
213
* Bottom 8 bits of astate hold the state machine state.
214
* Top 8 bits hold the current character in the http 1866 nv string decoding
216
#define GS(a) ((a) & 0xff)
217
#define SS(a, b) (((uint32_t)(a) << 24) | (b))
218
#define GI(a) ((uint32_t)(a) >> 24)
84
220
_DIAGASSERT(cp != NULL);
85
221
_DIAGASSERT(astate != NULL);
87
224
if (flag & UNVIS_END) {
88
if (*astate == S_OCTAL2 || *astate == S_OCTAL3
89
|| *astate == S_HEX2) {
229
*astate = SS(0, S_GROUND);
93
return (*astate == S_GROUND ? UNVIS_NOCHAR : UNVIS_SYNBAD);
104
if ((flag & VIS_HTTPSTYLE) && c == '%') {
242
if ((flag & VIS_NOESCAPE) == 0 && c == '\\') {
243
*astate = SS(0, S_START);
246
if ((flag & VIS_HTTP1808) && c == '%') {
247
*astate = SS(0, S_HEX1);
250
if ((flag & VIS_HTTP1866) && c == '&') {
251
*astate = SS(0, S_AMP);
254
if ((flag & VIS_MIMESTYLE) && c == '=') {
255
*astate = SS(0, S_MIME1);
109
return (UNVIS_VALID);
116
return (UNVIS_VALID);
265
*astate = SS(0, S_GROUND);
117
267
case '0': case '1': case '2': case '3':
118
268
case '4': case '5': case '6': case '7':
270
*astate = SS(0, S_OCTAL2);
123
273
*cp = (char)0200;
274
*astate = SS(0, S_META);
277
*astate = SS(0, S_CTRL);
132
return (UNVIS_VALID);
281
*astate = SS(0, S_GROUND);
136
return (UNVIS_VALID);
285
*astate = SS(0, S_GROUND);
140
return (UNVIS_VALID);
289
*astate = SS(0, S_GROUND);
144
return (UNVIS_VALID);
293
*astate = SS(0, S_GROUND);
148
return (UNVIS_VALID);
297
*astate = SS(0, S_GROUND);
152
return (UNVIS_VALID);
301
*astate = SS(0, S_GROUND);
156
return (UNVIS_VALID);
305
*astate = SS(0, S_GROUND);
160
return (UNVIS_VALID);
309
*astate = SS(0, S_GROUND);
164
return (UNVIS_VALID);
313
*astate = SS(0, S_GROUND);
170
return (UNVIS_NOCHAR);
319
*astate = SS(0, S_GROUND);
176
return (UNVIS_NOCHAR);
325
*astate = SS(0, S_GROUND);
179
return (UNVIS_SYNBAD);
332
*astate = SS(0, S_META1);
184
333
else if (c == '^')
188
return (UNVIS_SYNBAD);
334
*astate = SS(0, S_CTRL);
340
*astate = SS(0, S_GROUND);
195
return (UNVIS_VALID);
203
return (UNVIS_VALID);
349
*astate = SS(0, S_GROUND);
205
352
case S_OCTAL2: /* second possible octal digit */
206
353
if (isoctal(uc)) {
208
355
* yes - and maybe a third
210
357
*cp = (*cp << 3) + (c - '0');
358
*astate = SS(0, S_OCTAL3);
215
362
* no - done with current sequence, push back passed char
218
return (UNVIS_VALIDPUSH);
364
*astate = SS(0, S_GROUND);
365
return UNVIS_VALIDPUSH;
220
367
case S_OCTAL3: /* third possible octal digit */
368
*astate = SS(0, S_GROUND);
222
369
if (isoctal(uc)) {
223
370
*cp = (*cp << 3) + (c - '0');
224
return (UNVIS_VALID);
227
374
* we were done, push back passed char
229
return (UNVIS_VALIDPUSH);
376
return UNVIS_VALIDPUSH;
232
379
if (isxdigit(uc)) {
381
*astate = SS(0, S_HEX2);
238
385
* no - done with current sequence, push back passed char
241
return (UNVIS_VALIDPUSH);
387
*astate = SS(0, S_GROUND);
388
return UNVIS_VALIDPUSH;
244
391
*astate = S_GROUND;
245
392
if (isxdigit(uc)) {
246
393
*cp = xtod(uc) | (*cp << 4);
247
return (UNVIS_VALID);
249
return (UNVIS_VALIDPUSH);
396
return UNVIS_VALIDPUSH;
399
if (uc == '\n' || uc == '\r') {
400
*astate = SS(0, S_EATCRNL);
403
if (isxdigit(uc) && (isdigit(uc) || isupper(uc))) {
405
*astate = SS(0, S_MIME2);
411
if (isxdigit(uc) && (isdigit(uc) || isupper(uc))) {
412
*astate = SS(0, S_GROUND);
413
*cp = XTOD(uc) | (*cp << 4);
424
*astate = SS(0, S_MIME1);
428
*astate = SS(0, S_GROUND);
435
*astate = SS(0, S_NUMBER);
438
*astate = SS(0, S_STRING);
442
ia = *cp; /* index in the array */
443
is = GI(*astate); /* index in the string */
444
lc = is == 0 ? 0 : nv[ia].name[is - 1]; /* last character */
449
for (; ia < __arraycount(nv); ia++) {
450
if (is != 0 && nv[ia].name[is - 1] != lc)
452
if (nv[ia].name[is] == uc)
456
if (ia == __arraycount(nv))
461
*astate = SS(is + 1, S_STRING);
466
*astate = SS(0, S_GROUND);
474
*cp += (*cp * 10) + uc - '0';
253
480
* decoder in unknown state - (probably uninitialized)
256
return (UNVIS_SYNBAD);
482
*astate = SS(0, S_GROUND);
261
* strunvis - decode src into dst
488
* strnunvisx - decode src into dst
263
490
* Number of chars decoded into dst is returned, -1 on error.
264
491
* Dst is null terminated.
268
strunvisx(dst, src, flag)
495
strnunvisx(char *dst, size_t dlen, const char *src, int flag)
498
char t, *start = dst;
277
501
_DIAGASSERT(src != NULL);
278
502
_DIAGASSERT(dst != NULL);
503
#define CHECKSPACE() \
509
} while (/*CONSTCOND*/0)
280
511
while ((c = *src++) != '\0') {
282
switch (unvis(dst, c, &state, flag)) {
513
switch (unvis(&t, c, &state, flag)) {
283
514
case UNVIS_VALID:
286
518
case UNVIS_VALIDPUSH:
290
523
case UNVIS_NOCHAR:
296
if (unvis(dst, c, &state, UNVIS_END) == UNVIS_VALID)
534
if (unvis(&t, c, &state, UNVIS_END) == UNVIS_VALID) {
299
return (dst - start);
307
return strunvisx(dst, src, 0);
540
return (int)(dst - start);
544
strunvisx(char *dst, const char *src, int flag)
546
return strnunvisx(dst, (size_t)~0, src, flag);
550
strunvis(char *dst, const char *src)
552
return strnunvisx(dst, (size_t)~0, src, 0);
556
strnunvis(char *dst, size_t dlen, const char *src)
558
return strnunvisx(dst, dlen, src, 0);