1328
static int is_candidate(Uint cp)
1331
if (cp < 768) return 0;
1333
if (cp == 12441 || cp == 12442) return 1;
1336
index = cp / 32 - COMP_CANDIDATE_MAP_OFFSET;
1338
return !!(comp_candidate_map[index] & (1UL << pos));
1341
static int hashsearch(int *htab, int htab_size, CompEntry *cv, Uint16 c)
1343
int bucket = c % htab_size;
1344
while (htab[bucket] != -1 && cv[htab[bucket]].c != c)
1345
bucket = (bucket + 1) % htab_size;
1346
return htab[bucket];
1349
#define TRANSLATE_NO 0
1350
#define TRANSLATE_MAYBE -1
1352
/* The s array is reversed */
1353
static int translate(Uint16 *s, int slen, Uint16 *res)
1355
/* Go backwards through buffer and match against tree */
1357
CompEntry *cv = compose_tab;
1358
int *hc = hash_compose_tab;
1359
int cvs = compose_tab_size;
1361
while (pos < slen) {
1362
x = hashsearch(hc,cvs*HASH_SIZE_FACTOR,cv,s[pos]);
1364
return TRANSLATE_NO;
1370
cvs = cv[x].num_subs;
1375
return TRANSLATE_MAYBE;
1378
static void handle_first_norm(Uint16 *savepoints, int *numpointsp, Uint unipoint)
1380
/*erts_fprintf(stderr,"CP = %d, numpoints = %d\n",(int) unipoint,(int) *numpointsp);*/
1382
savepoints[0] = (Uint16) unipoint;
1385
static void cleanup_norm(Eterm **hpp, Uint16 *savepoints, int numpoints, Eterm *retp)
1392
ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
1395
for (i = 1;i < numpoints;) {
1396
if(!is_candidate(savepoints[i]) ||
1397
((res = translate(savepoints+i,numpoints - i, &newpoint)) <= 0)) {
1398
ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
1402
ret = CONS(hp,make_small((Uint) newpoint),ret);
1410
static void handle_potential_norm(Eterm **hpp, Uint16 *savepoints, int *numpointsp, Uint unipoint, Eterm *retp)
1413
int numpoints = *numpointsp;
1418
/* erts_fprintf(stderr,"CP = %d, numpoints = %d\n",(int) unipoint,(int) numpoints);*/
1419
if ((unipoint >> 16) == 0) { /* otherwise we're done here */
1420
savepoints[numpoints++] = (Uint16) unipoint;
1421
res = translate(savepoints,numpoints,&newpoint);
1422
if (res == TRANSLATE_NO) {
1423
ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
1425
for (i = 1;i < numpoints;) {
1426
if(!is_candidate(savepoints[i]) ||
1427
((res = translate(savepoints+i,numpoints - i, &newpoint)) == 0)) {
1428
ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
1431
} else if (res > 0) {
1432
ret = CONS(hp,make_small((Uint) newpoint),ret);
1435
} else { /* res < 0 */
1436
/* A "maybe", means we are not done yet */
1438
while (i < numpoints) {
1439
savepoints[j++] = savepoints[i++];
1448
} else if (res > 0) {
1450
ret = CONS(hp,make_small((Uint) newpoint),ret);
1452
} /* < 0 means go on */
1454
/* Unconditional rollup, this character is larger than 16 bit */
1455
ret = CONS(hp,make_small((Uint) savepoints[0]),ret);
1458
for (i = 1;i < numpoints;) {
1459
if(!is_candidate(savepoints[i]) ||
1460
((res = translate(savepoints+i,numpoints - i, &newpoint)) <= 0)) {
1461
ret = CONS(hp,make_small((Uint) savepoints[i]),ret);
1465
ret = CONS(hp,make_small((Uint) newpoint),ret);
1470
ret = CONS(hp,make_small(unipoint),ret);
1475
*numpointsp = numpoints;
1479
static Eterm do_utf8_to_list_normalize(Process *p, Uint num, byte *bytes, Uint sz)
1485
Uint16 savepoints[4];
1490
hp = HAlloc(p,num * 2); /* May be to much */
1491
hp_end = hp + num * 2;
1493
source = bytes + sz;
1494
while(--source >= bytes) {
1495
if (((*source) & ((byte) 0x80)) == 0) {
1496
unipoint = (Uint) *source;
1497
} else if (((*source) & ((byte) 0xE0)) == 0xC0) {
1499
(((Uint) ((*source) & ((byte) 0x1F))) << 6) |
1500
((Uint) (source[1] & ((byte) 0x3F)));
1501
} else if (((*source) & ((byte) 0xF0)) == 0xE0) {
1503
(((Uint) ((*source) & ((byte) 0xF))) << 12) |
1504
(((Uint) (source[1] & ((byte) 0x3F))) << 6) |
1505
((Uint) (source[2] & ((byte) 0x3F)));
1506
} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
1508
(((Uint) ((*source) & ((byte) 0x7))) << 18) |
1509
(((Uint) (source[1] & ((byte) 0x3F))) << 12) |
1510
(((Uint) (source[2] & ((byte) 0x3F))) << 6) |
1511
((Uint) (source[3] & ((byte) 0x3F)));
1513
/* ignore 2#10XXXXXX */
1517
handle_potential_norm(&hp,savepoints,&numpoints,unipoint,&ret);
1520
/* We are not building up any normalizations yet, look that we shouldn't start... */
1521
if (is_candidate(unipoint)) {
1522
handle_first_norm(savepoints,&numpoints,unipoint);
1525
ret = CONS(hp,make_small(unipoint),ret);
1528
/* so, we'we looped to the beginning, do we have anything saved? */
1530
cleanup_norm(&hp,savepoints,numpoints,&ret);
1533
HRelease(p,hp_end,hp);
1334
1539
* The last step of characters_to_list, build a list from the buffer 'bytes' (created in the same way
1335
1540
* as for characters_to_utf8). All sizes are known in advance and most data will be held in a
1814
2019
return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1);
2022
/**********************************************************
2023
* Simpler non-interruptable routines for UTF-8 and
2024
* Windowish UTF-16 (restricted)
2025
**********************************************************/
2027
* This function is the heart of the Unicode support for
2028
* open_port - spawn_executable. It converts both the name
2029
* of the executable and the arguments according to the same rules
2030
* as for filename conversion. That means as if your arguments are
2031
* to be raw, you supply binaries, else unicode characters are allowed up to
2032
* the encoding maximum (256 of the unicode max).
2033
* Depending on the filename encoding standard, the vector is then
2034
* converted to whatever is used, which might mean win_utf16 if on windows.
2035
* Do not peek into the argument vector or filenam with ordinary
2036
* string routines, that will certainly fail on some OS.
2039
char *erts_convert_filename_to_native(Eterm name, ErtsAlcType_t alloc_type, int allow_empty)
2041
int encoding = erts_get_native_filename_encoding();
2042
char* name_buf = NULL;
2044
if (is_atom(name) || is_list(name) || (allow_empty && is_nil(name))) {
2046
if ((need = erts_native_filename_need(name,encoding)) < 0) {
2049
if (encoding == ERL_FILENAME_WIN_WCHAR) {
2054
name_buf = (char *) erts_alloc(alloc_type, need);
2055
erts_native_filename_put(name,encoding,(byte *)name_buf);
2056
name_buf[need-1] = 0;
2057
if (encoding == ERL_FILENAME_WIN_WCHAR) {
2058
name_buf[need-2] = 0;
2060
} else if (is_binary(name)) {
2061
byte *temp_alloc = NULL;
2064
Uint size,num_chars;
2066
size = binary_size(name);
2067
bytes = erts_get_aligned_binary_bytes(name, &temp_alloc);
2068
if (encoding != ERL_FILENAME_WIN_WCHAR) {
2069
/*Add 0 termination only*/
2070
name_buf = (char *) erts_alloc(alloc_type, size+1);
2071
memcpy(name_buf,bytes,size);
2073
} else if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK ||
2074
erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) {
2076
/* What to do now? Maybe latin1, so just take byte for byte instead */
2077
name_buf = (char *) erts_alloc(alloc_type, (size+1)*2);
2078
p = (byte *) name_buf;
2085
} else { /* WIN_WCHAR and valid UTF8 */
2086
name_buf = (char *) erts_alloc(alloc_type, (num_chars+1)*2);
2087
erts_copy_utf8_to_utf16_little((byte *) name_buf, bytes, num_chars);
2088
name_buf[num_chars*2] = 0;
2089
name_buf[num_chars*2+1] = 0;
2091
erts_free_aligned_binary_bytes(temp_alloc);
2099
Sint erts_native_filename_need(Eterm ioterm, int encoding)
2103
DECLARE_ESTACK(stack);
2106
if (is_atom(ioterm)) {
2109
ap = atom_tab(atom_val(ioterm));
2111
case ERL_FILENAME_LATIN1:
2114
case ERL_FILENAME_UTF8_MAC:
2115
case ERL_FILENAME_UTF8:
2116
for (i = 0; i < ap->len; i++) {
2117
need += (ap->name[i] >= 0x80) ? 2 : 1;
2120
case ERL_FILENAME_WIN_WCHAR:
2126
DESTROY_ESTACK(stack);
2130
if (is_nil(ioterm)) {
2131
DESTROY_ESTACK(stack);
2134
if (!is_list(ioterm)) {
2135
DESTROY_ESTACK(stack);
2138
/* OK a list, needs to be processed in order, handling each flat list-level
2139
as they occur, just like io_list_to_binary would */
2140
ESTACK_PUSH(stack,ioterm);
2141
while (!ESTACK_ISEMPTY(stack)) {
2142
ioterm = ESTACK_POP(stack);
2143
if (is_nil(ioterm)) {
2144
/* ignore empty lists */
2147
if(is_list(ioterm)) {
2148
L_Again: /* Restart with sublist, old listend was pushed on stack */
2149
objp = list_val(ioterm);
2151
for(;;) { /* loop over one flat list of bytes and binaries
2152
until sublist or list end is encountered */
2153
if (is_small(obj)) { /* Always small */
2155
Uint x = unsigned_val(obj);
2157
case ERL_FILENAME_LATIN1:
2159
DESTROY_ESTACK(stack);
2164
case ERL_FILENAME_UTF8_MAC:
2165
case ERL_FILENAME_UTF8:
2168
} else if (x < 0x800) {
2170
} else if (x < 0x10000) {
2171
if ((x >= 0xD800 && x <= 0xDFFF) ||
2173
(x == 0xFFFF)) { /* Invalid unicode range */
2174
DESTROY_ESTACK(stack);
2178
} else if (x < 0x110000) {
2181
DESTROY_ESTACK(stack);
2185
case ERL_FILENAME_WIN_WCHAR:
2189
} /* else fall throug to error */
2191
DESTROY_ESTACK(stack);
2195
/* everything else will give badarg later
2196
in the process, so we dont check */
2198
if (!is_list(ioterm)) {
2201
objp = list_val(ioterm);
2206
} else if (is_nil(obj)) {
2208
if (!is_list(ioterm)) {
2211
objp = list_val(ioterm);
2213
} else if (is_list(obj)) {
2214
/* push rest of list for later processing, start
2215
again with sublist */
2216
ESTACK_PUSH(stack,CDR(objp));
2220
DESTROY_ESTACK(stack);
2223
if (is_nil(ioterm) || !is_list(ioterm)) {
2227
} /* is_list(ioterm) */
2229
if (!is_list(ioterm) && !is_nil(ioterm)) {
2230
/* inproper list end */
2231
DESTROY_ESTACK(stack);
2234
} /* while not estack empty */
2235
DESTROY_ESTACK(stack);
2239
void erts_native_filename_put(Eterm ioterm, int encoding, byte *p)
2243
DECLARE_ESTACK(stack);
2245
if (is_atom(ioterm)) {
2248
ap = atom_tab(atom_val(ioterm));
2250
case ERL_FILENAME_LATIN1:
2251
for (i = 0; i < ap->len; i++) {
2255
case ERL_FILENAME_UTF8_MAC:
2256
case ERL_FILENAME_UTF8:
2257
for (i = 0; i < ap->len; i++) {
2258
if(ap->name[i] < 0x80) {
2261
*p++ = (((ap->name[i]) >> 6) | ((byte) 0xC0));
2262
*p++ = (((ap->name[i]) & 0x3F) | ((byte) 0x80));
2266
case ERL_FILENAME_WIN_WCHAR:
2267
for (i = 0; i < ap->len; i++) {
2276
DESTROY_ESTACK(stack);
2280
if (is_nil(ioterm)) {
2281
DESTROY_ESTACK(stack);
2284
ASSERT(is_list(ioterm));
2285
/* OK a list, needs to be processed in order, handling each flat list-level
2286
as they occur, just like io_list_to_binary would */
2287
ESTACK_PUSH(stack,ioterm);
2288
while (!ESTACK_ISEMPTY(stack)) {
2289
ioterm = ESTACK_POP(stack);
2290
if (is_nil(ioterm)) {
2291
/* ignore empty lists */
2294
if(is_list(ioterm)) {
2295
L_Again: /* Restart with sublist, old listend was pushed on stack */
2296
objp = list_val(ioterm);
2298
for(;;) { /* loop over one flat list of bytes and binaries
2299
until sublist or list end is encountered */
2300
if (is_small(obj)) { /* Always small */
2302
Uint x = unsigned_val(obj);
2304
case ERL_FILENAME_LATIN1:
2308
case ERL_FILENAME_UTF8_MAC:
2309
case ERL_FILENAME_UTF8:
2313
else if (x < 0x800) {
2314
*p++ = (((byte) (x >> 6)) |
2316
*p++ = (((byte) (x & 0x3F)) |
2318
} else if (x < 0x10000) {
2319
ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
2322
*p++ = (((byte) (x >> 12)) |
2324
*p++ = ((((byte) (x >> 6)) & 0x3F) |
2326
*p++ = (((byte) (x & 0x3F)) |
2329
ASSERT(x < 0x110000);
2330
*p++ = (((byte) (x >> 18)) |
2332
*p++ = ((((byte) (x >> 12)) & 0x3F) |
2334
*p++ = ((((byte) (x >> 6)) & 0x3F) |
2336
*p++ = (((byte) (x & 0x3F)) |
2340
case ERL_FILENAME_WIN_WCHAR:
2341
ASSERT(x <= 0xFFFF);
2342
*p++ = (byte) (x & 0xFFU);
2343
*p++ = (byte) ((x >> 8) & 0xFFU);
2349
/* everything else will give badarg later
2350
in the process, so we dont check */
2352
if (!is_list(ioterm)) {
2355
objp = list_val(ioterm);
2360
} else if (is_nil(obj)) {
2362
if (!is_list(ioterm)) {
2365
objp = list_val(ioterm);
2367
} else if (is_list(obj)) {
2368
/* push rest of list for later processing, start
2369
again with sublist */
2370
ESTACK_PUSH(stack,CDR(objp));
2376
if (is_nil(ioterm) || !is_list(ioterm)) {
2380
} /* is_list(ioterm) */
2382
ASSERT(is_list(ioterm) || is_nil(ioterm));
2383
} /* while not estack empty */
2384
DESTROY_ESTACK(stack);
2387
void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars)
2391
while (num_chars--) {
2392
if (((*bytes) & ((byte) 0x80)) == 0) {
2393
unipoint = (Uint) *bytes;
2395
} else if (((*bytes) & ((byte) 0xE0)) == 0xC0) {
2397
(((Uint) ((*bytes) & ((byte) 0x1F))) << 6) |
2398
((Uint) (bytes[1] & ((byte) 0x3F)));
2400
} else if (((*bytes) & ((byte) 0xF0)) == 0xE0) {
2402
(((Uint) ((*bytes) & ((byte) 0xF))) << 12) |
2403
(((Uint) (bytes[1] & ((byte) 0x3F))) << 6) |
2404
((Uint) (bytes[2] & ((byte) 0x3F)));
2406
} else if (((*bytes) & ((byte) 0xF8)) == 0xF0) {
2408
(((Uint) ((*bytes) & ((byte) 0x7))) << 18) |
2409
(((Uint) (bytes[1] & ((byte) 0x3F))) << 12) |
2410
(((Uint) (bytes[2] & ((byte) 0x3F))) << 6) |
2411
((Uint) (bytes[3] & ((byte) 0x3F)));
2414
erl_exit(1,"Internal unicode error in prim_file:internal_name2native/1");
2416
*target++ = (byte) (unipoint & 0xFF);
2417
*target++ = (byte) ((unipoint >> 8) & 0xFF);
2422
* This internal bif converts a filename to whatever format is suitable for the file driver
2423
* It also adds zero termination so that prim_file needn't bother with the character encoding
2424
* of the file driver
2426
BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
2428
int encoding = erts_get_native_filename_encoding();
2432
/* Prim file explicitly does not allow atoms, although we could
2433
very well cope with it. Instead of letting 'file' handle them,
2434
it would probably be more efficient to handle them here. Subject to
2436
if (is_atom(BIF_ARG_1)) {
2437
BIF_ERROR(BIF_P,BADARG);
2439
if (is_binary(BIF_ARG_1)) {
2440
byte *temp_alloc = NULL;
2443
Uint size,num_chars;
2444
/* Uninterpreted encoding except if windows widechar, in case we convert from
2445
utf8 to win_wchar */
2446
size = binary_size(BIF_ARG_1);
2447
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
2448
if (encoding != ERL_FILENAME_WIN_WCHAR) {
2449
/*Add 0 termination only*/
2450
bin_term = new_binary(BIF_P, NULL, size+1);
2451
bin_p = binary_bytes(bin_term);
2452
memcpy(bin_p,bytes,size);
2454
erts_free_aligned_binary_bytes(temp_alloc);
2457
/* In a wchar world, the emulator flags only affect how
2458
binaries are interpreted when sent from the user. */
2459
/* Determine real length and create a new binary */
2460
if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK ||
2461
erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) {
2462
/* What to do now? Maybe latin1, so just take byte for byte instead */
2463
bin_term = new_binary(BIF_P, 0, (size+1)*2);
2464
bin_p = binary_bytes(bin_term);
2466
*bin_p++ = *bytes++;
2471
erts_free_aligned_binary_bytes(temp_alloc);
2474
/* OK, UTF8 ok, number of characters is in num_chars */
2475
bin_term = new_binary(BIF_P, 0, (num_chars+1)*2);
2476
bin_p = binary_bytes(bin_term);
2477
erts_copy_utf8_to_utf16_little(bin_p, bytes, num_chars);
2478
/* zero termination */
2479
bin_p[num_chars*2] = 0;
2480
bin_p[num_chars*2+1] = 0;
2481
erts_free_aligned_binary_bytes(temp_alloc);
2486
if ((need = erts_native_filename_need(BIF_ARG_1,encoding)) < 0) {
2487
BIF_ERROR(BIF_P,BADARG);
2489
if (encoding == ERL_FILENAME_WIN_WCHAR) {
2495
bin_term = new_binary(BIF_P, 0, need);
2496
bin_p = binary_bytes(bin_term);
2497
erts_native_filename_put(BIF_ARG_1,encoding,bin_p);
2499
if (encoding == ERL_FILENAME_WIN_WCHAR) {
2505
BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
2509
Uint size,num_chars;
2513
byte *temp_alloc = NULL;
2516
Uint num_built; /* characters */
2517
Uint num_eaten; /* bytes */
2521
if (is_not_binary(BIF_ARG_1)) {
2522
BIF_ERROR(BIF_P,BADARG);
2524
size = binary_size(BIF_ARG_1);
2525
ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize);
2527
BIF_ERROR(BIF_P,BADARG);
2532
switch (erts_get_native_filename_encoding()) {
2533
case ERL_FILENAME_LATIN1:
2534
hp = HAlloc(BIF_P, 2 * size);
2535
bytes = binary_bytes(real_bin)+offset;
2537
BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs));
2538
case ERL_FILENAME_UTF8_MAC:
2540
case ERL_FILENAME_UTF8:
2541
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
2542
if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) {
2543
erts_free_aligned_binary_bytes(temp_alloc);
2549
ret = do_utf8_to_list_normalize(BIF_P, num_chars, bytes, size);
2551
ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL);
2553
erts_free_aligned_binary_bytes(temp_alloc);
2555
case ERL_FILENAME_WIN_WCHAR:
2556
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
2557
if ((size % 2) != 0) { /* Panic fixup to avoid crashing the emulator */
2559
hp = HAlloc(BIF_P, size+2);
2560
ret = CONS(hp,make_small((Uint) bytes[size]),NIL);
2563
hp = HAlloc(BIF_P, size);
2568
Uint x = ((Uint) *bytes--) << 8;
2569
x |= ((Uint) *bytes--);
2571
ret = CONS(hp,make_small(x),ret);
2574
erts_free_aligned_binary_bytes(temp_alloc);
2583
BIF_RETTYPE prim_file_internal_normalize_utf8_1(BIF_ALIST_1)
2587
Uint size,num_chars;
2591
byte *temp_alloc = NULL;
2595
if (is_not_binary(BIF_ARG_1)) {
2596
BIF_ERROR(BIF_P,BADARG);
2598
size = binary_size(BIF_ARG_1);
2599
ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize);
2601
BIF_ERROR(BIF_P,BADARG);
2606
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
2607
if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) {
2608
erts_free_aligned_binary_bytes(temp_alloc);
2609
BIF_ERROR(BIF_P,BADARG);
2611
ret = do_utf8_to_list_normalize(BIF_P, num_chars, bytes, size);
2612
erts_free_aligned_binary_bytes(temp_alloc);
2616
BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
2618
switch (erts_get_native_filename_encoding()) {
2619
case ERL_FILENAME_LATIN1:
2621
case ERL_FILENAME_UTF8_MAC:
2622
case ERL_FILENAME_UTF8:
2624
case ERL_FILENAME_WIN_WCHAR:
2625
if (erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) {
2631
BIF_RET(am_undefined);