#include "tickit.h" #include #include "unicode.h" static int next_utf8(const char *str, size_t len, uint32_t *cp) { unsigned char b0 = (str++)[0]; int nbytes; if(!len) return -1; if(!b0) return -1; else if(b0 < 0x80) { // ASCII *cp = b0; return 1; } else if(b0 < 0xc0) // C1 or continuation return -1; else if(b0 < 0xe0) { nbytes = 2; *cp = b0 & 0x1f; } else if(b0 < 0xf0) { nbytes = 3; *cp = b0 & 0x0f; } else if(b0 < 0xf8) { nbytes = 4; *cp = b0 & 0x07; } else return -1; if(len < nbytes) return -1; for(int i = 1; i < nbytes; i++) { b0 = (str++)[0]; if(!b0) return -1; *cp <<= 6; *cp |= b0 & 0x3f; } return nbytes; } int tickit_utf8_seqlen(long codepoint) { if(codepoint < 0x0000080) return 1; if(codepoint < 0x0000800) return 2; if(codepoint < 0x0010000) return 3; if(codepoint < 0x0200000) return 4; if(codepoint < 0x4000000) return 5; return 6; } size_t tickit_utf8_put(char *str, size_t len, long codepoint) { int nbytes = tickit_utf8_seqlen(codepoint); if(!str) return nbytes; if(len < nbytes) return -1; // This is easier done backwards int b = nbytes; while(b > 1) { b--; str[b] = 0x80 | (codepoint & 0x3f); codepoint >>= 6; } switch(nbytes) { case 1: str[0] = (codepoint & 0x7f); break; case 2: str[0] = 0xc0 | (codepoint & 0x1f); break; case 3: str[0] = 0xe0 | (codepoint & 0x0f); break; case 4: str[0] = 0xf0 | (codepoint & 0x07); break; case 5: str[0] = 0xf8 | (codepoint & 0x03); break; case 6: str[0] = 0xfc | (codepoint & 0x01); break; } return nbytes; } size_t tickit_utf8_count(const char *str, TickitStringPos *pos, const TickitStringPos *limit) { tickit_stringpos_zero(pos); return tickit_utf8_ncountmore(str, (size_t)-1, pos, limit); } size_t tickit_utf8_countmore(const char *str, TickitStringPos *pos, const TickitStringPos *limit) { return tickit_utf8_ncountmore(str, (size_t)-1, pos, limit); } size_t tickit_utf8_ncount(const char *str, size_t len, TickitStringPos *pos, const TickitStringPos *limit) { tickit_stringpos_zero(pos); return tickit_utf8_ncountmore(str, len, pos, limit); } size_t tickit_utf8_ncountmore(const char *str, size_t len, TickitStringPos *pos, const TickitStringPos *limit) { TickitStringPos here = *pos; size_t start_bytes = pos->bytes; str += pos->bytes; if(len != (size_t)-1) len -= pos->bytes; while(len != 0 && *str) { uint32_t cp; int bytes = next_utf8(str, len, &cp); if(bytes == -1) return -1; /* Abort on C0 or C1 controls */ if(cp < 0x20 || (cp >= 0x80 && cp < 0xa0)) return -1; int width = tickit_utf8_wcwidth(cp); if(width == -1) return -1; int is_grapheme = (width > 0) ? 1 : 0; if(is_grapheme) // Commit on the previous grapheme *pos = here; if(limit && limit->bytes != (size_t)-1 && here.bytes + bytes > limit->bytes) break; if(limit && limit->codepoints != -1 && here.codepoints + 1 > limit->codepoints) break; if(limit && limit->graphemes != -1 && here.graphemes + is_grapheme > limit->graphemes) break; if(limit && limit->columns != -1 && here.columns + width > limit->columns) break; str += bytes; if(len != (size_t)-1) len -= bytes; here.bytes += bytes; here.codepoints += 1; here.graphemes += is_grapheme; here.columns += width; } if(len == 0 || *str == 0) // Commit on the final grapheme *pos = here; return pos->bytes - start_bytes; } int tickit_utf8_mbswidth(const char *str) { TickitStringPos pos; tickit_utf8_count(str, &pos, NULL); return pos.columns; } int tickit_utf8_byte2col(const char *str, size_t byte) { TickitStringPos limit = INIT_TICKIT_STRINGPOS_LIMIT_BYTES(byte); TickitStringPos pos; tickit_utf8_count(str, &pos, &limit); return pos.columns; } size_t tickit_utf8_col2byte(const char *str, int col) { TickitStringPos limit = INIT_TICKIT_STRINGPOS_LIMIT_COLUMNS(col); TickitStringPos pos; tickit_utf8_count(str, &pos, &limit); return pos.bytes; }