2
* Copyright (C) 2007-2013 Frank Mertens.
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation; either version
7
* 2 of the License, or (at your option) any later version.
9
#ifndef DRY_UTF8ENCODER_HPP
10
#define DRY_UTF8ENCODER_HPP
12
#include "ByteEncoder.hpp"
17
class Utf8Encoder: public Sink<uchar_t>
20
inline static Ref<Utf8Encoder> open(Stream *stream, int bufSize = 0x4000) {
21
return new Utf8Encoder(stream, bufSize);
24
inline static Ref<Utf8Encoder> open(void *buf, int bufSize) {
25
return new Utf8Encoder(buf, bufSize);
28
void write(uchar_t ch);
29
static int encodedSize(uchar_t ch);
31
inline ByteEncoder *byteEncoder() const { return byteEncoder_; }
34
Utf8Encoder(Stream *stream, int bufSize)
35
: byteEncoder_(ByteEncoder::open(stream, bufSize))
38
Utf8Encoder(void *buf, int bufSize)
39
: byteEncoder_(ByteEncoder::open(buf, bufSize))
42
Ref<ByteEncoder> byteEncoder_;
45
inline void Utf8Encoder::write(uchar_t ch)
47
if (ch < 0x80) { // ASCII range: 0xxxxxxx
48
byteEncoder_->writeUInt8(ch);
50
else if (ch < 0x800) { // two-byte codes: 110yyyxx | 10xxxxx
51
byteEncoder_->writeUInt8((ch >> 6) | 0xC0); // 0xC = (1100)2, code prefix: (110)2
52
byteEncoder_->writeUInt8((ch & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
54
else if (ch < 0x10000) { // three-byte codes: 1110yyyy | 10yyyyxx | 10xxxxxx
55
if ((0xD800 <= ch) && (ch <= 0xDFFF))
56
DRY_THROW(EncodingException, "UTF-8 disallows encoding of UTF-16 surrogate pairs 0xD800..0xDFFF");
57
else if ((0xFDD0 <= ch) && (ch <= 0xFDEF))
58
DRY_THROW(EncodingException, "UTF-8 disallows encoding of non-characters 0xFDD0..0xFDEF");
59
byteEncoder_->writeUInt8((ch >> 12) | 0xE0); // 0xE = (1110)2, code prefix: (1110)2
60
byteEncoder_->writeUInt8(((ch >> 6) & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
61
byteEncoder_->writeUInt8((ch & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
63
else if (ch < 0x110000) { // four-byte codes: 11110zzz | 10zzyyyy | 10yyyyxx | 10xxxxxx
64
if ((ch & 0xFFFE) == 0xFFFE)
65
DRY_THROW(EncodingException, "UTF-8 disallows encoding of non-characters 0x??FFFE,0x??FFFF");
66
byteEncoder_->writeUInt8((ch >> 18) | 0xF0); // 0xF = (1111)2, code prefix: (11110)2
67
byteEncoder_->writeUInt8(((ch >> 12) & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
68
byteEncoder_->writeUInt8(((ch >> 6) & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
69
byteEncoder_->writeUInt8((ch & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
72
DRY_THROW(EncodingException, "UTF-8 disallows encoding of code points above 0x10FFFF");
76
inline int Utf8Encoder::encodedSize(uchar_t ch)
79
if (ch < 0x80) n = 1; // ASCII range: 0xxxxxxx
80
else if (ch < 0x800) n = 2; // two-byte codes: 110yyyxx | 10xxxxx
81
else if (ch < 0x10000) n = 3; // three-byte codes: 1110yyyy | 10yyyyxx | 10xxxxxx
82
else if (ch < 0x11000) n = 4; // four-byte codes: 11110zzz | 10zzyyyy | 10yyyyxx | 10xxxxxx
88
#endif // DRY_UTF8ENCODER_HPP