~frankencode/drycore/trunk

« back to all changes in this revision

Viewing changes to dry/Utf8Encoder.hpp

  • Committer: Frank Mertens
  • Date: 2013-02-27 18:43:50 UTC
  • Revision ID: frank@cyblogic.de-20130227184350-ypu14rj5e2r8gwqz
Initial commit.

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
 /*
 
2
  * Copyright (C) 2007-2013 Frank Mertens.
 
3
  *
 
4
  * This program is free software; you can redistribute it and/or
 
5
  * modify it under the terms of the GNU General Public License
 
6
  * as published by the Free Software Foundation; either version
 
7
  * 2 of the License, or (at your option) any later version.
 
8
  */
 
9
#ifndef DRY_UTF8ENCODER_HPP
 
10
#define DRY_UTF8ENCODER_HPP
 
11
 
 
12
#include "ByteEncoder.hpp"
 
13
 
 
14
namespace dry
 
15
{
 
16
 
 
17
class Utf8Encoder: public Sink<uchar_t>
 
18
{
 
19
public:
 
20
        inline static Ref<Utf8Encoder> open(Stream *stream, int bufSize = 0x4000) {
 
21
                return new Utf8Encoder(stream, bufSize);
 
22
        }
 
23
 
 
24
        inline static Ref<Utf8Encoder> open(void *buf, int bufSize) {
 
25
                return new Utf8Encoder(buf, bufSize);
 
26
        }
 
27
 
 
28
        void write(uchar_t ch);
 
29
        static int encodedSize(uchar_t ch);
 
30
 
 
31
        inline ByteEncoder *byteEncoder() const { return byteEncoder_; }
 
32
 
 
33
private:
 
34
        Utf8Encoder(Stream *stream, int bufSize)
 
35
                : byteEncoder_(ByteEncoder::open(stream, bufSize))
 
36
        {}
 
37
 
 
38
        Utf8Encoder(void *buf, int bufSize)
 
39
                : byteEncoder_(ByteEncoder::open(buf, bufSize))
 
40
        {}
 
41
 
 
42
        Ref<ByteEncoder> byteEncoder_;
 
43
};
 
44
 
 
45
inline void Utf8Encoder::write(uchar_t ch)
 
46
{
 
47
        if (ch < 0x80) { // ASCII range: 0xxxxxxx
 
48
                byteEncoder_->writeUInt8(ch);
 
49
        }
 
50
        else if (ch < 0x800) { // two-byte codes: 110yyyxx | 10xxxxx
 
51
                byteEncoder_->writeUInt8((ch >> 6) | 0xC0);   // 0xC = (1100)2, code prefix: (110)2
 
52
                byteEncoder_->writeUInt8((ch & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
 
53
        }
 
54
        else if (ch < 0x10000) { // three-byte codes: 1110yyyy | 10yyyyxx | 10xxxxxx
 
55
                if ((0xD800 <= ch) && (ch <= 0xDFFF))
 
56
                        DRY_THROW(EncodingException, "UTF-8 disallows encoding of UTF-16 surrogate pairs 0xD800..0xDFFF");
 
57
                else if ((0xFDD0 <= ch) && (ch <= 0xFDEF))
 
58
                        DRY_THROW(EncodingException, "UTF-8 disallows encoding of non-characters 0xFDD0..0xFDEF");
 
59
                byteEncoder_->writeUInt8((ch >> 12) | 0xE0);         // 0xE = (1110)2, code prefix: (1110)2
 
60
                byteEncoder_->writeUInt8(((ch >> 6) & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
 
61
                byteEncoder_->writeUInt8((ch & 0x3F) | 0x80);        // 0x8 = (1000)2, code prefix: (10)2
 
62
        }
 
63
        else if (ch < 0x110000) { // four-byte codes: 11110zzz | 10zzyyyy | 10yyyyxx | 10xxxxxx
 
64
                if ((ch & 0xFFFE) == 0xFFFE)
 
65
                        DRY_THROW(EncodingException, "UTF-8 disallows encoding of non-characters 0x??FFFE,0x??FFFF");
 
66
                byteEncoder_->writeUInt8((ch >> 18) | 0xF0);           // 0xF = (1111)2, code prefix: (11110)2
 
67
                byteEncoder_->writeUInt8(((ch >> 12) & 0x3F) | 0x80); // 0x8 = (1000)2, code prefix: (10)2
 
68
                byteEncoder_->writeUInt8(((ch >> 6) & 0x3F) | 0x80);  // 0x8 = (1000)2, code prefix: (10)2
 
69
                byteEncoder_->writeUInt8((ch & 0x3F) | 0x80);         // 0x8 = (1000)2, code prefix: (10)2
 
70
        }
 
71
        else {
 
72
                DRY_THROW(EncodingException, "UTF-8 disallows encoding of code points above 0x10FFFF");
 
73
        }
 
74
}
 
75
 
 
76
inline int Utf8Encoder::encodedSize(uchar_t ch)
 
77
{
 
78
        int n = 0;
 
79
        if (ch < 0x80)         n = 1; // ASCII range: 0xxxxxxx
 
80
        else if (ch < 0x800)   n = 2; // two-byte codes: 110yyyxx | 10xxxxx
 
81
        else if (ch < 0x10000) n = 3; // three-byte codes: 1110yyyy | 10yyyyxx | 10xxxxxx
 
82
        else if (ch < 0x11000) n = 4; // four-byte codes: 11110zzz | 10zzyyyy | 10yyyyxx | 10xxxxxx
 
83
        return n;
 
84
}
 
85
 
 
86
} // namespace dry
 
87
 
 
88
#endif // DRY_UTF8ENCODER_HPP