11
12
-- Functions for converting lazy 'Text' values to and from lazy
12
13
-- 'ByteString', using several standard encodings.
15
-- To gain access to a much larger variety of encodings, use the
16
-- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>
15
18
module Data.Text.Lazy.Encoding
17
20
-- * Decoding ByteStrings to Text
29
-- ** Catchable failure
32
-- ** Controllable error handling
26
39
-- * Encoding Text to ByteStrings
34
import Data.ByteString.Lazy (ByteString)
35
import Data.Text.Encoding.Error (OnDecodeError, strictDecode)
36
import Data.Text.Lazy (Text)
47
import Control.Exception (evaluate, try)
48
import Data.Bits ((.&.))
49
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)
50
import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)
51
import System.IO.Unsafe (unsafePerformIO)
52
import qualified Data.ByteString as S
53
import qualified Data.ByteString.Lazy as B
54
import qualified Data.ByteString.Lazy.Internal as B
55
import qualified Data.ByteString.Unsafe as S
56
import qualified Data.Text as T
57
import qualified Data.Text.Encoding as TE
58
import qualified Data.Text.Lazy.Encoding.Fusion as E
37
59
import qualified Data.Text.Lazy.Fusion as F
38
import qualified Data.Text.Lazy.Encoding.Fusion as E
40
decodeUtf8With :: OnDecodeError -> ByteString -> Text
41
decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)
42
{-# INLINE decodeUtf8With #-}
44
decodeUtf8 :: ByteString -> Text
63
-- All of the single-parameter functions for decoding bytestrings
64
-- encoded in one of the Unicode Transformation Formats (UTF) operate
65
-- in a /strict/ mode: each will throw an exception if given invalid
68
-- Each function has a variant, whose name is suffixed with -'With',
69
-- that gives greater control over the handling of decoding errors.
70
-- For instance, 'decodeUtf8' will throw an exception, but
71
-- 'decodeUtf8With' allows the programmer to determine what to do on a
74
-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.
75
decodeASCII :: B.ByteString -> Text
76
decodeASCII bs = foldr (chunk . TE.decodeASCII) empty (B.toChunks bs)
77
{-# INLINE decodeASCII #-}
79
-- | Decode a 'ByteString' containing UTF-8 encoded text.
80
decodeUtf8With :: OnDecodeError -> B.ByteString -> Text
81
decodeUtf8With onErr bs0 = fast bs0
83
decode = TE.decodeUtf8With onErr
84
fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)
85
| otherwise = chunk (decode h) (slow t ps)
86
where (h,t) = S.splitAt pivot p
91
at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0
93
slow i bs = {-# SCC "decodeUtf8With'/slow" #-}
95
Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')
96
| otherwise -> slow i' bs'
98
Nothing -> case S.uncons i of
100
case onErr desc (Just j) of
101
Nothing -> slow i' bs
102
Just c -> Chunk (T.singleton c) (slow i' bs)
104
case onErr desc Nothing of
106
Just c -> Chunk (T.singleton c) empty
107
isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}
108
ix 1 .&. 0x80 == 0 ||
109
(len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||
110
(len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||
111
(len >= 4 && ix 4 .&. 0xf8 == 0xf0)
112
where len = S.length bs
113
ix n = S.unsafeIndex bs (len-n)
114
desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"
115
{-# INLINE[0] decodeUtf8With #-}
117
-- | Decode a 'ByteString' containing UTF-8 encoded text that is known
120
-- If the input contains any invalid UTF-8 data, an exception will be
121
-- thrown that cannot be caught in pure code. For more control over
122
-- the handling of invalid data, use 'decodeUtf8'' or
124
decodeUtf8 :: B.ByteString -> Text
45
125
decodeUtf8 = decodeUtf8With strictDecode
46
{-# INLINE decodeUtf8 #-}
48
encodeUtf8 :: Text -> ByteString
49
encodeUtf8 txt = E.unstream (E.restreamUtf8 (F.stream txt))
50
{-# INLINE encodeUtf8 #-}
126
{-# INLINE[0] decodeUtf8 #-}
128
-- This rule seems to cause performance loss.
129
{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]
130
forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}
132
-- | Decode a 'ByteString' containing UTF-8 encoded text..
134
-- If the input contains any invalid UTF-8 data, the relevant
135
-- exception will be returned, otherwise the decoded text.
137
-- /Note/: this function is /not/ lazy, as it must decode its entire
138
-- input before it can return a result. If you need lazy (streaming)
139
-- decoding, use 'decodeUtf8With' in lenient mode.
140
decodeUtf8' :: B.ByteString -> Either UnicodeException Text
141
decodeUtf8' bs = unsafePerformIO $ do
142
let t = decodeUtf8 bs
143
try (evaluate (rnf t `seq` t))
146
rnf (Chunk _ ts) = rnf ts
147
{-# INLINE decodeUtf8' #-}
149
encodeUtf8 :: Text -> B.ByteString
150
encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)
151
encodeUtf8 Empty = B.Empty
153
-- | Decode text from little endian UTF-16 encoding.
154
decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text
155
decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)
156
{-# INLINE decodeUtf16LEWith #-}
158
-- | Decode text from little endian UTF-16 encoding.
160
-- If the input contains any invalid little endian UTF-16 data, an
161
-- exception will be thrown. For more control over the handling of
162
-- invalid data, use 'decodeUtf16LEWith'.
163
decodeUtf16LE :: B.ByteString -> Text
164
decodeUtf16LE = decodeUtf16LEWith strictDecode
165
{-# INLINE decodeUtf16LE #-}
167
-- | Decode text from big endian UTF-16 encoding.
168
decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text
169
decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)
170
{-# INLINE decodeUtf16BEWith #-}
172
-- | Decode text from big endian UTF-16 encoding.
174
-- If the input contains any invalid big endian UTF-16 data, an
175
-- exception will be thrown. For more control over the handling of
176
-- invalid data, use 'decodeUtf16BEWith'.
177
decodeUtf16BE :: B.ByteString -> Text
178
decodeUtf16BE = decodeUtf16BEWith strictDecode
179
{-# INLINE decodeUtf16BE #-}
181
-- | Encode text using little endian UTF-16 encoding.
182
encodeUtf16LE :: Text -> B.ByteString
183
encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)
184
{-# INLINE encodeUtf16LE #-}
186
-- | Encode text using big endian UTF-16 encoding.
187
encodeUtf16BE :: Text -> B.ByteString
188
encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)
189
{-# INLINE encodeUtf16BE #-}
191
-- | Decode text from little endian UTF-32 encoding.
192
decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text
193
decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)
194
{-# INLINE decodeUtf32LEWith #-}
196
-- | Decode text from little endian UTF-32 encoding.
198
-- If the input contains any invalid little endian UTF-32 data, an
199
-- exception will be thrown. For more control over the handling of
200
-- invalid data, use 'decodeUtf32LEWith'.
201
decodeUtf32LE :: B.ByteString -> Text
202
decodeUtf32LE = decodeUtf32LEWith strictDecode
203
{-# INLINE decodeUtf32LE #-}
205
-- | Decode text from big endian UTF-32 encoding.
206
decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text
207
decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)
208
{-# INLINE decodeUtf32BEWith #-}
210
-- | Decode text from big endian UTF-32 encoding.
212
-- If the input contains any invalid big endian UTF-32 data, an
213
-- exception will be thrown. For more control over the handling of
214
-- invalid data, use 'decodeUtf32BEWith'.
215
decodeUtf32BE :: B.ByteString -> Text
216
decodeUtf32BE = decodeUtf32BEWith strictDecode
217
{-# INLINE decodeUtf32BE #-}
219
-- | Encode text using little endian UTF-32 encoding.
220
encodeUtf32LE :: Text -> B.ByteString
221
encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)
222
{-# INLINE encodeUtf32LE #-}
224
-- | Encode text using big endian UTF-32 encoding.
225
encodeUtf32BE :: Text -> B.ByteString
226
encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)
227
{-# INLINE encodeUtf32BE #-}