~ubuntu-branches/ubuntu/precise/haskell-text/precise

« back to all changes in this revision

Viewing changes to Data/Text/Lazy/Encoding.hs

Committer: Bazaar Package Importer
Author(s): Joachim Breitner
Date: 2011-04-13 11:38:29 UTC
mfrom: (4.1.3 sid)
Revision ID: james.westby@ubuntu.com-20110413113829-f4ss61ivg720e5bu

Tags: 0.11.0.6-1

New upstream release

files added:
Data/Text/Lazy/Builder.hs

Data/Text/Lazy/Read.hs

Data/Text/Read.hs

Data/Text/Util.hs

README.markdown

tests/StdioCoverage.hs

tests/TestUtils.hs

tests/benchmarks

tests/benchmarks/Cut.hs

tests/benchmarks/DecodeUtf8.hs

tests/benchmarks/EncodeUtf8.hs

tests/benchmarks/Equality.hs

tests/benchmarks/FileIndices.hs

tests/benchmarks/FileRead.hs

tests/benchmarks/FoldLines.hs

tests/benchmarks/HtmlCombinator.hs

tests/benchmarks/Makefile

tests/benchmarks/Replace.hs

tests/benchmarks/ReplaceTags.hs

tests/benchmarks/fileread.py

tests/benchmarks/fileread_c.c

tests/cover-stdio.sh

files removed:
README

TODO

debian/libghc6-text-doc.doc-base

scripts/CaseFolding.txt

scripts/SpecialCasing.txt

files modified:
Data/Text.hs

Data/Text/Array.hs

Data/Text/Encoding.hs

Data/Text/Encoding/Error.hs

Data/Text/Encoding/Fusion.hs

Data/Text/Encoding/Fusion/Common.hs

Data/Text/Encoding/Utf16.hs

Data/Text/Encoding/Utf32.hs

Data/Text/Encoding/Utf8.hs

Data/Text/Foreign.hs

Data/Text/Fusion.hs

Data/Text/Fusion/Common.hs

Data/Text/Fusion/Internal.hs

Data/Text/Fusion/Size.hs

Data/Text/IO.hs

Data/Text/IO/Internal.hs

Data/Text/Internal.hs

Data/Text/Lazy.hs

Data/Text/Lazy/Encoding.hs

Data/Text/Lazy/Encoding/Fusion.hs

Data/Text/Lazy/Fusion.hs

Data/Text/Lazy/IO.hs

Data/Text/Lazy/Internal.hs

Data/Text/Lazy/Search.hs

Data/Text/Search.hs

Data/Text/Unsafe.hs

Data/Text/UnsafeChar.hs

Data/Text/UnsafeShift.hs

debian/changelog

debian/control

debian/copyright

scripts/ApiCompare.hs

tests/Benchmarks.hs

tests/Makefile

tests/Properties.hs

tests/QuickCheckUtils.hs

tests/SlowFunctions.hs

text.cabal

Show diffs side-by-side

added added

removed removed

Data/Text/Lazy/Encoding.hs

{-# LANGUAGE BangPatterns #-}

-- |

-- Module : Data.Text.Lazy.Encoding

-- License : BSD-style

-- Maintainer : bos@serpentine.com, rtomharper@googlemail.com,

-- duncan@haskell.org

-- Stability : experimental

-- Portability : portable

-- Functions for converting lazy 'Text' values to and from lazy

-- 'ByteString', using several standard encodings.

-- To gain access to a much larger variety of encodings, use the

-- @text-icu@ package: <http://hackage.haskell.org/package/text-icu>

module Data.Text.Lazy.Encoding

(

-- * Decoding ByteStrings to Text

-- decodeASCII

decodeUtf8

-- $strict

decodeASCII

, decodeUtf8

, decodeUtf16LE

, decodeUtf16BE

, decodeUtf32LE

, decodeUtf32BE

-- ** Catchable failure

, decodeUtf8'

-- ** Controllable error handling

, decodeUtf8With

--, decodeUtf16LE

--, decodeUtf16BE

--, decodeUtf32LE

--, decodeUtf32BE

, decodeUtf16LEWith

, decodeUtf16BEWith

, decodeUtf32LEWith

, decodeUtf32BEWith

-- * Encoding Text to ByteStrings

, encodeUtf8

--, encodeUtf16LE

--, encodeUtf16BE

--, encodeUtf32LE

--, encodeUtf32BE

, encodeUtf16LE

, encodeUtf16BE

, encodeUtf32LE

, encodeUtf32BE

) where

import Data.ByteString.Lazy (ByteString)

import Data.Text.Encoding.Error (OnDecodeError, strictDecode)

import Data.Text.Lazy (Text)

import Control.Exception (evaluate, try)

import Data.Bits ((.&.))

import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode)

import Data.Text.Lazy.Internal (Text(..), chunk, empty, foldrChunks)

import System.IO.Unsafe (unsafePerformIO)

import qualified Data.ByteString as S

import qualified Data.ByteString.Lazy as B

import qualified Data.ByteString.Lazy.Internal as B

import qualified Data.ByteString.Unsafe as S

import qualified Data.Text as T

import qualified Data.Text.Encoding as TE

import qualified Data.Text.Lazy.Encoding.Fusion as E

import qualified Data.Text.Lazy.Fusion as F

import qualified Data.Text.Lazy.Encoding.Fusion as E

decodeUtf8With :: OnDecodeError -> ByteString -> Text

decodeUtf8With onErr bs = F.unstream (E.streamUtf8 onErr bs)

{-# INLINE decodeUtf8With #-}

decodeUtf8 :: ByteString -> Text

-- $strict

-- All of the single-parameter functions for decoding bytestrings

-- encoded in one of the Unicode Transformation Formats (UTF) operate

-- in a /strict/ mode: each will throw an exception if given invalid

-- input.

-- Each function has a variant, whose name is suffixed with -'With',

-- that gives greater control over the handling of decoding errors.

-- For instance, 'decodeUtf8' will throw an exception, but

-- 'decodeUtf8With' allows the programmer to determine what to do on a

-- decoding error.

-- | Decode a 'ByteString' containing 7-bit ASCII encoded text.

decodeASCII :: B.ByteString -> Text

decodeASCII bs = foldr (chunk . TE.decodeASCII) empty (B.toChunks bs)

{-# INLINE decodeASCII #-}

-- | Decode a 'ByteString' containing UTF-8 encoded text.

decodeUtf8With :: OnDecodeError -> B.ByteString -> Text

decodeUtf8With onErr bs0 = fast bs0

where

decode = TE.decodeUtf8With onErr

fast (B.Chunk p ps) | isComplete p = chunk (decode p) (fast ps)

| otherwise = chunk (decode h) (slow t ps)

where (h,t) = S.splitAt pivot p

pivot | at 1 = len-1

| at 2 = len-2

| otherwise = len-3

len = S.length p

at n = len >= n && S.unsafeIndex p (len-n) .&. 0xc0 == 0xc0

fast B.Empty = empty

slow i bs = {-# SCC "decodeUtf8With'/slow" #-}

case B.uncons bs of

Just (w,bs') | isComplete i' -> chunk (decode i') (fast bs')

| otherwise -> slow i' bs'

where i' = S.snoc i w

Nothing -> case S.uncons i of

Just (j,i') ->

100

case onErr desc (Just j) of

101

Nothing -> slow i' bs

102

Just c -> Chunk (T.singleton c) (slow i' bs)

103

Nothing ->

104

case onErr desc Nothing of

105

Nothing -> empty

106

Just c -> Chunk (T.singleton c) empty

107

isComplete bs = {-# SCC "decodeUtf8With'/isComplete" #-}

108

ix 1 .&. 0x80 == 0 ||

109

(len >= 2 && ix 2 .&. 0xe0 == 0xc0) ||

110

(len >= 3 && ix 3 .&. 0xf0 == 0xe0) ||

111

(len >= 4 && ix 4 .&. 0xf8 == 0xf0)

112

where len = S.length bs

113

ix n = S.unsafeIndex bs (len-n)

114

desc = "Data.Text.Lazy.Encoding.decodeUtf8With: Invalid UTF-8 stream"

115

{-# INLINE[0] decodeUtf8With #-}

116

117

-- | Decode a 'ByteString' containing UTF-8 encoded text that is known

118

-- to be valid.

119

120

-- If the input contains any invalid UTF-8 data, an exception will be

121

-- thrown that cannot be caught in pure code. For more control over

122

-- the handling of invalid data, use 'decodeUtf8'' or

123

-- 'decodeUtf8With'.

124

decodeUtf8 :: B.ByteString -> Text

125

decodeUtf8 = decodeUtf8With strictDecode

{-# INLINE decodeUtf8 #-}

encodeUtf8 :: Text -> ByteString

encodeUtf8 txt = E.unstream (E.restreamUtf8 (F.stream txt))

{-# INLINE encodeUtf8 #-}

126

{-# INLINE[0] decodeUtf8 #-}

127

128

-- This rule seems to cause performance loss.

129

{- RULES "LAZY STREAM stream/decodeUtf8' fusion" [1]

130

forall bs. F.stream (decodeUtf8' bs) = E.streamUtf8 strictDecode bs #-}

131

132

-- | Decode a 'ByteString' containing UTF-8 encoded text..

133

134

-- If the input contains any invalid UTF-8 data, the relevant

135

-- exception will be returned, otherwise the decoded text.

136

137

-- /Note/: this function is /not/ lazy, as it must decode its entire

138

-- input before it can return a result. If you need lazy (streaming)

139

-- decoding, use 'decodeUtf8With' in lenient mode.

140

decodeUtf8' :: B.ByteString -> Either UnicodeException Text

141

decodeUtf8' bs = unsafePerformIO $ do

142

let t = decodeUtf8 bs

143

try (evaluate (rnf t `seq` t))

144

where

145

rnf Empty = ()

146

rnf (Chunk _ ts) = rnf ts

147

{-# INLINE decodeUtf8' #-}

148

149

encodeUtf8 :: Text -> B.ByteString

150

encodeUtf8 (Chunk c cs) = B.Chunk (TE.encodeUtf8 c) (encodeUtf8 cs)

151

encodeUtf8 Empty = B.Empty

152

153

-- | Decode text from little endian UTF-16 encoding.

154

decodeUtf16LEWith :: OnDecodeError -> B.ByteString -> Text

155

decodeUtf16LEWith onErr bs = F.unstream (E.streamUtf16LE onErr bs)

156

{-# INLINE decodeUtf16LEWith #-}

157

158

-- | Decode text from little endian UTF-16 encoding.

159

160

-- If the input contains any invalid little endian UTF-16 data, an

161

-- exception will be thrown. For more control over the handling of

162

-- invalid data, use 'decodeUtf16LEWith'.

163

decodeUtf16LE :: B.ByteString -> Text

164

decodeUtf16LE = decodeUtf16LEWith strictDecode

165

{-# INLINE decodeUtf16LE #-}

166

167

-- | Decode text from big endian UTF-16 encoding.

168

decodeUtf16BEWith :: OnDecodeError -> B.ByteString -> Text

169

decodeUtf16BEWith onErr bs = F.unstream (E.streamUtf16BE onErr bs)

170

{-# INLINE decodeUtf16BEWith #-}

171

172

-- | Decode text from big endian UTF-16 encoding.

173

174

-- If the input contains any invalid big endian UTF-16 data, an

175

-- exception will be thrown. For more control over the handling of

176

-- invalid data, use 'decodeUtf16BEWith'.

177

decodeUtf16BE :: B.ByteString -> Text

178

decodeUtf16BE = decodeUtf16BEWith strictDecode

179

{-# INLINE decodeUtf16BE #-}

180

181

-- | Encode text using little endian UTF-16 encoding.

182

encodeUtf16LE :: Text -> B.ByteString

183

encodeUtf16LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16LE) [] txt)

184

{-# INLINE encodeUtf16LE #-}

185

186

-- | Encode text using big endian UTF-16 encoding.

187

encodeUtf16BE :: Text -> B.ByteString

188

encodeUtf16BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf16BE) [] txt)

189

{-# INLINE encodeUtf16BE #-}

190

191

-- | Decode text from little endian UTF-32 encoding.

192

decodeUtf32LEWith :: OnDecodeError -> B.ByteString -> Text

193

decodeUtf32LEWith onErr bs = F.unstream (E.streamUtf32LE onErr bs)

194

{-# INLINE decodeUtf32LEWith #-}

195

196

-- | Decode text from little endian UTF-32 encoding.

197

198

-- If the input contains any invalid little endian UTF-32 data, an

199

-- exception will be thrown. For more control over the handling of

200

-- invalid data, use 'decodeUtf32LEWith'.

201

decodeUtf32LE :: B.ByteString -> Text

202

decodeUtf32LE = decodeUtf32LEWith strictDecode

203

{-# INLINE decodeUtf32LE #-}

204

205

-- | Decode text from big endian UTF-32 encoding.

206

decodeUtf32BEWith :: OnDecodeError -> B.ByteString -> Text

207

decodeUtf32BEWith onErr bs = F.unstream (E.streamUtf32BE onErr bs)

208

{-# INLINE decodeUtf32BEWith #-}

209

210

-- | Decode text from big endian UTF-32 encoding.

211

212

-- If the input contains any invalid big endian UTF-32 data, an

213

-- exception will be thrown. For more control over the handling of

214

-- invalid data, use 'decodeUtf32BEWith'.

215

decodeUtf32BE :: B.ByteString -> Text

216

decodeUtf32BE = decodeUtf32BEWith strictDecode

217

{-# INLINE decodeUtf32BE #-}

218

219

-- | Encode text using little endian UTF-32 encoding.

220

encodeUtf32LE :: Text -> B.ByteString

221

encodeUtf32LE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32LE) [] txt)

222

{-# INLINE encodeUtf32LE #-}

223

224

-- | Encode text using big endian UTF-32 encoding.

225

encodeUtf32BE :: Text -> B.ByteString

226

encodeUtf32BE txt = B.fromChunks (foldrChunks ((:) . TE.encodeUtf32BE) [] txt)

227

{-# INLINE encodeUtf32BE #-}

Older »