1
{-# LANGUAGE BangPatterns, CPP, OverloadedStrings #-}
2
{-# OPTIONS_GHC -fno-warn-unused-binds #-}
11
import Data.ByteString.Builder
12
(Builder, byteString, toLazyByteString, charUtf8, word8)
14
#if !MIN_VERSION_base(4,8,0)
15
import Control.Applicative ((*>), (<$>), (<*), pure)
16
import Data.Monoid (mappend, mempty)
19
import Control.Applicative (liftA2)
20
import Control.DeepSeq (NFData(..))
21
import Control.Monad (forM)
22
import Data.Attoparsec.ByteString.Char8 (Parser, char, endOfInput, scientific,
24
import Data.Bits ((.|.), shiftL)
25
import Data.ByteString (ByteString)
26
import Data.Char (chr)
27
import Data.List (sort)
28
import Data.Scientific (Scientific)
29
import Data.Text (Text)
30
import Data.Text.Encoding (decodeUtf8')
31
import Data.Vector as Vector (Vector, foldl', fromList)
32
import Data.Word (Word8)
33
import System.Directory (getDirectoryContents, doesDirectoryExist)
34
import System.FilePath ((</>), dropExtension)
35
import qualified Data.Attoparsec.ByteString as A
36
import qualified Data.Attoparsec.Lazy as L
37
import qualified Data.Attoparsec.Zepto as Z
38
import qualified Data.ByteString as B
39
import qualified Data.ByteString.Lazy as L
40
import qualified Data.ByteString.Unsafe as B
41
import qualified Data.HashMap.Strict as H
43
import Common (pathTo)
46
#define CLOSE_CURLY 125
47
#define CLOSE_SQUARE 93
49
#define DOUBLE_QUOTE 34
50
#define OPEN_CURLY 123
51
#define OPEN_SQUARE 91
61
data Result a = Error String
66
-- | A JSON \"object\" (key\/value map).
67
type Object = H.HashMap Text Value
69
-- | A JSON \"array\" (sequence).
70
type Array = Vector Value
72
-- | A JSON value represented as a Haskell value.
73
data Value = Object !Object
81
instance NFData Value where
82
rnf (Object o) = rnf o
83
rnf (Array a) = Vector.foldl' (\x y -> rnf y `seq` x) () a
84
rnf (String s) = rnf s
85
rnf (Number n) = rnf n
89
-- | Parse a top-level JSON value. This must be either an object or
90
-- an array, per RFC 4627.
92
-- The conversion of a parsed value to a Haskell value is deferred
93
-- until the Haskell value is needed. This may improve performance if
94
-- only a subset of the results of conversions are needed, but at a
95
-- cost in thunk allocation.
97
json = json_ object_ array_
99
-- | Parse a top-level JSON value. This must be either an object or
100
-- an array, per RFC 4627.
102
-- This is a strict version of 'json' which avoids building up thunks
103
-- during parsing; it performs all conversions immediately. Prefer
104
-- this version if most of the JSON data needs to be accessed.
105
json' :: Parser Value
106
json' = json_ object_' array_'
108
json_ :: Parser Value -> Parser Value -> Parser Value
110
w <- skipSpace *> A.satisfy (\w -> w == OPEN_CURLY || w == OPEN_SQUARE)
116
object_ :: Parser Value
117
object_ = {-# SCC "object_" #-} Object <$> objectValues jstring value
119
object_' :: Parser Value
120
object_' = {-# SCC "object_'" #-} do
121
!vals <- objectValues jstring' value'
128
objectValues :: Parser Text -> Parser Value -> Parser (H.HashMap Text Value)
129
objectValues str val = do
131
let pair = liftA2 (,) (str <* skipSpace) (char ':' *> skipSpace *> val)
132
H.fromList <$> commaSeparated pair CLOSE_CURLY
133
{-# INLINE objectValues #-}
135
array_ :: Parser Value
136
array_ = {-# SCC "array_" #-} Array <$> arrayValues value
138
array_' :: Parser Value
139
array_' = {-# SCC "array_'" #-} do
140
!vals <- arrayValues value'
143
commaSeparated :: Parser a -> Word8 -> Parser [a]
144
commaSeparated item endByte = do
147
then A.anyWord8 >> return []
151
v <- item <* skipSpace
152
ch <- A.satisfy $ \w -> w == COMMA || w == endByte
154
then skipSpace >> (v:) <$> loop
156
{-# INLINE commaSeparated #-}
158
arrayValues :: Parser Value -> Parser (Vector Value)
161
Vector.fromList <$> commaSeparated val CLOSE_SQUARE
162
{-# INLINE arrayValues #-}
164
-- | Parse any JSON value. You should usually 'json' in preference to
165
-- this function, as this function relaxes the object-or-array
166
-- requirement of RFC 4627.
168
-- In particular, be careful in using this function if you think your
169
-- code might interoperate with Javascript. A naïve Javascript
170
-- library that parses JSON data using @eval@ is vulnerable to attack
171
-- unless the encoded data represents an object or an array. JSON
172
-- implementations in other languages conform to that same restriction
173
-- to preserve interoperability and security.
174
value :: Parser Value
178
DOUBLE_QUOTE -> A.anyWord8 *> (String <$> jstring_)
179
OPEN_CURLY -> A.anyWord8 *> object_
180
OPEN_SQUARE -> A.anyWord8 *> array_
181
C_f -> string "false" *> pure (Bool False)
182
C_t -> string "true" *> pure (Bool True)
183
C_n -> string "null" *> pure Null
184
_ | w >= 48 && w <= 57 || w == 45
185
-> Number <$> scientific
186
| otherwise -> fail "not a valid json value"
188
-- | Strict version of 'value'. See also 'json''.
189
value' :: Parser Value
194
!s <- A.anyWord8 *> jstring_
196
OPEN_CURLY -> A.anyWord8 *> object_'
197
OPEN_SQUARE -> A.anyWord8 *> array_'
198
C_f -> string "false" *> pure (Bool False)
199
C_t -> string "true" *> pure (Bool True)
200
C_n -> string "null" *> pure Null
201
_ | w >= 48 && w <= 57 || w == 45
205
| otherwise -> fail "not a valid json value"
207
-- | Parse a quoted JSON string.
208
jstring :: Parser Text
209
jstring = A.word8 DOUBLE_QUOTE *> jstring_
211
-- | Parse a string without a leading quote.
212
jstring_ :: Parser Text
213
jstring_ = {-# SCC "jstring_" #-} do
214
s <- A.scan False $ \s c -> if s then Just False
215
else if c == DOUBLE_QUOTE
217
else Just (c == BACKSLASH)
218
_ <- A.word8 DOUBLE_QUOTE
219
s1 <- if BACKSLASH `B.elem` s
220
then case Z.parse unescape s of
225
case decodeUtf8' s1 of
227
Left err -> fail $ show err
229
{-# INLINE jstring_ #-}
231
unescape :: Z.Parser ByteString
232
unescape = toByteString <$> go mempty where
234
h <- Z.takeWhile (/=BACKSLASH)
237
let !slash = B.unsafeHead start
238
!t = B.unsafeIndex start 1
239
escape = case B.findIndex (==t) "\"\\/ntbrfu" of
242
if slash /= BACKSLASH || escape == 255
243
then fail "invalid JSON escape sequence"
245
let cont m = go (acc `mappend` byteString h `mappend` m)
248
then cont (word8 (B.unsafeIndex mapping escape))
251
if a < 0xd800 || a > 0xdfff
252
then cont (charUtf8 (chr a))
254
b <- Z.string "\\u" *> hexQuad
255
if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
256
then let !c = ((a - 0xd800) `shiftL` 10) +
257
(b - 0xdc00) + 0x10000
258
in cont (charUtf8 (chr c))
259
else fail "invalid UTF-16 surrogates"
262
then return (acc `mappend` byteString h)
264
mapping = "\"\\/\n\t\b\r\f"
266
hexQuad :: Z.Parser Int
269
let hex n | w >= C_0 && w <= C_9 = w - C_0
270
| w >= C_a && w <= C_f = w - 87
271
| w >= C_A && w <= C_F = w - 55
273
where w = fromIntegral $ B.unsafeIndex s n
274
a = hex 0; b = hex 1; c = hex 2; d = hex 3
275
if (a .|. b .|. c .|. d) /= 255
276
then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
277
else fail "invalid hex escape"
279
decodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString -> Maybe a
282
L.Done _ v -> case to v of
286
{-# INLINE decodeWith #-}
288
decodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString
290
decodeStrictWith p to s =
291
case either Error to (A.parseOnly p s) of
294
{-# INLINE decodeStrictWith #-}
296
eitherDecodeWith :: Parser Value -> (Value -> Result a) -> L.ByteString
298
eitherDecodeWith p to s =
300
L.Done _ v -> case to v of
302
Error msg -> Left msg
303
L.Fail _ _ msg -> Left msg
304
{-# INLINE eitherDecodeWith #-}
306
eitherDecodeStrictWith :: Parser Value -> (Value -> Result a) -> B.ByteString
308
eitherDecodeStrictWith p to s =
309
case either Error to (A.parseOnly p s) of
311
Error msg -> Left msg
312
{-# INLINE eitherDecodeStrictWith #-}
316
-- The 'json' and 'value' parsers decouple identification from
317
-- conversion. Identification occurs immediately (so that an invalid
318
-- JSON document can be rejected as early as possible), but conversion
319
-- to a Haskell value is deferred until that value is needed.
321
-- This decoupling can be time-efficient if only a smallish subset of
322
-- elements in a JSON value need to be inspected, since the cost of
323
-- conversion is zero for uninspected elements. The trade off is an
324
-- increase in memory usage, due to allocation of thunks for values
325
-- that have not yet been converted.
329
-- The 'json'' and 'value'' parsers combine identification with
330
-- conversion. They consume more CPU cycles up front, but have a
331
-- smaller memory footprint.
333
-- | Parse a top-level JSON value followed by optional whitespace and
334
-- end-of-input. See also: 'json'.
335
jsonEOF :: Parser Value
336
jsonEOF = json <* skipSpace <* endOfInput
338
-- | Parse a top-level JSON value followed by optional whitespace and
339
-- end-of-input. See also: 'json''.
340
jsonEOF' :: Parser Value
341
jsonEOF' = json' <* skipSpace <* endOfInput
343
toByteString :: Builder -> ByteString
344
toByteString = L.toStrict . toLazyByteString
345
{-# INLINE toByteString #-}
347
aeson :: IO [Benchmark]
349
path <- pathTo "json-data"
350
names <- sort . filter (`notElem` [".", ".."]) <$> getDirectoryContents path
351
forM names $ \name -> do
352
bs <- B.readFile (path </> name)
353
return . bench ("attoparsec/" ++ dropExtension name) $ nf (A.parseOnly jsonEOF') bs
355
aesonLazy :: IO [Benchmark]
357
path <- pathTo "json-data"
358
names <- sort . filter (`notElem` [".", ".."]) <$> getDirectoryContents path
359
forM names $ \name -> do
360
bs <- L.readFile (path </> name)
361
return . bench ("attoparsec/lazy-bytestring/" ++ dropExtension name) $ nf (L.parse jsonEOF') bs