2
% (c) The University of Glasgow 2006
3
% (c) The University of Glasgow, 1997-2006
6
Buffers for scanning string input stored in external arrays.
9
{-# OPTIONS_GHC -O -funbox-strict-fields #-}
10
-- We always optimise this, otherwise performance of a non-optimised
11
-- compiler is severely affected
16
-- non-abstract for vs\/HaskellService
18
-- * Creation\/destruction
20
hGetStringBufferBlock,
30
-- * Moving and comparison
43
#include "HsVersions.h"
46
import FastString hiding ( buf )
51
import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose
56
import System.IO ( openBinaryFile )
58
-- -----------------------------------------------------------------------------
59
-- The StringBuffer type
61
-- |A StringBuffer is an internal pointer to a sized chunk of bytes.
62
-- The bytes are intended to be *immutable*. There are pure
63
-- operations to read the contents of a StringBuffer.
65
-- A StringBuffer may have a finalizer, depending on how it was
70
buf :: {-# UNPACK #-} !(ForeignPtr Word8),
71
len :: {-# UNPACK #-} !Int, -- length
72
cur :: {-# UNPACK #-} !Int -- current pos
74
-- The buffer is assumed to be UTF-8 encoded, and furthermore
75
-- we add three '\0' bytes to the end as sentinels so that the
76
-- decoder doesn't have to check for overflow at every single byte
77
-- of a multibyte sequence.
79
instance Show StringBuffer where
80
showsPrec _ s = showString "<stringbuffer("
81
. shows (len s) . showString "," . shows (cur s)
84
-- -----------------------------------------------------------------------------
85
-- Creation / Destruction
87
hGetStringBuffer :: FilePath -> IO StringBuffer
88
hGetStringBuffer fname = do
89
h <- openBinaryFile fname ReadMode
91
let size = fromIntegral size_i
92
buf <- mallocForeignPtrArray (size+3)
93
withForeignPtr buf $ \ptr -> do
94
r <- if size == 0 then return 0 else hGetBuf h ptr size
97
then ioError (userError "short read of file")
98
else newUTF8StringBuffer buf ptr size
100
hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer
101
hGetStringBufferBlock handle wanted
102
= do size_i <- hFileSize handle
103
offset_i <- hTell handle
104
let size = min wanted (fromIntegral $ size_i-offset_i)
105
buf <- mallocForeignPtrArray (size+3)
106
withForeignPtr buf $ \ptr ->
107
do r <- if size == 0 then return 0 else hGetBuf handle ptr size
109
then ioError (userError $ "short read of file: "++show(r,size,size_i,handle))
110
else newUTF8StringBuffer buf ptr size
112
newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer
113
newUTF8StringBuffer buf ptr size = do
114
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
115
-- sentinels for UTF-8 decoding
117
sb0 = StringBuffer buf size 0
118
(first_char, sb1) = nextChar sb0
119
-- skip the byte-order mark if there is one (see #1744)
120
-- This is better than treating #FEFF as whitespace,
121
-- because that would mess up layout. We don't have a concept
122
-- of zero-width whitespace in Haskell: all whitespace codepoints
123
-- have a width of one column.
124
return (if first_char == '\xfeff' then sb1 else sb0)
126
appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer
127
appendStringBuffers sb1 sb2
128
= do newBuf <- mallocForeignPtrArray (size+3)
129
withForeignPtr newBuf $ \ptr ->
130
withForeignPtr (buf sb1) $ \sb1Ptr ->
131
withForeignPtr (buf sb2) $ \sb2Ptr ->
132
do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len
133
copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len
134
pokeArray (ptr `advancePtr` size) [0,0,0]
135
return (StringBuffer newBuf size 0)
136
where sb1_len = calcLen sb1
137
sb2_len = calcLen sb2
138
calcLen sb = len sb - cur sb
139
size = sb1_len + sb2_len
141
stringToStringBuffer :: String -> IO StringBuffer
142
stringToStringBuffer str = do
143
let size = utf8EncodedLength str
144
buf <- mallocForeignPtrArray (size+3)
145
withForeignPtr buf $ \ptr -> do
146
utf8EncodeString ptr str
147
pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0]
148
-- sentinels for UTF-8 decoding
149
return (StringBuffer buf size 0)
151
-- -----------------------------------------------------------------------------
154
-- Getting our fingers dirty a little here, but this is performance-critical
155
{-# INLINE nextChar #-}
156
nextChar :: StringBuffer -> (Char,StringBuffer)
157
nextChar (StringBuffer buf len (I# cur#)) =
159
withForeignPtr buf $ \(Ptr a#) -> do
160
case utf8DecodeChar# (a# `plusAddr#` cur#) of
162
let cur' = I# (b# `minusAddr#` a#) in
163
return (C# c#, StringBuffer buf len cur')
165
currentChar :: StringBuffer -> Char
166
currentChar = fst . nextChar
168
prevChar :: StringBuffer -> Char -> Char
169
prevChar (StringBuffer _ _ 0) deflt = deflt
170
prevChar (StringBuffer buf _ cur) _ =
172
withForeignPtr buf $ \p -> do
173
p' <- utf8PrevChar (p `plusPtr` cur)
174
return (fst (utf8DecodeChar p'))
176
-- -----------------------------------------------------------------------------
179
stepOn :: StringBuffer -> StringBuffer
180
stepOn s = snd (nextChar s)
182
offsetBytes :: Int -> StringBuffer -> StringBuffer
183
offsetBytes i s = s { cur = cur s + i }
185
byteDiff :: StringBuffer -> StringBuffer -> Int
186
byteDiff s1 s2 = cur s2 - cur s1
188
atEnd :: StringBuffer -> Bool
189
atEnd (StringBuffer _ l c) = l == c
191
-- -----------------------------------------------------------------------------
194
lexemeToString :: StringBuffer -> Int {-bytes-} -> String
195
lexemeToString _ 0 = ""
196
lexemeToString (StringBuffer buf _ cur) bytes =
198
withForeignPtr buf $ \ptr ->
199
utf8DecodeString (ptr `plusPtr` cur) bytes
201
lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString
202
lexemeToFastString _ 0 = nilFS
203
lexemeToFastString (StringBuffer buf _ cur) len =
205
withForeignPtr buf $ \ptr ->
206
return $! mkFastStringBytes (ptr `plusPtr` cur) len
208
-- -----------------------------------------------------------------------------
209
-- Parsing integer strings in various bases
211
byteOff :: StringBuffer -> Int -> Char
212
byteOff (StringBuffer buf _ cur) i =
213
inlinePerformIO $ withForeignPtr buf $ \ptr -> do
214
-- return $! cBox (indexWord8OffFastPtrAsFastChar
215
-- (pUnbox ptr) (iUnbox (cur+i)))
217
-- w <- peek (ptr `plusPtr` (cur+i))
218
-- return (unsafeChr (fromIntegral (w::Word8)))
220
-- | XXX assumes ASCII digits only (by using byteOff)
221
parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer
222
parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int
223
= inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let
224
--LOL, in implementations where the indexing needs slow unsafePerformIO,
225
--this is less (not more) efficient than using the IO monad explicitly
228
byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i)))
229
go i x | i == len = x
230
| otherwise = case byteOff i of
231
char -> go (i + 1) (x * radix + toInteger (char_to_int char))