From 21f3aae7371469beb3950a6170db6c5668379ff3 Mon Sep 17 00:00:00 2001 From: Josh Meredith Date: Thu, 20 Apr 2023 08:44:58 +0000 Subject: Use unboxed codebuffers in base Metric Decrease: encodingAllocations --- libraries/base/GHC/IO/Encoding.hs | 7 +- libraries/base/GHC/IO/Encoding/CodePage/API.hs | 15 +- libraries/base/GHC/IO/Encoding/Failure.hs | 18 +- libraries/base/GHC/IO/Encoding/Iconv.hs | 23 +- libraries/base/GHC/IO/Encoding/Latin1.hs | 188 ++++++++------- libraries/base/GHC/IO/Encoding/UTF16.hs | 314 +++++++++++++------------ libraries/base/GHC/IO/Encoding/UTF32.hs | 286 +++++++++++----------- libraries/base/GHC/IO/Encoding/UTF8.hs | 238 ++++++++++--------- libraries/base/changelog.md | 1 + 9 files changed, 597 insertions(+), 493 deletions(-) diff --git a/libraries/base/GHC/IO/Encoding.hs b/libraries/base/GHC/IO/Encoding.hs index 036a00bebc..ab2599559d 100644 --- a/libraries/base/GHC/IO/Encoding.hs +++ b/libraries/base/GHC/IO/Encoding.hs @@ -1,5 +1,6 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE CPP, NoImplicitPrelude #-} +{-# LANGUAGE UnboxedTuples #-} {-# OPTIONS_GHC -funbox-strict-fields #-} ----------------------------------------------------------------------------- @@ -336,11 +337,13 @@ mkTextEncoding' cfm enc = latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8) -latin1_encode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_encode input output -- unchecked, used for char8 +latin1_encode input output = IO $ \st -> case Latin1.latin1_encode input output st of + (# st', _why, input', output' #) -> (# st', (input', output') #) -- unchecked, used for char8 --latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encode latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer) -latin1_decode input output = fmap (\(_why,input',output') -> (input',output')) $ Latin1.latin1_decode input output +latin1_decode input output = IO $ \st -> case Latin1.latin1_decode input output st of + (# st', _why, input', output' #) -> (# st', (input',output') #) --latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encode unknownEncodingErr :: String -> IO a diff --git a/libraries/base/GHC/IO/Encoding/CodePage/API.hs b/libraries/base/GHC/IO/Encoding/CodePage/API.hs index 6136156cdb..e8412e8356 100644 --- a/libraries/base/GHC/IO/Encoding/CodePage/API.hs +++ b/libraries/base/GHC/IO/Encoding/CodePage/API.hs @@ -1,6 +1,7 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE CPP, NoImplicitPrelude, NondecreasingIndentation, - RecordWildCards, ScopedTypeVariables #-} + RecordWildCards, ScopedTypeVariables, + UnboxedTuples #-} {-# OPTIONS_GHC -Wno-name-shadowing #-} module GHC.IO.Encoding.CodePage.API ( @@ -157,11 +158,15 @@ newCP rec fn cp = do utf16_native_encode' :: EncodeBuffer utf16_native_decode' :: DecodeBuffer #if defined(WORDS_BIGENDIAN) -utf16_native_encode' = utf16be_encode -utf16_native_decode' = utf16be_decode +utf16_native_encode' i o = IO $ \st -> case utf16be_encode i o st of + (# st', c, i', o' #) -> (# st', (c, i', o') #) +utf16_native_decode' i o = IO $ \st -> case utf16be_decode i o st of + (# st', c, i', o' #) -> (# st', (c, i', o') #) #else -utf16_native_encode' = utf16le_encode -utf16_native_decode' = utf16le_decode +utf16_native_encode' i o = IO $ \st -> case utf16le_encode i o st of + (# st', c, i', o' #) -> (# st', (c, i', o') #) +utf16_native_decode' i o = IO $ \st -> case utf16le_decode i o st of + (# st', c, i', o' #) -> (# st', (c, i', o') #) #endif saner :: CodeBuffer from to diff --git a/libraries/base/GHC/IO/Encoding/Failure.hs b/libraries/base/GHC/IO/Encoding/Failure.hs index fb885bd45b..d9817aa950 100644 --- a/libraries/base/GHC/IO/Encoding/Failure.hs +++ b/libraries/base/GHC/IO/Encoding/Failure.hs @@ -1,5 +1,8 @@ {-# LANGUAGE Trustworthy #-} {-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE UnboxedTuples #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE BangPatterns #-} ----------------------------------------------------------------------------- -- | @@ -18,7 +21,8 @@ module GHC.IO.Encoding.Failure ( CodingFailureMode(..), codingFailureModeSuffix, isSurrogate, - recoverDecode, recoverEncode + recoverDecode, recoverEncode, + recoverDecode#, recoverEncode#, ) where import GHC.IO @@ -142,6 +146,12 @@ unescapeRoundtripCharacterSurrogate c | otherwise = Nothing where x = ord c +recoverDecode# :: CodingFailureMode -> Buffer Word8 -> Buffer Char + -> State# RealWorld -> (# State# RealWorld, Buffer Word8, Buffer Char #) +recoverDecode# cfm input output st = + let !(# st', (bIn, bOut) #) = unIO (recoverDecode cfm input output) st + in (# st', bIn, bOut #) + recoverDecode :: CodingFailureMode -> Buffer Word8 -> Buffer Char -> IO (Buffer Word8, Buffer Char) recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } @@ -160,6 +170,12 @@ recoverDecode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } ow' <- writeCharBuf oraw ow (escapeToRoundtripCharacterSurrogate b) return (input { bufL=ir+1 }, output { bufR=ow' }) +recoverEncode# :: CodingFailureMode -> Buffer Char -> Buffer Word8 + -> State# RealWorld -> (# State# RealWorld, Buffer Char, Buffer Word8 #) +recoverEncode# cfm input output st = + let !(# st', (bIn, bOut) #) = unIO (recoverEncode cfm input output) st + in (# st', bIn, bOut #) + recoverEncode :: CodingFailureMode -> Buffer Char -> Buffer Word8 -> IO (Buffer Char, Buffer Word8) recoverEncode cfm input@Buffer{ bufRaw=iraw, bufL=ir, bufR=_ } diff --git a/libraries/base/GHC/IO/Encoding/Iconv.hs b/libraries/base/GHC/IO/Encoding/Iconv.hs index 6c120cae58..672e505e98 100644 --- a/libraries/base/GHC/IO/Encoding/Iconv.hs +++ b/libraries/base/GHC/IO/Encoding/Iconv.hs @@ -2,6 +2,8 @@ {-# LANGUAGE CPP , NoImplicitPrelude , NondecreasingIndentation + , UnboxedTuples + , MagicHash #-} {-# OPTIONS_HADDOCK not-home #-} @@ -133,19 +135,24 @@ newIConv from to rec fn = withCAString to $ \ to_str -> do iconvt <- throwErrnoIfMinus1 "mkTextEncoding" $ hs_iconv_open to_str from_str let iclose = throwErrnoIfMinus1_ "Iconv.close" $ hs_iconv_close iconvt - return BufferCodec{ - encode = fn iconvt, - recover = rec, - close = iclose, + fn_iconvt ibuf obuf st = case unIO (fn iconvt ibuf obuf) st of + (# st', (prog, ibuf', obuf') #) -> (# st', prog, ibuf', obuf' #) + return BufferCodec# { + encode# = fn_iconvt, + recover# = rec#, + close# = iclose, -- iconv doesn't supply a way to save/restore the state - getState = return (), - setState = const $ return () + getState# = return (), + setState# = const $ return () } + where + rec# ibuf obuf st = case unIO (rec ibuf obuf) st of + (# st', (ibuf', obuf') #) -> (# st', ibuf', obuf' #) -iconvDecode :: IConv -> DecodeBuffer +iconvDecode :: IConv -> Buffer Word8 -> Buffer Char -> IO (CodingProgress, Buffer Word8, Buffer Char) iconvDecode iconv_t ibuf obuf = iconvRecode iconv_t ibuf 0 obuf char_shift -iconvEncode :: IConv -> EncodeBuffer +iconvEncode :: IConv -> Buffer Char -> Buffer Word8 -> IO (CodingProgress, Buffer Char, Buffer Word8) iconvEncode iconv_t ibuf obuf = iconvRecode iconv_t ibuf char_shift obuf 0 iconvRecode :: IConv -> Buffer a -> Int -> Buffer b -> Int diff --git a/libraries/base/GHC/IO/Encoding/Latin1.hs b/libraries/base/GHC/IO/Encoding/Latin1.hs index 730379e4a0..4185a0de42 100644 --- a/libraries/base/GHC/IO/Encoding/Latin1.hs +++ b/libraries/base/GHC/IO/Encoding/Latin1.hs @@ -2,6 +2,8 @@ {-# LANGUAGE NoImplicitPrelude , BangPatterns , NondecreasingIndentation + , UnboxedTuples + , MagicHash #-} {-# OPTIONS_GHC -funbox-strict-fields #-} @@ -56,22 +58,22 @@ mkLatin1 cfm = TextEncoding { textEncodingName = "ISO-8859-1", latin1_DF :: CodingFailureMode -> IO (TextDecoder ()) latin1_DF cfm = - return (BufferCodec { - encode = latin1_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = latin1_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) latin1_EF :: CodingFailureMode -> IO (TextEncoder ()) latin1_EF cfm = - return (BufferCodec { - encode = latin1_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = latin1_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) latin1_checked :: TextEncoding @@ -85,12 +87,12 @@ mkLatin1_checked cfm = TextEncoding { textEncodingName = "ISO-8859-1", latin1_checked_EF :: CodingFailureMode -> IO (TextEncoder ()) latin1_checked_EF cfm = - return (BufferCodec { - encode = latin1_checked_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = latin1_checked_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) -- ----------------------------------------------------------------------------- @@ -108,22 +110,22 @@ mkAscii cfm = TextEncoding { textEncodingName = "ASCII", ascii_DF :: CodingFailureMode -> IO (TextDecoder ()) ascii_DF cfm = - return (BufferCodec { - encode = ascii_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = ascii_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) ascii_EF :: CodingFailureMode -> IO (TextEncoder ()) ascii_EF cfm = - return (BufferCodec { - encode = ascii_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = ascii_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) @@ -134,97 +136,115 @@ ascii_EF cfm = -- TODO: Eliminate code duplication between the checked and unchecked -- versions of the decoder or encoder (but don't change the Core!) -latin1_decode :: DecodeBuffer +latin1_decode :: DecodeBuffer# latin1_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) - loop (ir+1) ow' + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir) st0 + !(# st2, ow' #) = unIO (writeCharBuf oraw ow (unsafeChr (fromIntegral c0))) st1 + loop (ir+1) ow' st2 -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -ascii_decode :: DecodeBuffer +ascii_decode :: DecodeBuffer# ascii_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - if c0 > 0x7f then invalid else do - ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) - loop (ir+1) ow' + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir) st0 + if c0 > 0x7f then invalid st1 else do + let !(# st2, ow' #) = unIO (writeCharBuf oraw ow (unsafeChr (fromIntegral c0))) st1 + loop (ir+1) ow' st2 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -latin1_encode :: EncodeBuffer +latin1_encode :: EncodeBuffer# latin1_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir - writeWord8Buf oraw ow (fromIntegral (ord c)) - loop ir' (ow+1) + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 + !(# st2, () #) = unIO (writeWord8Buf oraw ow (fromIntegral (ord c))) st1 + loop ir' (ow+1) st2 in - loop ir0 ow0 + loop ir0 ow0 st -latin1_checked_encode :: EncodeBuffer +latin1_checked_encode :: EncodeBuffer# latin1_checked_encode input output = single_byte_checked_encode 0xff input output -ascii_encode :: EncodeBuffer +ascii_encode :: EncodeBuffer# ascii_encode input output = single_byte_checked_encode 0x7f input output -single_byte_checked_encode :: Int -> EncodeBuffer +single_byte_checked_encode :: Int -> EncodeBuffer# single_byte_checked_encode max_legal_char input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir - if ord c > max_legal_char then invalid else do - writeWord8Buf oraw ow (fromIntegral (ord c)) - loop ir' (ow+1) + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 + if ord c > max_legal_char then invalid st1 else do + let !(# st2, () #) = unIO (writeWord8Buf oraw ow (fromIntegral (ord c))) st1 + loop ir' (ow+1) st2 where - invalid = done InvalidSequence ir ow + invalid :: EncodingBuffer# + invalid st' = done InvalidSequence ir ow st' in - loop ir0 ow0 + loop ir0 ow0 st {-# INLINE single_byte_checked_encode #-} diff --git a/libraries/base/GHC/IO/Encoding/UTF16.hs b/libraries/base/GHC/IO/Encoding/UTF16.hs index a0878d4fce..a592d4c47c 100644 --- a/libraries/base/GHC/IO/Encoding/UTF16.hs +++ b/libraries/base/GHC/IO/Encoding/UTF16.hs @@ -3,6 +3,7 @@ , BangPatterns , NondecreasingIndentation , MagicHash + , UnboxedTuples #-} {-# OPTIONS_GHC -funbox-strict-fields #-} @@ -61,64 +62,66 @@ mkUTF16 cfm = TextEncoding { textEncodingName = "UTF-16", mkTextDecoder = utf16_DF cfm, mkTextEncoder = utf16_EF cfm } -utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer)) +utf16_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer#)) utf16_DF cfm = do seen_bom <- newIORef Nothing - return (BufferCodec { - encode = utf16_decode seen_bom, - recover = recoverDecode cfm, - close = return (), - getState = readIORef seen_bom, - setState = writeIORef seen_bom + return (BufferCodec# { + encode# = utf16_decode seen_bom, + recover# = recoverDecode# cfm, + close# = return (), + getState# = readIORef seen_bom, + setState# = writeIORef seen_bom }) utf16_EF :: CodingFailureMode -> IO (TextEncoder Bool) utf16_EF cfm = do done_bom <- newIORef False - return (BufferCodec { - encode = utf16_encode done_bom, - recover = recoverEncode cfm, - close = return (), - getState = readIORef done_bom, - setState = writeIORef done_bom + return (BufferCodec# { + encode# = utf16_encode done_bom, + recover# = recoverEncode# cfm, + close# = return (), + getState# = readIORef done_bom, + setState# = writeIORef done_bom }) -utf16_encode :: IORef Bool -> EncodeBuffer +utf16_encode :: IORef Bool -> EncodeBuffer# utf16_encode done_bom input output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + st0 = do - b <- readIORef done_bom - if b then utf16_native_encode input output + let !(# st1, b #) = unIO (readIORef done_bom) st0 + if b then utf16_native_encode input output st1 else if os - ow < 2 - then return (OutputUnderflow,input,output) + then (# st1,OutputUnderflow,input,output #) else do - writeIORef done_bom True - writeWord8Buf oraw ow bom1 - writeWord8Buf oraw (ow+1) bom2 - utf16_native_encode input output{ bufR = ow+2 } + let !(# st2, () #) = unIO (writeIORef done_bom True) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw ow bom1) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+1) bom2) st3 + utf16_native_encode input output{ bufR = ow+2 } st4 -utf16_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer +utf16_decode :: IORef (Maybe DecodeBuffer#) -> DecodeBuffer# utf16_decode seen_bom input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } output + st0 = do - mb <- readIORef seen_bom + let !(# st1, mb #) = unIO (readIORef seen_bom) st0 case mb of - Just decode -> decode input output + Just decode -> decode input output st1 Nothing -> - if iw - ir < 2 then return (InputUnderflow,input,output) else do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) + if iw - ir < 2 then (# st1,InputUnderflow,input,output #) else do + let !(# st2, c0 #) = unIO (readWord8Buf iraw ir ) st1 + !(# st3, c1 #) = unIO (readWord8Buf iraw (ir+1)) st2 case () of - _ | c0 == bomB && c1 == bomL -> do - writeIORef seen_bom (Just utf16be_decode) - utf16be_decode input{ bufL= ir+2 } output - | c0 == bomL && c1 == bomB -> do - writeIORef seen_bom (Just utf16le_decode) - utf16le_decode input{ bufL= ir+2 } output - | otherwise -> do - writeIORef seen_bom (Just utf16_native_decode) - utf16_native_decode input output + _ | c0 == bomB && c1 == bomL -> + let !(# st4, () #) = unIO (writeIORef seen_bom (Just utf16be_decode)) st3 + in utf16be_decode input{ bufL= ir+2 } output st4 + | c0 == bomL && c1 == bomB -> + let !(# st4, () #) = unIO (writeIORef seen_bom (Just utf16le_decode)) st3 + in utf16le_decode input{ bufL= ir+2 } output st4 + | otherwise -> + let !(# st4, () #) = unIO (writeIORef seen_bom (Just utf16_native_decode)) st3 + in utf16_native_decode input output st4 bomB, bomL, bom1, bom2 :: Word8 @@ -126,10 +129,10 @@ bomB = 0xfe bomL = 0xff -- choose UTF-16BE by default for UTF-16 output -utf16_native_decode :: DecodeBuffer +utf16_native_decode :: DecodeBuffer# utf16_native_decode = utf16be_decode -utf16_native_encode :: EncodeBuffer +utf16_native_encode :: EncodeBuffer# utf16_native_encode = utf16be_encode bom1 = bomB @@ -149,22 +152,22 @@ mkUTF16be cfm = TextEncoding { textEncodingName = "UTF-16BE", utf16be_DF :: CodingFailureMode -> IO (TextDecoder ()) utf16be_DF cfm = - return (BufferCodec { - encode = utf16be_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf16be_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf16be_EF :: CodingFailureMode -> IO (TextEncoder ()) utf16be_EF cfm = - return (BufferCodec { - encode = utf16be_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf16be_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf16le :: TextEncoding @@ -178,114 +181,127 @@ mkUTF16le cfm = TextEncoding { textEncodingName = "UTF16-LE", utf16le_DF :: CodingFailureMode -> IO (TextDecoder ()) utf16le_DF cfm = - return (BufferCodec { - encode = utf16le_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf16le_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf16le_EF :: CodingFailureMode -> IO (TextEncoder ()) utf16le_EF cfm = - return (BufferCodec { - encode = utf16le_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf16le_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) -utf16be_decode :: DecodeBuffer +utf16be_decode :: DecodeBuffer# utf16be_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow - | ir + 1 == iw = done InputUnderflow ir ow + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | ir + 1 == iw = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir ) st0 + !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 let x1 = fromIntegral c0 `shiftL` 8 + fromIntegral c1 if validate1 x1 - then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1)) - loop (ir+2) ow' - else if iw - ir < 4 then done InputUnderflow ir ow else do - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) - let x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3 - if not (validate2 x1 x2) then invalid else do - ow' <- writeCharBuf oraw ow (chr2 x1 x2) - loop (ir+4) ow' + then let !(# st3, ow' #) = unIO (writeCharBuf oraw ow (unsafeChr (fromIntegral x1))) st2 + in loop (ir+2) ow' st3 + else if iw - ir < 4 then done InputUnderflow ir ow st2 else do + let !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + !(# st4, c3 #) = unIO (readWord8Buf iraw (ir+3)) st3 + x2 = fromIntegral c2 `shiftL` 8 + fromIntegral c3 + if not (validate2 x1 x2) then invalid st4 else do + let !(# st5, ow' #) = unIO (writeCharBuf oraw ow (chr2 x1 x2)) st4 + loop (ir+4) ow' st5 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input { bufL = 0, bufR = 0 } else input { bufL = ir } + !ro = output{ bufR = ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -utf16le_decode :: DecodeBuffer +utf16le_decode :: DecodeBuffer# utf16le_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow - | ir + 1 == iw = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | ir + 1 == iw = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) - let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0 + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir ) st0 + !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0 if validate1 x1 - then do ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral x1)) - loop (ir+2) ow' - else if iw - ir < 4 then done InputUnderflow ir ow else do - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) - let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2 - if not (validate2 x1 x2) then invalid else do - ow' <- writeCharBuf oraw ow (chr2 x1 x2) - loop (ir+4) ow' + then let !(# st3, ow' #) = unIO (writeCharBuf oraw ow (unsafeChr (fromIntegral x1))) st2 + in loop (ir+2) ow' st3 + else if iw - ir < 4 then done InputUnderflow ir ow st2 else do + let !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + !(# st4, c3 #) = unIO (readWord8Buf iraw (ir+3)) st3 + x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2 + if not (validate2 x1 x2) then invalid st4 else do + let !(# st5, ow' #) = unIO (writeCharBuf oraw ow (chr2 x1 x2)) st4 + loop (ir+4) ow' st5 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL = 0, bufR = 0 } else input{ bufL = ir } + !ro = output{ bufR = ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -utf16be_encode :: EncodeBuffer +utf16be_encode :: EncodeBuffer# utf16be_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ir >= iw = done InputUnderflow ir ow - | os - ow < 2 = done OutputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL = 0, bufR = 0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | os - ow < 2 = done OutputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 case ord c of - x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do - writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8)) - writeWord8Buf oraw (ow+1) (fromIntegral x) - loop ir' (ow+2) + x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow st1 else do + let !(# st2, () #) = unIO (writeWord8Buf oraw ow (fromIntegral (x `shiftR` 8))) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) (fromIntegral x)) st2 + loop ir' (ow+2) st3 | otherwise -> do - if os - ow < 4 then done OutputUnderflow ir ow else do + if os - ow < 4 then done OutputUnderflow ir ow st1 else do let n1 = x - 0x10000 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8) @@ -294,35 +310,39 @@ utf16be_encode c3 = fromIntegral (n2 `shiftR` 8 + 0xDC) c4 = fromIntegral n2 -- - writeWord8Buf oraw ow c1 - writeWord8Buf oraw (ow+1) c2 - writeWord8Buf oraw (ow+2) c3 - writeWord8Buf oraw (ow+3) c4 - loop ir' (ow+4) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c1) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c2) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c3) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+3) c4) st4 + loop ir' (ow+4) st5 in - loop ir0 ow0 + loop ir0 ow0 st -utf16le_encode :: EncodeBuffer +utf16le_encode :: EncodeBuffer# utf16le_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ir >= iw = done InputUnderflow ir ow - | os - ow < 2 = done OutputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL = 0, bufR = 0 } else input{ bufL = ir } + !ro = output{ bufR = ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | os - ow < 2 = done OutputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 case ord c of - x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow else do - writeWord8Buf oraw ow (fromIntegral x) - writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8)) - loop ir' (ow+2) + x | x < 0x10000 -> if isSurrogate c then done InvalidSequence ir ow st1 else do + let !(# st2, () #) = unIO (writeWord8Buf oraw ow (fromIntegral x)) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8))) st2 + loop ir' (ow+2) st3 | otherwise -> - if os - ow < 4 then done OutputUnderflow ir ow else do + if os - ow < 4 then done OutputUnderflow ir ow st1 else do let n1 = x - 0x10000 c1 = fromIntegral (n1 `shiftR` 18 + 0xD8) @@ -331,13 +351,13 @@ utf16le_encode c3 = fromIntegral (n2 `shiftR` 8 + 0xDC) c4 = fromIntegral n2 -- - writeWord8Buf oraw ow c2 - writeWord8Buf oraw (ow+1) c1 - writeWord8Buf oraw (ow+2) c4 - writeWord8Buf oraw (ow+3) c3 - loop ir' (ow+4) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c2) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c1) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c4) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+3) c3) st4 + loop ir' (ow+4) st5 in - loop ir0 ow0 + loop ir0 ow0 st chr2 :: Word16 -> Word16 -> Char chr2 (W16# a#) (W16# b#) = C# (chr# (upper# +# lower# +# 0x10000#)) diff --git a/libraries/base/GHC/IO/Encoding/UTF32.hs b/libraries/base/GHC/IO/Encoding/UTF32.hs index 379f76066b..9319234ca7 100644 --- a/libraries/base/GHC/IO/Encoding/UTF32.hs +++ b/libraries/base/GHC/IO/Encoding/UTF32.hs @@ -3,6 +3,7 @@ , BangPatterns , NondecreasingIndentation , MagicHash + , UnboxedTuples #-} {-# OPTIONS_GHC -funbox-strict-fields #-} @@ -61,68 +62,70 @@ mkUTF32 cfm = TextEncoding { textEncodingName = "UTF-32", mkTextDecoder = utf32_DF cfm, mkTextEncoder = utf32_EF cfm } -utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer)) +utf32_DF :: CodingFailureMode -> IO (TextDecoder (Maybe DecodeBuffer#)) utf32_DF cfm = do seen_bom <- newIORef Nothing - return (BufferCodec { - encode = utf32_decode seen_bom, - recover = recoverDecode cfm, - close = return (), - getState = readIORef seen_bom, - setState = writeIORef seen_bom + return (BufferCodec# { + encode# = utf32_decode seen_bom, + recover# = recoverDecode# cfm, + close# = return (), + getState# = readIORef seen_bom, + setState# = writeIORef seen_bom }) utf32_EF :: CodingFailureMode -> IO (TextEncoder Bool) utf32_EF cfm = do done_bom <- newIORef False - return (BufferCodec { - encode = utf32_encode done_bom, - recover = recoverEncode cfm, - close = return (), - getState = readIORef done_bom, - setState = writeIORef done_bom + return (BufferCodec# { + encode# = utf32_encode done_bom, + recover# = recoverEncode# cfm, + close# = return (), + getState# = readIORef done_bom, + setState# = writeIORef done_bom }) -utf32_encode :: IORef Bool -> EncodeBuffer +utf32_encode :: IORef Bool -> EncodeBuffer# utf32_encode done_bom input output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + st0 = do - b <- readIORef done_bom - if b then utf32_native_encode input output + let !(# st1, b #) = unIO (readIORef done_bom) st0 + if b then utf32_native_encode input output st1 else if os - ow < 4 - then return (OutputUnderflow, input,output) + then (# st1,OutputUnderflow,input,output #) else do - writeIORef done_bom True - writeWord8Buf oraw ow bom0 - writeWord8Buf oraw (ow+1) bom1 - writeWord8Buf oraw (ow+2) bom2 - writeWord8Buf oraw (ow+3) bom3 - utf32_native_encode input output{ bufR = ow+4 } - -utf32_decode :: IORef (Maybe DecodeBuffer) -> DecodeBuffer + let !(# st2, () #) = unIO (writeIORef done_bom True) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw ow bom0) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+1) bom1) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+2) bom2) st4 + !(# st6, () #) = unIO (writeWord8Buf oraw (ow+3) bom3) st5 + utf32_native_encode input output{ bufR = ow+4 } st6 + +utf32_decode :: IORef (Maybe DecodeBuffer#) -> DecodeBuffer# utf32_decode seen_bom input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } output + st0 = do - mb <- readIORef seen_bom + let !(# st1, mb #) = unIO (readIORef seen_bom) st0 case mb of - Just decode -> decode input output + Just decode -> decode input output st1 Nothing -> - if iw - ir < 4 then return (InputUnderflow, input,output) else do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) + if iw - ir < 4 then (# st1,InputUnderflow,input,output #) else do + let !(# st2, c0 #) = unIO (readWord8Buf iraw ir ) st1 + !(# st3, c1 #) = unIO (readWord8Buf iraw (ir+1)) st2 + !(# st4, c2 #) = unIO (readWord8Buf iraw (ir+2)) st3 + !(# st5, c3 #) = unIO (readWord8Buf iraw (ir+3)) st4 case () of - _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> do - writeIORef seen_bom (Just utf32be_decode) - utf32be_decode input{ bufL= ir+4 } output - _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> do - writeIORef seen_bom (Just utf32le_decode) - utf32le_decode input{ bufL= ir+4 } output - | otherwise -> do - writeIORef seen_bom (Just utf32_native_decode) - utf32_native_decode input output + _ | c0 == bom0 && c1 == bom1 && c2 == bom2 && c3 == bom3 -> + let !(# st6, () #) = unIO (writeIORef seen_bom (Just utf32be_decode)) st5 + in utf32be_decode input{ bufL= ir+4 } output st6 + _ | c0 == bom3 && c1 == bom2 && c2 == bom1 && c3 == bom0 -> + let !(# st6, () #) = unIO (writeIORef seen_bom (Just utf32le_decode)) st5 + in utf32le_decode input{ bufL= ir+4 } output st6 + | otherwise -> + let !(# st6, () #) = unIO (writeIORef seen_bom (Just utf32_native_decode)) st5 + in utf32_native_decode input output st6 bom0, bom1, bom2, bom3 :: Word8 @@ -132,10 +135,10 @@ bom2 = 0xfe bom3 = 0xff -- choose UTF-32BE by default for UTF-32 output -utf32_native_decode :: DecodeBuffer +utf32_native_decode :: DecodeBuffer# utf32_native_decode = utf32be_decode -utf32_native_encode :: EncodeBuffer +utf32_native_encode :: EncodeBuffer# utf32_native_encode = utf32be_encode -- ----------------------------------------------------------------------------- @@ -152,22 +155,22 @@ mkUTF32be cfm = TextEncoding { textEncodingName = "UTF-32BE", utf32be_DF :: CodingFailureMode -> IO (TextDecoder ()) utf32be_DF cfm = - return (BufferCodec { - encode = utf32be_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf32be_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf32be_EF :: CodingFailureMode -> IO (TextEncoder ()) utf32be_EF cfm = - return (BufferCodec { - encode = utf32be_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf32be_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) @@ -182,128 +185,145 @@ mkUTF32le cfm = TextEncoding { textEncodingName = "UTF-32LE", utf32le_DF :: CodingFailureMode -> IO (TextDecoder ()) utf32le_DF cfm = - return (BufferCodec { - encode = utf32le_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf32le_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf32le_EF :: CodingFailureMode -> IO (TextEncoder ()) utf32le_EF cfm = - return (BufferCodec { - encode = utf32le_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf32le_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) -utf32be_decode :: DecodeBuffer +utf32be_decode :: DecodeBuffer# utf32be_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | iw - ir < 4 = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | iw - ir < 4 = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir ) st0 + !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + !(# st4, c3 #) = unIO (readWord8Buf iraw (ir+3)) st3 let x1 = chr4 c0 c1 c2 c3 - if not (validate x1) then invalid else do - ow' <- writeCharBuf oraw ow x1 - loop (ir+4) ow' + if not (validate x1) then invalid st4 else do + let !(# st5, ow' #) = unIO (writeCharBuf oraw ow x1) st4 + loop (ir+4) ow' st5 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -utf32le_decode :: DecodeBuffer +utf32le_decode :: DecodeBuffer# utf32le_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | iw - ir < 4 = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | iw - ir < 4 = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir ) st0 + !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + !(# st4, c3 #) = unIO (readWord8Buf iraw (ir+3)) st3 let x1 = chr4 c3 c2 c1 c0 - if not (validate x1) then invalid else do - ow' <- writeCharBuf oraw ow x1 - loop (ir+4) ow' + if not (validate x1) then invalid st4 else do + let !(# st5, ow' #) = unIO (writeCharBuf oraw ow x1) st4 + loop (ir+4) ow' st5 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -utf32be_encode :: EncodeBuffer +utf32be_encode :: EncodeBuffer# utf32be_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ir >= iw = done InputUnderflow ir ow - | os - ow < 4 = done OutputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | os - ow < 4 = done OutputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir - if isSurrogate c then done InvalidSequence ir ow else do + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 + if isSurrogate c then done InvalidSequence ir ow st1 else do let (c0,c1,c2,c3) = ord4 c - writeWord8Buf oraw ow c0 - writeWord8Buf oraw (ow+1) c1 - writeWord8Buf oraw (ow+2) c2 - writeWord8Buf oraw (ow+3) c3 - loop ir' (ow+4) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c0) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c1) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c2) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+3) c3) st4 + loop ir' (ow+4) st5 in - loop ir0 ow0 + loop ir0 ow0 st -utf32le_encode :: EncodeBuffer +utf32le_encode :: EncodeBuffer# utf32le_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ir >= iw = done InputUnderflow ir ow - | os - ow < 4 = done OutputUnderflow ir ow + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL=0, bufR=0 } else input{ bufL=ir } + !ro = output{ bufR=ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ir >= iw = done InputUnderflow ir ow st0 + | os - ow < 4 = done OutputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir - if isSurrogate c then done InvalidSequence ir ow else do + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 + if isSurrogate c then done InvalidSequence ir ow st1 else do let (c0,c1,c2,c3) = ord4 c - writeWord8Buf oraw ow c3 - writeWord8Buf oraw (ow+1) c2 - writeWord8Buf oraw (ow+2) c1 - writeWord8Buf oraw (ow+3) c0 - loop ir' (ow+4) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c3) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c2) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c1) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+3) c0) st4 + loop ir' (ow+4) st5 in - loop ir0 ow0 + loop ir0 ow0 st chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) = diff --git a/libraries/base/GHC/IO/Encoding/UTF8.hs b/libraries/base/GHC/IO/Encoding/UTF8.hs index a8d30d9749..f2f4b69176 100644 --- a/libraries/base/GHC/IO/Encoding/UTF8.hs +++ b/libraries/base/GHC/IO/Encoding/UTF8.hs @@ -3,6 +3,7 @@ , BangPatterns , NondecreasingIndentation , MagicHash + , UnboxedTuples #-} {-# OPTIONS_GHC -funbox-strict-fields #-} @@ -56,22 +57,22 @@ mkUTF8 cfm = TextEncoding { textEncodingName = "UTF-8", utf8_DF :: CodingFailureMode -> IO (TextDecoder ()) utf8_DF cfm = - return (BufferCodec { - encode = utf8_decode, - recover = recoverDecode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf8_decode, + recover# = recoverDecode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf8_EF :: CodingFailureMode -> IO (TextEncoder ()) utf8_EF cfm = - return (BufferCodec { - encode = utf8_encode, - recover = recoverEncode cfm, - close = return (), - getState = return (), - setState = const $ return () + return (BufferCodec# { + encode# = utf8_encode, + recover# = recoverEncode# cfm, + close# = return (), + getState# = return (), + setState# = const $ return () }) utf8_bom :: TextEncoding @@ -85,177 +86,188 @@ mkUTF8_bom cfm = TextEncoding { textEncodingName = "UTF-8BOM", utf8_bom_DF :: CodingFailureMode -> IO (TextDecoder Bool) utf8_bom_DF cfm = do ref <- newIORef True - return (BufferCodec { - encode = utf8_bom_decode ref, - recover = recoverDecode cfm, - close = return (), - getState = readIORef ref, - setState = writeIORef ref + return (BufferCodec# { + encode# = utf8_bom_decode ref, + recover# = recoverDecode# cfm, + close# = return (), + getState# = readIORef ref, + setState# = writeIORef ref }) utf8_bom_EF :: CodingFailureMode -> IO (TextEncoder Bool) utf8_bom_EF cfm = do ref <- newIORef True - return (BufferCodec { - encode = utf8_bom_encode ref, - recover = recoverEncode cfm, - close = return (), - getState = readIORef ref, - setState = writeIORef ref + return (BufferCodec# { + encode# = utf8_bom_encode ref, + recover# = recoverEncode# cfm, + close# = return (), + getState# = readIORef ref, + setState# = writeIORef ref }) -utf8_bom_decode :: IORef Bool -> DecodeBuffer +utf8_bom_decode :: IORef Bool -> DecodeBuffer# utf8_bom_decode ref input@Buffer{ bufRaw=iraw, bufL=ir, bufR=iw, bufSize=_ } output + st0 = do - first <- readIORef ref + let !(# st1, first #) = unIO (readIORef ref) st0 if not first - then utf8_decode input output + then utf8_decode input output st1 else do - let no_bom = do writeIORef ref False; utf8_decode input output - if iw - ir < 1 then return (InputUnderflow,input,output) else do - c0 <- readWord8Buf iraw ir + let no_bom = let !(# st', () #) = unIO (writeIORef ref False) st1 in utf8_decode input output st' + if iw - ir < 1 then (# st1,InputUnderflow,input,output #) else do + let !(# st2, c0 #) = unIO (readWord8Buf iraw ir) st1 if (c0 /= bom0) then no_bom else do - if iw - ir < 2 then return (InputUnderflow,input,output) else do - c1 <- readWord8Buf iraw (ir+1) + if iw - ir < 2 then (# st2,InputUnderflow,input,output #) else do + let !(# st3, c1 #) = unIO (readWord8Buf iraw (ir+1)) st2 if (c1 /= bom1) then no_bom else do - if iw - ir < 3 then return (InputUnderflow,input,output) else do - c2 <- readWord8Buf iraw (ir+2) + if iw - ir < 3 then (# st3,InputUnderflow,input,output #) else do + let !(# st4, c2 #) = unIO (readWord8Buf iraw (ir+2)) st3 if (c2 /= bom2) then no_bom else do -- found a BOM, ignore it and carry on - writeIORef ref False - utf8_decode input{ bufL = ir + 3 } output + let !(# st5, () #) = unIO (writeIORef ref False) st4 + utf8_decode input{ bufL = ir + 3 } output st5 -utf8_bom_encode :: IORef Bool -> EncodeBuffer +utf8_bom_encode :: IORef Bool -> EncodeBuffer# utf8_bom_encode ref input output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow, bufSize=os } + st0 = do - b <- readIORef ref - if not b then utf8_encode input output + let !(# st1, b #) = unIO (readIORef ref) st0 + if not b then utf8_encode input output st1 else if os - ow < 3 - then return (OutputUnderflow,input,output) + then (# st1,OutputUnderflow,input,output #) else do - writeIORef ref False - writeWord8Buf oraw ow bom0 - writeWord8Buf oraw (ow+1) bom1 - writeWord8Buf oraw (ow+2) bom2 - utf8_encode input output{ bufR = ow+3 } + let !(# st2, () #) = unIO (writeIORef ref False) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw ow bom0) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+1) bom1) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+2) bom2) st4 + utf8_encode input output{ bufR = ow+3 } st5 bom0, bom1, bom2 :: Word8 bom0 = 0xef bom1 = 0xbb bom2 = 0xbf -utf8_decode :: DecodeBuffer +utf8_decode :: DecodeBuffer# utf8_decode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + loop :: Int -> Int -> DecodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - c0 <- readWord8Buf iraw ir + let !(# st1, c0 #) = unIO (readWord8Buf iraw ir) st0 case c0 of _ | c0 <= 0x7f -> do - ow' <- writeCharBuf oraw ow (unsafeChr (fromIntegral c0)) - loop (ir+1) ow' - | c0 >= 0xc0 && c0 <= 0xc1 -> invalid -- Overlong forms + let !(# st2, ow' #) = unIO (writeCharBuf oraw ow (unsafeChr (fromIntegral c0))) st1 + loop (ir+1) ow' st2 + | c0 >= 0xc0 && c0 <= 0xc1 -> invalid st1 -- Overlong forms | c0 >= 0xc2 && c0 <= 0xdf -> - if iw - ir < 2 then done InputUnderflow ir ow else do - c1 <- readWord8Buf iraw (ir+1) - if (c1 < 0x80 || c1 >= 0xc0) then invalid else do - ow' <- writeCharBuf oraw ow (chr2 c0 c1) - loop (ir+2) ow' + if iw - ir < 2 then done InputUnderflow ir ow st1 else do + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + if (c1 < 0x80 || c1 >= 0xc0) then invalid st2 else do + let !(# st3, ow' #) = unIO (writeCharBuf oraw ow (chr2 c0 c1)) st2 + loop (ir+2) ow' st3 | c0 >= 0xe0 && c0 <= 0xef -> case iw - ir of - 1 -> done InputUnderflow ir ow + 1 -> done InputUnderflow ir ow st1 2 -> do -- check for an error even when we don't have -- the full sequence yet (#3341) - c1 <- readWord8Buf iraw (ir+1) + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 if not (validate3 c0 c1 0x80) - then invalid else done InputUnderflow ir ow + then invalid st2 else done InputUnderflow ir ow st2 _ -> do - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) - if not (validate3 c0 c1 c2) then invalid else do - ow' <- writeCharBuf oraw ow (chr3 c0 c1 c2) - loop (ir+3) ow' + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + let !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + if not (validate3 c0 c1 c2) then invalid st3 else do + let !(# st4, ow' #) = unIO (writeCharBuf oraw ow (chr3 c0 c1 c2)) st3 + loop (ir+3) ow' st4 | c0 >= 0xf0 -> case iw - ir of - 1 -> done InputUnderflow ir ow + 1 -> done InputUnderflow ir ow st1 2 -> do -- check for an error even when we don't have -- the full sequence yet (#3341) - c1 <- readWord8Buf iraw (ir+1) + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 if not (validate4 c0 c1 0x80 0x80) - then invalid else done InputUnderflow ir ow + then invalid st2 else done InputUnderflow ir ow st2 3 -> do - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 if not (validate4 c0 c1 c2 0x80) - then invalid else done InputUnderflow ir ow + then invalid st3 else done InputUnderflow ir ow st3 _ -> do - c1 <- readWord8Buf iraw (ir+1) - c2 <- readWord8Buf iraw (ir+2) - c3 <- readWord8Buf iraw (ir+3) - if not (validate4 c0 c1 c2 c3) then invalid else do - ow' <- writeCharBuf oraw ow (chr4 c0 c1 c2 c3) - loop (ir+4) ow' + let !(# st2, c1 #) = unIO (readWord8Buf iraw (ir+1)) st1 + !(# st3, c2 #) = unIO (readWord8Buf iraw (ir+2)) st2 + !(# st4, c3 #) = unIO (readWord8Buf iraw (ir+3)) st3 + if not (validate4 c0 c1 c2 c3) then invalid st4 else do + let !(# st5, ow' #) = unIO (writeCharBuf oraw ow (chr4 c0 c1 c2 c3)) st4 + loop (ir+4) ow' st5 | otherwise -> - invalid + invalid st1 where - invalid = done InvalidSequence ir ow + invalid :: DecodingBuffer# + invalid st' = done InvalidSequence ir ow st' -- lambda-lifted, to avoid thunks being built in the inner-loop: - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> DecodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL = 0, bufR = 0} else input{ bufL = ir } + !ro = output { bufR = ow } + in (# st', why, ri, ro #) in - loop ir0 ow0 + loop ir0 ow0 st -utf8_encode :: EncodeBuffer +utf8_encode :: EncodeBuffer# utf8_encode input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } + st = let - done why !ir !ow = return (why, - if ir == iw then input{ bufL=0, bufR=0 } - else input{ bufL=ir }, - output{ bufR=ow }) - loop !ir !ow - | ow >= os = done OutputUnderflow ir ow - | ir >= iw = done InputUnderflow ir ow + {-# NOINLINE done #-} + done :: CodingProgress -> Int -> Int -> EncodingBuffer# + done why !ir !ow st' = + let !ri = if ir == iw then input{ bufL = 0, bufR = 0 } else input{ bufL = ir } + !ro = output{ bufR = ow } + in (# st', why, ri, ro #) + loop :: Int -> Int -> EncodingBuffer# + loop !ir !ow st0 + | ow >= os = done OutputUnderflow ir ow st0 + | ir >= iw = done InputUnderflow ir ow st0 | otherwise = do - (c,ir') <- readCharBuf iraw ir + let !(# st1, (c,ir') #) = unIO (readCharBuf iraw ir) st0 case ord c of x | x <= 0x7F -> do - writeWord8Buf oraw ow (fromIntegral x) - loop ir' (ow+1) + let !(# st2, () #) = unIO (writeWord8Buf oraw ow (fromIntegral x)) st1 + loop ir' (ow+1) st2 | x <= 0x07FF -> - if os - ow < 2 then done OutputUnderflow ir ow else do + if os - ow < 2 then done OutputUnderflow ir ow st1 else do let (c1,c2) = ord2 c - writeWord8Buf oraw ow c1 - writeWord8Buf oraw (ow+1) c2 - loop ir' (ow+2) - | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow else do - if os - ow < 3 then done OutputUnderflow ir ow else do + !(# st2, () #) = unIO (writeWord8Buf oraw ow c1) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c2) st2 + loop ir' (ow+2) st3 + | x <= 0xFFFF -> if isSurrogate c then done InvalidSequence ir ow st1 else do + if os - ow < 3 then done OutputUnderflow ir ow st1 else do let (c1,c2,c3) = ord3 c - writeWord8Buf oraw ow c1 - writeWord8Buf oraw (ow+1) c2 - writeWord8Buf oraw (ow+2) c3 - loop ir' (ow+3) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c1) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c2) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c3) st3 + loop ir' (ow+3) st4 | otherwise -> do - if os - ow < 4 then done OutputUnderflow ir ow else do + if os - ow < 4 then done OutputUnderflow ir ow st1 else do let (c1,c2,c3,c4) = ord4 c - writeWord8Buf oraw ow c1 - writeWord8Buf oraw (ow+1) c2 - writeWord8Buf oraw (ow+2) c3 - writeWord8Buf oraw (ow+3) c4 - loop ir' (ow+4) + !(# st2, () #) = unIO (writeWord8Buf oraw ow c1) st1 + !(# st3, () #) = unIO (writeWord8Buf oraw (ow+1) c2) st2 + !(# st4, () #) = unIO (writeWord8Buf oraw (ow+2) c3) st3 + !(# st5, () #) = unIO (writeWord8Buf oraw (ow+3) c4) st4 + loop ir' (ow+4) st5 in - loop ir0 ow0 + loop ir0 ow0 st -- ----------------------------------------------------------------------------- -- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8 diff --git a/libraries/base/changelog.md b/libraries/base/changelog.md index a194bfad7f..26e7f36d5f 100644 --- a/libraries/base/changelog.md +++ b/libraries/base/changelog.md @@ -28,6 +28,7 @@ * Add more instances for `Compose`: `Enum`, `Bounded`, `Num`, `Real`, `Integral` ([CLC proposal #160](https://github.com/haskell/core-libraries-committee/issues/160)) * Make `(&)` representation polymorphic in the return type ([CLC proposal #158](https://github.com/haskell/core-libraries-committee/issues/158)) * Implement `GHC.IORef.atomicSwapIORef` via a new dedicated primop `atomicSwapMutVar#` ([CLC proposal #139](https://github.com/haskell/core-libraries-committee/issues/139)) + * Change codebuffers to use an unboxed implementation, while providing a compatibility layer using pattern synonyms. ([CLC proposal #134](https://github.com/haskell/core-libraries-committee/issues/134)) ## 4.18.0.0 *March 2023* * Shipped with GHC 9.6.1 -- cgit v1.2.1