summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndreas Klebinger <klebinger.andreas@gmx.at>2019-08-05 18:42:10 +0200
committerMarge Bot <ben+marge-bot@smart-cactus.org>2019-08-14 16:55:42 -0400
commita38104b4df092d2155f61e9785e92ceb268b7e5f (patch)
tree089a1af9fc83ef560b2af571a0b376322965089f
parentb1d29c6732b9cc6e4b3b88b3c1a7943d0a9af4f2 (diff)
downloadhaskell-a38104b4df092d2155f61e9785e92ceb268b7e5f.tar.gz
Rework the Binary Integer instance.
We used to serialise large integers as strings. Now they are serialized as a list of Bytes. This changes the size for a Integer in the higher 64bit range from 77 to 9 bytes when written to disk. The impact on the general case is small (<1% for interface files) as we don't use many Integers. But for code that uses many this should be a nice benefit.
-rw-r--r--compiler/utils/Binary.hs96
1 files changed, 74 insertions, 22 deletions
diff --git a/compiler/utils/Binary.hs b/compiler/utils/Binary.hs
index 5734528458..baca4be929 100644
--- a/compiler/utils/Binary.hs
+++ b/compiler/utils/Binary.hs
@@ -4,6 +4,7 @@
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE MultiWayIf #-}
+{-# LANGUAGE BangPatterns #-}
{-# OPTIONS_GHC -O2 -funbox-strict-fields #-}
-- We always optimise this, otherwise performance of a non-optimised
@@ -79,11 +80,12 @@ import qualified Data.ByteString.Unsafe as BS
import Data.IORef
import Data.Char ( ord, chr )
import Data.Time
+import Data.List (unfoldr)
import Type.Reflection
import Type.Reflection.Unsafe
import Data.Kind (Type)
import GHC.Exts (TYPE, RuntimeRep(..), VecCount(..), VecElem(..))
-import Control.Monad ( when )
+import Control.Monad ( when, (<$!>) )
import System.IO as IO
import System.IO.Unsafe ( unsafeInterleaveIO )
import System.IO.Error ( mkIOError, eofErrorType )
@@ -502,40 +504,90 @@ instance Binary DiffTime where
get bh = do r <- get bh
return $ fromRational r
---to quote binary-0.3 on this code idea,
---
--- TODO This instance is not architecture portable. GMP stores numbers as
--- arrays of machine sized words, so the byte format is not portable across
--- architectures with different endianness and word size.
---
--- This makes it hard (impossible) to make an equivalent instance
--- with code that is compilable with non-GHC. Do we need any instance
--- Binary Integer, and if so, does it have to be blazing fast? Or can
--- we just change this instance to be portable like the rest of the
--- instances? (binary package has code to steal for that)
---
--- yes, we need Binary Integer and Binary Rational in basicTypes/Literal.hs
+{-
+Finally - a reasonable portable Integer instance.
+
+We used to encode values in the Int32 range as such,
+falling back to a string of all things. In either case
+we stored a tag byte to discriminate between the two cases.
+
+This made some sense as it's highly portable but also not very
+efficient.
+
+However GHC stores a surprisingly large number off large Integer
+values. In the examples looked at between 25% and 50% of Integers
+serialized were outside of the Int32 range.
+
+Consider a valie like `2724268014499746065`, some sort of hash
+actually generated by GHC.
+In the old scheme this was encoded as a list of 19 chars. This
+gave a size of 77 Bytes, one for the length of the list and 76
+since we encod chars as Word32 as well.
+
+We can easily do better. The new plan is:
+
+* Start with a tag byte
+ * 0 => Int32 value
+ * 1 => Int64
+ * 2 => Negative large interger
+ * 3 => Positive large integer
+* Followed by the value:
+ * Int32/64 is encoded as usual
+ * Large integers are encoded as a list of bytes (Word8).
+ We use Data.Bits which defines a bit order independent of the representation.
+ Values are stored LSB first.
+
+This means our example value `2724268014499746065` is now only 10 bytes large.
+* One byte tag
+* One byte for the length of the [Word8] list.
+* 8 bytes for the actual date.
+
+The new scheme also does not depend in any way on
+architecture specific details.
+
+The instance is used for in Binary Integer and Binary Rational in basicTypes/Literal.hs
+-}
instance Binary Integer where
put_ bh i
| i >= lo32 && i <= hi32 = do
putWord8 bh 0
put_ bh (fromIntegral i :: Int32)
- | otherwise = do
+ | i >= lo64 && i <= hi64 = do
putWord8 bh 1
- put_ bh (show i)
+ put_ bh (fromIntegral i :: Int64)
+ | otherwise = do
+ if i < 0
+ then putWord8 bh 2
+ else putWord8 bh 3
+ put_ bh (unroll $ abs i)
where
lo32 = fromIntegral (minBound :: Int32)
hi32 = fromIntegral (maxBound :: Int32)
-
+ lo64 = fromIntegral (minBound :: Int64)
+ hi64 = fromIntegral (maxBound :: Int64)
get bh = do
int_kind <- getWord8 bh
case int_kind of
- 0 -> fromIntegral <$> (get bh :: IO Int32)
- _ -> do str <- get bh
- case reads str of
- [(i, "")] -> return i
- _ -> fail ("Binary integer: got " ++ show str)
+ 0 -> fromIntegral <$!> (get bh :: IO Int32)
+ 1 -> fromIntegral <$!> (get bh :: IO Int64)
+ -- Large integer
+ _ -> do
+ !i <- roll <$!> (get bh :: IO [Word8]) :: IO Integer
+ if int_kind == 2 then return $! negate i -- Negative
+ else return $! i -- Positive
+
+unroll :: (Integral a, Bits a) => a -> [Word8]
+unroll = unfoldr step
+ where
+ step 0 = Nothing
+ step i = Just (fromIntegral i, i `shiftR` 8)
+
+roll :: (Integral a, Bits a) => [Word8] -> a
+roll = foldl' unstep 0 . reverse
+ where
+ unstep a b = a `shiftL` 8 .|. fromIntegral b
+
{-
-- This code is currently commented out.