summaryrefslogtreecommitdiff
path: root/testsuite
diff options
context:
space:
mode:
authorJohan Tibell <johan.tibell@gmail.com>2014-03-13 09:35:21 +0100
committerJohan Tibell <johan.tibell@gmail.com>2014-03-22 10:32:02 +0100
commit1eece45692fb5d1a5f4ec60c1537f8068237e9c1 (patch)
treeb5d99d52c5a6ab762f9b92dfd0504105122ed62b /testsuite
parent99ef27913dbe55fa57891bbf97d131e0933733e3 (diff)
downloadhaskell-1eece45692fb5d1a5f4ec60c1537f8068237e9c1.tar.gz
codeGen: inline allocation optimization for clone array primops
The inline allocation version is 69% faster than the out-of-line version, when cloning an array of 16 unit elements on a 64-bit machine. Comparing the new and the old primop implementations isn't straightforward. The old version had a missing heap check that I discovered during the development of the new version. Comparing the old and the new version would requiring fixing the old version, which in turn means reimplementing the equivalent of MAYBE_CG in StgCmmPrim. The inline allocation threshold is configurable via -fmax-inline-alloc-size which gives the maximum array size, in bytes, to allocate inline. The size does not include the closure header size. Allowing the same primop to be either inline or out-of-line has some implication for how we lay out heap checks. We always place a heap check around out-of-line primops, as they may allocate outside of our knowledge. However, for the inline primops we only allow allocation via the standard means (i.e. virtHp). Since the clone primops might be either inline or out-of-line the heap check layout code now consults shouldInlinePrimOp to know whether a primop will be inlined.
Diffstat (limited to 'testsuite')
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun064.hs79
-rw-r--r--testsuite/tests/codeGen/should_run/cgrun064.stdout8
-rw-r--r--testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs24
-rw-r--r--testsuite/tests/perf/should_run/all.T7
4 files changed, 114 insertions, 4 deletions
diff --git a/testsuite/tests/codeGen/should_run/cgrun064.hs b/testsuite/tests/codeGen/should_run/cgrun064.hs
index 24544c4382..527c6bde67 100644
--- a/testsuite/tests/codeGen/should_run/cgrun064.hs
+++ b/testsuite/tests/codeGen/should_run/cgrun064.hs
@@ -9,15 +9,20 @@ import GHC.Exts hiding (IsList(..))
import GHC.Prim
import GHC.ST
+main :: IO ()
main = putStr
(test_copyArray
++ "\n" ++ test_copyMutableArray
++ "\n" ++ test_copyMutableArrayOverlap
++ "\n" ++ test_cloneArray
+ ++ "\n" ++ test_cloneArrayStatic
++ "\n" ++ test_cloneMutableArray
++ "\n" ++ test_cloneMutableArrayEmpty
+ ++ "\n" ++ test_cloneMutableArrayStatic
++ "\n" ++ test_freezeArray
+ ++ "\n" ++ test_freezeArrayStatic
++ "\n" ++ test_thawArray
+ ++ "\n" ++ test_thawArrayStatic
++ "\n"
)
@@ -32,6 +37,10 @@ len = 130
copied :: Int
copied = len - 2
+copiedStatic :: Int
+copiedStatic = 16
+{-# INLINE copiedStatic #-} -- to make sure optimization triggers
+
------------------------------------------------------------------------
-- copyArray#
@@ -90,9 +99,20 @@ test_cloneArray =
fill src 0 len
src <- unsafeFreezeArray src
-- Don't include the first and last element.
- return $ cloneArray src 1 copied
+ return $! cloneArray src 1 copied
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_cloneArrayStatic :: String
+test_cloneArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ src <- unsafeFreezeArray src
+ -- Don't include the first and last element.
+ return $! cloneArray src 1 copiedStatic
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- cloneMutableArray#
@@ -117,6 +137,17 @@ test_cloneMutableArrayEmpty =
unsafeFreezeArray dst
in shows (toList dst 0) "\n"
+-- Check that the static-size optimization works.
+test_cloneMutableArrayStatic :: String
+test_cloneMutableArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ -- Don't include the first and last element.
+ dst <- cloneMutableArray src 1 copiedStatic
+ unsafeFreezeArray dst
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- freezeArray#
@@ -131,6 +162,16 @@ test_freezeArray =
freezeArray src 1 copied
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_freezeArrayStatic :: String
+test_freezeArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ -- Don't include the first and last element.
+ freezeArray src 1 copiedStatic
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- thawArray#
@@ -147,6 +188,18 @@ test_thawArray =
unsafeFreezeArray dst
in shows (toList dst copied) "\n"
+-- Check that the static-size optimization works.
+test_thawArrayStatic :: String
+test_thawArrayStatic =
+ let dst = runST $ do
+ src <- newArray len 0
+ fill src 0 len
+ src <- unsafeFreezeArray src
+ -- Don't include the first and last element.
+ dst <- thawArray src 1 copiedStatic
+ unsafeFreezeArray dst
+ in shows (toList dst copiedStatic) "\n"
+
------------------------------------------------------------------------
-- Test helpers
@@ -181,13 +234,27 @@ newArray (I# n#) a = ST $ \s# -> case newArray# n# a s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
indexArray :: Array a -> Int -> a
-indexArray arr (I# i#) = case indexArray# (unArray arr) i# of
- (# a #) -> a
+indexArray arr i@(I# i#)
+ | i < 0 || i >= len =
+ error $ "bounds error, offset " ++ show i ++ ", length " ++ show len
+ | otherwise = case indexArray# (unArray arr) i# of
+ (# a #) -> a
+ where len = lengthArray arr
writeArray :: MArray s a -> Int -> a -> ST s ()
-writeArray marr (I# i#) a = ST $ \ s# ->
+writeArray marr i@(I# i#) a
+ | i < 0 || i >= len =
+ error $ "bounds error, offset " ++ show i ++ ", length " ++ show len
+ | otherwise = ST $ \ s# ->
case writeArray# (unMArray marr) i# a s# of
s2# -> (# s2#, () #)
+ where len = lengthMArray marr
+
+lengthArray :: Array a -> Int
+lengthArray arr = I# (sizeofArray# (unArray arr))
+
+lengthMArray :: MArray s a -> Int
+lengthMArray marr = I# (sizeofMutableArray# (unMArray marr))
unsafeFreezeArray :: MArray s a -> ST s (Array a)
unsafeFreezeArray marr = ST $ \ s# ->
@@ -206,21 +273,25 @@ copyMutableArray src (I# six#) dst (I# dix#) (I# n#) = ST $ \ s# ->
cloneArray :: Array a -> Int -> Int -> Array a
cloneArray src (I# six#) (I# n#) = Array (cloneArray# (unArray src) six# n#)
+{-# INLINE cloneArray #-} -- to make sure optimization triggers
cloneMutableArray :: MArray s a -> Int -> Int -> ST s (MArray s a)
cloneMutableArray src (I# six#) (I# n#) = ST $ \ s# ->
case cloneMutableArray# (unMArray src) six# n# s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
+{-# INLINE cloneMutableArray #-} -- to make sure optimization triggers
freezeArray :: MArray s a -> Int -> Int -> ST s (Array a)
freezeArray src (I# six#) (I# n#) = ST $ \ s# ->
case freezeArray# (unMArray src) six# n# s# of
(# s2#, arr# #) -> (# s2#, Array arr# #)
+{-# INLINE freezeArray #-} -- to make sure optimization triggers
thawArray :: Array a -> Int -> Int -> ST s (MArray s a)
thawArray src (I# six#) (I# n#) = ST $ \ s# ->
case thawArray# (unArray src) six# n# s# of
(# s2#, marr# #) -> (# s2#, MArray marr# #)
+{-# INLINE thawArray #-} -- to make sure optimization triggers
toList :: Array a -> Int -> [a]
toList arr n = go 0
diff --git a/testsuite/tests/codeGen/should_run/cgrun064.stdout b/testsuite/tests/codeGen/should_run/cgrun064.stdout
index 8e741ceec6..86ad8a276c 100644
--- a/testsuite/tests/codeGen/should_run/cgrun064.stdout
+++ b/testsuite/tests/codeGen/should_run/cgrun064.stdout
@@ -6,11 +6,19 @@
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
[]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256,289,324,361,400,441,484,529,576,625,676,729,784,841,900,961,1024,1089,1156,1225,1296,1369,1444,1521,1600,1681,1764,1849,1936,2025,2116,2209,2304,2401,2500,2601,2704,2809,2916,3025,3136,3249,3364,3481,3600,3721,3844,3969,4096,4225,4356,4489,4624,4761,4900,5041,5184,5329,5476,5625,5776,5929,6084,6241,6400,6561,6724,6889,7056,7225,7396,7569,7744,7921,8100,8281,8464,8649,8836,9025,9216,9409,9604,9801,10000,10201,10404,10609,10816,11025,11236,11449,11664,11881,12100,12321,12544,12769,12996,13225,13456,13689,13924,14161,14400,14641,14884,15129,15376,15625,15876,16129,16384]
+[1,4,9,16,25,36,49,64,81,100,121,144,169,196,225,256]
+
diff --git a/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs b/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs
new file mode 100644
index 0000000000..54243fe793
--- /dev/null
+++ b/testsuite/tests/perf/should_run/InlineCloneArrayAlloc.hs
@@ -0,0 +1,24 @@
+{-# LANGUAGE MagicHash, UnboxedTuples #-}
+module Main where
+
+import GHC.Exts
+import GHC.IO
+
+main :: IO ()
+main = do
+ marr <- newArray
+ loop 10000000 (unMArray marr)
+ where
+ loop :: Int -> MutableArray# RealWorld () -> IO ()
+ loop 0 _ = return ()
+ loop i marr = freezeArray marr >> loop (i-1) marr
+
+data MArray = MArray { unMArray :: !(MutableArray# RealWorld ()) }
+
+newArray :: IO MArray
+newArray = IO $ \s -> case newArray# 16# () s of
+ (# s', marr# #) -> (# s', MArray marr# #)
+
+freezeArray :: MutableArray# RealWorld () -> IO ()
+freezeArray marr# = IO $ \s -> case freezeArray# marr# 0# 16# s of
+ (# s', _ #) -> (# s', () #)
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
index 14be74ed9d..1e1b6ccba8 100644
--- a/testsuite/tests/perf/should_run/all.T
+++ b/testsuite/tests/perf/should_run/all.T
@@ -344,3 +344,10 @@ test('InlineByteArrayAlloc',
only_ways(['normal'])],
compile_and_run,
['-O2'])
+
+test('InlineCloneArrayAlloc',
+ [stats_num_field('bytes allocated',
+ [ (wordsize(64), 1600041120, 5)]),
+ only_ways(['normal'])],
+ compile_and_run,
+ ['-O2'])