59 files changed, 1471 insertions, 0 deletions
diff --git a/testsuite/tests/perf/should_run/Conversions.hs b/testsuite/tests/perf/should_run/Conversions.hs
new file mode 100644
index 0000000000..8432727965
--- /dev/null
+++ b/testsuite/tests/perf/should_run/Conversions.hs
@@ -0,0 +1,21 @@
+{-# LANGUAGE BangPatterns #-}
+
+-- | Tests that conversions between various primitive types (e.g.
+-- Word, Double, etc) doesn't allocate.
+module Main (main) where
+
+import Data.Word
+
+-- Repeatedly convert Words to Doubles
+loop :: Floating a => Word -> a
+loop n = go 0 0.0
+  where
+    go i !acc | i < n = go (i+1) (acc + fromIntegral i)
+              | otherwise = acc
+{-# SPECIALISE loop :: Word -> Float #-}
+{-# SPECIALISE loop :: Word -> Double #-}
+
+main :: IO ()
+main = do
+    print (loop 1000000 :: Float)
+    print (loop 1000000 :: Double)
diff --git a/testsuite/tests/perf/should_run/Conversions.stdout b/testsuite/tests/perf/should_run/Conversions.stdout
new file mode 100644
index 0000000000..2fe5b4dc9d
--- /dev/null
+++ b/testsuite/tests/perf/should_run/Conversions.stdout
@@ -0,0 +1,2 @@
+4.9994036e11
+4.999995e11
diff --git a/testsuite/tests/perf/should_run/Makefile b/testsuite/tests/perf/should_run/Makefile
new file mode 100644
index 0000000000..e839342f90
--- /dev/null
+++ b/testsuite/tests/perf/should_run/Makefile
@@ -0,0 +1,36 @@
+TOP=../../..
+include $(TOP)/mk/boilerplate.mk
+include $(TOP)/mk/test.mk
+
+define runT3736
+./T3736 $1 +RTS -t --machine-readable 2>&1 | grep '"bytes allocated"' | sed -e 's/.*, "//' -e 's/".*//'
+endef
+.PHONY: T3736
+T3736:
+	$(RM) -f T3736.hi T3736.o T3736
+	'$(TEST_HC)' $(TEST_HC_OPTS) -v0 -O --make T3736 -rtsopts
+# Check ALLOC1 is > 100 just to check with have sane results, and if so,
+# the test passes if the two numbers are equal. We could check that the
+# actual numbers are in the range we expect too (on the various
+# platforms), but we don't currently.
+	ALLOC1=`$(call runT3736,1)`; ALLOC2=`$(call runT3736,2)`; if [ "$$ALLOC1" -gt 100 ] && [ "$$ALLOC1" -eq "$$ALLOC2" ]; then echo Match; else echo "Mismatch: $$ALLOC1 $$ALLOC2"; fi
+
+.PHONY: T2902
+T2902:
+	$(RM) -f T2902_A    T2902_B
+	$(RM) -f T2902_A.hi T2902_B.hi
+	$(RM) -f T2902_A.o  T2902_B.o
+	$(RM) -f T2902_A_PairingSum.hi T2902_B_PairingSum.hi T2902_Sum.hi
+	$(RM) -f T2902_A_PairingSum.o  T2902_B_PairingSum.o  T2902_Sum.o
+	'$(TEST_HC)' $(TEST_HC_OPTS) -v0 -O --make T2902_A -rtsopts
+	'$(TEST_HC)' $(TEST_HC_OPTS) -v0 -O --make T2902_B -rtsopts
+	BAA=`./T2902_A +RTS -t --machine-readable 2>&1 | grep '"bytes allocated"' | sed -e 's/.*, "//' -e 's/")//'`; BAB=`./T2902_B +RTS -t --machine-readable 2>&1 | grep '"bytes allocated"' | sed -e 's/.*, "//' -e 's/")//'`; [ "$$BAA" = "" ] && echo 'T2902_A: No "bytes allocated"'; [ "$$BAA" = "$$BAB" ] || echo "T2902: Mismatch in \"bytes allocated\": $$BAA $$BAB"
+
+.PHONY: T149
+T149:
+	$(RM) -f T149_A T149_A.hi T149_A.o
+	$(RM) -f T149_B T149_B.hi T149_B.o
+	'$(TEST_HC)' $(TEST_HC_OPTS) -v0 -O --make T149_A -rtsopts
+	'$(TEST_HC)' $(TEST_HC_OPTS) -v0 -O --make T149_B -rtsopts
+	BAA=`./T149_A +RTS -t --machine-readable 2>&1 | grep '"bytes allocated"' | sed -e 's/.*, "//' -e 's/")//'`; BAB=`./T149_B +RTS -t --machine-readable 2>&1 | grep '"bytes allocated"' | sed -e 's/.*, "//' -e 's/")//'`; [ "$$BAA" = "" ] && echo 'T149_A: No "bytes allocated"'; [ "$$BAA" = "$$BAB" ] || echo "T149: Mismatch in \"bytes allocated\": $$BAA $$BAB"
+
diff --git a/testsuite/tests/perf/should_run/MethSharing.hs b/testsuite/tests/perf/should_run/MethSharing.hs
new file mode 100644
index 0000000000..fb69bd4509
--- /dev/null
+++ b/testsuite/tests/perf/should_run/MethSharing.hs
@@ -0,0 +1,97 @@
+module Main where
+
+-- This test works efficiently because the full laziness
+-- pass now floats out applications
+-- 	\x -> f y (x+1)
+-- It'll float out the (f y) if that's a redex
+
+loop :: Double -> [Int] -> Double
+{-# NOINLINE loop #-}
+loop x [] = x
+loop x (n:ns) = x `seq` loop (x ^ n) ns
+
+main = print $ loop 1 (replicate 10000000 5)
+
+----------------------------------------------------
+{-  Roman's message of May 2010
+
+I tried running nofib with -fno-method-sharing (we discussed this at some point). These are the results:
+
+--------------------------------------------------------------------------------
+        Program           Size    Allocs   Runtime   Elapsed
+--------------------------------------------------------------------------------
+
+            Min          -0.3%    -25.0%    -12.5%     -9.9%
+            Max          +0.2%   +159.1%    +90.0%    +84.7%
+ Geometric Mean          -0.0%     +2.2%     +6.8%     +5.1%
+
+This is the worst program:
+
+         simple          +0.2%   +159.1%    +65.3%    +63.9%
+
+I looked at it a bit and came up with this small example:
+
+----
+loop :: Double -> [Int] -> Double
+{-# NOINLINE loop #-}
+loop x [] = x
+loop x (n:ns) = x `seq` loop (x ^ n) ns
+
+main = print $ loop 1 (replicate 10000000 5)
+----
+
+This is over 2x slower with -fno-method-sharing. The culprit is, of
+course, (^). Here is the difference:
+
+Without -fno-method-sharing:
+
+----
+^_rVB :: GHC.Types.Double -> GHC.Types.Int -> GHC.Types.Double ^_rVB =
+  GHC.Real.^
+    @ GHC.Types.Double
+    @ GHC.Types.Int
+    GHC.Float.$fNumDouble
+    GHC.Real.$fIntegralInt
+
+Main.loop [InlPrag=NOINLINE (sat-args=2), Occ=LoopBreaker]
+  :: GHC.Types.Double -> [GHC.Types.Int] -> GHC.Types.Double Main.loop =
+  \ (x1_aat :: GHC.Types.Double) (ds_drG :: [GHC.Types.Int]) ->
+    case ds_drG of _ {
+      [] -> x1_aat;
+      : n_aav ns_aaw ->
+        case x1_aat of x2_aau { GHC.Types.D# ipv_srQ ->
+        Main.loop (^_rVB x2_aau n_aav) ns_aaw
+        }
+    }
+----
+
+With:
+
+----
+Main.loop [InlPrag=NOINLINE (sat-args=2), Occ=LoopBreaker]
+  :: GHC.Types.Double -> [GHC.Types.Int] -> GHC.Types.Double Main.loop =
+  \ (x1_aat :: GHC.Types.Double) (ds_drD :: [GHC.Types.Int]) ->
+    case ds_drD of _ {
+      [] -> x1_aat;
+      : n_aav ns_aaw ->
+        case x1_aat of x2_aau { GHC.Types.D# ipv_srN ->
+        Main.loop
+          (GHC.Real.^
+             @ GHC.Types.Double
+             @ GHC.Types.Int
+             GHC.Float.$fNumDouble
+             GHC.Real.$fIntegralInt
+             x2_aau
+             n_aav)
+          ns_aaw
+        }
+    }
+----
+
+This is a bit disappointing. I would have expected GHC to float out
+the application of (^) to the two dictionaries during full laziness
+(note that (^) has arity 2 so the application is oversaturated). Why
+doesn't that happen? SetLevels (if this is the right place to look)
+has this:
+ 
+-}
+\ No newline at end of file
diff --git a/testsuite/tests/perf/should_run/MethSharing.stdout b/testsuite/tests/perf/should_run/MethSharing.stdout
new file mode 100644
index 0000000000..d3827e75a5
--- /dev/null
+++ b/testsuite/tests/perf/should_run/MethSharing.stdout
@@ -0,0 +1 @@
+1.0
diff --git a/testsuite/tests/perf/should_run/T149_A.hs b/testsuite/tests/perf/should_run/T149_A.hs
new file mode 100644
index 0000000000..dd745460c0
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T149_A.hs
@@ -0,0 +1,25 @@
+module Main (main) where
+
+-- See Trac #149
+
+-- Curently (with GHC 7.0) the CSE works, just,
+-- but it's delicate.
+
+
+import System.CPUTime
+
+main :: IO ()
+main = print $ playerMostOccur1 [1..m]
+
+m :: Int
+m = 22
+
+playerMostOccur1 :: [Int] -> Int
+playerMostOccur1 [a] = a
+playerMostOccur1 (x:xs)
+ | numOccur x (x:xs) > numOccur (playerMostOccur1 xs) xs = x
+ | otherwise = playerMostOccur1 xs
+
+numOccur :: Int -> [Int] -> Int
+numOccur i is = length $ filter (i ==) is
+
diff --git a/testsuite/tests/perf/should_run/T149_B.hs b/testsuite/tests/perf/should_run/T149_B.hs
new file mode 100644
index 0000000000..fcc87cdf55
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T149_B.hs
@@ -0,0 +1,26 @@
+module Main (main) where
+
+-- See Trac #149
+
+-- Curently (with GHC 7.0) the CSE works, just,
+-- but it's delicate.
+
+
+import System.CPUTime
+
+main :: IO ()
+main = print $ playerMostOccur2 [1..m]
+
+m :: Int
+m = 22
+
+playerMostOccur2 :: [Int] -> Int
+playerMostOccur2 [a] = a
+playerMostOccur2 (x:xs)
+ | numOccur x (x:xs) > numOccur pmo xs = x
+ | otherwise = pmo
+    where pmo = playerMostOccur2 xs
+
+numOccur :: Int -> [Int] -> Int
+numOccur i is = length $ filter (i ==) is
+
diff --git a/testsuite/tests/perf/should_run/T2902_A.hs b/testsuite/tests/perf/should_run/T2902_A.hs
new file mode 100644
index 0000000000..c0939104f3
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T2902_A.hs
@@ -0,0 +1,18 @@
+
+{-# LANGUAGE UnicodeSyntax #-}
+
+module Main (main) where
+
+import T2902_A_PairingSum
+
+f :: Int -> PSum Int Int
+f n = unions $ fmap g [1..n]
+  where
+    g m = unions $ fmap fromList
+      [ zip [m..n] $ repeat 1
+      , zip [m,2+m..n] $ repeat 2
+      , zip [m,3+m..n] $ repeat 3
+      ]
+
+main ∷ IO ()
+main = print $ take 20 $ toList $ f 20
diff --git a/testsuite/tests/perf/should_run/T2902_A_PairingSum.hs b/testsuite/tests/perf/should_run/T2902_A_PairingSum.hs
new file mode 100644
index 0000000000..6dc5fb484e
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T2902_A_PairingSum.hs
@@ -0,0 +1,49 @@
+
+{-# LANGUAGE UnicodeSyntax, MultiParamTypeClasses, FlexibleInstances #-}
+
+module T2902_A_PairingSum (Sum(..), PSum) where
+
+import T2902_Sum
+
+data PSum a b = Empty | Tree a b [(PSum a b)]
+
+instance (Ord a, Eq b, Num b) ⇒ Sum PSum a b where
+  insert     = insertX
+  union      = unionX
+  unions     = unionsX
+  extractMin = extractMinX
+  fromList   = fromListX
+  toList     = toListX
+
+insertX ∷ (Ord a, Eq b, Num b) ⇒ a → b → PSum a b → PSum a b
+insertX v r = unionX $ Tree v r []
+
+unionX ∷ (Ord a, Eq b, Num b) ⇒ PSum a b → PSum a b → PSum a b
+unionX x Empty = x
+unionX Empty x = x
+unionX x@(Tree v r xs) y@(Tree w s ys) =
+  case compare v w of
+    LT → Tree v r (y:xs)
+    GT → Tree w s (x:ys)
+    EQ → case r + s of
+      0 → z
+      t → insertX v t z
+  where z = unionX (unionsX xs) (unionsX ys)
+
+unionsX ∷ (Ord a, Eq b, Num b) ⇒ [PSum a b] → PSum a b
+unionsX [] = Empty
+unionsX [x] = x
+unionsX (x : y : zs) = unionX (unionX x y) (unionsX zs)
+
+extractMinX ∷ (Ord a, Eq b, Num b) ⇒ PSum a b → ((a,b), PSum a b)
+extractMinX Empty = undefined
+extractMinX (Tree v r xs) = ((v,r), unionsX xs)
+
+fromListX ∷ (Ord a, Eq b, Num b) ⇒ [(a,b)] → PSum a b
+fromListX [] = Empty
+fromListX ((v,r):xs) = insertX v r $ fromListX xs
+
+toListX ∷ (Ord a, Eq b, Num b) ⇒ PSum a b → [(a,b)]
+toListX Empty = []
+toListX x = let (y, z) = extractMinX x in y : toListX z
+
diff --git a/testsuite/tests/perf/should_run/T2902_B.hs b/testsuite/tests/perf/should_run/T2902_B.hs
new file mode 100644
index 0000000000..c6558c625b
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T2902_B.hs
@@ -0,0 +1,18 @@
+
+{-# LANGUAGE UnicodeSyntax #-}
+
+module Main (main) where
+
+import T2902_B_PairingSum
+
+f :: Int -> PSum Int Int
+f n = unions $ fmap g [1..n]
+  where
+    g m = unions $ fmap fromList
+      [ zip [m..n] $ repeat 1
+      , zip [m,2+m..n] $ repeat 2
+      , zip [m,3+m..n] $ repeat 3
+      ]
+
+main ∷ IO ()
+main = print $ take 20 $ toList $ f 20
diff --git a/testsuite/tests/perf/should_run/T2902_B_PairingSum.hs b/testsuite/tests/perf/should_run/T2902_B_PairingSum.hs
new file mode 100644
index 0000000000..baf5885798
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T2902_B_PairingSum.hs
@@ -0,0 +1,37 @@
+
+{-# LANGUAGE UnicodeSyntax, MultiParamTypeClasses, FlexibleInstances #-}
+
+module T2902_B_PairingSum (Sum(..), PSum) where
+
+import T2902_Sum
+
+data PSum a b = Empty | Tree a b [PSum a b]
+
+instance (Ord a, Eq b, Num b) ⇒ Sum PSum a b where
+
+  insert v r = union $ Tree v r []
+
+  union x Empty = x
+  union Empty x = x
+  union x@(Tree v r xs) y@(Tree w s ys) =
+    case compare v w of
+      LT → Tree v r (y:xs)
+      GT → Tree w s (x:ys)
+      EQ → case r + s of
+        0 → z
+        t → insert v t z
+    where z = union (unions xs) (unions ys)
+
+  unions [] = Empty
+  unions [x] = x
+  unions (x : y : zs) = union (union x y) (unions zs)
+
+  extractMin Empty = undefined
+  extractMin (Tree v r xs) = ((v,r), unions xs)
+
+  fromList [] = Empty
+  fromList ((v,r):xs) = insert v r $ fromList xs
+
+  toList Empty = []
+  toList x = let (y, z) = extractMin x in y : toList z
+
diff --git a/testsuite/tests/perf/should_run/T2902_Sum.hs b/testsuite/tests/perf/should_run/T2902_Sum.hs
new file mode 100644
index 0000000000..9be6b10568
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T2902_Sum.hs
@@ -0,0 +1,14 @@
+
+{-# LANGUAGE UnicodeSyntax, MultiParamTypeClasses #-}
+
+module T2902_Sum (Sum(..)) where
+
+class Sum c a b where
+  insert     ∷ a → b → c a b → c a b
+  union      ∷ c a b → c a b → c a b
+  unions     ∷ [c a b] → c a b
+  extractMin ∷ c a b → ((a,b), c a b)
+
+  fromList   ∷ [(a,b)] → c a b
+  toList     ∷ c a b → [(a,b)]
+
diff --git a/testsuite/tests/perf/should_run/T3245.hs b/testsuite/tests/perf/should_run/T3245.hs
new file mode 100644
index 0000000000..d345fed38b
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3245.hs
@@ -0,0 +1,47 @@
+{-# LANGUAGE ExistentialQuantification, DeriveDataTypeable #-}
+
+-- The second version (count2) took ages with GHC 6.12
+-- because the typeOf function was not properly memoised
+
+import Data.Typeable
+import System.CPUTime
+
+size :: Int
+size = 40000	-- This was big enough to take 5 seconds in 
+       		-- the bad case on my machine.
+
+data Any = forall a. (Typeable a) => Any a
+
+int_type, int_list_type :: TypeRep
+int_type = typeOf (undefined :: Int)
+int_list_type = typeOf (undefined :: [Int])
+
+count1 :: [Any] -> Int
+count1 [] = 0
+count1 (Any x:xs) = count1 xs + (if typeOf x == int_type then 1 else 0)
+
+doTime x = do
+  start <- getCPUTime
+  putStr "Result: "
+  print x
+  stop <- getCPUTime
+  putStr "Time(sec): "
+  print (round $ fromIntegral (stop - start) / 1e12)
+    -- The 'round' rounds to an integral number of seconds
+    -- Should be zero if things are working right!
+
+main = do
+  let list = [MkT | i <- [1..size  :: Int]]
+  putStrLn "count1"
+  let x = map Any list
+  doTime $ count1 x
+  doTime $ count1 x
+  doTime $ count1 x
+  putStrLn ""
+  putStrLn "count2"
+  let x = map (Any . (:[])) list
+  doTime $ count1 x
+  doTime $ count1 x
+  doTime $ count1 x
+
+data T = MkT deriving Typeable
diff --git a/testsuite/tests/perf/should_run/T3245.stdout b/testsuite/tests/perf/should_run/T3245.stdout
new file mode 100644
index 0000000000..bcff7f8fd1
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3245.stdout
@@ -0,0 +1,15 @@
+count1
+Result: 0
+Time(sec): 0
+Result: 0
+Time(sec): 0
+Result: 0
+Time(sec): 0
+
+count2
+Result: 0
+Time(sec): 0
+Result: 0
+Time(sec): 0
+Result: 0
+Time(sec): 0
diff --git a/testsuite/tests/perf/should_run/T3586.hs b/testsuite/tests/perf/should_run/T3586.hs
new file mode 100644
index 0000000000..968f2eba27
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3586.hs
@@ -0,0 +1,20 @@
+{-# LANGUAGE BangPatterns #-}
+{-# OPTIONS -fvia-C -optc-O3 -fexcess-precision -optc-msse3 #-}
+
+import Control.Monad.ST
+import Data.Array.ST
+import Data.Array.Base
+
+main = print $ runST
+          (do arr <- newArray (1,2000000) 137.0 :: ST s (STUArray s Int Double)
+              go arr 2000000 0.0 )
+
+
+go :: STUArray s Int Double -> Int -> Double -> ST s Double
+go !a i !acc
+    | i < 1     = return acc
+    | otherwise = do
+         b <- unsafeRead a i
+         unsafeWrite a i (b+3.0)
+         c <- unsafeRead a i
+         go a (i-1) (c+acc)
diff --git a/testsuite/tests/perf/should_run/T3586.stdout b/testsuite/tests/perf/should_run/T3586.stdout
new file mode 100644
index 0000000000..626282f10c
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3586.stdout
@@ -0,0 +1 @@
+2.79999863e8
diff --git a/testsuite/tests/perf/should_run/T3736.hs b/testsuite/tests/perf/should_run/T3736.hs
new file mode 100644
index 0000000000..3a399ada0f
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3736.hs
@@ -0,0 +1,212 @@
+{-# OPTIONS_GHC -funbox-strict-fields -O #-}
+{-# LANGUAGE ExistentialQuantification #-}
+
+{- OPTIONS_GHC -ddump-simpl -ddump-asm -}
+
+module Main (main) where
+
+import GHC.Float (float2Int, int2Float)
+
+import System.Environment
+
+import Prelude hiding           (null
+                                ,lines,unlines
+                                ,writeFile
+                                )
+
+import Control.Exception        (assert, bracket, )
+
+import Foreign.Marshal.Array    (advancePtr)
+import Foreign.Ptr              (minusPtr)
+import Foreign.Storable         (Storable(..))
+
+import Control.Monad            (when)
+
+import System.IO                (openBinaryFile, hClose,
+                                 hPutBuf,
+                                 Handle, IOMode(..))
+
+import System.IO.Unsafe         (unsafePerformIO)
+
+import Foreign.Ptr              (Ptr)
+import Foreign.ForeignPtr       (ForeignPtr, withForeignPtr, )
+import Foreign.Marshal.Array    (copyArray)
+
+import qualified Foreign.ForeignPtr as F
+
+main :: IO ()
+main = do args <- getArgs
+          case args of
+              ["1"] -> mainMonolithic1Generator
+              ["2"] -> mainMonolithic1Composed
+              _ -> error "Huh?"
+
+type Phase = (Float, Float, Float)
+
+{-# INLINE saw #-}
+saw :: Num a => a -> a
+saw t = 1-2*t
+
+{-# INLINE fraction #-}
+fraction :: Float -> Float
+fraction x = x - int2Float (float2Int x)
+
+{-# INLINE generator0Freq #-}
+generator0Freq :: Float -> Float -> Maybe (Float, Float)
+generator0Freq freq =
+   \p -> Just (saw p, fraction (p+freq))
+
+infixl 6 `mix`, `mixGen`
+
+{-# INLINE mix #-}
+mix ::
+   (Num y) =>
+   (s -> Maybe (y, s)) ->
+   (t -> Maybe (y, t)) ->
+   ((s,t) -> Maybe (y, (s,t)))
+mix f g (s0,t0) =
+   do (a,s1) <- f s0
+      (b,t1) <- g t0
+      return ((a+b), (s1,t1))
+
+data Generator a =
+   forall s.
+      Generator (s -> Maybe (a, s)) s
+
+{-# INLINE runGeneratorMonolithic #-}
+runGeneratorMonolithic :: Int -> Generator Float -> Vector Float
+runGeneratorMonolithic size' (Generator f s) =
+   fst $ unfoldrN size' f s
+
+{- SPECIALISE INLINE generator0Gen :: Float -> Float -> Generator Float -}
+{-# INLINE generator0Gen #-}
+generator0Gen :: Float -> Float -> Generator Float
+generator0Gen freq phase =
+   Generator (\p -> Just (saw p, fraction (p+freq))) phase
+
+{- SPECIALISE INLINE mixGen :: Generator Float -> Generator Float -> Generator Float -}
+{-# INLINE mixGen #-}
+mixGen ::
+   (Num y) =>
+   Generator y ->
+   Generator y ->
+   Generator y
+mixGen (Generator f s) (Generator g t) =
+   Generator (\(s0,t0) ->
+      do (a,s1) <- f s0
+         (b,t1) <- g t0
+         return ((a+b), (s1,t1))) (s,t)
+
+{-# INLINE dl #-}
+dl :: Phase
+dl = (0.01008, 0.01003, 0.00990)
+
+{-# INLINE initPhase2 #-}
+initPhase2 :: (Phase, Phase)
+initPhase2 =
+   ((0,0.7,0.1), (0.3,0.4,0.6))
+
+
+size :: Int
+size = 10000000
+
+
+mainMonolithic1Composed :: IO ()
+mainMonolithic1Composed =
+   writeFile "T3736.speed.f32"
+      (fst $ unfoldrN size
+          (let (f0,f1,f2) = dl
+           in  generator0Freq f0 `mix`
+               generator0Freq f1 `mix`
+               generator0Freq f2)
+          (let (p0,p1,p2) = fst initPhase2
+           in  ((p0,p1),p2)))
+
+mainMonolithic1Generator :: IO ()
+mainMonolithic1Generator =
+   writeFile "T3736.speed.f32"
+      (runGeneratorMonolithic size
+          (let (f0,f1,f2) = dl
+               (p0,p1,p2) = fst initPhase2
+           in  generator0Gen f0 p0 `mixGen`
+               generator0Gen f1 p1 `mixGen`
+               generator0Gen f2 p2))
+
+empty :: (Storable a) => Vector a
+empty = unsafeCreate 0 $ const $ return ()
+{-# NOINLINE empty #-}
+
+null :: Vector a -> Bool
+null (SV _ _ l) = assert (l >= 0) $ l <= 0
+{-# INLINE null #-}
+
+unfoldrN :: (Storable b) => Int -> (a -> Maybe (b, a)) -> a -> (Vector b, Maybe a)
+unfoldrN n f x0 =
+   if n <= 0
+     then (empty, Just x0)
+     else unsafePerformIO $ createAndTrim' n $ \p -> go p n x0
+       where
+          go = arguments2 $ \p i -> \x ->
+             if i == 0
+               then return (0, n-i, Just x)
+               else
+                 case f x of
+                   Nothing     -> return (0, n-i, Nothing)
+                   Just (w,x') -> do poke p w
+                                     go (incPtr p) (i-1) x'
+{-# INLINE unfoldrN #-}
+
+hPut :: (Storable a) => Handle -> Vector a -> IO ()
+hPut h v =
+   when (not (null v)) $
+      withStartPtr v $ \ ptrS l ->
+         let ptrE = advancePtr ptrS l
+         in  hPutBuf h ptrS (minusPtr ptrE ptrS)
+
+writeFile :: (Storable a) => FilePath -> Vector a -> IO ()
+writeFile f txt =
+   bracket (openBinaryFile f WriteMode) hClose
+      (\h -> hPut h txt)
+
+data Vector a = SV {-# UNPACK #-} !(ForeignPtr a)
+                   {-# UNPACK #-} !Int                -- offset
+                   {-# UNPACK #-} !Int                -- length
+
+withStartPtr :: Storable a => Vector a -> (Ptr a -> Int -> IO b) -> IO b
+withStartPtr (SV x s l) f =
+   withForeignPtr x $ \p -> f (p `advancePtr` s) l
+{-# INLINE withStartPtr #-}
+
+incPtr :: (Storable a) => Ptr a -> Ptr a
+incPtr v = advancePtr v 1
+{-# INLINE incPtr #-}
+
+unsafeCreate :: (Storable a) => Int -> (Ptr a -> IO ()) -> Vector a
+unsafeCreate l f = unsafePerformIO (create l f)
+{-# INLINE unsafeCreate #-}
+
+create :: (Storable a) => Int -> (Ptr a -> IO ()) -> IO (Vector a)
+create l f = do
+    fp <- mallocForeignPtrArray l
+    withForeignPtr fp $ \p -> f p
+    return $! SV fp 0 l
+
+createAndTrim' :: (Storable a) => Int
+                               -> (Ptr a -> IO (Int, Int, b))
+                               -> IO (Vector a, b)
+createAndTrim' l f = do
+    fp <- mallocForeignPtrArray l
+    withForeignPtr fp $ \p -> do
+        (off, l', res) <- f p
+        if assert (l' <= l) $ l' >= l
+            then return $! (SV fp 0 l, res)
+            else do ps <- create l' $ \p' -> copyArray p' (p `advancePtr` off) l'
+                    return $! (ps, res)
+
+{-# INLINE arguments2 #-}
+arguments2 :: (a -> b -> x) -> a -> b -> x
+arguments2 f = \a b -> (f $! a) $! b
+
+{-# INLINE mallocForeignPtrArray #-}
+mallocForeignPtrArray :: Storable a => Int -> IO (F.ForeignPtr a)
+mallocForeignPtrArray = F.mallocForeignPtrArray
diff --git a/testsuite/tests/perf/should_run/T3736.stdout b/testsuite/tests/perf/should_run/T3736.stdout
new file mode 100644
index 0000000000..1796dc2720
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3736.stdout
@@ -0,0 +1 @@
+Match
diff --git a/testsuite/tests/perf/should_run/T3738.hs b/testsuite/tests/perf/should_run/T3738.hs
new file mode 100644
index 0000000000..1b3141c0f5
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3738.hs
@@ -0,0 +1,10 @@
+
+module Main where
+
+import T3738a
+
+{-# INLINE bar #-}
+bar :: Int -> [Int]
+bar x = map (+ 2) (foo x)
+
+main = print (bar 2 !! 10000)
diff --git a/testsuite/tests/perf/should_run/T3738.stdout b/testsuite/tests/perf/should_run/T3738.stdout
new file mode 100644
index 0000000000..7ed6ff82de
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3738.stdout
@@ -0,0 +1 @@
+5
diff --git a/testsuite/tests/perf/should_run/T3738a.hs b/testsuite/tests/perf/should_run/T3738a.hs
new file mode 100644
index 0000000000..b27451d12d
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T3738a.hs
@@ -0,0 +1,6 @@
+
+module T3738a where
+
+{-# INLINE foo #-}
+foo :: Num a => a -> [a]
+foo x = map (+ 1) (repeat x)
diff --git a/testsuite/tests/perf/should_run/T4321.hs b/testsuite/tests/perf/should_run/T4321.hs
new file mode 100644
index 0000000000..b8a0dbc4a1
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4321.hs
@@ -0,0 +1,15 @@
+
+-- In 6.13 this stack overflowed
+
+module Main (main) where
+
+main :: IO ()
+main = let n = 1000000
+       in print $ integrate n (1 / fromIntegral n)
+
+integrate :: Int -> Double -> Double
+integrate n h = h * (sum (map area [1..n]))
+    where area :: Int -> Double
+          area i = let x = h * (fromIntegral i - 0.5)
+                   in 4 / (1 + x*x)
+
diff --git a/testsuite/tests/perf/should_run/T4321.stdout b/testsuite/tests/perf/should_run/T4321.stdout
new file mode 100644
index 0000000000..ce98a198a7
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4321.stdout
@@ -0,0 +1 @@
+3.1415926535897643
diff --git a/testsuite/tests/perf/should_run/T4474a.hs b/testsuite/tests/perf/should_run/T4474a.hs
new file mode 100644
index 0000000000..ef70a6ae69
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474a.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE BangPatterns #-}
+
+module Main where
+
+data Tree = Leaf !Int | Fork !Tree !Tree deriving Show
+
+fullTree 0 = Leaf 1
+fullTree n = let t = fullTree (n - 1) in Fork t t
+
+flatListNaive (Leaf n)   = [n]
+flatListNaive (Fork a b) = flatListNaive a ++ flatListNaive b
+
+flatListCons t = flat t []
+  where
+  flat (Leaf n)   ns = n : ns
+  flat (Fork a b) ns = flat a (flat b ns)
+
+flatListCons2 t = flat t []
+  where
+  flat (Leaf n)   = \ns -> n : ns
+  flat (Fork a b) = \ns -> flat a (flat b ns)
+
+flatListCons3 t = flat t []
+  where
+  flat (Leaf n)   = (n :)
+  flat (Fork a b) = flat a . flat b
+
+flatDList (Leaf n)   = (n :)
+flatDList (Fork a b) = flatDList a . flatDList b
+
+sumList l = loop 0 l
+  where loop !c [] = c
+        loop !c (h:t) = loop (c + h) t
+
+sumDList l = loop 0 (l [])
+  where loop !c []      = c
+        loop !c (h : t) = loop (c + h) t
+
+main = print $ sumList  $ flatListCons  $ fullTree 26
+
diff --git a/testsuite/tests/perf/should_run/T4474a.stdout b/testsuite/tests/perf/should_run/T4474a.stdout
new file mode 100644
index 0000000000..e6c68622ac
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474a.stdout
@@ -0,0 +1 @@
+67108864
diff --git a/testsuite/tests/perf/should_run/T4474b.hs b/testsuite/tests/perf/should_run/T4474b.hs
new file mode 100644
index 0000000000..fd931b4223
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474b.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE BangPatterns #-}
+
+module Main where
+
+data Tree = Leaf !Int | Fork !Tree !Tree deriving Show
+
+fullTree 0 = Leaf 1
+fullTree n = let t = fullTree (n - 1) in Fork t t
+
+flatListNaive (Leaf n)   = [n]
+flatListNaive (Fork a b) = flatListNaive a ++ flatListNaive b
+
+flatListCons t = flat t []
+  where
+  flat (Leaf n)   ns = n : ns
+  flat (Fork a b) ns = flat a (flat b ns)
+
+flatListCons2 t = flat t []
+  where
+  flat (Leaf n)   = \ns -> n : ns
+  flat (Fork a b) = \ns -> flat a (flat b ns)
+
+flatListCons3 t = flat t []
+  where
+  flat (Leaf n)   = (n :)
+  flat (Fork a b) = flat a . flat b
+
+flatDList (Leaf n)   = (n :)
+flatDList (Fork a b) = flatDList a . flatDList b
+
+sumList l = loop 0 l
+  where loop !c [] = c
+        loop !c (h:t) = loop (c + h) t
+
+sumDList l = loop 0 (l [])
+  where loop !c []      = c
+        loop !c (h : t) = loop (c + h) t
+
+main = print $ sumList  $ flatListCons2 $ fullTree 26
+
diff --git a/testsuite/tests/perf/should_run/T4474b.stdout b/testsuite/tests/perf/should_run/T4474b.stdout
new file mode 100644
index 0000000000..e6c68622ac
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474b.stdout
@@ -0,0 +1 @@
+67108864
diff --git a/testsuite/tests/perf/should_run/T4474c.hs b/testsuite/tests/perf/should_run/T4474c.hs
new file mode 100644
index 0000000000..116a21322e
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474c.hs
@@ -0,0 +1,40 @@
+{-# LANGUAGE BangPatterns #-}
+
+module Main where
+
+data Tree = Leaf !Int | Fork !Tree !Tree deriving Show
+
+fullTree 0 = Leaf 1
+fullTree n = let t = fullTree (n - 1) in Fork t t
+
+flatListNaive (Leaf n)   = [n]
+flatListNaive (Fork a b) = flatListNaive a ++ flatListNaive b
+
+flatListCons t = flat t []
+  where
+  flat (Leaf n)   ns = n : ns
+  flat (Fork a b) ns = flat a (flat b ns)
+
+flatListCons2 t = flat t []
+  where
+  flat (Leaf n)   = \ns -> n : ns
+  flat (Fork a b) = \ns -> flat a (flat b ns)
+
+flatListCons3 t = flat t []
+  where
+  flat (Leaf n)   = (n :)
+  flat (Fork a b) = flat a . flat b
+
+flatDList (Leaf n)   = (n :)
+flatDList (Fork a b) = flatDList a . flatDList b
+
+sumList l = loop 0 l
+  where loop !c [] = c
+        loop !c (h:t) = loop (c + h) t
+
+sumDList l = loop 0 (l [])
+  where loop !c []      = c
+        loop !c (h : t) = loop (c + h) t
+
+main = print $ sumList  $ flatListCons3 $ fullTree 26
+
diff --git a/testsuite/tests/perf/should_run/T4474c.stdout b/testsuite/tests/perf/should_run/T4474c.stdout
new file mode 100644
index 0000000000..e6c68622ac
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4474c.stdout
@@ -0,0 +1 @@
+67108864
diff --git a/testsuite/tests/perf/should_run/T4830.hs b/testsuite/tests/perf/should_run/T4830.hs
new file mode 100644
index 0000000000..e345ffc9cd
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4830.hs
@@ -0,0 +1,15 @@
+-- Compile with O2; SpecConstr should fire nicely
+-- and eliminate all allocation in inner loop
+
+module Main where
+
+foo :: Int -> Maybe (Double,Double) -> Double
+foo _ Nothing      = 0
+foo 0 (Just (x,y)) = x+y
+foo n (Just (x,y)) = let r = f x y in r `seq` foo (n-1) (Just r)
+  where
+    f x y | x <= y    = (x,y)
+          | otherwise = (y,x)
+
+main = print (foo 1000000 (Just (1,2)))
+
diff --git a/testsuite/tests/perf/should_run/T4830.stdout b/testsuite/tests/perf/should_run/T4830.stdout
new file mode 100644
index 0000000000..9f55b2ccb5
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4830.stdout
@@ -0,0 +1 @@
+3.0
diff --git a/testsuite/tests/perf/should_run/T4978.hs b/testsuite/tests/perf/should_run/T4978.hs
new file mode 100644
index 0000000000..b661edc483
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4978.hs
@@ -0,0 +1,126 @@
+module Main (main) where
+
+import qualified Data.ByteString as S
+import qualified Data.ByteString.Lazy as L
+import Data.ByteString.Internal (inlinePerformIO)
+import qualified Data.ByteString.Internal as S
+import Data.Monoid
+import Foreign
+import System.IO.Unsafe
+
+newtype Builder = Builder {
+        runBuilder :: (Buffer -> [S.ByteString]) -> Buffer -> [S.ByteString]
+    }
+
+instance Monoid Builder where
+    mempty  = empty
+    {-# INLINE mempty #-}
+    mappend = append
+    {-# INLINE mappend #-}
+    mconcat = foldr mappend mempty
+    {-# INLINE mconcat #-}
+
+empty :: Builder
+empty = Builder (\ k b -> b `seq` k b)
+{-# INLINE empty #-}
+
+singleton :: Word8 -> Builder
+singleton = writeN 1 . flip poke
+{-# INLINE singleton #-}
+
+append :: Builder -> Builder -> Builder
+append (Builder f) (Builder g) = Builder (f . g)
+{-# INLINE [0] append #-}
+
+-- Our internal buffer type
+data Buffer = Buffer {-# UNPACK #-} !(ForeignPtr Word8)
+                     {-# UNPACK #-} !Int                -- offset
+                     {-# UNPACK #-} !Int                -- used bytes
+                     {-# UNPACK #-} !Int                -- length left
+
+-- | /O(1)./ Pop the 'S.ByteString' we have constructed so far, if any,
+-- yielding a new chunk in the result lazy 'L.ByteString'.
+flush :: Builder
+flush = Builder $ \ k buf@(Buffer p o u l) ->
+    if u == 0
+      then k buf
+      else S.PS p o u : k (Buffer p (o+u) 0 l)
+
+-- | /O(n)./ Extract a lazy 'L.ByteString' from a 'Builder'.
+-- The construction work takes place if and when the relevant part of
+-- the lazy 'L.ByteString' is demanded.
+--
+toLazyByteString :: Builder -> L.ByteString
+toLazyByteString m = L.fromChunks $ unsafePerformIO $ do
+    buf <- newBuffer defaultSize
+    return (runBuilder (m `append` flush) (const []) buf)
+{-# INLINE toLazyByteString #-}
+
+defaultSize :: Int
+defaultSize = 32 * k - overhead
+    where k = 1024
+          overhead = 2 * sizeOf (undefined :: Int)
+
+-- | Sequence an IO operation on the buffer
+unsafeLiftIO :: (Buffer -> IO Buffer) -> Builder
+unsafeLiftIO f =  Builder $ \ k buf -> inlinePerformIO $ do
+    buf' <- f buf
+    return (k buf')
+{-# INLINE unsafeLiftIO #-}
+
+-- | Get the size of the buffer
+withSize :: (Int -> Builder) -> Builder
+withSize f = Builder $ \ k buf@(Buffer _ _ _ l) -> runBuilder (f l) k buf
+
+-- | Ensure that there are at least @n@ many bytes available.
+ensureFree :: Int -> Builder
+ensureFree n = n `seq` withSize $ \ l ->
+    if n <= l then empty else
+        flush `append` unsafeLiftIO (const (newBuffer (max n defaultSize)))
+{-# INLINE [0] ensureFree #-}
+
+-- | Ensure that @n@ many bytes are available, and then use @f@ to write some
+-- bytes into the memory.
+writeN :: Int -> (Ptr Word8 -> IO ()) -> Builder
+writeN n f = ensureFree n `append` unsafeLiftIO (writeNBuffer n f)
+{-# INLINE [0] writeN #-}
+
+writeNBuffer :: Int -> (Ptr Word8 -> IO ()) -> Buffer -> IO Buffer
+writeNBuffer n f (Buffer fp o u l) = do
+    withForeignPtr fp (\p -> f (p `plusPtr` (o+u)))
+    return (Buffer fp o (u+n) (l-n))
+{-# INLINE writeNBuffer #-}
+
+newBuffer :: Int -> IO Buffer
+newBuffer size = do
+    fp <- S.mallocByteString size
+    return $! Buffer fp 0 0 size
+{-# INLINE newBuffer #-}
+
+-- Merge buffer bounds checks.
+{-# RULES
+"append/writeN" forall a b (f::Ptr Word8 -> IO ())
+                           (g::Ptr Word8 -> IO ()) ws.
+    append (writeN a f) (append (writeN b g) ws) =
+        append (writeN (a+b) (\p -> f p >> g (p `plusPtr` a))) ws
+
+"writeN/writeN" forall a b (f::Ptr Word8 -> IO ())
+                           (g::Ptr Word8 -> IO ()).
+    append (writeN a f) (writeN b g) =
+        writeN (a+b) (\p -> f p >> g (p `plusPtr` a))
+
+"ensureFree/ensureFree" forall a b .
+    append (ensureFree a) (ensureFree b) = ensureFree (max a b)
+ #-}
+
+-- Test case
+
+-- Argument must be a multiple of 4.
+test :: Int -> Builder
+test 0 = mempty
+test n = singleton 1 `mappend`
+         (singleton 2 `mappend`
+          (singleton 3 `mappend`
+           (singleton 4 `mappend` test (n-4))))
+
+main = print $ L.length $ toLazyByteString $ test 10000000
diff --git a/testsuite/tests/perf/should_run/T4978.stdout b/testsuite/tests/perf/should_run/T4978.stdout
new file mode 100644
index 0000000000..825319e1c5
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T4978.stdout
@@ -0,0 +1 @@
+10000000
diff --git a/testsuite/tests/perf/should_run/T5113.hs b/testsuite/tests/perf/should_run/T5113.hs
new file mode 100644
index 0000000000..e87bcb6cad
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5113.hs
@@ -0,0 +1,31 @@
+{-# LANGUAGE BangPatterns #-}
+module Main where
+
+import Data.Array.Base (unsafeRead, unsafeWrite)
+import Data.Array.ST
+import Data.Array.Unboxed
+import Control.Monad.ST
+
+main = print (divisorCounts 1000000 ! 342)
+
+isqrt :: Int -> Int
+isqrt n = floor (sqrt $ fromIntegral n)
+
+divisorCounts :: Int -> UArray Int Int
+divisorCounts n = runSTUArray $ do
+    let !rt = isqrt n
+    darr <- newArray (0,n) 1 :: ST s (STUArray s Int Int)
+    let inc i = unsafeRead darr i >>= \k -> unsafeWrite darr i (k+1)
+        note step i
+            | i > n     = return ()
+            | otherwise = do
+                inc i
+                note step (i+step)
+        count j
+            | j > rt    = return ()
+            | otherwise = do
+                note (2*j) (j*j)
+                count (j+2)
+    note 2 4
+    count 3
+    return darr
diff --git a/testsuite/tests/perf/should_run/T5113.stdout b/testsuite/tests/perf/should_run/T5113.stdout
new file mode 100644
index 0000000000..0cfbf08886
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5113.stdout
@@ -0,0 +1 @@
+2
diff --git a/testsuite/tests/perf/should_run/T5205.hs b/testsuite/tests/perf/should_run/T5205.hs
new file mode 100644
index 0000000000..215dd42647
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5205.hs
@@ -0,0 +1,13 @@
+
+import Control.Concurrent
+import Control.Monad
+
+main :: IO ()
+main = do t <- forkIO (x >> x)
+          threadDelay 1000000
+          killThread t
+          putStrLn "Done"
+
+x :: IO ()
+x = forever yield
+
diff --git a/testsuite/tests/perf/should_run/T5205.stdout b/testsuite/tests/perf/should_run/T5205.stdout
new file mode 100644
index 0000000000..a965a70ed4
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5205.stdout
@@ -0,0 +1 @@
+Done
diff --git a/testsuite/tests/perf/should_run/T5237.hs b/testsuite/tests/perf/should_run/T5237.hs
new file mode 100644
index 0000000000..6a12f5ecea
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5237.hs
@@ -0,0 +1,15 @@
+{-# LANGUAGE BangPatterns #-}
+module Main (main) where
+
+-- Test that the rewrite rules for small exponents fire (#5237).
+-- If they don't fire, this will allocate much.
+
+fun :: Double -> Double
+fun x = go 0 1.0
+  where
+    go !acc z
+      | x < z   = acc
+      | otherwise = go (acc + 1/z^4) (z+1.0)
+
+main :: IO ()
+main = print (fun 1e7)
diff --git a/testsuite/tests/perf/should_run/T5237.stdout b/testsuite/tests/perf/should_run/T5237.stdout
new file mode 100644
index 0000000000..a620a54dd7
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5237.stdout
@@ -0,0 +1 @@
+1.082323233710861
diff --git a/testsuite/tests/perf/should_run/T5536.hs b/testsuite/tests/perf/should_run/T5536.hs
new file mode 100644
index 0000000000..3259c46675
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5536.hs
@@ -0,0 +1,5 @@
+
+main :: IO ()
+main = do writeFile "T5536.data" (replicate 10000000 'a')
+          readFile "T5536.data" >>= putStr
+
diff --git a/testsuite/tests/perf/should_run/T5549.hs b/testsuite/tests/perf/should_run/T5549.hs
new file mode 100644
index 0000000000..cab0dc0226
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5549.hs
@@ -0,0 +1,27 @@
+module Main where
+-- See Trac #5549
+-- The issue here is allocating integer constants inside a loop
+
+
+lcs3 :: Eq a => [a] -> [a] -> [a]
+lcs3 a b = fst $ aux (a, length a) (b, length b)
+  where
+    aux (_,0) _ = ([],0)
+    aux _ (_,0) = ([],0)
+    aux (a@(ha:as),la) (b@(hb:bs), lb)
+      | ha == hb  = let (s,n) = aux (as,la-1) (bs,lb-1) in (ha : s, n+1)
+      | otherwise =
+        let (sa,na) = aux (as,la-1) (b,lb)
+            (sb,nb) = aux (a,la) (bs,lb-1) in
+        if na > nb then (sa,na) else (sb,nb)
+
+f :: Integer -> Integer -> Integer
+f acc 0 = acc
+f acc n = g (acc + 1) (n-1)
+
+g :: Integer -> Integer -> Integer
+g acc 0 = acc
+g acc n = f (acc -1) (n-1)
+
+main = do putStrLn . show $ f 0 100000000
+          putStrLn . show $ lcs3 [1..20] [10..20]
diff --git a/testsuite/tests/perf/should_run/T5549.stdout b/testsuite/tests/perf/should_run/T5549.stdout
new file mode 100644
index 0000000000..95a4feb269
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T5549.stdout
@@ -0,0 +1,2 @@
+0
+[10,11,12,13,14,15,16,17,18,19,20]
diff --git a/testsuite/tests/perf/should_run/T7257.hs b/testsuite/tests/perf/should_run/T7257.hs
new file mode 100644
index 0000000000..ef65db3a56
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7257.hs
@@ -0,0 +1,30 @@
+{-# LANGUAGE BangPatterns #-}
+module Main where
+
+import qualified Data.ByteString as S
+import Data.IORef
+import Control.Monad
+
+makeBs :: Int -> S.ByteString
+makeBs n = S.replicate n (fromIntegral n)
+
+doStuff :: IORef [S.ByteString] -> Int -> IO ()
+doStuff ref n = do
+    let !bs = makeBs n
+    modifyIORef ref (bs:)
+{-# NOINLINE doStuff #-}
+
+undo :: IORef [S.ByteString] -> IO ()
+undo ref = do
+    h <- atomicModifyIORef ref (\(x:xs) -> (xs,x))
+    S.length h `seq` return ()
+
+main = do
+    ref <- newIORef [S.empty]
+    let fn n = do
+        doStuff ref n
+        when (rem 5 n /= 0 ) $ undo ref
+        
+    mapM_ fn (take 5000000 $ cycle [1..100])
+    var <- readIORef ref
+    print $ length var
diff --git a/testsuite/tests/perf/should_run/T7257.stdout b/testsuite/tests/perf/should_run/T7257.stdout
new file mode 100644
index 0000000000..26aece761a
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7257.stdout
@@ -0,0 +1 @@
+100001
diff --git a/testsuite/tests/perf/should_run/T7436.hs b/testsuite/tests/perf/should_run/T7436.hs
new file mode 100644
index 0000000000..9f615da9ed
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7436.hs
@@ -0,0 +1,22 @@
+{-# LANGUAGE DeriveFunctor, DeriveFoldable #-}
+module Main where
+
+import Prelude hiding (foldr)
+import Data.Foldable
+
+data List a = Nil | Cons a (List a)
+    deriving (Functor, Foldable)
+
+mkList :: Int -> List Int
+mkList 0 = Nil
+mkList n = Cons n (mkList (n-1))
+
+main :: IO ()
+main = print $ foldr (\x y -> y) "end" (mkList n)
+  where n = 40000
+  -- Increase this to increase the difference between good and bad
+  -- Eg 6000 takes a lot longer
+  -- The biggest difference is not allocation or bytes used,
+  -- but execution time!
+
+
diff --git a/testsuite/tests/perf/should_run/T7436.stdout b/testsuite/tests/perf/should_run/T7436.stdout
new file mode 100644
index 0000000000..e0deb4bed2
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7436.stdout
@@ -0,0 +1 @@
+"end"
diff --git a/testsuite/tests/perf/should_run/T7507.hs b/testsuite/tests/perf/should_run/T7507.hs
new file mode 100644
index 0000000000..04f156d871
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7507.hs
@@ -0,0 +1,14 @@
+module Main where
+
+import Data.Int
+import System.Environment
+
+bitcount x = if x > 0 
+    then let (d,m) = divMod x 2 in  bitcount d + m
+    else 0
+
+main = print $ sum  $ map bitcount 
+       [ 0 :: Int64 .. 2^20 - 1 ]
+
+
+
diff --git a/testsuite/tests/perf/should_run/T7507.stdout b/testsuite/tests/perf/should_run/T7507.stdout
new file mode 100644
index 0000000000..a6ec9d96e5
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7507.stdout
@@ -0,0 +1 @@
+10485760
diff --git a/testsuite/tests/perf/should_run/T7797.hs b/testsuite/tests/perf/should_run/T7797.hs
new file mode 100644
index 0000000000..9329a408a7
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7797.hs
@@ -0,0 +1,15 @@
+{-# LANGUAGE ExistentialQuantification #-}
+module Main where
+
+import T7797a 
+
+data Box = forall a. (Size a) => Box a a 
+
+box = Box (go 10000000) (go 10000000) where
+  go :: Int -> [Int]
+  go 0 = []
+  go n = 1 : go (n - 1)
+{-# NOINLINE box #-}
+
+main = print $ case box of
+  Box l r -> size l r
diff --git a/testsuite/tests/perf/should_run/T7797.stdout b/testsuite/tests/perf/should_run/T7797.stdout
new file mode 100644
index 0000000000..573541ac97
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7797.stdout
@@ -0,0 +1 @@
+0
diff --git a/testsuite/tests/perf/should_run/T7797a.hs b/testsuite/tests/perf/should_run/T7797a.hs
new file mode 100644
index 0000000000..d06df97a5d
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7797a.hs
@@ -0,0 +1,12 @@
+module T7797a where
+
+class Size t where
+  size :: t -> t -> Int
+  burg :: t -> t
+
+instance (Ord a, Num a) => Size [a] where
+  {-# SPECIALISE instance Size [Int] #-}
+  size (x:xs) (y:ys) | x+y > 4   = size xs ys
+                     | otherwise = size xs ys
+  size _ _ = 0
+  burg = error "urk"
diff --git a/testsuite/tests/perf/should_run/T7850.hs b/testsuite/tests/perf/should_run/T7850.hs
new file mode 100644
index 0000000000..d3124db3c2
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7850.hs
@@ -0,0 +1,11 @@
+
+{-# LANGUAGE BangPatterns #-}
+
+main :: IO ()
+main = print $ ack 4 1
+
+ack :: Int -> Int -> Int
+ack 0 !n = n+1
+ack m  0 = ack (m-1) 1
+ack m  n = ack (m-1) $ ack m (n-1)
+
diff --git a/testsuite/tests/perf/should_run/T7850.stdout b/testsuite/tests/perf/should_run/T7850.stdout
new file mode 100644
index 0000000000..b07de00606
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7850.stdout
@@ -0,0 +1 @@
+65533
diff --git a/testsuite/tests/perf/should_run/T7954.hs b/testsuite/tests/perf/should_run/T7954.hs
new file mode 100644
index 0000000000..2b86d2fc39
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7954.hs
@@ -0,0 +1,7 @@
+module Main where
+
+norm ::  [Double] -> Double
+norm = sqrt . sum . map (\x -> x*x)
+   
+main :: IO ()
+main = print (norm (enumFromTo 0 10000000) > 100) 
diff --git a/testsuite/tests/perf/should_run/T7954.stdout b/testsuite/tests/perf/should_run/T7954.stdout
new file mode 100644
index 0000000000..0ca95142bb
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T7954.stdout
@@ -0,0 +1 @@
+True
diff --git a/testsuite/tests/perf/should_run/T876.hs b/testsuite/tests/perf/should_run/T876.hs
new file mode 100644
index 0000000000..398859f86d
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T876.hs
@@ -0,0 +1,11 @@
+-- This test allocates a lot more if length is
+-- not a good consumer
+
+module Main where
+import System.Environment (getArgs)
+
+foo :: Int -> Int
+foo n = sum [ length [i..n] | i <- [1..n] ]
+
+main = do { [arg] <- getArgs
+          ; print (foo (read arg)) }
diff --git a/testsuite/tests/perf/should_run/T876.stdout b/testsuite/tests/perf/should_run/T876.stdout
new file mode 100644
index 0000000000..b9d569380c
--- /dev/null
+++ b/testsuite/tests/perf/should_run/T876.stdout
@@ -0,0 +1 @@
+50005000
diff --git a/testsuite/tests/perf/should_run/all.T b/testsuite/tests/perf/should_run/all.T
new file mode 100644
index 0000000000..b44d30755f
--- /dev/null
+++ b/testsuite/tests/perf/should_run/all.T
@@ -0,0 +1,298 @@
+# Tests that newArray/newArray_ is being optimised correctly
+
+# fortunately the values here are mostly independent of the wordsize,
+# because the test allocates an unboxed array of doubles.
+
+test('T3586',
+     [stats_num_field('peak_megabytes_allocated', (17, 1)),
+                                 # expected value: 17 (amd64/Linux)
+      stats_num_field('bytes allocated', (16835544, 5)),
+                        # expected value: 16835544 (amd64/Linux)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T4830',
+     [stats_num_field('bytes allocated',
+          [(wordsize(64), 99264, 1),
+           #             127000 (amd64/Linux)
+           # 2013-02-07:  99264 (amd64/Linux)
+           (wordsize(32), 70646, 2)]),
+           # 2013-02-10:  69744 (x86/Windows)
+           # 2013-02-10:  71548 (x86/OSX)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O2'])
+
+test('T3245', normal, compile_and_run, ['-O'])
+
+# Test that bytestring reading/writing isn't over-allocating.  We had
+# a bug in hGetBufNonBlocking in 6.13 that triggered this.
+#
+test('lazy-bs-alloc',
+     [stats_num_field('peak_megabytes_allocated', (2, 1)),
+                                 # expected value: 2 (amd64/Linux)
+      stats_num_field('bytes allocated',
+          [(wordsize(64), 425400, 1),
+            #             489776 (amd64/Linux)
+            # 2013-02-07: 429744 (amd64/Linux)
+            # 2013-12-12: 425400 (amd64/Linux)
+           (wordsize(32), 417738, 1)]),
+            # 2013-02-10: 421296 (x86/Windows)
+            # 2013-02-10: 414180 (x86/OSX)
+      only_ways(['normal']),
+      extra_run_opts('../../numeric/should_run/arith011.stdout'),
+      ignore_output
+      ],
+      # use a suitably big file, without bloating the repo with a new one:
+     compile_and_run,
+     ['-O'])
+
+test('T876',
+     [stats_num_field('bytes allocated',
+          [(wordsize(64), 1263712 , 5),
+              # 2013-02-14: 1263712 (x86_64/Linux)
+           (wordsize(32), 663712, 5)]),
+      only_ways(['normal']),
+      extra_run_opts('10000')
+      ],
+     compile_and_run,
+     ['-O'])
+
+# Get reproducible floating-point results on x86
+if config.arch == 'i386':
+   sse2_opts = '-msse2'
+else:
+   sse2_opts = ''
+
+test('T4321',
+     omit_ways(['ghci']),
+     compile_and_run, ['-O ' + sse2_opts])
+
+test('T3736',
+     extra_clean(['T3736.speed.f32']),
+     run_command,
+     ['$MAKE -s --no-print-directory T3736'])
+test('T3738',
+     [extra_clean(['T3738a.hi', 'T3738a.o']),
+      stats_num_field('peak_megabytes_allocated', (1, 0)),
+                                 # expected value: 1 (amd64/Linux)
+      stats_num_field('bytes allocated',
+                      [(wordsize(32), 45648, 5),
+                    # expected value: 45648 (x86/Linux)
+                       (wordsize(64), 49400, 5)]),
+                    # expected value: 49400 (amd64/Linux)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('MethSharing',
+     [stats_num_field('peak_megabytes_allocated', (1, 0)),
+                                 # expected value: 1 (amd64/Linux)
+      stats_num_field('bytes allocated',
+                      [(wordsize(32), 360940756, 5),
+                    # expected value: 2685858140 (x86/OS X)
+                          # expected: 360940756 (x86/Linux)
+                       (wordsize(64), 640067672, 5)]),
+                          # expected: 640067672 (amd64/Linux)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+test('T2902',
+     extra_clean(['T2902_A',    'T2902_B',
+                  'T2902_A.hi', 'T2902_B.hi',
+                  'T2902_A.o',  'T2902_B.o',
+                  'T2902_A_PairingSum.hi', 'T2902_B_PairingSum.hi',
+                  'T2902_A_PairingSum.o',  'T2902_B_PairingSum.o',
+                  'T2902_Sum.hi',
+                  'T2902_Sum.o']),
+     run_command,
+     ['$MAKE -s --no-print-directory T2902'])
+test('T149',
+     [ # expect_broken(149),
+       # working (2 Jul 2013, x86-64/Linux)
+      extra_clean(['T149_A',    'T149_B',
+                   'T149_A.hi', 'T149_B.hi',
+                   'T149_A.o',  'T149_B.o'])],
+     run_command,
+     ['$MAKE -s --no-print-directory T149'])
+
+test('T5113',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 4000000, 5),
+                       (wordsize(64), 8000000, 5)]),
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+
+test('T4978',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 10000000, 5),
+                       (wordsize(64), 10137680, 5)]),
+                    # expected value: 10137680 (amd64/Linux)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O2'])
+
+test('T5205',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 47088, 5),
+                    # expected value: 47088 (x86/Darwin)
+                       (wordsize(64), 51320, 5)]),
+                    # expected value: 51320 (amd64/Linux)
+      only_ways(['normal', 'optasm'])
+      ],
+     compile_and_run,
+     [''])
+
+test('T5549',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 3362958676, 5),
+                    # expected value: 3362958676 (Windows)
+                       (wordsize(64), 6725846120, 5)]),
+                    # expected value: 6725846120 (amd64/Linux)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T4474a',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 2405242767, 5),
+                       (wordsize(64), 4831890304, 5)]),
+                    # expected value: 4831890304 (amd64/OSX)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+test('T4474b',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 2405242767, 5),
+                       (wordsize(64), 4831890304, 5)]),
+                    # expected value: 4831890304 (amd64/OSX)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+test('T4474c',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 2405242767, 5),
+                       (wordsize(64), 4831890304, 5)]),
+                    # expected value: 4831890304 (amd64/OSX)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T5237',
+     [stats_num_field('bytes allocated',
+                        [(wordsize(32), 78328, 5),
+                      # expected value: 78328 (i386/Linux)
+                         (wordsize(64), 104176, 5)]),
+                      # expected value: 110888 (amd64/Linux)
+                      # expected value: 104176 (amd64/Linux)
+     only_ways(['normal'])
+     ],
+    compile_and_run,
+    ['-O ' + sse2_opts])
+
+test('T5536',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 446260520, 1),
+                                   # 1246287228 (i386/Linux)
+                                    # 446328556 (i386/Windows)
+                                    # 446192484 (i386/OSX)
+                       (wordsize(64), 892399040, 5)]),
+                   # expected value: 2492589480 (amd64/Linux)
+                   # 17/1/13:         892399040 (x86_64/Linux)
+                   #                  (new demand analyser)
+     extra_clean(['T5536.data']),
+     ignore_output,
+     only_ways(['normal'])
+     ],
+    compile_and_run,
+    ['-O'])
+
+test('T7257',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 1150000000, 10),
+                    # expected value: 1246287228 (i386/Linux)
+                       (wordsize(64), 1774893760, 5)]),
+                        # 2012-09-21: 1774893760 (amd64/Linux)
+      stats_num_field('peak_megabytes_allocated',
+                      [(wordsize(32), 217, 5),
+                        # 2012-10-08: 217 (x86/Linux)
+                       (wordsize(64), 227, 5)]),
+                        # 2012-09-21: 227 (amd64/Linux)
+
+      only_ways(['normal'])
+     ],
+    compile_and_run, ['-O'])
+
+test('Conversions',
+     [stats_num_field('bytes allocated',
+                      [(wordsize(32), 78374, 2),
+                        # 2012-12-18: 55316 Guessed 64-bit value / 2
+                        # 2013-02-10: 77472 (x86/OSX)
+                        # 2013-02-10: 79276 (x86/Windows)
+                       (wordsize(64), 110632, 5)]),
+                        # 2012-12-18: 109608 (amd64/OS X)
+
+      only_ways(['normal'])
+     ],
+    compile_and_run, ['-O -msse2'])
+
+test('T7507', omit_ways(['ghci']), compile_and_run, ['-O'])
+# For 7507, stack overflow is the bad case
+
+test('T7436',
+     [stats_num_field('max_bytes_used',
+          [(wordsize(64), 60360, 1),
+           #             127000 (amd64/Linux)
+           # 2013-02-07:  60360 (amd64/Linux)
+           (wordsize(32), 58434, 1)]),
+            # 2013-02-10: 58032 (x86/Windows)
+            # 2013-02-10: 58836 (x86/OSX)
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T7797',
+      [stats_num_field('bytes allocated',
+                      [(wordsize(32), 240044984, 5),
+                          # expected value: 2685858140 (x86/OS X)
+                          # expected: 360940756 (x86/Linux)
+                          # expected: 240044984 (x86/Windows, 64bit machine)
+                       (wordsize(64), 480050944, 5)]),
+                          # expected: 480050944 (amd64/Linux)
+      extra_clean(['T7797a.hi', 'T7797a.o']),
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T7954',
+      [stats_num_field('bytes allocated',
+                      [(wordsize(32), 1380051408, 10),
+                       (wordsize(64), 2720051528, 10)]),
+      only_ways(['normal'])
+      ],
+     compile_and_run,
+     ['-O'])
+
+test('T7850',
+     [stats_num_field('peak_megabytes_allocated', 
+                      [(wordsize(32), 2, 10),
+                       (wordsize(64), 4, 10)]),
+      only_ways(['normal'])],
+     compile_and_run,
+     ['-O'])
+
diff --git a/testsuite/tests/perf/should_run/lazy-bs-alloc.hs b/testsuite/tests/perf/should_run/lazy-bs-alloc.hs
new file mode 100644
index 0000000000..76850c67d0
--- /dev/null
+++ b/testsuite/tests/perf/should_run/lazy-bs-alloc.hs
@@ -0,0 +1,9 @@
+module Main (main) where
+
+import System.Environment (getArgs)
+import qualified Data.ByteString.Lazy as L
+
+main :: IO ()
+main = do
+    (file : _) <- getArgs
+    L.readFile file >>= L.putStr