summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGabor Greif <ggreif@gmail.com>2017-10-20 15:45:37 +0200
committerMarge Bot <ben+marge-bot@smart-cactus.org>2019-12-06 21:20:38 -0500
commit9897e8c8ef0b19a9571ef97a1d9bb050c1ee9121 (patch)
tree0d63829cdfd660376cd600a26e8ed22d3478f669
parentf171b3582d44746bf8b08897a3b23bc97e5dbdda (diff)
downloadhaskell-9897e8c8ef0b19a9571ef97a1d9bb050c1ee9121.tar.gz
Implement pointer tagging for big families (#14373)
Formerly we punted on these and evaluated constructors always got a tag of 1. We now cascade switches because we have to check the tag first and when it is MAX_PTR_TAG then get the precise tag from the info table and switch on that. The only technically tricky part is that the default case needs (logical) duplication. To do this we emit an extra label for it and branch to that from the second switch. This avoids duplicated codegen. Here's a simple example of the new code gen: data D = D1 | D2 | D3 | D4 | D5 | D6 | D7 | D8 On a 64-bit system previously all constructors would be tagged 1. With the new code gen D7 and D8 are tagged 7: [Lib.D7_con_entry() { ... {offset c1eu: // global R1 = R1 + 7; call (P64[Sp])(R1) args: 8, res: 0, upd: 8; } }] [Lib.D8_con_entry() { ... {offset c1ez: // global R1 = R1 + 7; call (P64[Sp])(R1) args: 8, res: 0, upd: 8; } }] When switching we now look at the info table only when the tag is 7. For example, if we derive Enum for the type above, the Cmm looks like this: c2Le: _s2Js::P64 = R1; _c2Lq::P64 = _s2Js::P64 & 7; switch [1 .. 7] _c2Lq::P64 { case 1 : goto c2Lk; case 2 : goto c2Ll; case 3 : goto c2Lm; case 4 : goto c2Ln; case 5 : goto c2Lo; case 6 : goto c2Lp; case 7 : goto c2Lj; } // Read info table for tag c2Lj: _c2Lv::I64 = %MO_UU_Conv_W32_W64(I32[I64[_s2Js::P64 & (-8)] - 4]); if (_c2Lv::I64 != 6) goto c2Lu; else goto c2Lt; Generated Cmm sizes do not change too much, but binaries are very slightly larger, due to the fact that the new instructions are longer in encoded form. E.g. previously entry code for D8 above would be 00000000000001c0 <Lib_D8_con_info>: 1c0: 48 ff c3 inc %rbx 1c3: ff 65 00 jmpq *0x0(%rbp) With this patch 00000000000001d0 <Lib_D8_con_info>: 1d0: 48 83 c3 07 add $0x7,%rbx 1d4: ff 65 00 jmpq *0x0(%rbp) This is one byte longer. Secondly, reading info table directly and then switching is shorter _c1co: movq -1(%rbx),%rax movl -4(%rax),%eax // Switch on info table tag jmp *_n1d5(,%rax,8) than doing the same switch, and then for the tag 7 doing another switch: // When tag is 7 _c1ct: andq $-8,%rbx movq (%rbx),%rax movl -4(%rax),%eax // Switch on info table tag ... Some changes of binary sizes in actual programs: - In NoFib the worst case is 0.1% increase in benchmark "parser" (see NoFib results below). All programs get slightly larger. - Stage 2 compiler size does not change. - In "containers" (the library) size of all object files increases 0.0005%. Size of the test program "bitqueue-properties" increases 0.03%. nofib benchmarks kindly provided by Ă–mer (@osa1): NoFib Results ============= -------------------------------------------------------------------------------- Program Size Allocs Instrs Reads Writes -------------------------------------------------------------------------------- CS +0.0% 0.0% -0.0% -0.0% -0.0% CSD +0.0% 0.0% 0.0% +0.0% +0.0% FS +0.0% 0.0% 0.0% +0.0% 0.0% S +0.0% 0.0% -0.0% 0.0% 0.0% VS +0.0% 0.0% -0.0% +0.0% +0.0% VSD +0.0% 0.0% -0.0% +0.0% -0.0% VSM +0.0% 0.0% 0.0% 0.0% 0.0% anna +0.0% 0.0% +0.1% -0.9% -0.0% ansi +0.0% 0.0% -0.0% +0.0% +0.0% atom +0.0% 0.0% 0.0% 0.0% 0.0% awards +0.0% 0.0% -0.0% +0.0% 0.0% banner +0.0% 0.0% -0.0% +0.0% 0.0% bernouilli +0.0% 0.0% +0.0% +0.0% +0.0% binary-trees +0.0% 0.0% -0.0% -0.0% -0.0% boyer +0.0% 0.0% +0.0% 0.0% -0.0% boyer2 +0.0% 0.0% +0.0% 0.0% -0.0% bspt +0.0% 0.0% +0.0% +0.0% 0.0% cacheprof +0.0% 0.0% +0.1% -0.8% 0.0% calendar +0.0% 0.0% -0.0% +0.0% -0.0% cichelli +0.0% 0.0% +0.0% 0.0% 0.0% circsim +0.0% 0.0% -0.0% -0.1% -0.0% clausify +0.0% 0.0% +0.0% +0.0% 0.0% comp_lab_zift +0.0% 0.0% +0.0% 0.0% -0.0% compress +0.0% 0.0% +0.0% +0.0% 0.0% compress2 +0.0% 0.0% 0.0% 0.0% 0.0% constraints +0.0% 0.0% -0.0% -0.0% -0.0% cryptarithm1 +0.0% 0.0% +0.0% 0.0% 0.0% cryptarithm2 +0.0% 0.0% +0.0% -0.0% 0.0% cse +0.0% 0.0% +0.0% +0.0% 0.0% digits-of-e1 +0.0% 0.0% -0.0% -0.0% -0.0% digits-of-e2 +0.0% 0.0% +0.0% -0.0% -0.0% dom-lt +0.0% 0.0% +0.0% +0.0% 0.0% eliza +0.0% 0.0% -0.0% +0.0% 0.0% event +0.0% 0.0% -0.0% -0.0% -0.0% exact-reals +0.0% 0.0% +0.0% +0.0% +0.0% exp3_8 +0.0% 0.0% -0.0% -0.0% -0.0% expert +0.0% 0.0% +0.0% +0.0% +0.0% fannkuch-redux +0.0% 0.0% +0.0% 0.0% 0.0% fasta +0.0% 0.0% -0.0% -0.0% -0.0% fem +0.0% 0.0% +0.0% +0.0% +0.0% fft +0.0% 0.0% +0.0% -0.0% -0.0% fft2 +0.0% 0.0% +0.0% +0.0% +0.0% fibheaps +0.0% 0.0% +0.0% +0.0% 0.0% fish +0.0% 0.0% +0.0% +0.0% 0.0% fluid +0.0% 0.0% +0.0% +0.0% +0.0% fulsom +0.0% 0.0% +0.0% -0.0% +0.0% gamteb +0.0% 0.0% +0.0% -0.0% -0.0% gcd +0.0% 0.0% +0.0% +0.0% 0.0% gen_regexps +0.0% 0.0% +0.0% -0.0% -0.0% genfft +0.0% 0.0% -0.0% -0.0% -0.0% gg +0.0% 0.0% 0.0% -0.0% 0.0% grep +0.0% 0.0% +0.0% +0.0% +0.0% hidden +0.0% 0.0% +0.0% -0.0% -0.0% hpg +0.0% 0.0% +0.0% -0.1% -0.0% ida +0.0% 0.0% +0.0% -0.0% -0.0% infer +0.0% 0.0% -0.0% -0.0% -0.0% integer +0.0% 0.0% -0.0% -0.0% -0.0% integrate +0.0% 0.0% 0.0% +0.0% 0.0% k-nucleotide +0.0% 0.0% -0.0% -0.0% -0.0% kahan +0.0% 0.0% -0.0% -0.0% -0.0% knights +0.0% 0.0% +0.0% -0.0% -0.0% lambda +0.0% 0.0% +1.2% -6.1% -0.0% last-piece +0.0% 0.0% +0.0% -0.0% -0.0% lcss +0.0% 0.0% +0.0% -0.0% -0.0% life +0.0% 0.0% +0.0% -0.0% -0.0% lift +0.0% 0.0% +0.0% +0.0% 0.0% linear +0.0% 0.0% +0.0% +0.0% +0.0% listcompr +0.0% 0.0% -0.0% -0.0% -0.0% listcopy +0.0% 0.0% -0.0% -0.0% -0.0% maillist +0.0% 0.0% +0.0% -0.0% -0.0% mandel +0.0% 0.0% +0.0% +0.0% +0.0% mandel2 +0.0% 0.0% +0.0% +0.0% -0.0% mate +0.0% 0.0% +0.0% +0.0% +0.0% minimax +0.0% 0.0% -0.0% +0.0% -0.0% mkhprog +0.0% 0.0% +0.0% +0.0% +0.0% multiplier +0.0% 0.0% 0.0% +0.0% -0.0% n-body +0.0% 0.0% +0.0% -0.0% -0.0% nucleic2 +0.0% 0.0% +0.0% +0.0% -0.0% para +0.0% 0.0% +0.0% +0.0% +0.0% paraffins +0.0% 0.0% +0.0% +0.0% +0.0% parser +0.1% 0.0% +0.4% -1.7% -0.0% parstof +0.0% 0.0% -0.0% -0.0% -0.0% pic +0.0% 0.0% +0.0% 0.0% -0.0% pidigits +0.0% 0.0% -0.0% -0.0% -0.0% power +0.0% 0.0% +0.0% -0.0% -0.0% pretty +0.0% 0.0% +0.0% +0.0% +0.0% primes +0.0% 0.0% +0.0% 0.0% 0.0% primetest +0.0% 0.0% +0.0% +0.0% +0.0% prolog +0.0% 0.0% +0.0% +0.0% +0.0% puzzle +0.0% 0.0% +0.0% +0.0% +0.0% queens +0.0% 0.0% 0.0% +0.0% +0.0% reptile +0.0% 0.0% +0.0% +0.0% 0.0% reverse-complem +0.0% 0.0% -0.0% -0.0% -0.0% rewrite +0.0% 0.0% +0.0% 0.0% -0.0% rfib +0.0% 0.0% +0.0% +0.0% +0.0% rsa +0.0% 0.0% +0.0% +0.0% +0.0% scc +0.0% 0.0% +0.0% +0.0% +0.0% sched +0.0% 0.0% +0.0% +0.0% +0.0% scs +0.0% 0.0% +0.0% +0.0% 0.0% simple +0.0% 0.0% +0.0% +0.0% +0.0% solid +0.0% 0.0% +0.0% +0.0% 0.0% sorting +0.0% 0.0% +0.0% -0.0% 0.0% spectral-norm +0.0% 0.0% -0.0% -0.0% -0.0% sphere +0.0% 0.0% +0.0% -1.0% 0.0% symalg +0.0% 0.0% +0.0% +0.0% +0.0% tak +0.0% 0.0% +0.0% +0.0% +0.0% transform +0.0% 0.0% +0.4% -1.3% +0.0% treejoin +0.0% 0.0% +0.0% -0.0% 0.0% typecheck +0.0% 0.0% -0.0% +0.0% 0.0% veritas +0.0% 0.0% +0.0% -0.1% +0.0% wang +0.0% 0.0% +0.0% +0.0% +0.0% wave4main +0.0% 0.0% +0.0% 0.0% -0.0% wheel-sieve1 +0.0% 0.0% +0.0% +0.0% +0.0% wheel-sieve2 +0.0% 0.0% +0.0% +0.0% 0.0% x2n1 +0.0% 0.0% +0.0% +0.0% 0.0% -------------------------------------------------------------------------------- Min +0.0% 0.0% -0.0% -6.1% -0.0% Max +0.1% 0.0% +1.2% +0.0% +0.0% Geometric Mean +0.0% -0.0% +0.0% -0.1% -0.0% NoFib GC Results ================ -------------------------------------------------------------------------------- Program Size Allocs Instrs Reads Writes -------------------------------------------------------------------------------- circsim +0.0% 0.0% -0.0% -0.0% -0.0% constraints +0.0% 0.0% -0.0% 0.0% -0.0% fibheaps +0.0% 0.0% 0.0% -0.0% -0.0% fulsom +0.0% 0.0% 0.0% -0.6% -0.0% gc_bench +0.0% 0.0% 0.0% 0.0% -0.0% hash +0.0% 0.0% -0.0% -0.0% -0.0% lcss +0.0% 0.0% 0.0% -0.0% 0.0% mutstore1 +0.0% 0.0% 0.0% -0.0% -0.0% mutstore2 +0.0% 0.0% +0.0% -0.0% -0.0% power +0.0% 0.0% -0.0% 0.0% -0.0% spellcheck +0.0% 0.0% -0.0% -0.0% -0.0% -------------------------------------------------------------------------------- Min +0.0% 0.0% -0.0% -0.6% -0.0% Max +0.0% 0.0% +0.0% 0.0% 0.0% Geometric Mean +0.0% +0.0% +0.0% -0.1% +0.0% Fixes #14373 These performance regressions appear to be a fluke in CI. See the discussion in !1742 for details. Metric Increase: T6048 T12234 T12425 Naperian T12150 T5837 T13035
-rw-r--r--compiler/GHC/StgToCmm/Closure.hs13
-rw-r--r--compiler/GHC/StgToCmm/Expr.hs216
-rw-r--r--compiler/cmm/MkGraph.hs4
-rw-r--r--testsuite/driver/testlib.py12
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373.hs3
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373.stderr-ws-3216
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373.stderr-ws-6416
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373a.hs6
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373a.stderr3
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373b.hs8
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-327
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-645
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373c.hs9
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-328
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-649
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373d.hs19
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-3219
-rw-r--r--testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-6419
-rw-r--r--testsuite/tests/codeGen/should_compile/all.T22
19 files changed, 382 insertions, 32 deletions
diff --git a/compiler/GHC/StgToCmm/Closure.hs b/compiler/GHC/StgToCmm/Closure.hs
index 8552bfa905..07d3edab9a 100644
--- a/compiler/GHC/StgToCmm/Closure.hs
+++ b/compiler/GHC/StgToCmm/Closure.hs
@@ -362,20 +362,19 @@ type DynTag = Int -- The tag on a *pointer*
-- * big, otherwise.
--
-- Small families can have the constructor tag in the tag bits.
--- Big families only use the tag value 1 to represent evaluatedness.
+-- Big families always use the tag values 1..mAX_PTR_TAG to represent
+-- evaluatedness, the last one lumping together all overflowing ones.
-- We don't have very many tag bits: for example, we have 2 bits on
-- x86-32 and 3 bits on x86-64.
+--
+-- Also see Note [Tagging big families] in GHC.StgToCmm.Expr
isSmallFamily :: DynFlags -> Int -> Bool
isSmallFamily dflags fam_size = fam_size <= mAX_PTR_TAG dflags
tagForCon :: DynFlags -> DataCon -> DynTag
-tagForCon dflags con
- | isSmallFamily dflags fam_size = con_tag
- | otherwise = 1
- where
- con_tag = dataConTag con -- NB: 1-indexed
- fam_size = tyConFamilySize (dataConTyCon con)
+tagForCon dflags con = min (dataConTag con) (mAX_PTR_TAG dflags)
+-- NB: 1-indexed
tagForArity :: DynFlags -> RepArity -> DynTag
tagForArity dflags arity
diff --git a/compiler/GHC/StgToCmm/Expr.hs b/compiler/GHC/StgToCmm/Expr.hs
index 51058c6186..07113a4e82 100644
--- a/compiler/GHC/StgToCmm/Expr.hs
+++ b/compiler/GHC/StgToCmm/Expr.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE CPP #-}
+{-# LANGUAGE CPP, BangPatterns #-}
-----------------------------------------------------------------------------
--
@@ -32,10 +32,11 @@ import StgSyn
import MkGraph
import BlockId
-import Cmm
+import Cmm hiding ( succ )
import CmmInfo
import CoreSyn
import DataCon
+import DynFlags ( mAX_PTR_TAG )
import ForeignCall
import Id
import PrimOp
@@ -48,8 +49,9 @@ import Util
import FastString
import Outputable
-import Control.Monad (unless,void)
-import Control.Arrow (first)
+import Control.Monad ( unless, void )
+import Control.Arrow ( first )
+import Data.List ( partition )
------------------------------------------------------------------------
-- cgExpr: the main function
@@ -631,29 +633,152 @@ cgAlts gc_plan bndr (AlgAlt tycon) alts
; (mb_deflt, branches) <- cgAlgAltRhss gc_plan bndr alts
- ; let fam_sz = tyConFamilySize tycon
- bndr_reg = CmmLocal (idToReg dflags bndr)
-
- -- Is the constructor tag in the node reg?
- ; if isSmallFamily dflags fam_sz
- then do
- let -- Yes, bndr_reg has constr. tag in ls bits
- tag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg)
- branches' = [(tag+1,branch) | (tag,branch) <- branches]
- emitSwitch tag_expr branches' mb_deflt 1 fam_sz
-
- else -- No, get tag from info table
- let -- Note that ptr _always_ has tag 1
- -- when the family size is big enough
- untagged_ptr = cmmRegOffB bndr_reg (-1)
- tag_expr = getConstrTag dflags (untagged_ptr)
- in emitSwitch tag_expr branches mb_deflt 0 (fam_sz - 1)
+ ; let !fam_sz = tyConFamilySize tycon
+ !bndr_reg = CmmLocal (idToReg dflags bndr)
+ !ptag_expr = cmmConstrTag1 dflags (CmmReg bndr_reg)
+ !branches' = first succ <$> branches
+ !maxpt = mAX_PTR_TAG dflags
+ (!via_ptr, !via_info) = partition ((< maxpt) . fst) branches'
+ !small = isSmallFamily dflags fam_sz
+
+ -- Is the constructor tag in the node reg?
+ -- See Note [Tagging big families]
+ ; if small || null via_info
+ then -- Yes, bndr_reg has constructor tag in ls bits
+ emitSwitch ptag_expr branches' mb_deflt 1
+ (if small then fam_sz else maxpt)
+
+ else -- No, the get exact tag from info table when mAX_PTR_TAG
+ -- See Note [Double switching for big families]
+ do
+ let !untagged_ptr = cmmUntag dflags (CmmReg bndr_reg)
+ !itag_expr = getConstrTag dflags untagged_ptr
+ !info0 = first pred <$> via_info
+ if null via_ptr then
+ emitSwitch itag_expr info0 mb_deflt 0 (fam_sz - 1)
+ else do
+ infos_lbl <- newBlockId
+ infos_scp <- getTickScope
+
+ let spillover = (maxpt, (mkBranch infos_lbl, infos_scp))
+
+ (mb_shared_deflt, mb_shared_branch) <- case mb_deflt of
+ (Just (stmts, scp)) ->
+ do lbl <- newBlockId
+ return ( Just (mkLabel lbl scp <*> stmts, scp)
+ , Just (mkBranch lbl, scp))
+ _ -> return (Nothing, Nothing)
+ -- Switch on pointer tag
+ emitSwitch ptag_expr (spillover : via_ptr) mb_shared_deflt 1 maxpt
+ join_lbl <- newBlockId
+ emit (mkBranch join_lbl)
+ -- Switch on info table tag
+ emitLabel infos_lbl
+ emitSwitch itag_expr info0 mb_shared_branch
+ (maxpt - 1) (fam_sz - 1)
+ emitLabel join_lbl
; return AssignedDirectly }
cgAlts _ _ _ _ = panic "cgAlts"
-- UbxTupAlt and PolyAlt have only one alternative
+-- Note [Double switching for big families]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- An algebraic data type can have a n >= 0 summands
+-- (or alternatives), which are identified (labeled) by
+-- constructors. In memory they are kept apart by tags
+-- (see Note [Data constructor dynamic tags] in GHC.StgToCmm.Closure).
+-- Due to the characteristics of the platform that
+-- contribute to the alignment of memory objects, there
+-- is a natural limit of information about constructors
+-- that can be encoded in the pointer tag. When the mapping
+-- of constructors to the pointer tag range 1..mAX_PTR_TAG
+-- is not injective, then we have a "big data type", also
+-- called a "big (constructor) family" in the literature.
+-- Constructor tags residing in the info table are injective,
+-- but considerably more expensive to obtain, due to additional
+-- memory access(es).
+--
+-- When doing case analysis on a value of a "big data type"
+-- we need two nested switch statements to make up for the lack
+-- of injectivity of pointer tagging, also taking the info
+-- table tag into account. The exact mechanism is described next.
+--
+-- In the general case, switching on big family alternatives
+-- is done by two nested switch statements. According to
+-- Note [Tagging big families], the outer switch
+-- looks at the pointer tag and the inner dereferences the
+-- pointer and switches on the info table tag.
+--
+-- We can handle a simple case first, namely when none
+-- of the case alternatives mention a constructor having
+-- a pointer tag of 1..mAX_PTR_TAG-1. In this case we
+-- simply emit a switch on the info table tag.
+-- Note that the other simple case is when all mentioned
+-- alternatives lie in 1..mAX_PTR_TAG-1, in which case we can
+-- switch on the ptr tag only, just like in the small family case.
+--
+-- There is a single intricacy with a nested switch:
+-- Both should branch to the same default alternative, and as such
+-- avoid duplicate codegen of potentially heavy code. The outer
+-- switch generates the actual code with a prepended fresh label,
+-- while the inner one only generates a jump to that label.
+--
+-- For example, let's assume a 64-bit architecture, so that all
+-- heap objects are 8-byte aligned, and hence the address of a
+-- heap object ends in `000` (three zero bits).
+--
+-- Then consider the following data type
+--
+-- > data Big = T0 | T1 | T2 | T3 | T4 | T5 | T6 | T7 | T8
+-- Ptr tag: 1 2 3 4 5 6 7 7 7
+-- As bits: 001 010 011 100 101 110 111 111 111
+-- Info pointer tag (zero based):
+-- 0 1 2 3 4 5 6 7 8
+--
+-- Then \case T2 -> True; T8 -> True; _ -> False
+-- will result in following code (slightly cleaned-up and
+-- commented -ddump-cmm-from-stg):
+{-
+ R1 = _sqI::P64; -- scrutinee
+ if (R1 & 7 != 0) goto cqO; else goto cqP;
+ cqP: // global -- enter
+ call (I64[R1])(R1) returns to cqO, args: 8, res: 8, upd: 8;
+ cqO: // global -- already WHNF
+ _sqJ::P64 = R1;
+ _cqX::P64 = _sqJ::P64 & 7; -- extract pointer tag
+ switch [1 .. 7] _cqX::P64 {
+ case 3 : goto cqW;
+ case 7 : goto cqR;
+ default: {goto cqS;}
+ }
+ cqR: // global
+ _cr2 = I32[I64[_sqJ::P64 & (-8)] - 4]; -- tag from info pointer
+ switch [6 .. 8] _cr2::I64 {
+ case 8 : goto cr1;
+ default: {goto cr0;}
+ }
+ cr1: // global
+ R1 = GHC.Types.True_closure+2;
+ call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
+ cr0: // global -- technically necessary label
+ goto cqS;
+ cqW: // global
+ R1 = GHC.Types.True_closure+2;
+ call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
+ cqS: // global
+ R1 = GHC.Types.False_closure+1;
+ call (P64[(old + 8)])(R1) args: 8, res: 0, upd: 8;
+-}
+--
+-- For 32-bit systems we only have 2 tag bits in the pointers at our disposal,
+-- so the performance win is dubious, especially in face of the increased code
+-- size due to double switching. But we can take the viewpoint that 32-bit
+-- architectures are not relevant for performance any more, so this can be
+-- considered as moot.
+
-- Note [alg-alt heap check]
--
@@ -675,6 +800,55 @@ cgAlts _ _ _ _ = panic "cgAlts"
-- x = R1
-- goto L1
+
+-- Note [Tagging big families]
+-- ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+--
+-- Both the big and the small constructor families are tagged,
+-- that is, greater unions which overflow the tag space of TAG_BITS
+-- (i.e. 3 on 32 resp. 7 constructors on 64 bit archs).
+--
+-- For example, let's assume a 64-bit architecture, so that all
+-- heap objects are 8-byte aligned, and hence the address of a
+-- heap object ends in `000` (three zero bits). Then consider
+-- > data Maybe a = Nothing | Just a
+-- > data Day a = Mon | Tue | Wed | Thu | Fri | Sat | Sun
+-- > data Grade = G1 | G2 | G3 | G4 | G5 | G6 | G7 | G8 | G9 | G10
+--
+-- Since `Grade` has more than 7 constructors, it counts as a
+-- "big data type" (also referred to as "big constructor family" in papers).
+-- On the other hand, `Maybe` and `Day` have 7 constructors or fewer, so they
+-- are "small data types".
+--
+-- Then
+-- * A pointer to an unevaluated thunk of type `Maybe Int`, `Day` or `Grade` will end in `000`
+-- * A tagged pointer to a `Nothing`, `Mon` or `G1` will end in `001`
+-- * A tagged pointer to a `Just x`, `Tue` or `G2` will end in `010`
+-- * A tagged pointer to `Wed` or `G3` will end in `011`
+-- ...
+-- * A tagged pointer to `Sat` or `G6` will end in `110`
+-- * A tagged pointer to `Sun` or `G7` or `G8` or `G9` or `G10` will end in `111`
+--
+-- For big families we employ a mildly clever way of combining pointer and
+-- info-table tagging. We use 1..MAX_PTR_TAG-1 as pointer-resident tags where
+-- the tags in the pointer and the info table are in a one-to-one
+-- relation, whereas tag MAX_PTR_TAG is used as "spill over", signifying
+-- we have to fall back and get the precise constructor tag from the
+-- info-table.
+--
+-- Consequently we now cascade switches, because we have to check
+-- the pointer tag first, and when it is MAX_PTR_TAG, fetch the precise
+-- tag from the info table, and switch on that. The only technically
+-- tricky part is that the default case needs (logical) duplication.
+-- To do this we emit an extra label for it and branch to that from
+-- the second switch. This avoids duplicated codegen. See Trac #14373.
+-- See note [Double switching for big families] for the mechanics
+-- involved.
+--
+-- Also see note [Data constructor dynamic tags]
+-- and the wiki https://gitlab.haskell.org/ghc/ghc/wikis/commentary/rts/haskell-execution/pointer-tagging
+--
+
-------------------
cgAlgAltRhss :: (GcPlan,ReturnKind) -> NonVoid Id -> [CgStgAlt]
-> FCode ( Maybe CmmAGraphScoped
diff --git a/compiler/cmm/MkGraph.hs b/compiler/cmm/MkGraph.hs
index 8beb01bc4e..c6e62435a2 100644
--- a/compiler/cmm/MkGraph.hs
+++ b/compiler/cmm/MkGraph.hs
@@ -151,7 +151,7 @@ flattenCmmAGraph id (stmts_t, tscope) =
catAGraphs :: [CmmAGraph] -> CmmAGraph
catAGraphs = concatOL
--- | created a sequence "goto id; id:" as an AGraph
+-- | creates a sequence "goto id; id:" as an AGraph
mkLabel :: BlockId -> CmmTickScope -> CmmAGraph
mkLabel bid scp = unitOL (CgLabel bid scp)
@@ -159,7 +159,7 @@ mkLabel bid scp = unitOL (CgLabel bid scp)
mkMiddle :: CmmNode O O -> CmmAGraph
mkMiddle middle = unitOL (CgStmt middle)
--- | created a closed AGraph from a given node
+-- | creates a closed AGraph from a given node
mkLast :: CmmNode O C -> CmmAGraph
mkLast last = unitOL (CgLast last)
diff --git a/testsuite/driver/testlib.py b/testsuite/driver/testlib.py
index b72fd4fac5..6929f7144c 100644
--- a/testsuite/driver/testlib.py
+++ b/testsuite/driver/testlib.py
@@ -1220,6 +1220,9 @@ def multimod_compile( name, way, top_mod, extra_hc_opts ):
def multimod_compile_fail( name, way, top_mod, extra_hc_opts ):
return do_compile( name, way, True, top_mod, [], extra_hc_opts )
+def multimod_compile_filter( name, way, top_mod, extra_hc_opts, filter_with, suppress_stdout=True ):
+ return do_compile( name, way, False, top_mod, [], extra_hc_opts, filter_with=filter_with, suppress_stdout=suppress_stdout )
+
def multi_compile( name, way, top_mod, extra_mods, extra_hc_opts ):
return do_compile( name, way, False, top_mod, extra_mods, extra_hc_opts)
@@ -1459,12 +1462,14 @@ def simple_build(name: Union[TestName, str],
top_mod: Optional[Path],
link: bool,
addsuf: bool,
- backpack: bool = False) -> Any:
+ backpack: bool = False,
+ suppress_stdout: bool = False,
+ filter_with: str = '') -> Any:
opts = getTestOpts()
# Redirect stdout and stderr to the same file
stdout = in_testdir(name, 'comp.stderr')
- stderr = subprocess.STDOUT
+ stderr = subprocess.STDOUT if not suppress_stdout else None
if top_mod is not None:
srcname = top_mod
@@ -1515,6 +1520,9 @@ def simple_build(name: Union[TestName, str],
'{{compiler}} {to_do} {srcname} {flags} {extra_hc_opts}'
).format(**locals())
+ if filter_with != '':
+ cmd = cmd + ' | ' + filter_with
+
exit_code = runCmd(cmd, None, stdout, stderr, opts.compile_timeout_multiplier)
actual_stderr_path = in_testdir(name, 'comp.stderr')
diff --git a/testsuite/tests/codeGen/should_compile/T14373.hs b/testsuite/tests/codeGen/should_compile/T14373.hs
new file mode 100644
index 0000000000..9ab2242c2b
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373.hs
@@ -0,0 +1,3 @@
+module T14373 where
+
+data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P deriving (Enum, Show)
diff --git a/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-32 b/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-32
new file mode 100644
index 0000000000..18b0975bdd
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-32
@@ -0,0 +1,16 @@
+ const T14373.A_closure+1;
+ const T14373.B_closure+2;
+ const T14373.C_closure+3;
+ const T14373.D_closure+3;
+ const T14373.E_closure+3;
+ const T14373.F_closure+3;
+ const T14373.G_closure+3;
+ const T14373.H_closure+3;
+ const T14373.I_closure+3;
+ const T14373.J_closure+3;
+ const T14373.K_closure+3;
+ const T14373.L_closure+3;
+ const T14373.M_closure+3;
+ const T14373.N_closure+3;
+ const T14373.O_closure+3;
+ const T14373.P_closure+3;
diff --git a/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-64 b/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-64
new file mode 100644
index 0000000000..966b3cd648
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373.stderr-ws-64
@@ -0,0 +1,16 @@
+ const T14373.A_closure+1;
+ const T14373.B_closure+2;
+ const T14373.C_closure+3;
+ const T14373.D_closure+4;
+ const T14373.E_closure+5;
+ const T14373.F_closure+6;
+ const T14373.G_closure+7;
+ const T14373.H_closure+7;
+ const T14373.I_closure+7;
+ const T14373.J_closure+7;
+ const T14373.K_closure+7;
+ const T14373.L_closure+7;
+ const T14373.M_closure+7;
+ const T14373.N_closure+7;
+ const T14373.O_closure+7;
+ const T14373.P_closure+7;
diff --git a/testsuite/tests/codeGen/should_compile/T14373a.hs b/testsuite/tests/codeGen/should_compile/T14373a.hs
new file mode 100644
index 0000000000..ed75c5ef1a
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373a.hs
@@ -0,0 +1,6 @@
+module T14373a where
+
+data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
+
+{-# NOINLINE lateSwitch #-}
+lateSwitch P = "Cool"
diff --git a/testsuite/tests/codeGen/should_compile/T14373a.stderr b/testsuite/tests/codeGen/should_compile/T14373a.stderr
new file mode 100644
index 0000000000..47caba2d8b
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373a.stderr
@@ -0,0 +1,3 @@
+ switch [0 .. 15]
+ case 15 : goto
+ default: {goto
diff --git a/testsuite/tests/codeGen/should_compile/T14373b.hs b/testsuite/tests/codeGen/should_compile/T14373b.hs
new file mode 100644
index 0000000000..5261845066
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373b.hs
@@ -0,0 +1,8 @@
+module T14373b where
+
+data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
+
+{-# NOINLINE earlySwitch #-}
+earlySwitch A = True
+earlySwitch B = False
+earlySwitch C = False
diff --git a/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-32 b/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-32
new file mode 100644
index 0000000000..91e54b27ba
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-32
@@ -0,0 +1,7 @@
+ switch [1 .. 3]
+ case 1 : goto
+ case 2 : goto
+ case 3 : goto
+ switch [2 .. 15]
+ case 2 : goto
+ default: {goto
diff --git a/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-64 b/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-64
new file mode 100644
index 0000000000..4bbeacacda
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373b.stderr-ws-64
@@ -0,0 +1,5 @@
+ switch [1 .. 7]
+ case 1 : goto
+ case 2 : goto
+ case 3 : goto
+ default: {goto
diff --git a/testsuite/tests/codeGen/should_compile/T14373c.hs b/testsuite/tests/codeGen/should_compile/T14373c.hs
new file mode 100644
index 0000000000..92e42d3e5e
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373c.hs
@@ -0,0 +1,9 @@
+module T14373c where
+
+data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
+
+{-# NOINLINE mixedSwitch #-}
+mixedSwitch A = True
+mixedSwitch B = False
+mixedSwitch C = False
+mixedSwitch P = True
diff --git a/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-32 b/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-32
new file mode 100644
index 0000000000..d4a247f4a7
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-32
@@ -0,0 +1,8 @@
+ switch [1 .. 3]
+ case 1 : goto
+ case 2 : goto
+ case 3 : goto
+ switch [2 .. 15]
+ case 2 : goto
+ case 15 : goto
+ default: {goto
diff --git a/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-64 b/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-64
new file mode 100644
index 0000000000..ac0f17e095
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373c.stderr-ws-64
@@ -0,0 +1,9 @@
+ switch [1 .. 7]
+ case 1 : goto
+ case 2 : goto
+ case 3 : goto
+ case 7 : goto
+ default: {goto
+ switch [6 .. 15]
+ case 15 : goto
+ default: {goto
diff --git a/testsuite/tests/codeGen/should_compile/T14373d.hs b/testsuite/tests/codeGen/should_compile/T14373d.hs
new file mode 100644
index 0000000000..decd495b0f
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373d.hs
@@ -0,0 +1,19 @@
+module T14373d where
+
+data BigFam = A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P
+
+-- check that in all cases the default bloc is not duplicated
+-- (but being jumped at)
+
+{-# NOINLINE lateDefault #-}
+lateDefault P = "Cool"
+lateDefault _ = 'L' : "ate"
+
+{-# NOINLINE earlyDefault #-}
+earlyDefault B = "Cool"
+earlyDefault _ = 'E' : "arly"
+
+{-# NOINLINE mixedDefault #-}
+mixedDefault B = "Cool"
+mixedDefault P = "Cool"
+mixedDefault _ = 'M' : "ixed"
diff --git a/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-32 b/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-32
new file mode 100644
index 0000000000..b1f34757ee
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-32
@@ -0,0 +1,19 @@
+[T14373d.lateDefault_entry() { //
+ switch [0 .. 15]
+ case 15 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
+[T14373d.earlyDefault_entry() { //
+ switch [1 .. 3]
+ case 2 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
+[T14373d.mixedDefault_entry() { //
+ switch [1 .. 3]
+ case 2 : goto
+ case 3 : goto
+ default: {goto
+ switch [2 .. 15]
+ case 15 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
diff --git a/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-64 b/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-64
new file mode 100644
index 0000000000..15a63c9b9c
--- /dev/null
+++ b/testsuite/tests/codeGen/should_compile/T14373d.stderr-ws-64
@@ -0,0 +1,19 @@
+[T14373d.lateDefault_entry() { //
+ switch [0 .. 15]
+ case 15 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
+[T14373d.earlyDefault_entry() { //
+ switch [1 .. 7]
+ case 2 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
+[T14373d.mixedDefault_entry() { //
+ switch [1 .. 7]
+ case 2 : goto
+ case 7 : goto
+ default: {goto
+ switch [6 .. 15]
+ case 15 : goto
+ default: {goto
+ R1 = XYZ_closure+2;
diff --git a/testsuite/tests/codeGen/should_compile/all.T b/testsuite/tests/codeGen/should_compile/all.T
index eeb40464da..f69986e9b8 100644
--- a/testsuite/tests/codeGen/should_compile/all.T
+++ b/testsuite/tests/codeGen/should_compile/all.T
@@ -67,3 +67,25 @@ test('T17334', [ unless(have_ncg() and (arch('x86_64') or arch('i386')), skip)
, only_ways(['normal'])
], compile, ['-O'])
+test('T14373', [],
+ multimod_compile_filter, ['T14373', '-fasm -O2 -c -ddump-cmm-from-stg',
+ 'grep -e "const T14373\.._closure+.;"'])
+
+switch_skeleton_only = 'grep -e "switch \[" -e "case " -e "default: " | sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g"'
+
+test('T14373a', [],
+ multimod_compile_filter, ['T14373a', '-fasm -O2 -c -ddump-cmm-from-stg',
+ switch_skeleton_only])
+test('T14373b', [],
+ multimod_compile_filter, ['T14373b', '-fasm -O2 -c -ddump-cmm-from-stg',
+ switch_skeleton_only])
+test('T14373c', [],
+ multimod_compile_filter, ['T14373c', '-fasm -O2 -c -ddump-cmm-from-stg',
+ switch_skeleton_only])
+
+switch_skeleton_and_entries_only = ('grep -e "switch \[" -e "case " -e "default: " -e "Default_entry(" -e "R1 = .*_closure+2;"'
+ '| sed -e "s|\] .*|\]|g" -e "s|goto .*|goto |g" -e "s|R1 = .*_closure+2;.*|R1 = XYZ_closure+2;|g" -e "s|//.*|//|g"')
+
+test('T14373d', [],
+ multimod_compile_filter, ['T14373d', '-fasm -O2 -c -ddump-cmm-from-stg',
+ switch_skeleton_and_entries_only])