30 files changed, 768 insertions, 159 deletions
diff --git a/compiler/cmm/CmmCallConv.hs b/compiler/cmm/CmmCallConv.hs
index 0e89ce79f8..c32710e1b0 100644
--- a/compiler/cmm/CmmCallConv.hs
+++ b/compiler/cmm/CmmCallConv.hs
@@ -131,9 +131,10 @@ assignStack dflags offset arg_ty args = assign_stk offset [] (reverse args)
       assign_stk offset assts (r:rs)
         = assign_stk off' ((r, StackParam off') : assts) rs
         where w    = typeWidth (arg_ty r)
-              size = (((widthInBytes w - 1) `div` word_size) + 1) * word_size
               off' = offset + size
-              word_size = wORD_SIZE dflags
+              -- Stack arguments always take a whole number of words, we never
+              -- pack them unlike constructor fields.
+              size = roundUpToWords dflags (widthInBytes w)
 
 -----------------------------------------------------------------------------
 -- Local information about the registers available
diff --git a/compiler/cmm/SMRep.hs b/compiler/cmm/SMRep.hs
index 34048fe116..1469ae1bd3 100644
--- a/compiler/cmm/SMRep.hs
+++ b/compiler/cmm/SMRep.hs
@@ -9,7 +9,7 @@ module SMRep (
         -- * Words and bytes
         WordOff, ByteOff,
         wordsToBytes, bytesToWordsRoundUp,
-        roundUpToWords,
+        roundUpToWords, roundUpTo,
 
         StgWord, fromStgWord, toStgWord,
         StgHalfWord, fromStgHalfWord, toStgHalfWord,
@@ -79,8 +79,11 @@ type ByteOff = Int
 -- | Round up the given byte count to the next byte count that's a
 -- multiple of the machine's word size.
 roundUpToWords :: DynFlags -> ByteOff -> ByteOff
-roundUpToWords dflags n =
-  (n + (wORD_SIZE dflags - 1)) .&. (complement (wORD_SIZE dflags - 1))
+roundUpToWords dflags n = roundUpTo n (wORD_SIZE dflags)
+
+-- | Round up @base@ to a multiple of @size@.
+roundUpTo :: ByteOff -> ByteOff -> ByteOff
+roundUpTo base size = (base + (size - 1)) .&. (complement (size - 1))
 
 -- | Convert the given number of words to a number of bytes.
 --
diff --git a/compiler/codeGen/StgCmmBind.hs b/compiler/codeGen/StgCmmBind.hs
index 8b2e998b5e..13f908e846 100644
--- a/compiler/codeGen/StgCmmBind.hs
+++ b/compiler/codeGen/StgCmmBind.hs
@@ -112,7 +112,7 @@ cgTopRhsClosure dflags rec id ccs _ upd_flag args body =
 
                  -- BUILD THE OBJECT, AND GENERATE INFO TABLE (IF NECESSARY)
         ; emitDataLits closure_label closure_rep
-        ; let fv_details :: [(NonVoid Id, VirtualHpOffset)]
+        ; let fv_details :: [(NonVoid Id, ByteOff)]
               (_, _, fv_details) = mkVirtHeapOffsets dflags (isLFThunk lf_info) []
         -- Don't drop the non-void args until the closure info has been made
         ; forkClosureBody (closureCodeBody True id closure_info ccs
diff --git a/compiler/codeGen/StgCmmCon.hs b/compiler/codeGen/StgCmmCon.hs
index 1540d00715..a38f7bce37 100644
--- a/compiler/codeGen/StgCmmCon.hs
+++ b/compiler/codeGen/StgCmmCon.hs
@@ -79,9 +79,16 @@ cgTopRhsCon dflags id con args =
 
         -- LAY IT OUT
         ; let
+            is_thunk = False
             (tot_wds, --  #ptr_wds + #nonptr_wds
              ptr_wds, --  #ptr_wds
-             nv_args_w_offsets) = mkVirtConstrOffsets dflags (addArgReps args)
+             nv_args_w_offsets) =
+                 mkVirtHeapOffsetsWithPadding dflags is_thunk (addArgReps args)
+
+            mk_payload (Padding len _) = return (CmmInt 0 (widthFromBytes len))
+            mk_payload (FieldOff arg _) = do
+                CmmLit lit <- getArgAmode arg
+                return lit
 
             nonptr_wds = tot_wds - ptr_wds
 
@@ -90,10 +97,8 @@ cgTopRhsCon dflags id con args =
              -- needs to poke around inside it.
             info_tbl = mkDataConInfoTable dflags con True ptr_wds nonptr_wds
 
-            get_lit (arg, _offset) = do { CmmLit lit <- getArgAmode arg
-                                        ; return lit }
 
-        ; payload <- mapM get_lit nv_args_w_offsets
+        ; payload <- mapM mk_payload nv_args_w_offsets
                 -- NB1: nv_args_w_offsets is sorted into ptrs then non-ptrs
                 -- NB2: all the amodes should be Lits!
                 --      TODO (osa): Why?
@@ -264,7 +269,7 @@ bindConArgs (DataAlt con) base args
 
            -- The binding below forces the masking out of the tag bits
            -- when accessing the constructor field.
-           bind_arg :: (NonVoid Id, VirtualHpOffset) -> FCode (Maybe LocalReg)
+           bind_arg :: (NonVoid Id, ByteOff) -> FCode (Maybe LocalReg)
            bind_arg (arg@(NonVoid b), offset)
              | isDeadBinder b =
                  -- Do not load unused fields from objects to local variables.
diff --git a/compiler/codeGen/StgCmmHeap.hs b/compiler/codeGen/StgCmmHeap.hs
index 15dcaa2d89..790453619c 100644
--- a/compiler/codeGen/StgCmmHeap.hs
+++ b/compiler/codeGen/StgCmmHeap.hs
@@ -221,24 +221,11 @@ mkStaticClosure :: DynFlags -> CLabel -> CostCentreStack -> [CmmLit]
 mkStaticClosure dflags info_lbl ccs payload padding static_link_field saved_info_field
   =  [CmmLabel info_lbl]
   ++ staticProfHdr dflags ccs
-  ++ concatMap (padLitToWord dflags) payload
+  ++ payload
   ++ padding
   ++ static_link_field
   ++ saved_info_field
 
--- JD: Simon had elided this padding, but without it the C back end asserts
--- failure. Maybe it's a bad assertion, and this padding is indeed unnecessary?
-padLitToWord :: DynFlags -> CmmLit -> [CmmLit]
-padLitToWord dflags lit = lit : padding pad_length
-  where width = typeWidth (cmmLitType dflags lit)
-        pad_length = wORD_SIZE dflags - widthInBytes width :: Int
-
-        padding n | n <= 0 = []
-                  | n `rem` 2 /= 0 = CmmInt 0 W8  : padding (n-1)
-                  | n `rem` 4 /= 0 = CmmInt 0 W16 : padding (n-2)
-                  | n `rem` 8 /= 0 = CmmInt 0 W32 : padding (n-4)
-                  | otherwise      = CmmInt 0 W64 : padding (n-8)
-
 -----------------------------------------------------------
 --              Heap overflow checking
 -----------------------------------------------------------
diff --git a/compiler/codeGen/StgCmmLayout.hs b/compiler/codeGen/StgCmmLayout.hs
index aeb01242e7..5111b93bc5 100644
--- a/compiler/codeGen/StgCmmLayout.hs
+++ b/compiler/codeGen/StgCmmLayout.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE BangPatterns #-}
 {-# LANGUAGE CPP #-}
 
 -----------------------------------------------------------------------------
@@ -17,7 +18,12 @@ module StgCmmLayout (
 
         slowCall, directCall,
 
-        mkVirtHeapOffsets, mkVirtConstrOffsets, mkVirtConstrSizes, getHpRelOffset,
+        FieldOffOrPadding(..),
+        mkVirtHeapOffsets,
+        mkVirtHeapOffsetsWithPadding,
+        mkVirtConstrOffsets,
+        mkVirtConstrSizes,
+        getHpRelOffset,
 
         ArgRep(..), toArgRep, argRepSizeW -- re-exported from StgCmmArgRep
   ) where
@@ -44,7 +50,7 @@ import CmmInfo
 import CLabel
 import StgSyn
 import Id
-import TyCon             ( PrimRep(..) )
+import TyCon             ( PrimRep(..), primRepSizeB )
 import BasicTypes        ( RepArity )
 import DynFlags
 import Module
@@ -387,26 +393,33 @@ getHpRelOffset virtual_offset
        hp_usg <- getHpUsage
        return (cmmRegOffW dflags hpReg (hpRel (realHp hp_usg) virtual_offset))
 
-mkVirtHeapOffsets
+data FieldOffOrPadding a
+    = FieldOff (NonVoid a) -- Something that needs an offset.
+               ByteOff     -- Offset in bytes.
+    | Padding ByteOff  -- Length of padding in bytes.
+              ByteOff  -- Offset in bytes.
+
+mkVirtHeapOffsetsWithPadding
   :: DynFlags
   -> Bool                     -- True <=> is a thunk
-  -> [NonVoid (PrimRep,a)]    -- Things to make offsets for
-  -> (WordOff,                -- _Total_ number of words allocated
-      WordOff,                -- Number of words allocated for *pointers*
-      [(NonVoid a, ByteOff)])
+  -> [NonVoid (PrimRep, a)]   -- Things to make offsets for
+  -> ( WordOff                -- Total number of words allocated
+     , WordOff                -- Number of words allocated for *pointers*
+     , [FieldOffOrPadding a]  -- Either an offset or padding.
+     )
 
 -- Things with their offsets from start of object in order of
 -- increasing offset; BUT THIS MAY BE DIFFERENT TO INPUT ORDER
 -- First in list gets lowest offset, which is initial offset + 1.
 --
--- mkVirtHeapOffsets always returns boxed things with smaller offsets
+-- mkVirtHeapOffsetsWithPadding always returns boxed things with smaller offsets
 -- than the unboxed things
 
-mkVirtHeapOffsets dflags is_thunk things
-  = ASSERT(not (any (isVoidRep . fst . fromNonVoid) things))
-    ( bytesToWordsRoundUp dflags tot_bytes
+mkVirtHeapOffsetsWithPadding dflags is_thunk things =
+    ASSERT(not (any (isVoidRep . fst . fromNonVoid) things))
+    ( tot_wds
     , bytesToWordsRoundUp dflags bytes_of_ptrs
-    , ptrs_w_offsets ++ non_ptrs_w_offsets
+    , concat (ptrs_w_offsets ++ non_ptrs_w_offsets) ++ final_pad
     )
   where
     hdr_words | is_thunk   = thunkHdrSize dflags
@@ -420,10 +433,58 @@ mkVirtHeapOffsets dflags is_thunk things
     (tot_bytes, non_ptrs_w_offsets) =
        mapAccumL computeOffset bytes_of_ptrs non_ptrs
 
-    computeOffset bytes_so_far nv_thing
-      = (bytes_so_far + wordsToBytes dflags (argRepSizeW dflags (toArgRep rep)),
-         (NonVoid thing, hdr_bytes + bytes_so_far))
-           where (rep,thing) = fromNonVoid nv_thing
+    tot_wds = bytesToWordsRoundUp dflags tot_bytes
+
+    final_pad_size = tot_wds * word_size - tot_bytes
+    final_pad
+        | final_pad_size > 0 = [(Padding final_pad_size
+                                         (hdr_bytes + tot_bytes))]
+        | otherwise          = []
+
+    word_size = wORD_SIZE dflags
+
+    computeOffset bytes_so_far nv_thing =
+        (new_bytes_so_far, with_padding field_off)
+      where
+        (rep, thing) = fromNonVoid nv_thing
+
+        -- Size of the field in bytes.
+        !sizeB = primRepSizeB dflags rep
+
+        -- Align the start offset (eg, 2-byte value should be 2-byte aligned).
+        -- But not more than to a word.
+        !align = min word_size sizeB
+        !start = roundUpTo bytes_so_far align
+        !padding = start - bytes_so_far
+
+        -- Final offset is:
+        --   size of header + bytes_so_far + padding
+        !final_offset = hdr_bytes + bytes_so_far + padding
+        !new_bytes_so_far = start + sizeB
+        field_off = FieldOff (NonVoid thing) final_offset
+
+        with_padding field_off
+            | padding == 0 = [field_off]
+            | otherwise    = [ Padding padding (hdr_bytes + bytes_so_far)
+                             , field_off
+                             ]
+
+
+mkVirtHeapOffsets
+  :: DynFlags
+  -> Bool                     -- True <=> is a thunk
+  -> [NonVoid (PrimRep,a)]    -- Things to make offsets for
+  -> (WordOff,                -- _Total_ number of words allocated
+      WordOff,                -- Number of words allocated for *pointers*
+      [(NonVoid a, ByteOff)])
+mkVirtHeapOffsets dflags is_thunk things =
+    ( tot_wds
+    , ptr_wds
+    , [ (field, offset) | (FieldOff field offset) <- things_offsets ]
+    )
+  where
+   (tot_wds, ptr_wds, things_offsets) =
+       mkVirtHeapOffsetsWithPadding dflags is_thunk things
 
 -- | Just like mkVirtHeapOffsets, but for constructors
 mkVirtConstrOffsets
diff --git a/compiler/coreSyn/CoreLint.hs b/compiler/coreSyn/CoreLint.hs
index 96c34852ba..20354ec530 100644
--- a/compiler/coreSyn/CoreLint.hs
+++ b/compiler/coreSyn/CoreLint.hs
@@ -1677,8 +1677,8 @@ lintCoercion co@(UnivCo prov r ty1 ty2)
        = do { dflags <- getDynFlags
             ; checkWarnL (isUnBoxed rep1 == isUnBoxed rep2)
                          (report "between unboxed and boxed value")
-            ; checkWarnL (TyCon.primRepSizeW dflags rep1
-                           == TyCon.primRepSizeW dflags rep2)
+            ; checkWarnL (TyCon.primRepSizeB dflags rep1
+                           == TyCon.primRepSizeB dflags rep2)
                          (report "between unboxed values of different size")
             ; let fl = liftM2 (==) (TyCon.primRepIsFloat rep1)
                                    (TyCon.primRepIsFloat rep2)
diff --git a/compiler/ghci/ByteCodeAsm.hs b/compiler/ghci/ByteCodeAsm.hs
index edb18df382..920bc4ac2b 100644
--- a/compiler/ghci/ByteCodeAsm.hs
+++ b/compiler/ghci/ByteCodeAsm.hs
@@ -351,6 +351,12 @@ assembleI dflags i = case i of
   PUSH_L o1                -> emit bci_PUSH_L [SmallOp o1]
   PUSH_LL o1 o2            -> emit bci_PUSH_LL [SmallOp o1, SmallOp o2]
   PUSH_LLL o1 o2 o3        -> emit bci_PUSH_LLL [SmallOp o1, SmallOp o2, SmallOp o3]
+  PUSH8 o1                 -> emit bci_PUSH8 [SmallOp o1]
+  PUSH16 o1                -> emit bci_PUSH16 [SmallOp o1]
+  PUSH32 o1                -> emit bci_PUSH32 [SmallOp o1]
+  PUSH8_W o1               -> emit bci_PUSH8_W [SmallOp o1]
+  PUSH16_W o1              -> emit bci_PUSH16_W [SmallOp o1]
+  PUSH32_W o1              -> emit bci_PUSH32_W [SmallOp o1]
   PUSH_G nm                -> do p <- ptr (BCOPtrName nm)
                                  emit bci_PUSH_G [Op p]
   PUSH_PRIMOP op           -> do p <- ptr (BCOPtrPrimOp op)
@@ -365,6 +371,15 @@ assembleI dflags i = case i of
                            -> do let ul_bco = assembleBCO dflags proto
                                  p <- ioptr (liftM BCOPtrBCO ul_bco)
                                  emit (push_alts pk) [Op p]
+  PUSH_PAD8                -> emit bci_PUSH_PAD8 []
+  PUSH_PAD16               -> emit bci_PUSH_PAD16 []
+  PUSH_PAD32               -> emit bci_PUSH_PAD32 []
+  PUSH_UBX8 lit            -> do np <- literal lit
+                                 emit bci_PUSH_UBX8 [Op np]
+  PUSH_UBX16 lit           -> do np <- literal lit
+                                 emit bci_PUSH_UBX16 [Op np]
+  PUSH_UBX32 lit           -> do np <- literal lit
+                                 emit bci_PUSH_UBX32 [Op np]
   PUSH_UBX lit nws         -> do np <- literal lit
                                  emit bci_PUSH_UBX [Op np, SmallOp nws]
 
diff --git a/compiler/ghci/ByteCodeGen.hs b/compiler/ghci/ByteCodeGen.hs
index c7b96a83a0..697dc63b43 100644
--- a/compiler/ghci/ByteCodeGen.hs
+++ b/compiler/ghci/ByteCodeGen.hs
@@ -47,8 +47,9 @@ import Unique
 import FastString
 import Panic
 import StgCmmClosure    ( NonVoid(..), fromNonVoid, nonVoidIds )
-import StgCmmLayout     ( ArgRep(..), toArgRep, argRepSizeW,
-                          mkVirtHeapOffsets, mkVirtConstrOffsets )
+import StgCmmLayout     ( ArgRep(..), FieldOffOrPadding(..),
+                          toArgRep, argRepSizeW,
+                          mkVirtHeapOffsetsWithPadding, mkVirtConstrOffsets )
 import SMRep hiding (WordOff, ByteOff, wordsToBytes)
 import Bitmap
 import OrdList
@@ -455,6 +456,9 @@ truncIntegral16 w
     | otherwise
     = fromIntegral w
 
+trunc16B :: ByteOff -> Word16
+trunc16B = truncIntegral16
+
 trunc16W :: WordOff -> Word16
 trunc16W = truncIntegral16
 
@@ -798,10 +802,13 @@ mkConAppCode orig_d _ p con args_r_to_l =
                 , not (isVoidRep prim_rep)
                 ]
             is_thunk = False
-            (_, _, args_offsets) = mkVirtHeapOffsets dflags is_thunk non_voids
+            (_, _, args_offsets) =
+                mkVirtHeapOffsetsWithPadding dflags is_thunk non_voids
 
-            do_pushery !d ((arg, _) : args) = do
-                (push, arg_bytes) <- pushAtom d p (fromNonVoid arg)
+            do_pushery !d (arg : args) = do
+                (push, arg_bytes) <- case arg of
+                    (Padding l _) -> pushPadding l
+                    (FieldOff a _) -> pushConstrAtom d p (fromNonVoid a)
                 more_push_code <- do_pushery (d + arg_bytes) args
                 return (push `appOL` more_push_code)
             do_pushery !d [] = do
@@ -926,7 +933,8 @@ doCase d s p (_,scrut) bndr alts is_unboxed_tuple
                             | otherwise = wordSize dflags
 
         -- depth of stack after the return value has been pushed
-        d_bndr = d + ret_frame_size_b + idSizeB dflags bndr
+        d_bndr =
+            d + ret_frame_size_b + wordsToBytes dflags (idSizeW dflags bndr)
 
         -- depth of stack after the extra info table for an unboxed return
         -- has been pushed, if any.  This is the stack depth at the
@@ -1127,8 +1135,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
      code_n_reps <- pargs d0 args_r_to_l
      let
          (pushs_arg, a_reps_pushed_r_to_l) = unzip code_n_reps
-         a_reps_sizeW =
-             WordOff (sum (map (primRepSizeW dflags) a_reps_pushed_r_to_l))
+         a_reps_sizeW = sum (map (repSizeWords dflags) a_reps_pushed_r_to_l)
 
          push_args    = concatOL pushs_arg
          !d_after_args = d0 + wordsToBytes dflags a_reps_sizeW
@@ -1218,7 +1225,7 @@ generateCCall d0 s p (CCallSpec target cconv safety) fn args_r_to_l
 
          -- Push the return placeholder.  For a call returning nothing,
          -- this is a V (tag).
-         r_sizeW   = WordOff (primRepSizeW dflags r_rep)
+         r_sizeW   = repSizeWords dflags r_rep
          d_after_r = d_after_Addr + wordsToBytes dflags r_sizeW
          push_r =
              if returns_void
@@ -1472,12 +1479,20 @@ pushAtom d p (AnnVar var)
 
    | Just d_v <- lookupBCEnv_maybe var p  -- var is a local variable
    = do dflags <- getDynFlags
-        -- Currently this code assumes that @szb@ is a multiple of full words.
-        -- It'll need to change to support, e.g., sub-word constructor fields.
-        let !szb = idSizeB dflags var
-            !szw = bytesToWords dflags szb -- szb is a multiple of words
-            l = trunc16W $ bytesToWords dflags (d - d_v) + szw - 1
-        return (toOL (genericReplicate szw (PUSH_L l)), szb)
+
+        let !szb = idSizeCon dflags var
+            with_instr instr = do
+                let !off_b = trunc16B $ d - d_v
+                return (unitOL (instr off_b), wordSize dflags)
+
+        case szb of
+            1 -> with_instr PUSH8_W
+            2 -> with_instr PUSH16_W
+            4 -> with_instr PUSH32_W
+            _ -> do
+                let !szw = bytesToWords dflags szb
+                    !off_w = trunc16W $ bytesToWords dflags (d - d_v) + szw - 1
+                return (toOL (genericReplicate szw (PUSH_L off_w)), szb)
         -- d - d_v           offset from TOS to the first slot of the object
         --
         -- d - d_v + sz - 1  offset from the TOS of the last slot of the object
@@ -1492,7 +1507,7 @@ pushAtom d p (AnnVar var)
               ptrToWordPtr $ fromRemotePtr ptr
             Nothing -> do
                 dflags <- getDynFlags
-                let sz = idSizeB dflags var
+                let sz = idSizeCon dflags var
                 MASSERT( sz == wordSize dflags )
                 return (unitOL (PUSH_G (getName var)), sz)
 
@@ -1525,6 +1540,36 @@ pushAtom _ _ expr
               (pprCoreExpr (deAnnotate' expr))
 
 
+-- | Push an atom for constructor (i.e., PACK instruction) onto the stack.
+-- This is slightly different to @pushAtom@ due to the fact that we allow
+-- packing constructor fields. See also @mkConAppCode@ and @pushPadding@.
+pushConstrAtom
+    :: StackDepth -> BCEnv -> AnnExpr' Id DVarSet -> BcM (BCInstrList, ByteOff)
+
+pushConstrAtom _ _ (AnnLit lit@(MachFloat _)) =
+    return (unitOL (PUSH_UBX32 lit), 4)
+
+pushConstrAtom d p (AnnVar v)
+    | Just d_v <- lookupBCEnv_maybe v p = do  -- v is a local variable
+        dflags <- getDynFlags
+        let !szb = idSizeCon dflags v
+            done instr = do
+                let !off = trunc16B $ d - d_v
+                return (unitOL (instr off), szb)
+        case szb of
+            1 -> done PUSH8
+            2 -> done PUSH16
+            4 -> done PUSH32
+            _ -> pushAtom d p (AnnVar v)
+
+pushConstrAtom d p expr = pushAtom d p expr
+
+pushPadding :: Int -> BcM (BCInstrList, ByteOff)
+pushPadding 1 = return (unitOL (PUSH_PAD8), 1)
+pushPadding 2 = return (unitOL (PUSH_PAD16), 2)
+pushPadding 4 = return (unitOL (PUSH_PAD32), 4)
+pushPadding x = panic $ "pushPadding x=" ++ show x
+
 -- -----------------------------------------------------------------------------
 -- Given a bunch of alts code and their discrs, do the donkey work
 -- of making a multiway branch using a switch tree.
@@ -1669,8 +1714,8 @@ lookupBCEnv_maybe = Map.lookup
 idSizeW :: DynFlags -> Id -> WordOff
 idSizeW dflags = WordOff . argRepSizeW dflags . bcIdArgRep
 
-idSizeB :: DynFlags -> Id -> ByteOff
-idSizeB dflags = wordsToBytes dflags . idSizeW dflags
+idSizeCon :: DynFlags -> Id -> ByteOff
+idSizeCon dflags = ByteOff . primRepSizeB dflags . bcIdPrimRep
 
 bcIdArgRep :: Id -> ArgRep
 bcIdArgRep = toArgRep . bcIdPrimRep
@@ -1682,6 +1727,9 @@ bcIdPrimRep id
   | otherwise
   = pprPanic "bcIdPrimRep" (ppr id <+> dcolon <+> ppr (idType id))
 
+repSizeWords :: DynFlags -> PrimRep -> WordOff
+repSizeWords dflags rep = WordOff $ argRepSizeW dflags (toArgRep rep)
+
 isFollowableArg :: ArgRep -> Bool
 isFollowableArg P = True
 isFollowableArg _ = False
diff --git a/compiler/ghci/ByteCodeInstr.hs b/compiler/ghci/ByteCodeInstr.hs
index 7ef82206cb..07dcd2222a 100644
--- a/compiler/ghci/ByteCodeInstr.hs
+++ b/compiler/ghci/ByteCodeInstr.hs
@@ -62,6 +62,23 @@ data BCInstr
    | PUSH_LL   !Word16 !Word16{-2 offsets-}
    | PUSH_LLL  !Word16 !Word16 !Word16{-3 offsets-}
 
+   -- Push the specified local as a 8, 16, 32 bit value onto the stack. (i.e.,
+   -- the stack will grow by 8, 16 or 32 bits)
+   | PUSH8  !Word16
+   | PUSH16 !Word16
+   | PUSH32 !Word16
+
+   -- Push the specifiec local as a 8, 16, 32 bit value onto the stack, but the
+   -- value will take the whole word on the stack (i.e., the stack will gorw by
+   -- a word)
+   -- This is useful when extracting a packed constructor field for further use.
+   -- Currently we expect all values on the stack to take full words, except for
+   -- the ones used for PACK (i.e., actually constracting new data types, in
+   -- which case we use PUSH{8,16,32})
+   | PUSH8_W  !Word16
+   | PUSH16_W !Word16
+   | PUSH32_W !Word16
+
    -- Push a ptr  (these all map to PUSH_G really)
    | PUSH_G       Name
    | PUSH_PRIMOP  PrimOp
@@ -71,8 +88,16 @@ data BCInstr
    | PUSH_ALTS          (ProtoBCO Name)
    | PUSH_ALTS_UNLIFTED (ProtoBCO Name) ArgRep
 
+   -- Pushing 8, 16 and 32 bits of padding (for constructors).
+   | PUSH_PAD8
+   | PUSH_PAD16
+   | PUSH_PAD32
+
    -- Pushing literals
-   | PUSH_UBX  Literal Word16
+   | PUSH_UBX8  Literal
+   | PUSH_UBX16 Literal
+   | PUSH_UBX32 Literal
+   | PUSH_UBX   Literal Word16
         -- push this int/float/double/addr, on the stack. Word16
         -- is # of words to copy from literal pool.  Eitherness reflects
         -- the difficulty of dealing with MachAddr here, mostly due to
@@ -194,6 +219,12 @@ instance Outputable BCInstr where
    ppr (PUSH_L offset)       = text "PUSH_L  " <+> ppr offset
    ppr (PUSH_LL o1 o2)       = text "PUSH_LL " <+> ppr o1 <+> ppr o2
    ppr (PUSH_LLL o1 o2 o3)   = text "PUSH_LLL" <+> ppr o1 <+> ppr o2 <+> ppr o3
+   ppr (PUSH8  offset)       = text "PUSH8  " <+> ppr offset
+   ppr (PUSH16 offset)       = text "PUSH16  " <+> ppr offset
+   ppr (PUSH32 offset)       = text "PUSH32  " <+> ppr offset
+   ppr (PUSH8_W  offset)     = text "PUSH8_W  " <+> ppr offset
+   ppr (PUSH16_W offset)     = text "PUSH16_W  " <+> ppr offset
+   ppr (PUSH32_W offset)     = text "PUSH32_W  " <+> ppr offset
    ppr (PUSH_G nm)           = text "PUSH_G  " <+> ppr nm
    ppr (PUSH_PRIMOP op)      = text "PUSH_G  " <+> text "GHC.PrimopWrappers."
                                                <> ppr op
@@ -201,6 +232,13 @@ instance Outputable BCInstr where
    ppr (PUSH_ALTS bco)       = hang (text "PUSH_ALTS") 2 (ppr bco)
    ppr (PUSH_ALTS_UNLIFTED bco pk) = hang (text "PUSH_ALTS_UNLIFTED" <+> ppr pk) 2 (ppr bco)
 
+   ppr PUSH_PAD8             = text "PUSH_PAD8"
+   ppr PUSH_PAD16            = text "PUSH_PAD16"
+   ppr PUSH_PAD32            = text "PUSH_PAD32"
+
+   ppr (PUSH_UBX8  lit)      = text "PUSH_UBX8" <+> ppr lit
+   ppr (PUSH_UBX16 lit)      = text "PUSH_UBX16" <+> ppr lit
+   ppr (PUSH_UBX32 lit)      = text "PUSH_UBX32" <+> ppr lit
    ppr (PUSH_UBX lit nw)     = text "PUSH_UBX" <+> parens (ppr nw) <+> ppr lit
    ppr PUSH_APPLY_N          = text "PUSH_APPLY_N"
    ppr PUSH_APPLY_V          = text "PUSH_APPLY_V"
@@ -269,11 +307,23 @@ bciStackUse STKCHECK{}            = 0
 bciStackUse PUSH_L{}              = 1
 bciStackUse PUSH_LL{}             = 2
 bciStackUse PUSH_LLL{}            = 3
+bciStackUse PUSH8{}               = 1  -- overapproximation
+bciStackUse PUSH16{}              = 1  -- overapproximation
+bciStackUse PUSH32{}              = 1  -- overapproximation on 64bit arch
+bciStackUse PUSH8_W{}             = 1  -- takes exactly 1 word
+bciStackUse PUSH16_W{}            = 1  -- takes exactly 1 word
+bciStackUse PUSH32_W{}            = 1  -- takes exactly 1 word
 bciStackUse PUSH_G{}              = 1
 bciStackUse PUSH_PRIMOP{}         = 1
 bciStackUse PUSH_BCO{}            = 1
 bciStackUse (PUSH_ALTS bco)       = 2 + protoBCOStackUse bco
 bciStackUse (PUSH_ALTS_UNLIFTED bco _) = 2 + protoBCOStackUse bco
+bciStackUse (PUSH_PAD8)           = 1  -- overapproximation
+bciStackUse (PUSH_PAD16)          = 1  -- overapproximation
+bciStackUse (PUSH_PAD32)          = 1  -- overapproximation on 64bit arch
+bciStackUse (PUSH_UBX8 _)         = 1  -- overapproximation
+bciStackUse (PUSH_UBX16 _)        = 1  -- overapproximation
+bciStackUse (PUSH_UBX32 _)        = 1  -- overapproximation on 64bit arch
 bciStackUse (PUSH_UBX _ nw)       = fromIntegral nw
 bciStackUse PUSH_APPLY_N{}        = 1
 bciStackUse PUSH_APPLY_V{}        = 1
diff --git a/compiler/ghci/RtClosureInspect.hs b/compiler/ghci/RtClosureInspect.hs
index 63d1886b4d..b85322d60e 100644
--- a/compiler/ghci/RtClosureInspect.hs
+++ b/compiler/ghci/RtClosureInspect.hs
@@ -60,6 +60,7 @@ import GHC.Arr          ( Array(..) )
 import GHC.Char
 import GHC.Exts
 import GHC.IO ( IO(..) )
+import SMRep ( roundUpTo )
 
 import Control.Monad
 import Data.Maybe
@@ -71,6 +72,7 @@ import Data.Sequence (viewl, ViewL(..))
 import Foreign
 import System.IO.Unsafe
 
+
 ---------------------------------------------
 -- * A representation of semi evaluated Terms
 ---------------------------------------------
@@ -148,11 +150,13 @@ data ClosureType = Constr
                  | Other  Int
  deriving (Show, Eq)
 
+data ClosureNonPtrs = ClosureNonPtrs ByteArray#
+
 data Closure = Closure { tipe         :: ClosureType
                        , infoPtr      :: Ptr ()
                        , infoTable    :: StgInfoTable
                        , ptrs         :: Array Int HValue
-                       , nonPtrs      :: [Word]
+                       , nonPtrs      :: ClosureNonPtrs
                        }
 
 instance Outputable ClosureType where
@@ -184,8 +188,7 @@ getClosureData dflags a =
            let tipe = readCType (InfoTable.tipe itbl)
                elems = fromIntegral (InfoTable.ptrs itbl)
                ptrsList = Array 0 (elems - 1) elems ptrs
-               nptrs_data = [W# (indexWordArray# nptrs i)
-                            | I# i <- [0.. fromIntegral (InfoTable.nptrs itbl)-1] ]
+               nptrs_data = ClosureNonPtrs nptrs
            ASSERT(elems >= 0) return ()
            ptrsList `seq`
             return (Closure tipe iptr0 itbl ptrsList nptrs_data)
@@ -793,47 +796,75 @@ cvObtainTerm hsc_env max_depth force old_ty hval = runTR hsc_env $ do
 
 extractSubTerms :: (Type -> HValue -> TcM Term)
                 -> Closure -> [Type] -> TcM [Term]
-extractSubTerms recurse clos = liftM thdOf3 . go 0 (nonPtrs clos)
+extractSubTerms recurse clos = liftM thdOf3 . go 0 0
   where
-    go ptr_i ws [] = return (ptr_i, ws, [])
-    go ptr_i ws (ty:tys)
+    !(ClosureNonPtrs array) = nonPtrs clos
+
+    go ptr_i arr_i [] = return (ptr_i, arr_i, [])
+    go ptr_i arr_i (ty:tys)
       | Just (tc, elem_tys) <- tcSplitTyConApp_maybe ty
       , isUnboxedTupleTyCon tc
                 -- See Note [Unboxed tuple RuntimeRep vars] in TyCon
-      = do (ptr_i, ws, terms0) <- go ptr_i ws (dropRuntimeRepArgs elem_tys)
-           (ptr_i, ws, terms1) <- go ptr_i ws tys
-           return (ptr_i, ws, unboxedTupleTerm ty terms0 : terms1)
+      = do (ptr_i, arr_i, terms0) <-
+               go ptr_i arr_i (dropRuntimeRepArgs elem_tys)
+           (ptr_i, arr_i, terms1) <- go ptr_i arr_i tys
+           return (ptr_i, arr_i, unboxedTupleTerm ty terms0 : terms1)
       | otherwise
       = case typePrimRepArgs ty of
           [rep_ty] ->  do
-            (ptr_i, ws, term0)  <- go_rep ptr_i ws ty rep_ty
-            (ptr_i, ws, terms1) <- go ptr_i ws tys
-            return (ptr_i, ws, term0 : terms1)
+            (ptr_i, arr_i, term0)  <- go_rep ptr_i arr_i ty rep_ty
+            (ptr_i, arr_i, terms1) <- go ptr_i arr_i tys
+            return (ptr_i, arr_i, term0 : terms1)
           rep_tys -> do
-           (ptr_i, ws, terms0) <- go_unary_types ptr_i ws rep_tys
-           (ptr_i, ws, terms1) <- go ptr_i ws tys
-           return (ptr_i, ws, unboxedTupleTerm ty terms0 : terms1)
+           (ptr_i, arr_i, terms0) <- go_unary_types ptr_i arr_i rep_tys
+           (ptr_i, arr_i, terms1) <- go ptr_i arr_i tys
+           return (ptr_i, arr_i, unboxedTupleTerm ty terms0 : terms1)
 
-    go_unary_types ptr_i ws [] = return (ptr_i, ws, [])
-    go_unary_types ptr_i ws (rep_ty:rep_tys) = do
+    go_unary_types ptr_i arr_i [] = return (ptr_i, arr_i, [])
+    go_unary_types ptr_i arr_i (rep_ty:rep_tys) = do
       tv <- newVar liftedTypeKind
-      (ptr_i, ws, term0)  <- go_rep ptr_i ws tv rep_ty
-      (ptr_i, ws, terms1) <- go_unary_types ptr_i ws rep_tys
-      return (ptr_i, ws, term0 : terms1)
-
-    go_rep ptr_i ws ty rep
-      | isGcPtrRep rep
-      = do t <- appArr (recurse ty) (ptrs clos) ptr_i
-           return (ptr_i + 1, ws, t)
-      | otherwise
-      = do dflags <- getDynFlags
-           let (ws0, ws1) = splitAt (primRepSizeW dflags rep) ws
-           return (ptr_i, ws1, Prim ty ws0)
+      (ptr_i, arr_i, term0)  <- go_rep ptr_i arr_i tv rep_ty
+      (ptr_i, arr_i, terms1) <- go_unary_types ptr_i arr_i rep_tys
+      return (ptr_i, arr_i, term0 : terms1)
+
+    go_rep ptr_i arr_i ty rep
+      | isGcPtrRep rep = do
+          t <- appArr (recurse ty) (ptrs clos) ptr_i
+          return (ptr_i + 1, arr_i, t)
+      | otherwise = do
+          -- This is a bit involved since we allow packing multiple fields
+          -- within a single word. See also
+          -- StgCmmLayout.mkVirtHeapOffsetsWithPadding
+          dflags <- getDynFlags
+          let word_size = wORD_SIZE dflags
+              size_b = primRepSizeB dflags rep
+              -- Fields are always aligned.
+              !aligned_idx = roundUpTo arr_i size_b
+              !new_arr_i = aligned_idx + size_b
+              ws
+                  | size_b < word_size = [index size_b array aligned_idx]
+                  | otherwise =
+                      let (q, r) = size_b `quotRem` word_size
+                      in ASSERT( r == 0 )
+                         [ W# (indexWordArray# array i)
+                         | o <- [0.. q - 1]
+                         , let !(I# i) = (aligned_idx + o) `quot` word_size
+                         ]
+          return (ptr_i, new_arr_i, Prim ty ws)
 
     unboxedTupleTerm ty terms
       = Term ty (Right (tupleDataCon Unboxed (length terms)))
                 (error "unboxedTupleTerm: no HValue for unboxed tuple") terms
 
+    index item_size_b  array (I# index_b) =
+        case item_size_b of
+            -- indexWord*Array# functions take offsets dependent not in bytes,
+            -- but in multiples of an element's size.
+            1 -> W# (indexWord8Array# array index_b)
+            2 -> W# (indexWord16Array# array (index_b `quotInt#` 2#))
+            4 -> W# (indexWord32Array# array (index_b `quotInt#` 4#))
+            _ -> panic ("Weird byte-index: " ++ show (I# index_b))
+
 
 -- Fast, breadth-first Type reconstruction
 ------------------------------------------
diff --git a/compiler/main/Constants.hs b/compiler/main/Constants.hs
index 06d93128e7..7eda130917 100644
--- a/compiler/main/Constants.hs
+++ b/compiler/main/Constants.hs
@@ -38,5 +38,9 @@ mAX_SOLVER_ITERATIONS = 4
 wORD64_SIZE :: Int
 wORD64_SIZE = 8
 
+-- Size of float in bytes.
+fLOAT_SIZE :: Int
+fLOAT_SIZE = 4
+
 tARGET_MAX_CHAR :: Int
 tARGET_MAX_CHAR = 0x10ffff
diff --git a/compiler/types/TyCon.hs b/compiler/types/TyCon.hs
index 103c824bca..596c5f37ef 100644
--- a/compiler/types/TyCon.hs
+++ b/compiler/types/TyCon.hs
@@ -114,7 +114,8 @@ module TyCon(
         -- * Primitive representations of Types
         PrimRep(..), PrimElemRep(..),
         isVoidRep, isGcPtrRep,
-        primRepSizeW, primElemRepSizeB,
+        primRepSizeB,
+        primElemRepSizeB,
         primRepIsFloat,
 
         -- * Recursion breaking
@@ -1340,19 +1341,25 @@ isGcPtrRep LiftedRep   = True
 isGcPtrRep UnliftedRep = True
 isGcPtrRep _           = False
 
--- | Find the size of a 'PrimRep', in words
-primRepSizeW :: DynFlags -> PrimRep -> Int
-primRepSizeW _      IntRep           = 1
-primRepSizeW _      WordRep          = 1
-primRepSizeW dflags Int64Rep         = wORD64_SIZE `quot` wORD_SIZE dflags
-primRepSizeW dflags Word64Rep        = wORD64_SIZE `quot` wORD_SIZE dflags
-primRepSizeW _      FloatRep         = 1    -- NB. might not take a full word
-primRepSizeW dflags DoubleRep        = dOUBLE_SIZE dflags `quot` wORD_SIZE dflags
-primRepSizeW _      AddrRep          = 1
-primRepSizeW _      LiftedRep        = 1
-primRepSizeW _      UnliftedRep      = 1
-primRepSizeW _      VoidRep          = 0
-primRepSizeW dflags (VecRep len rep) = len * primElemRepSizeB rep `quot` wORD_SIZE dflags
+-- | The size of a 'PrimRep' in bytes.
+--
+-- This applies also when used in a constructor, where we allow packing the
+-- fields. For instance, in @data Foo = Foo Float# Float#@ the two fields will
+-- take only 8 bytes, which for 64-bit arch will be equal to 1 word.
+-- See also mkVirtHeapOffsetsWithPadding for details of how data fields are
+-- layed out.
+primRepSizeB :: DynFlags -> PrimRep -> Int
+primRepSizeB dflags IntRep           = wORD_SIZE dflags
+primRepSizeB dflags WordRep          = wORD_SIZE dflags
+primRepSizeB _      Int64Rep         = wORD64_SIZE
+primRepSizeB _      Word64Rep        = wORD64_SIZE
+primRepSizeB _      FloatRep         = fLOAT_SIZE
+primRepSizeB dflags DoubleRep        = dOUBLE_SIZE dflags
+primRepSizeB dflags AddrRep          = wORD_SIZE dflags
+primRepSizeB dflags LiftedRep        = wORD_SIZE dflags
+primRepSizeB dflags UnliftedRep      = wORD_SIZE dflags
+primRepSizeB _      VoidRep          = 0
+primRepSizeB _      (VecRep len rep) = len * primElemRepSizeB rep
 
 primElemRepSizeB :: PrimElemRep -> Int
 primElemRepSizeB Int8ElemRep   = 1
diff --git a/includes/rts/Bytecodes.h b/includes/rts/Bytecodes.h
index 6ca74bf36e..e5d55f694f 100644
--- a/includes/rts/Bytecodes.h
+++ b/includes/rts/Bytecodes.h
@@ -27,58 +27,70 @@
 #define bci_PUSH_L    			2
 #define bci_PUSH_LL   			3
 #define bci_PUSH_LLL  			4
-#define bci_PUSH_G    			5
-#define bci_PUSH_ALTS  			6
-#define bci_PUSH_ALTS_P			7
-#define bci_PUSH_ALTS_N			8 
-#define bci_PUSH_ALTS_F			9 
-#define bci_PUSH_ALTS_D			10
-#define bci_PUSH_ALTS_L			11
-#define bci_PUSH_ALTS_V			12
-#define bci_PUSH_UBX  			13
-#define bci_PUSH_APPLY_N		14
-#define bci_PUSH_APPLY_F		15
-#define bci_PUSH_APPLY_D		16
-#define bci_PUSH_APPLY_L		17
-#define bci_PUSH_APPLY_V		18
-#define bci_PUSH_APPLY_P		19
-#define bci_PUSH_APPLY_PP		20
-#define bci_PUSH_APPLY_PPP		21
-#define bci_PUSH_APPLY_PPPP		22
-#define bci_PUSH_APPLY_PPPPP		23
-#define bci_PUSH_APPLY_PPPPPP		24
-/* #define bci_PUSH_APPLY_PPPPPPP		25 */
-#define bci_SLIDE     			26
-#define bci_ALLOC_AP   			27
-#define bci_ALLOC_AP_NOUPD		28
-#define bci_ALLOC_PAP  			29
-#define bci_MKAP      			30
-#define bci_MKPAP      			31
-#define bci_UNPACK    			32
-#define bci_PACK      			33
-#define bci_TESTLT_I   			34
-#define bci_TESTEQ_I  			35
-#define bci_TESTLT_F  			36
-#define bci_TESTEQ_F  			37
-#define bci_TESTLT_D  			38
-#define bci_TESTEQ_D  			39
-#define bci_TESTLT_P  			40
-#define bci_TESTEQ_P  			41
-#define bci_CASEFAIL  			42
-#define bci_JMP       			43
-#define bci_CCALL     			44
-#define bci_SWIZZLE   			45
-#define bci_ENTER     			46
-#define bci_RETURN    			47
-#define bci_RETURN_P 			48
-#define bci_RETURN_N 			49
-#define bci_RETURN_F 			50
-#define bci_RETURN_D 			51
-#define bci_RETURN_L 			52
-#define bci_RETURN_V 			53
-#define bci_BRK_FUN			54
-#define bci_TESTLT_W   			55
-#define bci_TESTEQ_W  			56
+#define bci_PUSH8                       5
+#define bci_PUSH16                      6
+#define bci_PUSH32                      7
+#define bci_PUSH8_W                     8
+#define bci_PUSH16_W                    9
+#define bci_PUSH32_W                    10
+#define bci_PUSH_G    			11
+#define bci_PUSH_ALTS  			12
+#define bci_PUSH_ALTS_P			13
+#define bci_PUSH_ALTS_N			14
+#define bci_PUSH_ALTS_F			15
+#define bci_PUSH_ALTS_D			16
+#define bci_PUSH_ALTS_L			17
+#define bci_PUSH_ALTS_V			18
+#define bci_PUSH_PAD8                   19
+#define bci_PUSH_PAD16                  20
+#define bci_PUSH_PAD32                  21
+#define bci_PUSH_UBX8                   22
+#define bci_PUSH_UBX16                  23
+#define bci_PUSH_UBX32                  24
+#define bci_PUSH_UBX  			25
+#define bci_PUSH_APPLY_N		26
+#define bci_PUSH_APPLY_F		27
+#define bci_PUSH_APPLY_D		28
+#define bci_PUSH_APPLY_L		29
+#define bci_PUSH_APPLY_V		30
+#define bci_PUSH_APPLY_P		31
+#define bci_PUSH_APPLY_PP		32
+#define bci_PUSH_APPLY_PPP		33
+#define bci_PUSH_APPLY_PPPP		34
+#define bci_PUSH_APPLY_PPPPP		35
+#define bci_PUSH_APPLY_PPPPPP		36
+/* #define bci_PUSH_APPLY_PPPPPPP		37 */
+#define bci_SLIDE     			38
+#define bci_ALLOC_AP   			39
+#define bci_ALLOC_AP_NOUPD		40
+#define bci_ALLOC_PAP  			41
+#define bci_MKAP      			42
+#define bci_MKPAP      			43
+#define bci_UNPACK    			44
+#define bci_PACK      			45
+#define bci_TESTLT_I   			46
+#define bci_TESTEQ_I  			47
+#define bci_TESTLT_F  			48
+#define bci_TESTEQ_F  			49
+#define bci_TESTLT_D  			50
+#define bci_TESTEQ_D  			51
+#define bci_TESTLT_P  			52
+#define bci_TESTEQ_P  			53
+#define bci_CASEFAIL  			54
+#define bci_JMP       			55
+#define bci_CCALL     			56
+#define bci_SWIZZLE   			57
+#define bci_ENTER     			58
+#define bci_RETURN    			59
+#define bci_RETURN_P 			60
+#define bci_RETURN_N 			61
+#define bci_RETURN_F 			62
+#define bci_RETURN_D 			63
+#define bci_RETURN_L 			64
+#define bci_RETURN_V 			65
+#define bci_BRK_FUN			66
+#define bci_TESTLT_W   			67
+#define bci_TESTEQ_W  			68
 /* If you need to go past 255 then you will run into the flags */
 
 /* If you need to go below 0x0100 then you will run into the instructions */
diff --git a/includes/stg/Types.h b/includes/stg/Types.h
index af6a51791c..91ad446993 100644
--- a/includes/stg/Types.h
+++ b/includes/stg/Types.h
@@ -68,6 +68,8 @@ typedef uint8_t                  StgWord8;
 #define STG_INT8_MAX             INT8_MAX
 #define STG_WORD8_MAX            UINT8_MAX
 
+#define FMT_Word8                PRIu8
+
 typedef int16_t                  StgInt16;
 typedef uint16_t                 StgWord16;
 
@@ -75,6 +77,8 @@ typedef uint16_t                 StgWord16;
 #define STG_INT16_MAX            INT16_MAX
 #define STG_WORD16_MAX           UINT16_MAX
 
+#define FMT_Word16               PRIu16
+
 typedef int32_t                  StgInt32;
 typedef uint32_t                 StgWord32;
 
diff --git a/rts/Disassembler.c b/rts/Disassembler.c
index e133e3a6ff..8c84e13ef3 100644
--- a/rts/Disassembler.c
+++ b/rts/Disassembler.c
@@ -94,11 +94,28 @@ disInstr ( StgBCO *bco, int pc )
          debugBelch("PUSH_LLL %d %d %d\n", instrs[pc], instrs[pc+1],
                                                             instrs[pc+2] );
          pc += 3; break;
+      case bci_PUSH8:
+         debugBelch("PUSH8    %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH16:
+         debugBelch("PUSH16   %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH32:
+         debugBelch("PUSH32   %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH8_W:
+         debugBelch("PUSH8_W  %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH16_W:
+         debugBelch("PUSH16_W %d\n", instrs[pc] );
+         pc += 1; break;
+      case bci_PUSH32_W:
+         debugBelch("PUSH32_W %d\n", instrs[pc] );
+         pc += 1; break;
       case bci_PUSH_G:
          debugBelch("PUSH_G   " ); printPtr( ptrs[instrs[pc]] );
          debugBelch("\n" );
          pc += 1; break;
-
       case bci_PUSH_ALTS:
          debugBelch("PUSH_ALTS  " ); printPtr( ptrs[instrs[pc]] );
          debugBelch("\n");
@@ -127,7 +144,33 @@ disInstr ( StgBCO *bco, int pc )
          debugBelch("PUSH_ALTS_V  " ); printPtr( ptrs[instrs[pc]] );
          debugBelch("\n");
          pc += 1; break;
-
+      case bci_PUSH_PAD8:
+         debugBelch("PUSH_PAD8\n");
+         pc += 1; break;
+      case bci_PUSH_PAD16:
+         debugBelch("PUSH_PAD16\n");
+         pc += 1; break;
+      case bci_PUSH_PAD32:
+         debugBelch("PUSH_PAD32\n");
+         pc += 1; break;
+      case bci_PUSH_UBX8:
+         debugBelch(
+             "PUSH_UBX8 0x%" FMT_Word8 " ",
+             (StgWord8) literals[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_UBX16:
+         debugBelch(
+             "PUSH_UBX16 0x%" FMT_Word16 " ",
+             (StgWord16) literals[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
+      case bci_PUSH_UBX32:
+         debugBelch(
+             "PUSH_UBX32 0x%" FMT_Word32 " ",
+             (StgWord32) literals[instrs[pc]] );
+         debugBelch("\n");
+         pc += 1; break;
       case bci_PUSH_UBX:
          debugBelch("PUSH_UBX ");
          for (i = 0; i < instrs[pc+1]; i++)
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index 165511b24c..0e80593d07 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -1181,6 +1181,48 @@ run_BCO:
             goto nextInsn;
         }
 
+        case bci_PUSH8: {
+            int off = BCO_NEXT;
+            Sp_subB(1);
+            *(StgWord8*)Sp = *(StgWord8*)(Sp_plusB(off+1));
+            goto nextInsn;
+        }
+
+        case bci_PUSH16: {
+            int off = BCO_NEXT;
+            Sp_subB(2);
+            *(StgWord16*)Sp = *(StgWord16*)(Sp_plusB(off+2));
+            goto nextInsn;
+        }
+
+        case bci_PUSH32: {
+            int off = BCO_NEXT;
+            Sp_subB(4);
+            *(StgWord32*)Sp = *(StgWord32*)(Sp_plusB(off+4));
+            goto nextInsn;
+        }
+
+        case bci_PUSH8_W: {
+            int off = BCO_NEXT;
+            *(StgWord*)(Sp_minusW(1)) = *(StgWord8*)(Sp_plusB(off));
+            Sp_subW(1);
+            goto nextInsn;
+        }
+
+        case bci_PUSH16_W: {
+            int off = BCO_NEXT;
+            *(StgWord*)(Sp_minusW(1)) = *(StgWord16*)(Sp_plusB(off));
+            Sp_subW(1);
+            goto nextInsn;
+        }
+
+        case bci_PUSH32_W: {
+            int off = BCO_NEXT;
+            *(StgWord*)(Sp_minusW(1)) = *(StgWord32*)(Sp_plusB(off));
+            Sp_subW(1);
+            goto nextInsn;
+        }
+
         case bci_PUSH_G: {
             int o1 = BCO_GET_LARGE_ARG;
             SpW(-1) = BCO_PTR(o1);
@@ -1313,6 +1355,45 @@ run_BCO:
             Sp_subW(1); SpW(0) = (W_)&stg_ap_pppppp_info;
             goto nextInsn;
 
+        case bci_PUSH_PAD8: {
+            Sp_subB(1);
+            *(StgWord8*)Sp = 0;
+            goto nextInsn;
+        }
+
+        case bci_PUSH_PAD16: {
+            Sp_subB(2);
+            *(StgWord16*)Sp = 0;
+            goto nextInsn;
+        }
+
+        case bci_PUSH_PAD32: {
+            Sp_subB(4);
+            *(StgWord32*)Sp = 0;
+            goto nextInsn;
+        }
+
+        case bci_PUSH_UBX8: {
+            int o_lit = BCO_GET_LARGE_ARG;
+            Sp_subB(1);
+            *(StgWord8*)Sp = *(StgWord8*)(literals+o_lit);
+            goto nextInsn;
+        }
+
+        case bci_PUSH_UBX16: {
+            int o_lit = BCO_GET_LARGE_ARG;
+            Sp_subB(2);
+            *(StgWord16*)Sp = *(StgWord16*)(literals+o_lit);
+            goto nextInsn;
+        }
+
+        case bci_PUSH_UBX32: {
+            int o_lit = BCO_GET_LARGE_ARG;
+            Sp_subB(4);
+            *(StgWord32*)Sp = *(StgWord32*)(literals+o_lit);
+            goto nextInsn;
+        }
+
         case bci_PUSH_UBX: {
             int i;
             int o_lits = BCO_GET_LARGE_ARG;
diff --git a/testsuite/tests/codeGen/should_run/T13825-unit.hs b/testsuite/tests/codeGen/should_run/T13825-unit.hs
new file mode 100644
index 0000000000..bd3d7fbb33
--- /dev/null
+++ b/testsuite/tests/codeGen/should_run/T13825-unit.hs
@@ -0,0 +1,78 @@
+module Main where
+
+import DynFlags
+import RepType
+import SMRep
+import StgCmmLayout
+import StgCmmClosure
+import GHC
+import GhcMonad
+import System.Environment
+import Platform
+
+main :: IO ()
+main = do
+    [libdir] <- getArgs
+    runGhc (Just libdir) tests
+
+
+-- How to read tests:
+--   F(a,8) = field a at offset 8
+--   P(4,8) = 4 bytes of padding at offset 8
+tests :: Ghc ()
+tests = do
+      (_, _, off) <- runTest [("a", FloatRep), ("b", DoubleRep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,8)"]
+          ["F(a,8)", "P(4,12)", "F(b,16)"]
+
+      (_, _, off) <- runTest [("a", FloatRep), ("b", FloatRep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,8)"]
+          ["F(a,8)", "F(b,12)"]
+
+      (_, _, off) <- runTest [("a", FloatRep), ("b", FloatRep), ("c", FloatRep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,8)", "F(c,12)"]
+          ["F(a,8)", "F(b,12)", "F(c,16)", "P(4,20)"]
+
+      (_, _, off) <- runTest [("a", FloatRep), ("b", FloatRep), ("c", Int64Rep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,8)", "F(c,12)"]
+          ["F(a,8)", "F(b,12)", "F(c,16)"]
+
+      (_, _, off) <- runTest [("a", Int64Rep), ("b", FloatRep), ("c", FloatRep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,12)", "F(c,16)"]
+          ["F(a,8)", "F(b,16)", "F(c,20)"]
+
+      (_, _, off) <- runTest [("a", Int64Rep), ("b", FloatRep), ("c", Int64Rep)]
+      assert_32_64 (map fmt off)
+          ["F(a,4)", "F(b,12)", "F(c,16)"]
+          ["F(a,8)", "F(b,16)", "P(4,20)", "F(c,24)"]
+
+
+assert_32_64 :: (Eq a, Show a) => a -> a -> a -> Ghc ()
+assert_32_64 actual expected32 expected64 = do
+    dflags <- getDynFlags
+    let
+      expected
+          | word_size == 4 = expected32
+          | word_size == 8 = expected64
+      word_size = wORD_SIZE dflags
+    case actual == expected of
+        True -> return ()
+        False ->
+            error $ "Expected:\n" ++ show expected
+                 ++ "\nBut got:\n" ++ show actual
+
+runTest :: [(a, PrimRep)] -> Ghc (WordOff , WordOff, [FieldOffOrPadding a])
+runTest prim_reps = do
+    dflags <- getDynFlags
+    return $ mkVirtHeapOffsetsWithPadding dflags False (mkNonVoids prim_reps)
+  where
+    mkNonVoids = map (\(a, prim_rep) -> NonVoid (prim_rep, a))
+
+fmt :: FieldOffOrPadding String -> String
+fmt (FieldOff (NonVoid id) off) = "F(" ++ id ++ "," ++ show off ++ ")"
+fmt (Padding len off) = "P(" ++ show len ++ "," ++ show off ++ ")"
diff --git a/testsuite/tests/codeGen/should_run/all.T b/testsuite/tests/codeGen/should_run/all.T
index 6aacea5fa3..214a9d5704 100644
--- a/testsuite/tests/codeGen/should_run/all.T
+++ b/testsuite/tests/codeGen/should_run/all.T
@@ -159,3 +159,7 @@ test('T9577', [ unless(arch('x86_64') or arch('i386'),skip),
 
 test('T13425', normal, compile_and_run, ['-O'])
 test('castFloatWord', normal, compile_and_run, ['-dcmm-lint'])
+test('T13825-unit',
+     extra_run_opts('"' + config.libdir + '"'),
+     compile_and_run,
+     ['-package ghc'])
diff --git a/testsuite/tests/ghci.debugger/scripts/T13825-debugger.hs b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.hs
new file mode 100644
index 0000000000..0c3a1de219
--- /dev/null
+++ b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.hs
@@ -0,0 +1,33 @@
+{-# LANGUAGE MagicHash #-}
+module T13825 where
+
+import GHC.Exts
+import Data.Word
+import Data.Int
+
+data Packed1 = Packed1 Float# Float# Int# Float#
+    deriving Show
+
+data Packed2 =
+    Packed2
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Int
+        {-# UNPACK #-} !Float
+    deriving Show
+
+data Packed3 =
+    Packed3
+        {-# UNPACK #-} !Word8
+        {-# UNPACK #-} !Int8
+        {-# UNPACK #-} !Int64
+        {-# UNPACK #-} !Word16
+        {-# UNPACK #-} !Word64
+        {-# UNPACK #-} !Word32
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Double
+    deriving Show
+
+packed1 = Packed1 12.34# 56.78# 42# 99.99#
+packed2 = Packed2 12.34 56.78 42 99.99
+packed3 = Packed3 1 2 3 4 5 6 7.8 9.0
diff --git a/testsuite/tests/ghci.debugger/scripts/T13825-debugger.script b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.script
new file mode 100644
index 0000000000..fc55ffc5dd
--- /dev/null
+++ b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.script
@@ -0,0 +1,7 @@
+:l T13825-debugger.hs
+packed1
+:print packed1
+packed2
+:print packed2
+packed3
+:print packed3
diff --git a/testsuite/tests/ghci.debugger/scripts/T13825-debugger.stdout b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.stdout
new file mode 100644
index 0000000000..6d3dc2f560
--- /dev/null
+++ b/testsuite/tests/ghci.debugger/scripts/T13825-debugger.stdout
@@ -0,0 +1,8 @@
+Packed1 12.34# 56.78# 42# 99.99#
+packed1 = Packed1 12.34 56.78 42 99.99
+Packed2 12.34 56.78 42 99.99
+packed2 = Packed2 12.34 56.78 42 99.99
+Packed3 1 2 3 4 5 6 7.8 9.0
+packed3 = Packed3
+            (GHC.Word.W8# 1) (GHC.Int.I8# 2) (GHC.Int.I64# 3) (GHC.Word.W16# 4)
+            (GHC.Word.W64# 5) (GHC.Word.W32# 6) 7.8 9.0
diff --git a/testsuite/tests/ghci.debugger/scripts/all.T b/testsuite/tests/ghci.debugger/scripts/all.T
index 00a39d704e..de3e7e37b2 100644
--- a/testsuite/tests/ghci.debugger/scripts/all.T
+++ b/testsuite/tests/ghci.debugger/scripts/all.T
@@ -95,3 +95,4 @@ test('getargs', extra_files(['../getargs.hs']), ghci_script, ['getargs.script'])
 test('T7386', normal, ghci_script, ['T7386.script'])
 test('T8557', normal, ghci_script, ['T8557.script'])
 test('T12458', normal, ghci_script, ['T12458.script'])
+test('T13825-debugger', normal, ghci_script, ['T13825-debugger.script'])
diff --git a/testsuite/tests/ghci/should_run/T13825-ghci.hs b/testsuite/tests/ghci/should_run/T13825-ghci.hs
new file mode 100644
index 0000000000..959cc7dc5b
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/T13825-ghci.hs
@@ -0,0 +1,38 @@
+module T13825 where
+
+import Data.Int
+import Data.Word
+
+data Packed =
+    Packed
+      {-# UNPACK #-} !Float
+      {-# UNPACK #-} !Float
+      {-# UNPACK #-} !Int8
+      {-# UNPACK #-} !Word16
+      {-# UNPACK #-} !Float
+      {-# UNPACK #-} !Int
+  deriving (Show)
+
+-- Test a top-level constant
+packed :: Packed
+packed = Packed 1.0 2.0 3 4 5 6
+
+packedAll :: [Packed]
+packedAll =
+    packed :
+    [ Packed
+        (fromIntegral i)
+        (fromIntegral (i + 1))
+        (fromIntegral (i + 2))
+        (fromIntegral (i + 3))
+        (fromIntegral (i + 3))
+        (fromIntegral (i + 4))
+    | i <- [1.. 4]
+    ]
+
+addOne :: Packed -> Packed
+addOne (Packed a b c d e f) =
+    Packed (a + 1.0) (b + 1.0) (c + 1) (d + 1) (e + 1.0) (f + 1)
+
+mapAddOne :: [Packed] -> [Packed]
+mapAddOne = map addOne
diff --git a/testsuite/tests/ghci/should_run/T13825-ghci.script b/testsuite/tests/ghci/should_run/T13825-ghci.script
new file mode 100644
index 0000000000..6cd22d9a1c
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/T13825-ghci.script
@@ -0,0 +1,13 @@
+:l T13825-ghci
+let ghciPacked = Packed 1.0 2.0 3 4 5 6
+map addOne (ghciPacked : packedAll)
+let ghciAddOne (Packed a b c d e f) = Packed (a + 1.0) (b + 1.0) (c + 1) (d + 1) (e + 1.0) (f + 1)
+map ghciAddOne (ghciPacked : packedAll)
+
+:set -fobject-code
+:l T13825-ghci
+:set -fbyte-code
+let ghciPacked = Packed 1.0 2.0 3 4 5 6
+map addOne (ghciPacked : packedAll)
+let ghciAddOne (Packed a b c d e f) = Packed (a + 1.0) (b + 1.0) (c + 1) (d + 1) (e + 1.0) (f + 1)
+map ghciAddOne (ghciPacked : packedAll)
diff --git a/testsuite/tests/ghci/should_run/T13825-ghci.stdout b/testsuite/tests/ghci/should_run/T13825-ghci.stdout
new file mode 100644
index 0000000000..4edee56c11
--- /dev/null
+++ b/testsuite/tests/ghci/should_run/T13825-ghci.stdout
@@ -0,0 +1,4 @@
+[Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 5.0 6,Packed 3.0 4.0 5 6 6.0 7,Packed 4.0 5.0 6 7 7.0 8,Packed 5.0 6.0 7 8 8.0 9]
+[Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 5.0 6,Packed 3.0 4.0 5 6 6.0 7,Packed 4.0 5.0 6 7 7.0 8,Packed 5.0 6.0 7 8 8.0 9]
+[Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 5.0 6,Packed 3.0 4.0 5 6 6.0 7,Packed 4.0 5.0 6 7 7.0 8,Packed 5.0 6.0 7 8 8.0 9]
+[Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 6.0 7,Packed 2.0 3.0 4 5 5.0 6,Packed 3.0 4.0 5 6 6.0 7,Packed 4.0 5.0 6 7 7.0 8,Packed 5.0 6.0 7 8 8.0 9]
diff --git a/testsuite/tests/ghci/should_run/all.T b/testsuite/tests/ghci/should_run/all.T
index da20149b56..c64b0e7026 100644
--- a/testsuite/tests/ghci/should_run/all.T
+++ b/testsuite/tests/ghci/should_run/all.T
@@ -29,3 +29,4 @@ test('T12456',     just_ghci, ghci_script, ['T12456.script'])
 test('T12549',     just_ghci, ghci_script, ['T12549.script'])
 test('BinaryArray', normal, compile_and_run, [''])
 test('T14125a',    just_ghci, ghci_script, ['T14125a.script'])
+test('T13825-ghci',just_ghci, ghci_script, ['T13825-ghci.script'])
diff --git a/testsuite/tests/primops/should_run/T13825-compile.hs b/testsuite/tests/primops/should_run/T13825-compile.hs
new file mode 100644
index 0000000000..04a72b38e9
--- /dev/null
+++ b/testsuite/tests/primops/should_run/T13825-compile.hs
@@ -0,0 +1,66 @@
+{-# LANGUAGE MagicHash #-}
+module Main where
+
+import GHC.Exts
+import Data.Word
+import Data.Int
+
+data Packed1 = Packed1 Float# Float# Int# Float#
+    deriving Show
+
+data Packed2 =
+    Packed2
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Int
+        {-# UNPACK #-} !Float
+    deriving Show
+
+data Packed3 =
+    Packed3
+        {-# UNPACK #-} !Word8
+        {-# UNPACK #-} !Int8
+        {-# UNPACK #-} !Int64
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Word64
+        {-# UNPACK #-} !Word32
+        {-# UNPACK #-} !Float
+        {-# UNPACK #-} !Double
+    deriving Show
+
+packed1 = go 0.0# 1.0# 2# 3.0#
+  where
+    go a b c d =
+        Packed1 a b c d
+            : go (a `plusFloat#` 1.0#)
+                 (b `plusFloat#` 1.0#)
+               (c +# 1#)
+               (d `plusFloat#` 1.0#)
+
+packed2 =
+    [ Packed2
+        (fromIntegral i)
+        (fromIntegral (i + 1))
+        (fromIntegral (i + 2))
+        (fromIntegral (i + 3))
+    | i <- [0..]
+    ]
+
+packed3 =
+    [ Packed3
+        (fromIntegral i)
+        (fromIntegral (i + 1))
+        (fromIntegral (i + 2))
+        (fromIntegral (i + 3))
+        (fromIntegral (i + 4))
+        (fromIntegral (i + 5))
+        (fromIntegral (i + 6))
+        (fromIntegral (i + 6))
+    | i <- [0..]
+    ]
+
+main :: IO ()
+main = do
+    print (take 3 packed1)
+    print (take 3 packed2)
+    print (take 3 packed3)
diff --git a/testsuite/tests/primops/should_run/T13825-compile.stdout b/testsuite/tests/primops/should_run/T13825-compile.stdout
new file mode 100644
index 0000000000..41a5fb1368
--- /dev/null
+++ b/testsuite/tests/primops/should_run/T13825-compile.stdout
@@ -0,0 +1,3 @@
+[Packed1 0.0# 1.0# 2# 3.0#,Packed1 1.0# 2.0# 3# 4.0#,Packed1 2.0# 3.0# 4# 5.0#]
+[Packed2 0.0 1.0 2 3.0,Packed2 1.0 2.0 3 4.0,Packed2 2.0 3.0 4 5.0]
+[Packed3 0 1 2 3.0 4 5 6.0 6.0,Packed3 1 2 3 4.0 5 6 7.0 7.0,Packed3 2 3 4 5.0 6 7 8.0 8.0]
diff --git a/testsuite/tests/primops/should_run/all.T b/testsuite/tests/primops/should_run/all.T
index 68a2d5609f..30e871ac11 100644
--- a/testsuite/tests/primops/should_run/all.T
+++ b/testsuite/tests/primops/should_run/all.T
@@ -13,3 +13,4 @@ test('T10678',
      ],
      compile_and_run, ['-O'])
 test('T11296', normal, compile_and_run, [''])
+test('T13825-compile', normal, compile_and_run, [''])