9 files changed, 310 insertions, 49 deletions
diff --git a/libraries/base/GHC/Base.hs b/libraries/base/GHC/Base.hs
index cd0c7e2c15..dae86a9484 100644
--- a/libraries/base/GHC/Base.hs
+++ b/libraries/base/GHC/Base.hs
@@ -1637,7 +1637,13 @@ a `iShiftRL#` b | isTrue# (b >=# WORD_SIZE_IN_BITS#) = 0#
 "unpack-list"  [1]  forall a   . unpackFoldrCString# a (:) [] = unpackCString# a
 "unpack-append"     forall a n . unpackFoldrCString# a (:) n  = unpackAppendCString# a n
 
--- There's a built-in rule (in PrelRules.hs) for
+"unpack-utf8"       [~1] forall a   . unpackCStringUtf8# a             = build (unpackFoldrCStringUtf8# a)
+"unpack-list-utf8"  [1]  forall a   . unpackFoldrCStringUtf8# a (:) [] = unpackCStringUtf8# a
+"unpack-append-utf8"     forall a n . unpackFoldrCStringUtf8# a (:) n  = unpackAppendCStringUtf8# a n
+
+-- There's a built-in rule (in GHC.Core.Op.ConstantFold) for
 --      unpackFoldr "foo" c (unpackFoldr "baz" c n)  =  unpackFoldr "foobaz" c n
 
+-- See also the Note [String literals in GHC] in CString.hs
+
   #-}
diff --git a/libraries/base/GHC/List.hs b/libraries/base/GHC/List.hs
index 6f6d9d670a..f75b9426fb 100644
--- a/libraries/base/GHC/List.hs
+++ b/libraries/base/GHC/List.hs
@@ -860,7 +860,7 @@ elem _ []       = False
 elem x (y:ys)   = x==y || elem x ys
 {-# NOINLINE [1] elem #-}
 {-# RULES
-"elem/build"    forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b)
+"elem/build"    forall x (g :: forall b . (a -> b -> b) -> b -> b)
    . elem x (build g) = g (\ y r -> (x == y) || r) False
  #-}
 #endif
@@ -874,7 +874,7 @@ notElem _ []    =  True
 notElem x (y:ys)=  x /= y && notElem x ys
 {-# NOINLINE [1] notElem #-}
 {-# RULES
-"notElem/build" forall x (g :: forall b . Eq a => (a -> b -> b) -> b -> b)
+"notElem/build" forall x (g :: forall b . (a -> b -> b) -> b -> b)
    . notElem x (build g) = g (\ y r -> (x /= y) && r) True
  #-}
 #endif
diff --git a/libraries/base/changelog.md b/libraries/base/changelog.md
index 953f80740d..a760f97970 100644
--- a/libraries/base/changelog.md
+++ b/libraries/base/changelog.md
@@ -1,5 +1,18 @@
 # Changelog for [`base` package](http://hackage.haskell.org/package/base)
 
+## 4.14.2.0 *Mar 2021*
+
+  * Add `Generic` instances to `Fingerprint`, `GiveGCStats`, `GCFlags`,
+    `ConcFlags`, `DebugFlags`, `CCFlags`, `DoHeapProfile`, `ProfFlags`,
+    `DoTrace`, `TraceFlags`, `TickyFlags`, `ParFlags`, `RTSFlags`, `RTSStats`,
+    `GCStats`, `ByteOrder`, `GeneralCategory`, `SrcLoc`
+
+  * Add rules `unpackUtf8`, `unpack-listUtf8` and `unpack-appendUtf8` to `GHC.Base`.
+    They correspond to their ascii versions and hopefully make it easier
+    for libraries to handle utf8 encoded strings efficiently.
+
+  * An issue with list fusion and `elem` was fixed. `elem` applied to known
+    small lists will now compile to a simple case statement more often.
 ## 4.14.1.0 *Jul 2020*
 
   * Bundled with GHC 8.10.2
diff --git a/libraries/base/tests/perf/Makefile b/libraries/base/tests/perf/Makefile
new file mode 100644
index 0000000000..28840dc0b9
--- /dev/null
+++ b/libraries/base/tests/perf/Makefile
@@ -0,0 +1,15 @@
+# This Makefile runs the tests using GHC's testsuite framework.  It
+# assumes the package is part of a GHC build tree with the testsuite
+# installed in ../../../testsuite.
+
+TOP=../../../../testsuite
+include $(TOP)/mk/boilerplate.mk
+include $(TOP)/mk/test.mk
+
+
+T17752:
+	'$(TEST_HC)' $(TEST_HC_OPTS) -O --make T17752 -rtsopts -ddump-simpl -ddump-to-file -dsuppress-uniques -dsuppress-all
+	# All occurences of elem should be optimized away.
+	# For strings these should result in loops after inlining foldCString.
+	# For lists it should result in a case expression.
+	echo $$(cat T17752.dump-simpl | grep "elem" -A4 )
diff --git a/libraries/base/tests/perf/T17752.hs b/libraries/base/tests/perf/T17752.hs
new file mode 100644
index 0000000000..f7f136e1f4
--- /dev/null
+++ b/libraries/base/tests/perf/T17752.hs
@@ -0,0 +1,18 @@
+module T17752 where
+
+-- All occurences of elem should be optimized away.
+-- For strings these should result in loops after inlining foldCString.
+-- For lists it should result in a case expression.
+
+-- Should compile to a pattern match if the rules fire
+isElemList    x = x `elem` ['a','b','c']
+isNotElemList x = x `elem` ['x','y','z']
+
+isOneOfThese x = x `elem` [1,2,3,4,5::Int]
+isNotOneOfThese x = x `notElem` [1,2,3,4,5::Int]
+
+isElemString x = elem x "foo"
+isNotElemString x = notElem x "bar"
+
+isElemStringUtf x = elem x "foö"
+isNotElemStringUtf x = notElem x "bär"
diff --git a/libraries/base/tests/perf/T17752.stdout b/libraries/base/tests/perf/T17752.stdout
new file mode 100644
index 0000000000..4d70e27e02
--- /dev/null
+++ b/libraries/base/tests/perf/T17752.stdout
@@ -0,0 +1,2 @@
+[1 of 1] Compiling T17752           ( T17752.hs, T17752.o )
+
diff --git a/libraries/base/tests/perf/all.T b/libraries/base/tests/perf/all.T
new file mode 100644
index 0000000000..61f42f5420
--- /dev/null
+++ b/libraries/base/tests/perf/all.T
@@ -0,0 +1,5 @@
+#--------------------------------------
+# Check specialization of elem via rules
+#--------------------------------------
+
+test('T17752', [only_ways(['normal'])] , makefile_test, ['T17752'])
diff --git a/libraries/ghc-prim/GHC/CString.hs b/libraries/ghc-prim/GHC/CString.hs
index 0e6199f30f..fd0e3f6625 100644
--- a/libraries/ghc-prim/GHC/CString.hs
+++ b/libraries/ghc-prim/GHC/CString.hs
@@ -16,13 +16,78 @@
 -----------------------------------------------------------------------------
 
 module GHC.CString (
+        -- * Ascii variants
         unpackCString#, unpackAppendCString#, unpackFoldrCString#,
-        unpackCStringUtf8#, unpackNBytes#
+        cstringLength#,
+
+        -- * Utf variants
+        unpackCStringUtf8#, unpackAppendCStringUtf8#, unpackFoldrCStringUtf8#,
+
+        -- * Other
+        unpackNBytes#,
     ) where
 
 import GHC.Types
 import GHC.Prim
 
+{-
+Note [String literals in GHC]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+String literals get quite a bit of special handling in GHC.  This Note
+summarises the moving parts.
+
+* Desugaring: see GHC.HsToCore.Match.Literal.dsLit, which in
+  turn calls GHC.Core.Make.mkStringExprFS.
+
+  The desugarer desugars the Haskell literal "foo" into Core
+     GHC.CString.unpackCString# "foo"#
+  where "foo"# is primitive string literal (of type Addr#).
+
+  When the string cannot be encoded as a C string, we use UTF8:
+     GHC.CString.unpackCStringUtf8# "foo"#
+
+* The library module ghc-prim:GHC.CString has a bunch of functions that
+  work over primitive strings, including GHC.CString.unpackCString#
+
+* GHC.Core.Op.ConstantFold has some RULES that optimise certain string
+  operations on literal strings. For example:
+
+    + Constant folding the desugared form of ("foo" ++ "bar")
+      into ("foobar")
+    + Comparing strings
+    + and more
+
+* GHC.Base has a number of regular rules for String literals.
+
+  + a rule "eqString": (==) @String = eqString
+    where GHC.Base.eqString :: String -> String -> Bool
+
+    ConstantFold has a RULE for eqString on literals:
+     eqString (Lit "foo"#) (Lit "bar"#) --> False
+
+    This allows compile time evaluation of things like "foo" == "bar"
+
+  + A bunch of rules to promote fusion:
+
+    "unpack"       [~1] forall a   . unpackCString# a             = build (unpackFoldrCString# a)
+    "unpack-list"  [1]  forall a   . unpackFoldrCString# a (:) [] = unpackCString# a
+    "unpack-append"     forall a n . unpackFoldrCString# a (:) n  = unpackAppendCString# a n
+
+    And UTF8 variants of these rules.
+
+* We allow primitive (unlifted) literal strings to be top-level
+  bindings, breaking out usual rule.  See GHC.Core
+  Note [Core top-level string literals]
+
+* TODO: There is work on a special code-gen path for top-level boxed strings
+     str :: [Char]
+     str = unpackCString# "foo"#
+  so that they can all share a common code pointer
+
+  There is a WIP MR on gitlab for this: !3012
+
+-}
+
 -----------------------------------------------------------------------------
 -- Unpacking C strings
 -----------------------------------------------------------------------------
@@ -70,6 +135,49 @@ Moreover, we want to make it CONLIKE, so that:
 All of this goes for unpackCStringUtf8# too.
 -}
 
+{- Note [Inlining of unpackFoldrCString]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#
+It also has a BuiltInRule in PrelRules.hs:
+     unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
+       =  unpackFoldrCString# "foobaz" c n
+
+We use NOINLINE [0] on the grounds that, unlike
+unpackCString#, there *is* some point in inlining
+unpackFoldrCString#, because we get better code for the
+higher-order function call.
+
+This can cause a code size increase but it was minimal
+when looking at nofib.
+
+This is especially important for elem which then results in an
+allocation free loop.
+
+  Note [unpackCString# iterating over addr]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+When unpacking unpackCString# and friends repeatedly return a cons cell
+containing:
+* The current character we just unpacked.
+* A thunk to unpack the rest of the string.
+
+In order to minimize the size of the thunk we do not index of
+the start of the string, offsetting into it, but instead increment
+the addr and always use offset 0#.
+
+This works since these two expressions will read from the same address.
+* `indexCharOffAddr# a i`
+* `indexCharOffAddr (a `plusAddr#` i) 0#`
+
+This way we avoid the need for the thunks to close over both the start of
+the string and the current offset, saving a word for each character unpacked.
+
+This has the additional advantage the we can guarantee that only the
+increment will happen in the loop.
+
+-}
+
 unpackCString# :: Addr# -> [Char]
 {-# NOINLINE CONLIKE unpackCString# #-}
 unpackCString# addr
@@ -93,60 +201,63 @@ unpackAppendCString# addr rest
       where
         !ch = indexCharOffAddr# addr nh
 
-unpackFoldrCString# :: Addr# -> (Char  -> a -> a) -> a -> a
-
--- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#
-
--- It also has a BuiltInRule in PrelRules.hs:
---      unpackFoldrCString# "foo" c (unpackFoldrCString# "baz" c n)
---        =  unpackFoldrCString# "foobaz" c n
-
-{-# NOINLINE unpackFoldrCString# #-}
--- At one stage I had NOINLINE [0] on the grounds that, unlike
--- unpackCString#, there *is* some point in inlining
--- unpackFoldrCString#, because we get better code for the
--- higher-order function call.  BUT there may be a lot of
--- literal strings, and making a separate 'unpack' loop for
--- each is highly gratuitous.  See nofib/real/anna/PrettyPrint.
-
-unpackFoldrCString# addr f z
-  = unpack 0#
+-- Usually the unpack-list rule turns unpackFoldrCString# into unpackCString#.
+-- See Note [String literals in GHC] for more details.
+-- See [Inlining of unpackFoldrCString]
+{-# NOINLINE[0] unpackFoldrCString# #-}
+unpackFoldrCString# :: Addr# -> (Char -> a -> a) -> a -> a
+unpackFoldrCString# str f z_init = go str z_init
   where
-    unpack nh
+    go addr z
       | isTrue# (ch `eqChar#` '\0'#) = z
-      | True                         = C# ch `f` unpack (nh +# 1#)
+      | True                         = C# ch `f` go (addr `plusAddr#` 1#) z
       where
-        !ch = indexCharOffAddr# addr nh
+        -- See Note [unpackCString# iterating over addr]
+        !ch = indexCharOffAddr# addr 0#
 
 -- There's really no point in inlining this for the same reasons as
 -- unpackCString. See Note [Inlining unpackCString#] above for details.
 unpackCStringUtf8# :: Addr# -> [Char]
 {-# NOINLINE CONLIKE unpackCStringUtf8# #-}
 unpackCStringUtf8# addr
-  = unpack 0#
+    | isTrue# (ch `eqChar#` '\0'#  ) = []
+    | True =
+        let !byte_count = getByteCount ch
+            !utf_ch = unpackUtf8Char# byte_count ch addr
+            !addr' = addr `plusBytes` byte_count
+        in  C# utf_ch : unpackCStringUtf8# addr'
+      where
+        -- See Note [unpackCString# iterating over addr]
+        !ch = indexCharOffAddr# addr 0#
+
+
+unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char]
+{-# NOINLINE unpackAppendCStringUtf8# #-}
+     -- See the NOINLINE note on unpackCString#
+unpackAppendCStringUtf8# addr rest
+    | isTrue# (ch `eqChar#` '\0'#) = rest
+    | True =
+        let !byte_count = getByteCount ch
+            !utf_ch = unpackUtf8Char# byte_count ch addr
+            !addr' = (addr `plusBytes` byte_count)
+        in  C# utf_ch : unpackAppendCStringUtf8# addr' rest
+      where
+        -- See Note [unpackCString# iterating over addr]
+        !ch = indexCharOffAddr# addr 0#
+
+-- See Note [Inlining of unpackFoldrCString]
+{-# NOINLINE[0] unpackFoldrCStringUtf8# #-}
+unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a
+unpackFoldrCStringUtf8# addr_init f z_init
+  = go addr_init z_init
   where
-    -- We take care to strictly evaluate the character decoding as
-    -- indexCharOffAddr# is marked with the can_fail flag and
-    -- consequently GHC won't evaluate the expression unless it is absolutely
-    -- needed.
-    unpack nh
-      | isTrue# (ch `eqChar#` '\0'#  ) = []
-      | isTrue# (ch `leChar#` '\x7F'#) = C# ch : unpack (nh +# 1#)
-      | isTrue# (ch `leChar#` '\xDF'#) =
-          let !c = C# (chr# (((ord# ch                                  -# 0xC0#) `uncheckedIShiftL#`  6#) +#
-                              (ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#)))
-          in c : unpack (nh +# 2#)
-      | isTrue# (ch `leChar#` '\xEF'#) =
-          let !c = C# (chr# (((ord# ch                                  -# 0xE0#) `uncheckedIShiftL#` 12#) +#
-                             ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
-                              (ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#)))
-          in c : unpack (nh +# 3#)
-      | True                           =
-          let !c = C# (chr# (((ord# ch                                  -# 0xF0#) `uncheckedIShiftL#` 18#) +#
-                             ((ord# (indexCharOffAddr# addr (nh +# 1#)) -# 0x80#) `uncheckedIShiftL#` 12#) +#
-                             ((ord# (indexCharOffAddr# addr (nh +# 2#)) -# 0x80#) `uncheckedIShiftL#`  6#) +#
-                              (ord# (indexCharOffAddr# addr (nh +# 3#)) -# 0x80#)))
-          in c : unpack (nh +# 4#)
+    go addr z
+      | isTrue# (ch `eqChar#` '\0'#) = z
+      | True =
+          let !byte_count = getByteCount ch
+              !utf_ch = unpackUtf8Char# byte_count ch addr
+              !addr' = (addr `plusBytes` byte_count)
+          in C# utf_ch `f` go addr' z
       where
         !ch = indexCharOffAddr# addr nh
 
@@ -163,3 +274,75 @@ unpackNBytes#  addr len# = unpack [] (len# -# 1#)
          case indexCharOffAddr# addr i# of
             ch -> unpack (C# ch : acc) (i# -# 1#)
 
+-- The return type is not correct here. We really want CSize,
+-- but that type is defined in base. However, CSize should always
+-- match the size of a machine word (I hope), so this is probably
+-- alright on all platforms that GHC supports.
+foreign import ccall unsafe "strlen" c_strlen :: Addr# -> Int#
+
+-- | Compute the length of a NUL-terminated string. This address
+-- must refer to immutable memory. GHC includes a built-in rule for
+-- constant folding when the argument is a statically-known literal.
+-- That is, a core-to-core pass reduces the expression
+-- @cstringLength# "hello"#@ to the constant @5#@.
+cstringLength# :: Addr# -> Int#
+{-# INLINE[0] cstringLength# #-}
+cstringLength# = c_strlen
+
+
+------------------------------
+--- UTF8 decoding utilities
+------------------------------
+--
+-- These functions make explicit the logic that was originally
+-- part of unpackCStringUtf8. Since we want the same support for ascii
+-- and non-ascii a variety of functions needs the same logic. Instead
+-- of C&P'in the decoding logic all over we have it here once, and then
+-- force GHC to inline it.
+--
+-- All the overhead of the Bytes argument and calls goes away once all is
+-- said and done. And what remains is readable code in Haskell land and
+-- performant code in the resulting binary.
+
+data Bytes = One | Two | Three | Four
+
+{-# INLINE getByteCount #-}
+getByteCount :: Char# -> Bytes
+getByteCount ch
+    | isTrue# (ch `leChar#` '\x7F'#) = One
+    | isTrue# (ch `leChar#` '\xDF'#) = Two
+    | isTrue# (ch `leChar#` '\xEF'#) = Three
+    | True                           = Four
+
+{-# INLINE plusBytes #-}
+plusBytes :: Addr# -> Bytes -> Addr#
+plusBytes addr bytes =
+  case bytes of
+    One   -> addr `plusAddr#` 1#
+    Two   -> addr `plusAddr#` 2#
+    Three -> addr `plusAddr#` 3#
+    Four  -> addr `plusAddr#` 4#
+
+-- | Take the current address, read unicode char of the given size.
+-- We obviously want the number of bytes, but we have to read one
+-- byte to determine the number of bytes for the current codepoint
+-- so we might as well reuse it and avoid a read.
+--
+-- Side Note: We don't dare to decode all 4 possibilities at once.
+-- Reading past the end of the addr might trigger an exception.
+-- For this reason we really have to check the width first and only
+-- decode after.
+{-# INLINE unpackUtf8Char# #-}
+unpackUtf8Char# :: Bytes -> Char# -> Addr# -> Char#
+unpackUtf8Char# bytes ch addr =
+  case bytes of
+    One -> ch
+    Two ->   (chr# (((ord# ch                                           -# 0xC0#) `uncheckedIShiftL#`  6#) +#
+                     (ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#)))
+    Three -> (chr# (((ord# ch                                           -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+                    ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#`  6#) +#
+                     (ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#)))
+    Four ->  (chr# (((ord# ch                                           -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+                    ((ord# (indexCharOffAddr# (addr `plusAddr#` 1#) 0#) -# 0x80#) `uncheckedIShiftL#` 12#) +#
+                    ((ord# (indexCharOffAddr# (addr `plusAddr#` 2#) 0#) -# 0x80#) `uncheckedIShiftL#`  6#) +#
+                     (ord# (indexCharOffAddr# (addr `plusAddr#` 3#) 0#) -# 0x80#)))
diff --git a/libraries/ghc-prim/changelog.md b/libraries/ghc-prim/changelog.md
index 96c648e620..1b52f06260 100644
--- a/libraries/ghc-prim/changelog.md
+++ b/libraries/ghc-prim/changelog.md
@@ -1,4 +1,23 @@
-## 0.6.1
+## 0.6.2 (edit as necessary)
+
+- Shipped with GHC 8.12.1
+
+- Add known-key `cstringLength#` to `GHC.CString`. This is just the
+  C function `strlen`, but a built-in rewrite rule allows GHC to
+  compute the result at compile time when the argument is known.
+
+- In order to support unicode better the following functions in `GHC.CString`
+  gained UTF8 counterparts:
+
+        unpackAppendCStringUtf8# :: Addr# -> [Char] -> [Char]
+        unpackFoldrCStringUtf8# :: Addr# -> (Char -> a -> a) -> a -> a
+
+- unpackFoldrCString* variants can now inline in phase [0].
+
+  If the folding function is known this allows for unboxing of the
+  Char argument resulting in much faster code.
+
+## 0.6.1 (edit as necessary)
 
 - Shipped with GHC 8.10.1