summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Pickering <matthewtpickering@gmail.com>2021-06-11 10:48:25 +0100
committerMarge Bot <ben+marge-bot@smart-cactus.org>2021-06-23 02:58:35 -0400
commit7f6454fb8cd92b2b2ad4e88fa6d81e34d43edb9a (patch)
tree22dbe8c64e1761856913450fa297e2797c905fa2
parent87f57ecf2523e83d8dd9cad919a6f2010f630ad0 (diff)
downloadhaskell-7f6454fb8cd92b2b2ad4e88fa6d81e34d43edb9a.tar.gz
Optimiser: Correctly deal with strings starting with unicode characters in exprConApp_maybe
For example: "\0" is encoded to "C0 80", then the rule would correct use a decoding function to work out the first character was "C0 80" but then just used BS.tail so the rest of the string was "80". This resulted in "\0" being transformed into '\C0\80' : unpackCStringUTF8# "80" Which is obviously bogus. I rewrote the function to call utf8UnconsByteString directly and avoid the roundtrip through Faststring so now the head/tail is computed by the same call. Fixes #19976
-rw-r--r--compiler/GHC/Core/SimpleOpt.hs38
-rw-r--r--libraries/ghc-boot/GHC/Utils/Encoding.hs9
-rw-r--r--testsuite/tests/simplCore/should_compile/T9400.hs4
-rw-r--r--testsuite/tests/simplCore/should_compile/T9400.stderr26
4 files changed, 50 insertions, 27 deletions
diff --git a/compiler/GHC/Core/SimpleOpt.hs b/compiler/GHC/Core/SimpleOpt.hs
index ffec34ff17..67dc4609c3 100644
--- a/compiler/GHC/Core/SimpleOpt.hs
+++ b/compiler/GHC/Core/SimpleOpt.hs
@@ -48,12 +48,12 @@ import GHC.Builtin.Types
import GHC.Builtin.Names
import GHC.Types.Basic
import GHC.Unit.Module ( Module )
+import GHC.Utils.Encoding
import GHC.Utils.Outputable
import GHC.Utils.Panic
import GHC.Utils.Panic.Plain
import GHC.Utils.Misc
import GHC.Data.Maybe ( orElse )
-import GHC.Data.FastString
import Data.List (mapAccumL)
import qualified Data.ByteString as BS
@@ -874,9 +874,8 @@ calls to unpackCString# and returns:
Just (':', [Char], ['a', unpackCString# "bc"]).
-We need to be careful about UTF8 strings here. ""# contains a ByteString, so
-we must parse it back into a FastString to split off the first character.
-That way we can treat unpackCString# and unpackCStringUtf8# in the same way.
+We need to be careful about UTF8 strings here. ""# contains an encoded ByteString, so
+we call utf8UnconsByteString to correctly deal with the encoding and splitting.
We must also be careful about
lvl = "foo"#
@@ -885,6 +884,8 @@ to ensure that we see through the let-binding for 'lvl'. Hence the
(exprIsLiteral_maybe .. arg) in the guard before the call to
dealWithStringLiteral.
+The tests for this function are in T9400.
+
Note [Push coercions in exprIsConApp_maybe]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In #13025 I found a case where we had
@@ -1237,23 +1238,18 @@ dealWithStringLiteral :: Var -> BS.ByteString -> Coercion
-- This is not possible with user-supplied empty literals, GHC.Core.Make.mkStringExprFS
-- turns those into [] automatically, but just in case something else in GHC
-- generates a string literal directly.
-dealWithStringLiteral _ str co
- | BS.null str
- = pushCoDataCon nilDataCon [Type charTy] co
-
-dealWithStringLiteral fun str co
- = let strFS = mkFastStringByteString str
-
- char = mkConApp charDataCon [mkCharLit (headFS strFS)]
- charTail = BS.tail (bytesFS strFS)
-
- -- In singleton strings, just add [] instead of unpackCstring# ""#.
- rest = if BS.null charTail
- then mkConApp nilDataCon [Type charTy]
- else App (Var fun)
- (Lit (LitString charTail))
-
- in pushCoDataCon consDataCon [Type charTy, char, rest] co
+dealWithStringLiteral fun str co =
+ case utf8UnconsByteString str of
+ Nothing -> pushCoDataCon nilDataCon [Type charTy] co
+ Just (char, charTail) ->
+ let char_expr = mkConApp charDataCon [mkCharLit char]
+ -- In singleton strings, just add [] instead of unpackCstring# ""#.
+ rest = if BS.null charTail
+ then mkConApp nilDataCon [Type charTy]
+ else App (Var fun)
+ (Lit (LitString charTail))
+
+ in pushCoDataCon consDataCon [Type charTy, char_expr, rest] co
{-
Note [Unfolding DFuns]
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
index ba07784b0d..5eb3779b3b 100644
--- a/libraries/ghc-boot/GHC/Utils/Encoding.hs
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -22,6 +22,7 @@ module GHC.Utils.Encoding (
utf8CharStart,
utf8DecodeChar,
utf8DecodeByteString,
+ utf8UnconsByteString,
utf8DecodeShortByteString,
utf8CompareShortByteString,
utf8DecodeStringLazy,
@@ -169,6 +170,14 @@ utf8DecodeByteString :: ByteString -> [Char]
utf8DecodeByteString (BS.PS fptr offset len)
= utf8DecodeStringLazy fptr offset len
+utf8UnconsByteString :: ByteString -> Maybe (Char, ByteString)
+utf8UnconsByteString (BS.PS _ _ 0) = Nothing
+utf8UnconsByteString (BS.PS fptr offset len)
+ = unsafeDupablePerformIO $
+ withForeignPtr fptr $ \ptr -> do
+ let (c,n) = utf8DecodeChar (ptr `plusPtr` offset)
+ return $ Just (c, BS.PS fptr (offset + n) (len - n))
+
utf8DecodeStringLazy :: ForeignPtr Word8 -> Int -> Int -> [Char]
utf8DecodeStringLazy fp offset (I# len#)
= unsafeDupablePerformIO $ do
diff --git a/testsuite/tests/simplCore/should_compile/T9400.hs b/testsuite/tests/simplCore/should_compile/T9400.hs
index 4e9cb72cb9..85aad51564 100644
--- a/testsuite/tests/simplCore/should_compile/T9400.hs
+++ b/testsuite/tests/simplCore/should_compile/T9400.hs
@@ -16,3 +16,7 @@ main = do
(x:xs) -> putStrLn xs
case "ab" of
"" -> putStrLn "fail"
+ case "\0abc" of
+ (x:xs) -> putStrLn xs
+ case "zażółćz" of
+ (x:xs) -> putStrLn xs
diff --git a/testsuite/tests/simplCore/should_compile/T9400.stderr b/testsuite/tests/simplCore/should_compile/T9400.stderr
index c4584f460e..511a481d96 100644
--- a/testsuite/tests/simplCore/should_compile/T9400.stderr
+++ b/testsuite/tests/simplCore/should_compile/T9400.stderr
@@ -9,7 +9,7 @@ T9400.hs:18:9: warning: [-Woverlapping-patterns (in -Wdefault)]
==================== Tidy Core ====================
Result size of Tidy Core
- = {terms: 38, types: 22, coercions: 0, joins: 0/0}
+ = {terms: 48, types: 28, coercions: 0, joins: 0/0}
-- RHS size: {terms: 1, types: 0, coercions: 0, joins: 0/0}
$trModule1 :: Addr#
@@ -36,7 +36,7 @@ T9400.$trModule :: Module
[GblId, Unf=OtherCon []]
T9400.$trModule = GHC.Types.Module $trModule2 $trModule4
--- RHS size: {terms: 23, types: 15, coercions: 0, joins: 0/0}
+-- RHS size: {terms: 33, types: 21, coercions: 0, joins: 0/0}
main :: IO ()
[GblId]
main
@@ -64,10 +64,24 @@ main
@()
@()
(putStrLn (unpackCString# "efg"#))
- (case Control.Exception.Base.patError
- @LiftedRep @() "T9400.hs:(17,5)-(18,29)|case"#
- of wild {
- }))))
+ (>>
+ @IO
+ GHC.Base.$fMonadIO
+ @()
+ @()
+ (case Control.Exception.Base.patError
+ @LiftedRep @() "T9400.hs:(17,5)-(18,29)|case"#
+ of wild {
+ })
+ (>>
+ @IO
+ GHC.Base.$fMonadIO
+ @()
+ @()
+ (putStrLn (unpackCStringUtf8# "abc"#))
+ (putStrLn
+ (unpackCStringUtf8#
+ "a\\197\\188\\195\\179\\197\\130\\196\\135z"#)))))))