summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Gamari <ben@smart-cactus.org>2021-10-12 16:08:51 -0400
committerBen Gamari <ben@smart-cactus.org>2021-10-12 16:08:51 -0400
commit2d5a6417743c4db23a7f532fa6ba4fcb76b48085 (patch)
tree3c2fd323391c9fa5362f37e203475af67813a2b4
parent8ffb96d393097ee94adf43b003637412afdb753f (diff)
downloadhaskell-2d5a6417743c4db23a7f532fa6ba4fcb76b48085.tar.gz
Testing
-rw-r--r--libraries/base/GHC/Conc/Sync.hs66
-rw-r--r--libraries/ghc-boot/GHC/Utils/Encoding.hs7
2 files changed, 70 insertions, 3 deletions
diff --git a/libraries/base/GHC/Conc/Sync.hs b/libraries/base/GHC/Conc/Sync.hs
index d5367617b0..f5fea1ddb8 100644
--- a/libraries/base/GHC/Conc/Sync.hs
+++ b/libraries/base/GHC/Conc/Sync.hs
@@ -634,7 +634,7 @@ threadLabel :: ThreadId -> IO (Maybe String)
threadLabel (ThreadId t) = IO $ \s ->
case threadLabel# t s of
(# s', 1#, lbl #) ->
- let lbl' = Just (unpackCStringUtf8# lbl)
+ let lbl' = Just (utf8DecodeCharByteArray# lbl 0)
in (# s', lbl' #)
(# s', 0#, _ #) -> (# s', Nothing #)
@@ -965,3 +965,67 @@ setUncaughtExceptionHandler = writeIORef uncaughtExceptionHandler
getUncaughtExceptionHandler :: IO (SomeException -> IO ())
getUncaughtExceptionHandler = readIORef uncaughtExceptionHandler
+
+-- We can't write the decoder as efficiently as we'd like without
+-- resorting to unboxed extensions, unfortunately. I tried to write
+-- an IO version of this function, but GHC can't eliminate boxed
+-- results from an IO-returning function.
+--
+-- We assume we can ignore overflow when parsing a multibyte character here.
+-- To make this safe, we add extra sentinel bytes to unparsed UTF-8 sequences
+-- before decoding them (see "GHC.Data.StringBuffer").
+
+{-# INLINE utf8DecodeChar# #-}
+utf8DecodeChar# :: (Int# -> Word#) -> (# Char#, Int# #)
+utf8DecodeChar# indexWord8# =
+ let !ch0 = word2Int# (indexWord8# 0#) in
+ case () of
+ _ | isTrue# (ch0 <=# 0x7F#) -> (# chr# ch0, 1# #)
+
+ | isTrue# ((ch0 >=# 0xC0#) `andI#` (ch0 <=# 0xDF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ (# chr# (((ch0 -# 0xC0#) `uncheckedIShiftL#` 6#) +#
+ (ch1 -# 0x80#)),
+ 2# #)
+
+ | isTrue# ((ch0 >=# 0xE0#) `andI#` (ch0 <=# 0xEF#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ (# chr# (((ch0 -# 0xE0#) `uncheckedIShiftL#` 12#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch2 -# 0x80#)),
+ 3# #)
+
+ | isTrue# ((ch0 >=# 0xF0#) `andI#` (ch0 <=# 0xF8#)) ->
+ let !ch1 = word2Int# (indexWord8# 1#) in
+ if isTrue# ((ch1 <# 0x80#) `orI#` (ch1 >=# 0xC0#)) then fail 1# else
+ let !ch2 = word2Int# (indexWord8# 2#) in
+ if isTrue# ((ch2 <# 0x80#) `orI#` (ch2 >=# 0xC0#)) then fail 2# else
+ let !ch3 = word2Int# (indexWord8# 3#) in
+ if isTrue# ((ch3 <# 0x80#) `orI#` (ch3 >=# 0xC0#)) then fail 3# else
+ (# chr# (((ch0 -# 0xF0#) `uncheckedIShiftL#` 18#) +#
+ ((ch1 -# 0x80#) `uncheckedIShiftL#` 12#) +#
+ ((ch2 -# 0x80#) `uncheckedIShiftL#` 6#) +#
+ (ch3 -# 0x80#)),
+ 4# #)
+
+ | otherwise -> fail 1#
+ where
+ -- all invalid sequences end up here:
+ fail :: Int# -> (# Char#, Int# #)
+ fail nBytes# = (# '\0'#, nBytes# #)
+ -- '\xFFFD' would be the usual replacement character, but
+ -- that's a valid symbol in Haskell, so will result in a
+ -- confusing parse error later on. Instead we use '\0' which
+ -- will signal a lexer error immediately.
+
+utf8DecodeCharByteArray# :: ByteArray# -> Int# -> (# Char#, Int# #)
+utf8DecodeCharByteArray# ba# off# =
+#if !MIN_VERSION_base(4,16,0)
+ utf8DecodeChar# (\i# -> indexWord8Array# ba# (i# +# off#))
+#else
+ utf8DecodeChar# (\i# -> word8ToWord# (indexWord8Array# ba# (i# +# off#)))
+#endif
diff --git a/libraries/ghc-boot/GHC/Utils/Encoding.hs b/libraries/ghc-boot/GHC/Utils/Encoding.hs
index 5eb3779b3b..a4190d35a2 100644
--- a/libraries/ghc-boot/GHC/Utils/Encoding.hs
+++ b/libraries/ghc-boot/GHC/Utils/Encoding.hs
@@ -233,12 +233,15 @@ utf8CompareShortByteString (SBS a1) (SBS a2) = go 0# 0#
| isTrue# (b1_1 `ltWord#` b2_1) -> LT
| otherwise -> go (off1 +# 1#) (off2 +# 1#)
-utf8DecodeShortByteString :: ShortByteString -> [Char]
-utf8DecodeShortByteString (SBS ba#)
+utf8DecodeByteArray :: ByteArray# -> [Char]
+utf8DecodeByteArray ba#
= unsafeDupablePerformIO $
let len# = sizeofByteArray# ba# in
utf8DecodeLazy# (return ()) (utf8DecodeCharByteArray# ba#) len#
+utf8DecodeShortByteString :: ShortByteString -> [Char]
+utf8DecodeShortByteString (SBS ba#) = utf8DecodeByteArray ba#
+
countUTF8Chars :: ShortByteString -> IO Int
countUTF8Chars (SBS ba) = go 0# 0#
where