summaryrefslogtreecommitdiff
path: root/compiler/GHC/Data
diff options
context:
space:
mode:
authorZubin Duggal <zubin.duggal@gmail.com>2021-10-12 18:25:41 +0530
committerMarge Bot <ben+marge-bot@smart-cactus.org>2021-10-26 12:36:24 -0400
commit0255ef38b1bb0d4f3608bf92ebc8a93955ccb30a (patch)
tree718653d95e2d43388f6e20f7fa2057aee1282baf /compiler/GHC/Data
parent0f7541dc37d25d8a1056586bbeb57bf0dd2826a0 (diff)
downloadhaskell-0255ef38b1bb0d4f3608bf92ebc8a93955ccb30a.tar.gz
Warn if unicode bidirectional formatting characters are found in the source (#20263)
Diffstat (limited to 'compiler/GHC/Data')
-rw-r--r--compiler/GHC/Data/StringBuffer.hs58
1 files changed, 57 insertions, 1 deletions
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs
index 749f64b09e..03d720eb37 100644
--- a/compiler/GHC/Data/StringBuffer.hs
+++ b/compiler/GHC/Data/StringBuffer.hs
@@ -47,7 +47,11 @@ module GHC.Data.StringBuffer
-- * Parsing integers
parseUnsignedInteger,
- ) where
+
+ -- * Checking for bi-directional format characters
+ containsBidirectionalFormatChar,
+ bidirectionalFormatChars
+ ) where
import GHC.Prelude
@@ -214,6 +218,58 @@ nextChar (StringBuffer buf len (I# cur#)) =
let cur' = I# (cur# +# nBytes#) in
return (C# c#, StringBuffer buf len cur')
+
+bidirectionalFormatChars :: [(Char,String)]
+bidirectionalFormatChars =
+ [ ('\x202a' , "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)")
+ , ('\x202b' , "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)")
+ , ('\x202c' , "U+202C POP DIRECTIONAL FORMATTING (PDF)")
+ , ('\x202d' , "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)")
+ , ('\x202e' , "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)")
+ , ('\x2066' , "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)")
+ , ('\x2067' , "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)")
+ , ('\x2068' , "U+2068 FIRST STRONG ISOLATE (FSI)")
+ , ('\x2069' , "U+2069 POP DIRECTIONAL ISOLATE (PDI)")
+ ]
+
+{-| Returns true if the buffer contains Unicode bi-directional formatting
+characters.
+
+https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types
+
+Bidirectional format characters are one of
+'\x202a' : "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)"
+'\x202b' : "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)"
+'\x202c' : "U+202C POP DIRECTIONAL FORMATTING (PDF)"
+'\x202d' : "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)"
+'\x202e' : "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)"
+'\x2066' : "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)"
+'\x2067' : "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)"
+'\x2068' : "U+2068 FIRST STRONG ISOLATE (FSI)"
+'\x2069' : "U+2069 POP DIRECTIONAL ISOLATE (PDI)"
+
+This list is encoded in 'bidirectionalFormatChars'
+
+-}
+{-# INLINE containsBidirectionalFormatChar #-}
+containsBidirectionalFormatChar :: StringBuffer -> Bool
+containsBidirectionalFormatChar (StringBuffer buf (I# len#) (I# cur#))
+ = inlinePerformIO $ unsafeWithForeignPtr buf $ \(Ptr a#) -> do
+ let go :: Int# -> Bool
+ go i | isTrue# (i >=# len#) = False
+ | otherwise = case utf8DecodeCharAddr# a# i of
+ (# '\x202a'# , _ #) -> True
+ (# '\x202b'# , _ #) -> True
+ (# '\x202c'# , _ #) -> True
+ (# '\x202d'# , _ #) -> True
+ (# '\x202e'# , _ #) -> True
+ (# '\x2066'# , _ #) -> True
+ (# '\x2067'# , _ #) -> True
+ (# '\x2068'# , _ #) -> True
+ (# '\x2069'# , _ #) -> True
+ (# _, bytes #) -> go (i +# bytes)
+ pure $! go cur#
+
-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
-- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
-- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character