summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorZubin Duggal <zubin.duggal@gmail.com>2021-10-12 18:25:41 +0530
committerZubin Duggal <zubin.duggal@gmail.com>2021-12-16 16:21:09 +0530
commitcf37552894879524c023f5f2a460ee6bc130322d (patch)
tree130039964a1901ef7cac2bcf9f8805d4d842c7e2
parent543547fc82da56c8447e2bb7c1422069d959adf8 (diff)
downloadhaskell-cf37552894879524c023f5f2a460ee6bc130322d.tar.gz
Warn if unicode bidirectional formatting characters are found in the source (#20263)
(cherry picked from commit 628c88a3e1fe2a037634518b018616cf63260786)
-rw-r--r--compiler/GHC/Data/StringBuffer.hs58
-rw-r--r--compiler/GHC/Driver/Flags.hs1
-rw-r--r--compiler/GHC/Driver/Main.hs45
-rw-r--r--compiler/GHC/Driver/Session.hs6
-rw-r--r--docs/users_guide/using-warnings.rst18
-rw-r--r--testsuite/tests/warnings/should_fail/T20263.hs2
-rw-r--r--testsuite/tests/warnings/should_fail/T20263.stderr8
-rw-r--r--testsuite/tests/warnings/should_fail/all.T1
8 files changed, 136 insertions, 3 deletions
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs
index 767a4111f3..1c9a0fe349 100644
--- a/compiler/GHC/Data/StringBuffer.hs
+++ b/compiler/GHC/Data/StringBuffer.hs
@@ -42,7 +42,11 @@ module GHC.Data.StringBuffer
-- * Parsing integers
parseUnsignedInteger,
- ) where
+
+ -- * Checking for bi-directional format characters
+ containsBidirectionalFormatChar,
+ bidirectionalFormatChars
+ ) where
#include "HsVersions.h"
@@ -211,6 +215,58 @@ nextChar (StringBuffer buf len (I# cur#)) =
let cur' = I# (cur# +# nBytes#) in
return (C# c#, StringBuffer buf len cur')
+
+bidirectionalFormatChars :: [(Char,String)]
+bidirectionalFormatChars =
+ [ ('\x202a' , "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)")
+ , ('\x202b' , "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)")
+ , ('\x202c' , "U+202C POP DIRECTIONAL FORMATTING (PDF)")
+ , ('\x202d' , "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)")
+ , ('\x202e' , "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)")
+ , ('\x2066' , "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)")
+ , ('\x2067' , "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)")
+ , ('\x2068' , "U+2068 FIRST STRONG ISOLATE (FSI)")
+ , ('\x2069' , "U+2069 POP DIRECTIONAL ISOLATE (PDI)")
+ ]
+
+{-| Returns true if the buffer contains Unicode bi-directional formatting
+characters.
+
+https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types
+
+Bidirectional format characters are one of
+'\x202a' : "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)"
+'\x202b' : "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)"
+'\x202c' : "U+202C POP DIRECTIONAL FORMATTING (PDF)"
+'\x202d' : "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)"
+'\x202e' : "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)"
+'\x2066' : "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)"
+'\x2067' : "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)"
+'\x2068' : "U+2068 FIRST STRONG ISOLATE (FSI)"
+'\x2069' : "U+2069 POP DIRECTIONAL ISOLATE (PDI)"
+
+This list is encoded in 'bidirectionalFormatChars'
+
+-}
+{-# INLINE containsBidirectionalFormatChar #-}
+containsBidirectionalFormatChar :: StringBuffer -> Bool
+containsBidirectionalFormatChar (StringBuffer buf (I# len#) (I# cur#))
+ = inlinePerformIO $ unsafeWithForeignPtr buf $ \(Ptr a#) -> do
+ let go :: Int# -> Bool
+ go i | isTrue# (i >=# len#) = False
+ | otherwise = case utf8DecodeCharAddr# a# i of
+ (# '\x202a'# , _ #) -> True
+ (# '\x202b'# , _ #) -> True
+ (# '\x202c'# , _ #) -> True
+ (# '\x202d'# , _ #) -> True
+ (# '\x202e'# , _ #) -> True
+ (# '\x2066'# , _ #) -> True
+ (# '\x2067'# , _ #) -> True
+ (# '\x2068'# , _ #) -> True
+ (# '\x2069'# , _ #) -> True
+ (# _, bytes #) -> go (i +# bytes)
+ pure $! go cur#
+
-- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous
-- to 'Data.List.head'). __Warning:__ The behavior is undefined if the
-- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character
diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs
index aeab013d0e..1bc657fa66 100644
--- a/compiler/GHC/Driver/Flags.hs
+++ b/compiler/GHC/Driver/Flags.hs
@@ -498,6 +498,7 @@ data WarningFlag =
| Opt_WarnCompatUnqualifiedImports -- Since 8.10
| Opt_WarnDerivingDefaults
| Opt_WarnInvalidHaddock -- Since 8.12
+ | Opt_WarnUnicodeBidirectionalFormatCharacters -- Since 9.0.2
deriving (Eq, Show, Enum)
-- | Used when outputting warnings: if a reason is given, it is
diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs
index 832e1a4f22..6febfe173a 100644
--- a/compiler/GHC/Driver/Main.hs
+++ b/compiler/GHC/Driver/Main.hs
@@ -181,6 +181,7 @@ import GHC.Iface.Ext.Ast ( mkHieFile )
import GHC.Iface.Ext.Types ( getAsts, hie_asts, hie_module )
import GHC.Iface.Ext.Binary ( readHieFile, writeHieFile , hie_file_result, NameCacheUpdater(..))
import GHC.Iface.Ext.Debug ( diffFile, validateScopes )
+import Data.List.NonEmpty (NonEmpty ((:|)))
#include "HsVersions.h"
@@ -347,6 +348,25 @@ hscParse' mod_summary
Nothing -> liftIO $ hGetStringBuffer src_filename
let loc = mkRealSrcLoc (mkFastString src_filename) 1 1
+ when (wopt Opt_WarnUnicodeBidirectionalFormatCharacters dflags) $ do
+ case checkBidirectionFormatChars (PsLoc loc (BufPos 0)) buf of
+ Nothing -> pure ()
+ Just ((loc,chr,desc) :| xs) ->
+ let span = mkSrcSpanPs $ mkPsSpan loc (advancePsLoc loc chr)
+ warn = makeIntoWarning (Reason Opt_WarnUnicodeBidirectionalFormatCharacters) $ mkLongWarnMsg dflags span neverQualify msg empty
+ msg = text "A unicode bidirectional formatting character" <+> parens (text desc)
+ $$ text "was found at offset" <+> ppr (bufPos (psBufPos loc)) <+> text "in the file"
+ $$ (case xs of
+ [] -> empty
+ xs -> text "along with further bidirectional formatting characters at" <+> pprChars xs
+ where
+ pprChars [] = empty
+ pprChars ((loc,_,desc):xs) = text "offset" <+> ppr (bufPos (psBufPos loc)) <> text ":" <+> text desc
+ $$ pprChars xs)
+ $$ text "Bidirectional formatting characters may be rendered misleadingly in certain editors"
+
+ in liftIO $ printOrThrowWarnings dflags (unitBag warn)
+
let parseMod | HsigFile == ms_hsc_src mod_summary
= parseSignature
| otherwise = parseModule
@@ -408,9 +428,34 @@ hscParse' mod_summary
= parsedResultAction p opts mod_summary
withPlugins dflags applyPluginAction res
+checkBidirectionFormatChars :: PsLoc -> StringBuffer -> Maybe (NonEmpty (PsLoc, Char, String))
+checkBidirectionFormatChars start_loc sb
+ | containsBidirectionalFormatChar sb = Just $ go start_loc sb
+ | otherwise = Nothing
+ where
+ go :: PsLoc -> StringBuffer -> NonEmpty (PsLoc, Char, String)
+ go loc sb
+ | atEnd sb = panic "checkBidirectionFormatChars: no char found"
+ | otherwise = case nextChar sb of
+ (chr, sb)
+ | Just desc <- lookup chr bidirectionalFormatChars ->
+ (loc, chr, desc) :| go1 (advancePsLoc loc chr) sb
+ | otherwise -> go (advancePsLoc loc chr) sb
+
+ go1 :: PsLoc -> StringBuffer -> [(PsLoc, Char, String)]
+ go1 loc sb
+ | atEnd sb = []
+ | otherwise = case nextChar sb of
+ (chr, sb)
+ | Just desc <- lookup chr bidirectionalFormatChars ->
+ (loc, chr, desc) : go1 (advancePsLoc loc chr) sb
+ | otherwise -> go1 (advancePsLoc loc chr) sb
+
-- -----------------------------------------------------------------------------
-- | If the renamed source has been kept, extract it. Dump it if requested.
+
+
extract_renamed_stuff :: ModSummary -> TcGblEnv -> Hsc RenamedStuff
extract_renamed_stuff mod_summary tc_result = do
let rn_info = getRenamedStuff tc_result
diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs
index abc36d2825..bdfc75c84c 100644
--- a/compiler/GHC/Driver/Session.hs
+++ b/compiler/GHC/Driver/Session.hs
@@ -3509,7 +3509,8 @@ wWarningFlagsDeps = [
Opt_WarnPrepositiveQualifiedModule,
flagSpec "unused-packages" Opt_WarnUnusedPackages,
flagSpec "compat-unqualified-imports" Opt_WarnCompatUnqualifiedImports,
- flagSpec "invalid-haddock" Opt_WarnInvalidHaddock
+ flagSpec "invalid-haddock" Opt_WarnInvalidHaddock,
+ flagSpec "unicode-bidirectional-format-characters" Opt_WarnUnicodeBidirectionalFormatCharacters
]
-- | These @-\<blah\>@ flags can all be reversed with @-no-\<blah\>@
@@ -4252,7 +4253,8 @@ standardWarnings -- see Note [Documenting warning flags]
Opt_WarnSimplifiableClassConstraints,
Opt_WarnStarBinder,
Opt_WarnInaccessibleCode,
- Opt_WarnSpaceAfterBang
+ Opt_WarnSpaceAfterBang,
+ Opt_WarnUnicodeBidirectionalFormatCharacters
]
-- | Things you get with -W
diff --git a/docs/users_guide/using-warnings.rst b/docs/users_guide/using-warnings.rst
index 0bfb5e5da3..eda295e385 100644
--- a/docs/users_guide/using-warnings.rst
+++ b/docs/users_guide/using-warnings.rst
@@ -45,6 +45,7 @@ generally likely to indicate bugs in your program. These are:
* :ghc-flag:`-Wunrecognised-warning-flags`
* :ghc-flag:`-Winaccessible-code`
* :ghc-flag:`-Wstar-binder`
+ * :ghc-flag:`-Wunicode-bidirectional-format-characters`
The following flags are simple ways to select standard "packages" of warnings:
@@ -1795,6 +1796,23 @@ of ``-W(no-)*``.
Since GHC 7.10, ``Typeable`` is automatically derived for all types.
Thus, deriving ``Typeable`` yourself is redundant.
+.. ghc-flag:: -Wunicode-bidirectional-format-characters
+ :shortdesc: warn about the usage of unicode bidirectional layout override characters
+ :type: dynamic
+ :category:
+
+ Explicit unicode bidirectional formatting characters can cause source code
+ to be rendered misleadingly in many viewers. We warn if any such character
+ is present in the source.
+
+ Specifically, the characters disallowed by this warning
+ are those which are a part of the 'Explicit Formatting`
+ category of the `Unicode Bidirectional Character Type Listing
+ <https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types>`_
+
+ :since: 9.0.2
+
+
If you're feeling really paranoid, the :ghc-flag:`-dcore-lint` option is a good choice.
It turns on heavyweight intra-pass sanity-checking within GHC. (It checks GHC's
sanity, not yours.)
diff --git a/testsuite/tests/warnings/should_fail/T20263.hs b/testsuite/tests/warnings/should_fail/T20263.hs
new file mode 100644
index 0000000000..c04928e9a1
--- /dev/null
+++ b/testsuite/tests/warnings/should_fail/T20263.hs
@@ -0,0 +1,2 @@
+main :: IO ()
+main = print {- ⁧⁦"Hello" ⁩⁦-} "Goodbye"
diff --git a/testsuite/tests/warnings/should_fail/T20263.stderr b/testsuite/tests/warnings/should_fail/T20263.stderr
new file mode 100644
index 0000000000..5d7b260959
--- /dev/null
+++ b/testsuite/tests/warnings/should_fail/T20263.stderr
@@ -0,0 +1,8 @@
+
+T20263.hs:2:17: error: [-Wunicode-bidirectional-format-characters (in -Wdefault), -Werror=unicode-bidirectional-format-characters]
+ A unicode bidirectional formatting character (U+2067 RIGHT-TO-LEFT ISOLATE (RLI))
+ was found at offset 30 in the file
+ along with further bidirectional formatting characters at offset 31: U+2066 LEFT-TO-RIGHT ISOLATE (LRI)
+ offset 40: U+2069 POP DIRECTIONAL ISOLATE (PDI)
+ offset 41: U+2066 LEFT-TO-RIGHT ISOLATE (LRI)
+ Bidirectional formatting characters may be rendered misleadingly in certain editors
diff --git a/testsuite/tests/warnings/should_fail/all.T b/testsuite/tests/warnings/should_fail/all.T
index 7d0dc4295f..2e67c119c2 100644
--- a/testsuite/tests/warnings/should_fail/all.T
+++ b/testsuite/tests/warnings/should_fail/all.T
@@ -15,3 +15,4 @@ test('CaretDiagnostics1',
compile_fail,
['-fdiagnostics-show-caret -ferror-spans'])
test('CaretDiagnostics2', normal, compile_fail, ['-fdiagnostics-show-caret'])
+test('T20263', normal, compile_fail, ['-Wunicode-bidirectional-format-characters -Werror'])