diff options
author | Zubin Duggal <zubin.duggal@gmail.com> | 2021-10-12 18:25:41 +0530 |
---|---|---|
committer | Zubin Duggal <zubin.duggal@gmail.com> | 2021-12-16 16:21:09 +0530 |
commit | cf37552894879524c023f5f2a460ee6bc130322d (patch) | |
tree | 130039964a1901ef7cac2bcf9f8805d4d842c7e2 | |
parent | 543547fc82da56c8447e2bb7c1422069d959adf8 (diff) | |
download | haskell-cf37552894879524c023f5f2a460ee6bc130322d.tar.gz |
Warn if unicode bidirectional formatting characters are found in the source (#20263)
(cherry picked from commit 628c88a3e1fe2a037634518b018616cf63260786)
-rw-r--r-- | compiler/GHC/Data/StringBuffer.hs | 58 | ||||
-rw-r--r-- | compiler/GHC/Driver/Flags.hs | 1 | ||||
-rw-r--r-- | compiler/GHC/Driver/Main.hs | 45 | ||||
-rw-r--r-- | compiler/GHC/Driver/Session.hs | 6 | ||||
-rw-r--r-- | docs/users_guide/using-warnings.rst | 18 | ||||
-rw-r--r-- | testsuite/tests/warnings/should_fail/T20263.hs | 2 | ||||
-rw-r--r-- | testsuite/tests/warnings/should_fail/T20263.stderr | 8 | ||||
-rw-r--r-- | testsuite/tests/warnings/should_fail/all.T | 1 |
8 files changed, 136 insertions, 3 deletions
diff --git a/compiler/GHC/Data/StringBuffer.hs b/compiler/GHC/Data/StringBuffer.hs index 767a4111f3..1c9a0fe349 100644 --- a/compiler/GHC/Data/StringBuffer.hs +++ b/compiler/GHC/Data/StringBuffer.hs @@ -42,7 +42,11 @@ module GHC.Data.StringBuffer -- * Parsing integers parseUnsignedInteger, - ) where + + -- * Checking for bi-directional format characters + containsBidirectionalFormatChar, + bidirectionalFormatChars + ) where #include "HsVersions.h" @@ -211,6 +215,58 @@ nextChar (StringBuffer buf len (I# cur#)) = let cur' = I# (cur# +# nBytes#) in return (C# c#, StringBuffer buf len cur') + +bidirectionalFormatChars :: [(Char,String)] +bidirectionalFormatChars = + [ ('\x202a' , "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)") + , ('\x202b' , "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)") + , ('\x202c' , "U+202C POP DIRECTIONAL FORMATTING (PDF)") + , ('\x202d' , "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)") + , ('\x202e' , "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)") + , ('\x2066' , "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)") + , ('\x2067' , "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)") + , ('\x2068' , "U+2068 FIRST STRONG ISOLATE (FSI)") + , ('\x2069' , "U+2069 POP DIRECTIONAL ISOLATE (PDI)") + ] + +{-| Returns true if the buffer contains Unicode bi-directional formatting +characters. + +https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types + +Bidirectional format characters are one of +'\x202a' : "U+202A LEFT-TO-RIGHT EMBEDDING (LRE)" +'\x202b' : "U+202B RIGHT-TO-LEFT EMBEDDING (RLE)" +'\x202c' : "U+202C POP DIRECTIONAL FORMATTING (PDF)" +'\x202d' : "U+202D LEFT-TO-RIGHT OVERRIDE (LRO)" +'\x202e' : "U+202E RIGHT-TO-LEFT OVERRIDE (RLO)" +'\x2066' : "U+2066 LEFT-TO-RIGHT ISOLATE (LRI)" +'\x2067' : "U+2067 RIGHT-TO-LEFT ISOLATE (RLI)" +'\x2068' : "U+2068 FIRST STRONG ISOLATE (FSI)" +'\x2069' : "U+2069 POP DIRECTIONAL ISOLATE (PDI)" + +This list is encoded in 'bidirectionalFormatChars' + +-} +{-# INLINE containsBidirectionalFormatChar #-} +containsBidirectionalFormatChar :: StringBuffer -> Bool +containsBidirectionalFormatChar (StringBuffer buf (I# len#) (I# cur#)) + = inlinePerformIO $ unsafeWithForeignPtr buf $ \(Ptr a#) -> do + let go :: Int# -> Bool + go i | isTrue# (i >=# len#) = False + | otherwise = case utf8DecodeCharAddr# a# i of + (# '\x202a'# , _ #) -> True + (# '\x202b'# , _ #) -> True + (# '\x202c'# , _ #) -> True + (# '\x202d'# , _ #) -> True + (# '\x202e'# , _ #) -> True + (# '\x2066'# , _ #) -> True + (# '\x2067'# , _ #) -> True + (# '\x2068'# , _ #) -> True + (# '\x2069'# , _ #) -> True + (# _, bytes #) -> go (i +# bytes) + pure $! go cur# + -- | Return the first UTF-8 character of a nonempty 'StringBuffer' (analogous -- to 'Data.List.head'). __Warning:__ The behavior is undefined if the -- 'StringBuffer' is empty. Similar to 'utf8DecodeChar', if the character diff --git a/compiler/GHC/Driver/Flags.hs b/compiler/GHC/Driver/Flags.hs index aeab013d0e..1bc657fa66 100644 --- a/compiler/GHC/Driver/Flags.hs +++ b/compiler/GHC/Driver/Flags.hs @@ -498,6 +498,7 @@ data WarningFlag = | Opt_WarnCompatUnqualifiedImports -- Since 8.10 | Opt_WarnDerivingDefaults | Opt_WarnInvalidHaddock -- Since 8.12 + | Opt_WarnUnicodeBidirectionalFormatCharacters -- Since 9.0.2 deriving (Eq, Show, Enum) -- | Used when outputting warnings: if a reason is given, it is diff --git a/compiler/GHC/Driver/Main.hs b/compiler/GHC/Driver/Main.hs index 832e1a4f22..6febfe173a 100644 --- a/compiler/GHC/Driver/Main.hs +++ b/compiler/GHC/Driver/Main.hs @@ -181,6 +181,7 @@ import GHC.Iface.Ext.Ast ( mkHieFile ) import GHC.Iface.Ext.Types ( getAsts, hie_asts, hie_module ) import GHC.Iface.Ext.Binary ( readHieFile, writeHieFile , hie_file_result, NameCacheUpdater(..)) import GHC.Iface.Ext.Debug ( diffFile, validateScopes ) +import Data.List.NonEmpty (NonEmpty ((:|))) #include "HsVersions.h" @@ -347,6 +348,25 @@ hscParse' mod_summary Nothing -> liftIO $ hGetStringBuffer src_filename let loc = mkRealSrcLoc (mkFastString src_filename) 1 1 + when (wopt Opt_WarnUnicodeBidirectionalFormatCharacters dflags) $ do + case checkBidirectionFormatChars (PsLoc loc (BufPos 0)) buf of + Nothing -> pure () + Just ((loc,chr,desc) :| xs) -> + let span = mkSrcSpanPs $ mkPsSpan loc (advancePsLoc loc chr) + warn = makeIntoWarning (Reason Opt_WarnUnicodeBidirectionalFormatCharacters) $ mkLongWarnMsg dflags span neverQualify msg empty + msg = text "A unicode bidirectional formatting character" <+> parens (text desc) + $$ text "was found at offset" <+> ppr (bufPos (psBufPos loc)) <+> text "in the file" + $$ (case xs of + [] -> empty + xs -> text "along with further bidirectional formatting characters at" <+> pprChars xs + where + pprChars [] = empty + pprChars ((loc,_,desc):xs) = text "offset" <+> ppr (bufPos (psBufPos loc)) <> text ":" <+> text desc + $$ pprChars xs) + $$ text "Bidirectional formatting characters may be rendered misleadingly in certain editors" + + in liftIO $ printOrThrowWarnings dflags (unitBag warn) + let parseMod | HsigFile == ms_hsc_src mod_summary = parseSignature | otherwise = parseModule @@ -408,9 +428,34 @@ hscParse' mod_summary = parsedResultAction p opts mod_summary withPlugins dflags applyPluginAction res +checkBidirectionFormatChars :: PsLoc -> StringBuffer -> Maybe (NonEmpty (PsLoc, Char, String)) +checkBidirectionFormatChars start_loc sb + | containsBidirectionalFormatChar sb = Just $ go start_loc sb + | otherwise = Nothing + where + go :: PsLoc -> StringBuffer -> NonEmpty (PsLoc, Char, String) + go loc sb + | atEnd sb = panic "checkBidirectionFormatChars: no char found" + | otherwise = case nextChar sb of + (chr, sb) + | Just desc <- lookup chr bidirectionalFormatChars -> + (loc, chr, desc) :| go1 (advancePsLoc loc chr) sb + | otherwise -> go (advancePsLoc loc chr) sb + + go1 :: PsLoc -> StringBuffer -> [(PsLoc, Char, String)] + go1 loc sb + | atEnd sb = [] + | otherwise = case nextChar sb of + (chr, sb) + | Just desc <- lookup chr bidirectionalFormatChars -> + (loc, chr, desc) : go1 (advancePsLoc loc chr) sb + | otherwise -> go1 (advancePsLoc loc chr) sb + -- ----------------------------------------------------------------------------- -- | If the renamed source has been kept, extract it. Dump it if requested. + + extract_renamed_stuff :: ModSummary -> TcGblEnv -> Hsc RenamedStuff extract_renamed_stuff mod_summary tc_result = do let rn_info = getRenamedStuff tc_result diff --git a/compiler/GHC/Driver/Session.hs b/compiler/GHC/Driver/Session.hs index abc36d2825..bdfc75c84c 100644 --- a/compiler/GHC/Driver/Session.hs +++ b/compiler/GHC/Driver/Session.hs @@ -3509,7 +3509,8 @@ wWarningFlagsDeps = [ Opt_WarnPrepositiveQualifiedModule, flagSpec "unused-packages" Opt_WarnUnusedPackages, flagSpec "compat-unqualified-imports" Opt_WarnCompatUnqualifiedImports, - flagSpec "invalid-haddock" Opt_WarnInvalidHaddock + flagSpec "invalid-haddock" Opt_WarnInvalidHaddock, + flagSpec "unicode-bidirectional-format-characters" Opt_WarnUnicodeBidirectionalFormatCharacters ] -- | These @-\<blah\>@ flags can all be reversed with @-no-\<blah\>@ @@ -4252,7 +4253,8 @@ standardWarnings -- see Note [Documenting warning flags] Opt_WarnSimplifiableClassConstraints, Opt_WarnStarBinder, Opt_WarnInaccessibleCode, - Opt_WarnSpaceAfterBang + Opt_WarnSpaceAfterBang, + Opt_WarnUnicodeBidirectionalFormatCharacters ] -- | Things you get with -W diff --git a/docs/users_guide/using-warnings.rst b/docs/users_guide/using-warnings.rst index 0bfb5e5da3..eda295e385 100644 --- a/docs/users_guide/using-warnings.rst +++ b/docs/users_guide/using-warnings.rst @@ -45,6 +45,7 @@ generally likely to indicate bugs in your program. These are: * :ghc-flag:`-Wunrecognised-warning-flags` * :ghc-flag:`-Winaccessible-code` * :ghc-flag:`-Wstar-binder` + * :ghc-flag:`-Wunicode-bidirectional-format-characters` The following flags are simple ways to select standard "packages" of warnings: @@ -1795,6 +1796,23 @@ of ``-W(no-)*``. Since GHC 7.10, ``Typeable`` is automatically derived for all types. Thus, deriving ``Typeable`` yourself is redundant. +.. ghc-flag:: -Wunicode-bidirectional-format-characters + :shortdesc: warn about the usage of unicode bidirectional layout override characters + :type: dynamic + :category: + + Explicit unicode bidirectional formatting characters can cause source code + to be rendered misleadingly in many viewers. We warn if any such character + is present in the source. + + Specifically, the characters disallowed by this warning + are those which are a part of the 'Explicit Formatting` + category of the `Unicode Bidirectional Character Type Listing + <https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types>`_ + + :since: 9.0.2 + + If you're feeling really paranoid, the :ghc-flag:`-dcore-lint` option is a good choice. It turns on heavyweight intra-pass sanity-checking within GHC. (It checks GHC's sanity, not yours.) diff --git a/testsuite/tests/warnings/should_fail/T20263.hs b/testsuite/tests/warnings/should_fail/T20263.hs new file mode 100644 index 0000000000..c04928e9a1 --- /dev/null +++ b/testsuite/tests/warnings/should_fail/T20263.hs @@ -0,0 +1,2 @@ +main :: IO () +main = print {- "Hello" -} "Goodbye" diff --git a/testsuite/tests/warnings/should_fail/T20263.stderr b/testsuite/tests/warnings/should_fail/T20263.stderr new file mode 100644 index 0000000000..5d7b260959 --- /dev/null +++ b/testsuite/tests/warnings/should_fail/T20263.stderr @@ -0,0 +1,8 @@ + +T20263.hs:2:17: error: [-Wunicode-bidirectional-format-characters (in -Wdefault), -Werror=unicode-bidirectional-format-characters] + A unicode bidirectional formatting character (U+2067 RIGHT-TO-LEFT ISOLATE (RLI)) + was found at offset 30 in the file + along with further bidirectional formatting characters at offset 31: U+2066 LEFT-TO-RIGHT ISOLATE (LRI) + offset 40: U+2069 POP DIRECTIONAL ISOLATE (PDI) + offset 41: U+2066 LEFT-TO-RIGHT ISOLATE (LRI) + Bidirectional formatting characters may be rendered misleadingly in certain editors diff --git a/testsuite/tests/warnings/should_fail/all.T b/testsuite/tests/warnings/should_fail/all.T index 7d0dc4295f..2e67c119c2 100644 --- a/testsuite/tests/warnings/should_fail/all.T +++ b/testsuite/tests/warnings/should_fail/all.T @@ -15,3 +15,4 @@ test('CaretDiagnostics1', compile_fail, ['-fdiagnostics-show-caret -ferror-spans']) test('CaretDiagnostics2', normal, compile_fail, ['-fdiagnostics-show-caret']) +test('T20263', normal, compile_fail, ['-Wunicode-bidirectional-format-characters -Werror']) |