diff options
-rw-r--r-- | compiler/GHC/Parser/Errors/Ppr.hs | 12 | ||||
-rw-r--r-- | compiler/GHC/Parser/Errors/Types.hs | 5 | ||||
-rw-r--r-- | compiler/GHC/Parser/Lexer.x | 97 | ||||
-rw-r--r-- | compiler/GHC/Types/Error/Codes.hs | 1 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843a.hs | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843a.stderr | 4 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843b.hs | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843b.stderr | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843c.hs | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843c.stderr | 6 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843d.hs | 4 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843d.stderr | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843e.hs | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843e.stderr | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843f.hs | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/T21843f.stderr | 3 | ||||
-rw-r--r-- | testsuite/tests/parser/should_fail/all.T | 6 |
17 files changed, 146 insertions, 16 deletions
diff --git a/compiler/GHC/Parser/Errors/Ppr.hs b/compiler/GHC/Parser/Errors/Ppr.hs index 65e2695f66..aadb2a0a79 100644 --- a/compiler/GHC/Parser/Errors/Ppr.hs +++ b/compiler/GHC/Parser/Errors/Ppr.hs @@ -513,6 +513,16 @@ instance Diagnostic PsMessage where , nest 2 $ text "but" <+> quotes (ppr tycon) <+> text "has" <+> speakN n ] , text "In the newtype declaration for" <+> quotes (ppr tycon) ] + PsErrUnicodeCharLooksLike bad_char looks_like_char looks_like_char_name + -> mkSimpleDecorated $ + hsep [ text "Unicode character" + -- purposefully not using `quotes (text [bad_char])`, because the quotes function adds smart quotes, + -- and smart quotes may be the topic of this error message + , text "'" <> text [bad_char] <> text "' (" <> text (show bad_char) <> text ")" + , text "looks like" + , text "'" <> text [looks_like_char] <> text "' (" <> text looks_like_char_name <> text ")" <> comma + , text "but it is not" ] + diagnosticReason = \case PsUnknownMessage m -> diagnosticReason m PsHeaderMessage m -> psHeaderMessageReason m @@ -630,6 +640,7 @@ instance Diagnostic PsMessage where PsErrIllegalGadtRecordMultiplicity{} -> ErrorWithoutFlag PsErrInvalidCApiImport {} -> ErrorWithoutFlag PsErrMultipleConForNewtype {} -> ErrorWithoutFlag + PsErrUnicodeCharLooksLike{} -> ErrorWithoutFlag diagnosticHints = \case PsUnknownMessage m -> diagnosticHints m @@ -800,6 +811,7 @@ instance Diagnostic PsMessage where PsErrIllegalGadtRecordMultiplicity{} -> noHints PsErrInvalidCApiImport {} -> noHints PsErrMultipleConForNewtype {} -> noHints + PsErrUnicodeCharLooksLike{} -> noHints diagnosticCode = constructorCode diff --git a/compiler/GHC/Parser/Errors/Types.hs b/compiler/GHC/Parser/Errors/Types.hs index 19851babd1..87f7f8d509 100644 --- a/compiler/GHC/Parser/Errors/Types.hs +++ b/compiler/GHC/Parser/Errors/Types.hs @@ -466,6 +466,11 @@ data PsMessage | PsErrMultipleConForNewtype !RdrName !Int + | PsErrUnicodeCharLooksLike + Char -- ^ the problematic character + Char -- ^ the character it looks like + String -- ^ the name of the character that it looks like + deriving Generic -- | Extra details about a parse error, which helps diff --git a/compiler/GHC/Parser/Lexer.x b/compiler/GHC/Parser/Lexer.x index 26f0de2873..b216ffeda8 100644 --- a/compiler/GHC/Parser/Lexer.x +++ b/compiler/GHC/Parser/Lexer.x @@ -274,6 +274,9 @@ $tab { warnTab } "-- " / { atEOL } { lineCommentToken } +-- Everywhere: check for smart quotes--they are not allowed outside of strings +$unigraphic / { isSmartQuote } { smart_quote_error } + -- 'bol' state: beginning of a line. Slurp up all the whitespace (including -- blank lines) until we find a non-whitespace character, then do layout -- processing. @@ -2037,7 +2040,8 @@ lex_string_tok span buf _len _buf2 = do lex_quoted_label :: Action lex_quoted_label span _buf _len _buf2 = do - s <- lex_string_helper "" + start <- getInput + s <- lex_string_helper "" start (AI end _) <- getInput let token = ITlabelvarid (mkFastString s) @@ -2050,7 +2054,8 @@ data LexedString = LexedRegularString String | LexedPrimString String lex_string :: P LexedString lex_string = do - s <- lex_string_helper "" + start <- getInput + s <- lex_string_helper "" start magicHash <- getBit MagicHashBit if magicHash then do @@ -2070,8 +2075,8 @@ lex_string = do return $ LexedRegularString s -lex_string_helper :: String -> P String -lex_string_helper s = do +lex_string_helper :: String -> AlexInput -> P String +lex_string_helper s start = do i <- getInput case alexGetChar' i of Nothing -> lit_error i @@ -2082,26 +2087,36 @@ lex_string_helper s = do Just ('\\',i) | Just ('&',i) <- next -> do - setInput i; lex_string_helper s + setInput i; lex_string_helper s start | Just (c,i) <- next, c <= '\x7f' && is_space c -> do -- is_space only works for <= '\x7f' (#3751, #5425) - setInput i; lex_stringgap s + setInput i; lex_stringgap s start where next = alexGetChar' i Just (c, i1) -> do case c of - '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s) - c | isAny c -> do setInput i1; lex_string_helper (c:s) + '\\' -> do setInput i1; c' <- lex_escape; lex_string_helper (c':s) start + c | isAny c -> do setInput i1; lex_string_helper (c:s) start + _other | any isDoubleSmartQuote s -> do + -- if the built-up string s contains a smart double quote character, it was + -- likely the reason why the string literal was not lexed correctly + setInput start -- rewind to the first character in the string literal + -- so we can find the smart quote character's location + advance_to_smart_quote_character + i2@(AI loc _) <- getInput + case alexGetChar' i2 of + Just (c, _) -> do add_nonfatal_smart_quote_error c loc; lit_error i + Nothing -> lit_error i -- should never get here _other -> lit_error i -lex_stringgap :: String -> P String -lex_stringgap s = do +lex_stringgap :: String -> AlexInput -> P String +lex_stringgap s start = do i <- getInput c <- getCharOrFail i case c of - '\\' -> lex_string_helper s - c | c <= '\x7f' && is_space c -> lex_stringgap s + '\\' -> lex_string_helper s start + c | c <= '\x7f' && is_space c -> lex_stringgap s start -- is_space only works for <= '\x7f' (#3751, #5425) _other -> lit_error i @@ -2123,15 +2138,16 @@ lex_char_tok span buf _len _buf2 = do -- We've seen ' setInput i2 return (L (mkPsSpan loc end2) ITtyQuote) - Just ('\\', i2@(AI _end2 _)) -> do -- We've seen 'backslash + Just ('\\', i2@(AI end2 _)) -> do -- We've seen 'backslash setInput i2 lit_ch <- lex_escape i3 <- getInput mc <- getCharOrFail i3 -- Trailing quote if mc == '\'' then finish_char_tok buf loc lit_ch - else lit_error i3 + else if isSingleSmartQuote mc then add_smart_quote_error mc end2 + else lit_error i3 - Just (c, i2@(AI _end2 _)) + Just (c, i2@(AI end2 _)) | not (isAny c) -> lit_error i1 | otherwise -> @@ -2141,6 +2157,7 @@ lex_char_tok span buf _len _buf2 = do -- We've seen ' Just ('\'', i3) -> do -- We've seen 'x' setInput i3 finish_char_tok buf loc c + Just (c, _) | isSingleSmartQuote c -> add_smart_quote_error c end2 _other -> do -- We've seen 'x not followed by quote -- (including the possibility of EOF) -- Just parse the quote only @@ -2171,7 +2188,7 @@ isAny c | c > '\x7f' = isPrint c lex_escape :: P Char lex_escape = do - i0 <- getInput + i0@(AI loc _) <- getInput c <- getCharOrFail i0 case c of 'a' -> return '\a' @@ -2184,6 +2201,11 @@ lex_escape = do '\\' -> return '\\' '"' -> return '\"' '\'' -> return '\'' + -- the next two patterns build up a Unicode smart quote error (#21843) + smart_double_quote | isDoubleSmartQuote smart_double_quote -> + add_smart_quote_error smart_double_quote loc + smart_single_quote | isSingleSmartQuote smart_single_quote -> + add_smart_quote_error smart_single_quote loc '^' -> do i1 <- getInput c <- getCharOrFail i1 if c >= '@' && c <= '_' @@ -2339,6 +2361,49 @@ quasiquote_error start = do (\k srcLoc -> mkPlainErrorMsgEnvelope srcLoc (PsErrLexer LexUnterminatedQQ k)) -- ----------------------------------------------------------------------------- +-- Unicode Smart Quote detection (#21843) + +isDoubleSmartQuote :: Char -> Bool +isDoubleSmartQuote '“' = True +isDoubleSmartQuote '”' = True +isDoubleSmartQuote _ = False + +isSingleSmartQuote :: Char -> Bool +isSingleSmartQuote '‘' = True +isSingleSmartQuote '’' = True +isSingleSmartQuote _ = False + +isSmartQuote :: AlexAccPred ExtsBitmap +isSmartQuote _ _ _ (AI _ buf) = let c = prevChar buf ' ' in isSingleSmartQuote c || isDoubleSmartQuote c + +smart_quote_error_message :: Char -> PsLoc -> MsgEnvelope PsMessage +smart_quote_error_message c loc = + let (correct_char, correct_char_name) = + if isSingleSmartQuote c then ('\'', "Single Quote") else ('"', "Quotation Mark") + err = mkPlainErrorMsgEnvelope (mkSrcSpanPs (mkPsSpan loc loc)) $ + PsErrUnicodeCharLooksLike c correct_char correct_char_name in + err + +smart_quote_error :: Action +smart_quote_error span buf _len _buf2 = do + let c = currentChar buf + addFatalError (smart_quote_error_message c (psSpanStart span)) + +add_smart_quote_error :: Char -> PsLoc -> P a +add_smart_quote_error c loc = addFatalError (smart_quote_error_message c loc) + +add_nonfatal_smart_quote_error :: Char -> PsLoc -> P () +add_nonfatal_smart_quote_error c loc = addError (smart_quote_error_message c loc) + +advance_to_smart_quote_character :: P () +advance_to_smart_quote_character = do + i <- getInput + case alexGetChar' i of + Just (c, _) | isDoubleSmartQuote c -> return () + Just (_, i2) -> do setInput i2; advance_to_smart_quote_character + Nothing -> return () -- should never get here + +-- ----------------------------------------------------------------------------- -- Warnings warnTab :: Action diff --git a/compiler/GHC/Types/Error/Codes.hs b/compiler/GHC/Types/Error/Codes.hs index 6bda338d4e..7a26ee637d 100644 --- a/compiler/GHC/Types/Error/Codes.hs +++ b/compiler/GHC/Types/Error/Codes.hs @@ -267,6 +267,7 @@ type family GhcDiagnosticCode c = n | n -> c where GhcDiagnosticCode "PsErrIllegalGadtRecordMultiplicity" = 37475 GhcDiagnosticCode "PsErrInvalidCApiImport" = 72744 GhcDiagnosticCode "PsErrMultipleConForNewtype" = 05380 + GhcDiagnosticCode "PsErrUnicodeCharLooksLike" = 31623 -- Driver diagnostic codes GhcDiagnosticCode "DriverMissingHomeModules" = 32850 diff --git a/testsuite/tests/parser/should_fail/T21843a.hs b/testsuite/tests/parser/should_fail/T21843a.hs new file mode 100644 index 0000000000..1b56d86553 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843a.hs @@ -0,0 +1,3 @@ +module UnicodeSmartQuotes where + +badString = “hello” diff --git a/testsuite/tests/parser/should_fail/T21843a.stderr b/testsuite/tests/parser/should_fail/T21843a.stderr new file mode 100644 index 0000000000..11ad47d94a --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843a.stderr @@ -0,0 +1,4 @@ + +T21843a.hs:3:13: [GHC-31623] + Unicode character '“' ('/8220') looks like '"' (Quotation Mark), but it is not + diff --git a/testsuite/tests/parser/should_fail/T21843b.hs b/testsuite/tests/parser/should_fail/T21843b.hs new file mode 100644 index 0000000000..57bf9e81d5 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843b.hs @@ -0,0 +1,3 @@ +module UnicodeSmartQuotes where + +badChar = ‘x’ diff --git a/testsuite/tests/parser/should_fail/T21843b.stderr b/testsuite/tests/parser/should_fail/T21843b.stderr new file mode 100644 index 0000000000..34c531c3f8 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843b.stderr @@ -0,0 +1,3 @@ + +T21843b.hs:3:11: [GHC-31623] + Unicode character '‘' ('/8216') looks like ''' (Single Quote), but it is not diff --git a/testsuite/tests/parser/should_fail/T21843c.hs b/testsuite/tests/parser/should_fail/T21843c.hs new file mode 100644 index 0000000000..d3aa809185 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843c.hs @@ -0,0 +1,3 @@ +module UnicodeSmartQuotes where + +badString = "hello” diff --git a/testsuite/tests/parser/should_fail/T21843c.stderr b/testsuite/tests/parser/should_fail/T21843c.stderr new file mode 100644 index 0000000000..54146a42eb --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843c.stderr @@ -0,0 +1,6 @@ + +T21843c.hs:3:19: [GHC-31623] + Unicode character '”' ('/8221') looks like '"' (Quotation Mark), but it is not + +T21843c.hs:3:20: [GHC-21231] + lexical error in string/character literal at character '/n' diff --git a/testsuite/tests/parser/should_fail/T21843d.hs b/testsuite/tests/parser/should_fail/T21843d.hs new file mode 100644 index 0000000000..440967ce6c --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843d.hs @@ -0,0 +1,4 @@ +module UnicodeSmartQuotes where + +badChar = 'x’ + diff --git a/testsuite/tests/parser/should_fail/T21843d.stderr b/testsuite/tests/parser/should_fail/T21843d.stderr new file mode 100644 index 0000000000..4ee47ed8ef --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843d.stderr @@ -0,0 +1,3 @@ + +T21843d.hs:3:13: [GHC-31623] + Unicode character '’' ('/8217') looks like ''' (Single Quote), but it is not diff --git a/testsuite/tests/parser/should_fail/T21843e.hs b/testsuite/tests/parser/should_fail/T21843e.hs new file mode 100644 index 0000000000..5fd3b9614a --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843e.hs @@ -0,0 +1,3 @@ +module UnicodeSmartQuotes where + +badString = "\”" diff --git a/testsuite/tests/parser/should_fail/T21843e.stderr b/testsuite/tests/parser/should_fail/T21843e.stderr new file mode 100644 index 0000000000..c39d573b87 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843e.stderr @@ -0,0 +1,3 @@ + +T21843e.hs:3:15: [GHC-31623] + Unicode character '”' ('/8221') looks like '"' (Quotation Mark), but it is not diff --git a/testsuite/tests/parser/should_fail/T21843f.hs b/testsuite/tests/parser/should_fail/T21843f.hs new file mode 100644 index 0000000000..884ff11d71 --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843f.hs @@ -0,0 +1,3 @@ +module UnicodeSmartQuotes where + +badChar = '\‘' diff --git a/testsuite/tests/parser/should_fail/T21843f.stderr b/testsuite/tests/parser/should_fail/T21843f.stderr new file mode 100644 index 0000000000..198917937a --- /dev/null +++ b/testsuite/tests/parser/should_fail/T21843f.stderr @@ -0,0 +1,3 @@ + +T21843f.hs:3:13: [GHC-31623] + Unicode character '‘' ('/8216') looks like ''' (Single Quote), but it is not diff --git a/testsuite/tests/parser/should_fail/all.T b/testsuite/tests/parser/should_fail/all.T index 8713d91f2b..9dc87514c5 100644 --- a/testsuite/tests/parser/should_fail/all.T +++ b/testsuite/tests/parser/should_fail/all.T @@ -209,3 +209,9 @@ test('T20385A', normal, compile_fail, ['']) test('T20385B', normal, compile_fail, ['']) test('T16999', normal, compile_fail, ['']) test('T22070', normal, compile_fail, ['']) +test('T21843a', normal, compile_fail, ['']) +test('T21843b', normal, compile_fail, ['']) +test('T21843c', normal, compile_fail, ['']) +test('T21843d', normal, compile_fail, ['']) +test('T21843e', normal, compile_fail, ['']) +test('T21843f', normal, compile_fail, ['']) |