diff options
author | Richard Eisenberg <eir@cis.upenn.edu> | 2014-11-03 15:34:53 -0500 |
---|---|---|
committer | Richard Eisenberg <eir@cis.upenn.edu> | 2014-11-21 11:15:46 -0500 |
commit | da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41 (patch) | |
tree | ccc30f5a5b1d8ff4cc1ef66cfec4499bb28846d8 | |
parent | dbf360a5264d5d6597e046dcd9b4f49effa91eee (diff) | |
download | haskell-da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41.tar.gz |
Fix #7484, checking for good binder names in Convert.
This commit also refactors a bunch of lexeme-oriented code into
a new module Lexeme, and includes a submodule update for haddock.
-rw-r--r-- | compiler/basicTypes/Lexeme.hs | 252 | ||||
-rw-r--r-- | compiler/basicTypes/OccName.lhs | 72 | ||||
-rw-r--r-- | compiler/ghc.cabal.in | 1 | ||||
-rw-r--r-- | compiler/ghc.mk | 1 | ||||
-rw-r--r-- | compiler/hsSyn/Convert.lhs | 14 | ||||
-rw-r--r-- | compiler/parser/Lexer.x | 6 | ||||
-rw-r--r-- | compiler/typecheck/TcGenDeriv.lhs | 1 | ||||
-rw-r--r-- | compiler/typecheck/TcSplice.lhs | 1 | ||||
-rw-r--r-- | testsuite/tests/th/all.T | 2 | ||||
m--------- | utils/haddock | 0 |
10 files changed, 270 insertions, 80 deletions
diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs new file mode 100644 index 0000000000..c5bda4dbdd --- /dev/null +++ b/compiler/basicTypes/Lexeme.hs @@ -0,0 +1,252 @@ +-- (c) The GHC Team +-- +-- Functions to evaluate whether or not a string is a valid identifier. +-- There is considerable overlap between the logic here and the logic +-- in Lexer.x, but sadly there seems to be way to merge them. + +module Lexeme ( + -- * Lexical characteristics of Haskell names + + -- | Use these functions to figure what kind of name a 'FastString' + -- represents; these functions do /not/ check that the identifier + -- is valid. + + isLexCon, isLexVar, isLexId, isLexSym, + isLexConId, isLexConSym, isLexVarId, isLexVarSym, + startsVarSym, startsVarId, startsConSym, startsConId, + + -- * Validating identifiers + + -- | These functions (working over plain old 'String's) check + -- to make sure that the identifier is valid. + okVarOcc, okConOcc, okTcOcc, + okVarIdOcc, okVarSymOcc, okConIdOcc, okConSymOcc + + -- Some of the exports above are not used within GHC, but may + -- be of value to GHC API users. + + ) where + +import FastString + +import Data.Char +import qualified Data.Set as Set + +{- + +************************************************************************ +* * + Lexical categories +* * +************************************************************************ + +These functions test strings to see if they fit the lexical categories +defined in the Haskell report. + +Note [Classification of generated names] +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some names generated for internal use can show up in debugging output, +e.g. when using -ddump-simpl. These generated names start with a $ +but should still be pretty-printed using prefix notation. We make sure +this is the case in isLexVarSym by only classifying a name as a symbol +if all its characters are symbols, not just its first one. +-} + +isLexCon, isLexVar, isLexId, isLexSym :: FastString -> Bool +isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool + +isLexCon cs = isLexConId cs || isLexConSym cs +isLexVar cs = isLexVarId cs || isLexVarSym cs + +isLexId cs = isLexConId cs || isLexVarId cs +isLexSym cs = isLexConSym cs || isLexVarSym cs + +------------- +isLexConId cs -- Prefix type or data constructors + | nullFS cs = False -- e.g. "Foo", "[]", "(,)" + | cs == (fsLit "[]") = True + | otherwise = startsConId (headFS cs) + +isLexVarId cs -- Ordinary prefix identifiers + | nullFS cs = False -- e.g. "x", "_x" + | otherwise = startsVarId (headFS cs) + +isLexConSym cs -- Infix type or data constructors + | nullFS cs = False -- e.g. ":-:", ":", "->" + | cs == (fsLit "->") = True + | otherwise = startsConSym (headFS cs) + +isLexVarSym fs -- Infix identifiers e.g. "+" + | fs == (fsLit "~R#") = True + | otherwise + = case (if nullFS fs then [] else unpackFS fs) of + [] -> False + (c:cs) -> startsVarSym c && all isVarSymChar cs + -- See Note [Classification of generated names] + +------------- +startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool +startsVarSym c = startsVarSymASCII c || (ord c > 0x7f && isSymbol c) -- Infix Ids +startsConSym c = c == ':' -- Infix data constructors +startsVarId c = c == '_' || case generalCategory c of -- Ordinary Ids + LowercaseLetter -> True + OtherLetter -> True -- See #1103 + _ -> False +startsConId c = isUpper c || c == '(' -- Ordinary type constructors and data constructors + +startsVarSymASCII :: Char -> Bool +startsVarSymASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-" + +isVarSymChar :: Char -> Bool +isVarSymChar c = c == ':' || startsVarSym c + +{- + +************************************************************************ +* * + Detecting valid names for Template Haskell +* * +************************************************************************ + +-} + +---------------------- +-- External interface +---------------------- + +-- | Is this an acceptable variable name? +okVarOcc :: String -> Bool +okVarOcc str@(c:_) + | startsVarId c + = okVarIdOcc str + | startsVarSym c + = okVarSymOcc str +okVarOcc _ = False + +-- | Is this an acceptable constructor name? +okConOcc :: String -> Bool +okConOcc str@(c:_) + | startsConId c + = okConIdOcc str + | startsConSym c + = okConSymOcc str + | str == "[]" + = True +okConOcc _ = False + +-- | Is this an acceptable type name? +okTcOcc :: String -> Bool +okTcOcc "[]" = True +okTcOcc "->" = True +okTcOcc "~" = True +okTcOcc str@(c:_) + | startsConId c + = okConIdOcc str + | startsConSym c + = okConSymOcc str + | startsVarSym c + = okVarSymOcc str +okTcOcc _ = False + +-- | Is this an acceptable alphanumeric variable name, assuming it starts +-- with an acceptable letter? +okVarIdOcc :: String -> Bool +okVarIdOcc str = okIdOcc str && + not (str `Set.member` reservedIds) + +-- | Is this an acceptable symbolic variable name, assuming it starts +-- with an acceptable character? +okVarSymOcc :: String -> Bool +okVarSymOcc str = all okSymChar str && + not (str `Set.member` reservedOps) && + not (isDashes str) + +-- | Is this an acceptable alphanumeric constructor name, assuming it +-- starts with an acceptable letter? +okConIdOcc :: String -> Bool +okConIdOcc str = okIdOcc str || + is_tuple_name1 str + where + -- check for tuple name, starting at the beginning + is_tuple_name1 ('(' : rest) = is_tuple_name2 rest + is_tuple_name1 _ = False + + -- check for tuple tail + is_tuple_name2 ")" = True + is_tuple_name2 (',' : rest) = is_tuple_name2 rest + is_tuple_name2 (ws : rest) + | isSpace ws = is_tuple_name2 rest + is_tuple_name2 _ = False + +-- | Is this an acceptable symbolic constructor name, assuming it +-- starts with an acceptable character? +okConSymOcc :: String -> Bool +okConSymOcc ":" = True +okConSymOcc str = all okSymChar str && + not (str `Set.member` reservedOps) + +---------------------- +-- Internal functions +---------------------- + +-- | Is this string an acceptable id, possibly with a suffix of hashes, +-- but not worrying about case or clashing with reserved words? +okIdOcc :: String -> Bool +okIdOcc str + = let hashes = dropWhile okIdChar str in + all (== '#') hashes -- -XMagicHash allows a suffix of hashes + -- of course, `all` says "True" to an empty list + +-- | Is this character acceptable in an identifier (after the first letter)? +-- See alexGetByte in Lexer.x +okIdChar :: Char -> Bool +okIdChar c = case generalCategory c of + UppercaseLetter -> True + LowercaseLetter -> True + OtherLetter -> True + TitlecaseLetter -> True + DecimalNumber -> True + OtherNumber -> True + _ -> c == '\'' || c == '_' + +-- | Is this character acceptable in a symbol (after the first char)? +-- See alexGetByte in Lexer.x +okSymChar :: Char -> Bool +okSymChar c + | c `elem` specialSymbols + = False + | c `elem` "_\"'" + = False + | otherwise + = case generalCategory c of + ConnectorPunctuation -> True + DashPunctuation -> True + OtherPunctuation -> True + MathSymbol -> True + CurrencySymbol -> True + ModifierSymbol -> True + OtherSymbol -> True + _ -> False + +-- | All reserved identifiers. Taken from section 2.4 of the 2010 Report. +reservedIds :: Set.Set String +reservedIds = Set.fromList [ "case", "class", "data", "default", "deriving" + , "do", "else", "foreign", "if", "import", "in" + , "infix", "infixl", "infixr", "instance", "let" + , "module", "newtype", "of", "then", "type", "where" + , "_" ] + +-- | All punctuation that cannot appear in symbols. See $special in Lexer.x. +specialSymbols :: [Char] +specialSymbols = "(),;[]`{}" + +-- | All reserved operators. Taken from section 2.4 of the 2010 Report. +reservedOps :: Set.Set String +reservedOps = Set.fromList [ "..", ":", "::", "=", "\\", "|", "<-", "->" + , "@", "~", "=>" ] + +-- | Does this string contain only dashes and has at least 2 of them? +isDashes :: String -> Bool +isDashes ('-' : '-' : rest) = all (== '-') rest +isDashes _ = False diff --git a/compiler/basicTypes/OccName.lhs b/compiler/basicTypes/OccName.lhs index 0010ad37dc..fdc7c95918 100644 --- a/compiler/basicTypes/OccName.lhs +++ b/compiler/basicTypes/OccName.lhs @@ -94,11 +94,6 @@ module OccName ( -- * Tidying up TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv, - -- * Lexical characteristics of Haskell names - isLexCon, isLexVar, isLexId, isLexSym, - isLexConId, isLexConSym, isLexVarId, isLexVarSym, - startsVarSym, startsVarId, startsConSym, startsConId, - -- FsEnv FastStringEnv, emptyFsEnv, lookupFsEnv, extendFsEnv, mkFsEnv ) where @@ -110,6 +105,7 @@ import UniqFM import UniqSet import FastString import Outputable +import Lexeme import Binary import Data.Char import Data.Data @@ -851,72 +847,6 @@ tidyOccName env occ@(OccName occ_sp fs) %************************************************************************ %* * -\subsection{Lexical categories} -%* * -%************************************************************************ - -These functions test strings to see if they fit the lexical categories -defined in the Haskell report. - -Note [Classification of generated names] -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Some names generated for internal use can show up in debugging output, -e.g. when using -ddump-simpl. These generated names start with a $ -but should still be pretty-printed using prefix notation. We make sure -this is the case in isLexVarSym by only classifying a name as a symbol -if all its characters are symbols, not just its first one. - -\begin{code} -isLexCon, isLexVar, isLexId, isLexSym :: FastString -> Bool -isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool - -isLexCon cs = isLexConId cs || isLexConSym cs -isLexVar cs = isLexVarId cs || isLexVarSym cs - -isLexId cs = isLexConId cs || isLexVarId cs -isLexSym cs = isLexConSym cs || isLexVarSym cs - -------------- - -isLexConId cs -- Prefix type or data constructors - | nullFS cs = False -- e.g. "Foo", "[]", "(,)" - | cs == (fsLit "[]") = True - | otherwise = startsConId (headFS cs) - -isLexVarId cs -- Ordinary prefix identifiers - | nullFS cs = False -- e.g. "x", "_x" - | otherwise = startsVarId (headFS cs) - -isLexConSym cs -- Infix type or data constructors - | nullFS cs = False -- e.g. ":-:", ":", "->" - | cs == (fsLit "->") = True - | otherwise = startsConSym (headFS cs) - -isLexVarSym fs -- Infix identifiers e.g. "+" - | fs == (fsLit "~R#") = True - | otherwise - = case (if nullFS fs then [] else unpackFS fs) of - [] -> False - (c:cs) -> startsVarSym c && all isVarSymChar cs - -- See Note [Classification of generated names] - -------------- -startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool -startsVarSym c = isSymbolASCII c || (ord c > 0x7f && isSymbol c) -- Infix Ids -startsConSym c = c == ':' -- Infix data constructors -startsVarId c = isLower c || c == '_' -- Ordinary Ids -startsConId c = isUpper c || c == '(' -- Ordinary type constructors and data constructors - -isSymbolASCII :: Char -> Bool -isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-" - -isVarSymChar :: Char -> Bool -isVarSymChar c = c == ':' || startsVarSym c -\end{code} - -%************************************************************************ -%* * Binary instance Here rather than BinIface because OccName is abstract %* * diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in index bfc2e9cf7c..4aa2e3a4d5 100644 --- a/compiler/ghc.cabal.in +++ b/compiler/ghc.cabal.in @@ -161,6 +161,7 @@ Library Hooks Id IdInfo + Lexeme Literal Llvm Llvm.AbsSyn diff --git a/compiler/ghc.mk b/compiler/ghc.mk index 9716ef285a..752a607c72 100644 --- a/compiler/ghc.mk +++ b/compiler/ghc.mk @@ -536,6 +536,7 @@ compiler_stage2_dll0_MODULES = \ IfaceType \ InstEnv \ Kind \ + Lexeme \ ListSetOps \ Literal \ LoadIface \ diff --git a/compiler/hsSyn/Convert.lhs b/compiler/hsSyn/Convert.lhs index 83c286d4dc..141b8b840a 100644 --- a/compiler/hsSyn/Convert.lhs +++ b/compiler/hsSyn/Convert.lhs @@ -30,6 +30,7 @@ import ForeignCall import Unique import ErrUtils import Bag +import Lexeme import Util import FastString import Outputable @@ -1122,14 +1123,11 @@ cvtName ctxt_ns (TH.Name occ flavour) occ_str = TH.occString occ okOcc :: OccName.NameSpace -> String -> Bool -okOcc _ [] = False -okOcc ns str@(c:_) - | OccName.isVarNameSpace ns = startsVarId c || startsVarSym c - | OccName.isDataConNameSpace ns = startsConId c || startsConSym c || str == "[]" - | otherwise = startsConId c || startsConSym c || - startsVarSym c || str == "[]" || str == "->" - -- allow type operators like "+" - +okOcc ns str + | OccName.isVarNameSpace ns = okVarOcc str + | OccName.isDataConNameSpace ns = okConOcc str + | otherwise = okTcOcc str + -- Determine the name space of a name in a type -- isVarName :: TH.Name -> Bool diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x index b4223c82d8..1e8712b2d5 100644 --- a/compiler/parser/Lexer.x +++ b/compiler/parser/Lexer.x @@ -115,6 +115,8 @@ import Ctype -- ----------------------------------------------------------------------------- -- Alex "Character set macros" +-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs +-- Any changes here should likely be reflected there. $unispace = \x05 -- Trick Alex into handling Unicode. See alexGetByte. $nl = [\n\r\f] $whitechar = [$nl\v\ $unispace] @@ -1802,6 +1804,10 @@ alexGetByte (AI loc s) -- character is encountered we output these values -- with the actual character value hidden in the state. | otherwise = + -- NB: The logic behind these definitions is also reflected + -- in basicTypes/Lexeme.hs + -- Any changes here should likely be reflected there. + case generalCategory c of UppercaseLetter -> upper LowercaseLetter -> lower diff --git a/compiler/typecheck/TcGenDeriv.lhs b/compiler/typecheck/TcGenDeriv.lhs index df45001870..0779e67363 100644 --- a/compiler/typecheck/TcGenDeriv.lhs +++ b/compiler/typecheck/TcGenDeriv.lhs @@ -60,6 +60,7 @@ import Util import Var import MonadUtils import Outputable +import Lexeme import FastString import Pair import Bag diff --git a/compiler/typecheck/TcSplice.lhs b/compiler/typecheck/TcSplice.lhs index 3302d028a5..7c8085ebe2 100644 --- a/compiler/typecheck/TcSplice.lhs +++ b/compiler/typecheck/TcSplice.lhs @@ -91,6 +91,7 @@ import BasicTypes hiding( SuccessFlag(..) ) import Maybes( MaybeErr(..) ) import DynFlags import Panic +import Lexeme import FastString import Outputable import Control.Monad ( when ) diff --git a/testsuite/tests/th/all.T b/testsuite/tests/th/all.T index 5109473d96..1144156607 100644 --- a/testsuite/tests/th/all.T +++ b/testsuite/tests/th/all.T @@ -343,4 +343,4 @@ test('T9066', normal, compile, ['-v0']) test('T8100', normal, compile, ['-v0']) test('T9064', normal, compile, ['-v0']) test('T9209', normal, compile_fail, ['-v0']) -test('T7484', expect_broken(7484), compile_fail, ['-v0']) +test('T7484', normal, compile_fail, ['-v0']) diff --git a/utils/haddock b/utils/haddock -Subproject 19409126be62383bc64d79698b265ffaf96269a +Subproject 2b3712d701c1df626abbc60525c35e735272e45 |