Fix #7484, checking for good binder names in Convert.

This commit also refactors a bunch of lexeme-oriented code into a new module Lexeme, and includes a submodule update for haddock.
author: Richard Eisenberg <eir@cis.upenn.edu> 2014-11-03 15:34:53 -0500
committer: Richard Eisenberg <eir@cis.upenn.edu> 2014-11-21 11:15:46 -0500
commit: da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41 (patch)
tree: ccc30f5a5b1d8ff4cc1ef66cfec4499bb28846d8
parent: dbf360a5264d5d6597e046dcd9b4f49effa91eee (diff)
download: haskell-da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41.tar.gz
10 files changed, 270 insertions, 80 deletions
diff --git a/compiler/basicTypes/Lexeme.hs b/compiler/basicTypes/Lexeme.hs
new file mode 100644
index 0000000000..c5bda4dbdd
--- /dev/null
+++ b/compiler/basicTypes/Lexeme.hs
@@ -0,0 +1,252 @@
+-- (c) The GHC Team
+--
+-- Functions to evaluate whether or not a string is a valid identifier.
+-- There is considerable overlap between the logic here and the logic
+-- in Lexer.x, but sadly there seems to be way to merge them.
+
+module Lexeme (
+          -- * Lexical characteristics of Haskell names
+  
+          -- | Use these functions to figure what kind of name a 'FastString'
+          -- represents; these functions do /not/ check that the identifier
+          -- is valid.
+  
+        isLexCon, isLexVar, isLexId, isLexSym,
+        isLexConId, isLexConSym, isLexVarId, isLexVarSym,
+        startsVarSym, startsVarId, startsConSym, startsConId,
+
+          -- * Validating identifiers
+
+          -- | These functions (working over plain old 'String's) check
+          -- to make sure that the identifier is valid.
+        okVarOcc, okConOcc, okTcOcc,
+        okVarIdOcc, okVarSymOcc, okConIdOcc, okConSymOcc
+
+        -- Some of the exports above are not used within GHC, but may
+        -- be of value to GHC API users.
+
+  ) where
+
+import FastString
+
+import Data.Char
+import qualified Data.Set as Set
+
+{-
+
+************************************************************************
+*                                                                      *
+    Lexical categories
+*                                                                      *
+************************************************************************
+
+These functions test strings to see if they fit the lexical categories
+defined in the Haskell report.
+
+Note [Classification of generated names]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some names generated for internal use can show up in debugging output,
+e.g.  when using -ddump-simpl. These generated names start with a $
+but should still be pretty-printed using prefix notation. We make sure
+this is the case in isLexVarSym by only classifying a name as a symbol
+if all its characters are symbols, not just its first one.
+-}
+
+isLexCon,   isLexVar,    isLexId,    isLexSym    :: FastString -> Bool
+isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool
+
+isLexCon cs = isLexConId  cs || isLexConSym cs
+isLexVar cs = isLexVarId  cs || isLexVarSym cs
+
+isLexId  cs = isLexConId  cs || isLexVarId  cs
+isLexSym cs = isLexConSym cs || isLexVarSym cs
+
+-------------
+isLexConId cs                           -- Prefix type or data constructors
+  | nullFS cs          = False          --      e.g. "Foo", "[]", "(,)"
+  | cs == (fsLit "[]") = True
+  | otherwise          = startsConId (headFS cs)
+
+isLexVarId cs                           -- Ordinary prefix identifiers
+  | nullFS cs         = False           --      e.g. "x", "_x"
+  | otherwise         = startsVarId (headFS cs)
+
+isLexConSym cs                          -- Infix type or data constructors
+  | nullFS cs          = False          --      e.g. ":-:", ":", "->"
+  | cs == (fsLit "->") = True
+  | otherwise          = startsConSym (headFS cs)
+
+isLexVarSym fs                          -- Infix identifiers e.g. "+"
+  | fs == (fsLit "~R#") = True
+  | otherwise
+  = case (if nullFS fs then [] else unpackFS fs) of
+      [] -> False
+      (c:cs) -> startsVarSym c && all isVarSymChar cs
+        -- See Note [Classification of generated names]
+
+-------------
+startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool
+startsVarSym c = startsVarSymASCII c || (ord c > 0x7f && isSymbol c)  -- Infix Ids
+startsConSym c = c == ':'               -- Infix data constructors
+startsVarId c  = c == '_' || case generalCategory c of  -- Ordinary Ids
+  LowercaseLetter -> True
+  OtherLetter     -> True   -- See #1103
+  _               -> False
+startsConId c  = isUpper c || c == '('  -- Ordinary type constructors and data constructors
+
+startsVarSymASCII :: Char -> Bool
+startsVarSymASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
+
+isVarSymChar :: Char -> Bool
+isVarSymChar c = c == ':' || startsVarSym c
+
+{-
+
+************************************************************************
+*                                                                      *
+    Detecting valid names for Template Haskell
+*                                                                      *
+************************************************************************
+
+-}
+
+----------------------
+-- External interface 
+----------------------
+
+-- | Is this an acceptable variable name?
+okVarOcc :: String -> Bool
+okVarOcc str@(c:_)
+  | startsVarId c
+  = okVarIdOcc str
+  | startsVarSym c
+  = okVarSymOcc str
+okVarOcc _ = False
+
+-- | Is this an acceptable constructor name?
+okConOcc :: String -> Bool
+okConOcc str@(c:_)
+  | startsConId c
+  = okConIdOcc str
+  | startsConSym c
+  = okConSymOcc str
+  | str == "[]"
+  = True
+okConOcc _ = False
+
+-- | Is this an acceptable type name?
+okTcOcc :: String -> Bool
+okTcOcc "[]" = True
+okTcOcc "->" = True
+okTcOcc "~"  = True
+okTcOcc str@(c:_)
+  | startsConId c
+  = okConIdOcc str
+  | startsConSym c
+  = okConSymOcc str
+  | startsVarSym c
+  = okVarSymOcc str
+okTcOcc _ = False
+
+-- | Is this an acceptable alphanumeric variable name, assuming it starts
+-- with an acceptable letter?
+okVarIdOcc :: String -> Bool
+okVarIdOcc str = okIdOcc str &&
+                 not (str `Set.member` reservedIds)
+
+-- | Is this an acceptable symbolic variable name, assuming it starts
+-- with an acceptable character?
+okVarSymOcc :: String -> Bool
+okVarSymOcc str = all okSymChar str &&
+                  not (str `Set.member` reservedOps) &&
+                  not (isDashes str)
+
+-- | Is this an acceptable alphanumeric constructor name, assuming it
+-- starts with an acceptable letter?
+okConIdOcc :: String -> Bool
+okConIdOcc str = okIdOcc str ||
+                 is_tuple_name1 str
+  where
+    -- check for tuple name, starting at the beginning
+    is_tuple_name1 ('(' : rest) = is_tuple_name2 rest
+    is_tuple_name1 _            = False
+
+    -- check for tuple tail
+    is_tuple_name2 ")"          = True
+    is_tuple_name2 (',' : rest) = is_tuple_name2 rest
+    is_tuple_name2 (ws  : rest)
+      | isSpace ws              = is_tuple_name2 rest
+    is_tuple_name2 _            = False
+
+-- | Is this an acceptable symbolic constructor name, assuming it
+-- starts with an acceptable character?
+okConSymOcc :: String -> Bool
+okConSymOcc ":" = True
+okConSymOcc str = all okSymChar str &&
+                  not (str `Set.member` reservedOps)
+
+----------------------
+-- Internal functions
+----------------------
+
+-- | Is this string an acceptable id, possibly with a suffix of hashes,
+-- but not worrying about case or clashing with reserved words?
+okIdOcc :: String -> Bool
+okIdOcc str
+  = let hashes = dropWhile okIdChar str in
+    all (== '#') hashes   -- -XMagicHash allows a suffix of hashes
+                          -- of course, `all` says "True" to an empty list
+
+-- | Is this character acceptable in an identifier (after the first letter)?
+-- See alexGetByte in Lexer.x
+okIdChar :: Char -> Bool
+okIdChar c = case generalCategory c of
+  UppercaseLetter -> True
+  LowercaseLetter -> True
+  OtherLetter     -> True
+  TitlecaseLetter -> True
+  DecimalNumber   -> True
+  OtherNumber     -> True
+  _               -> c == '\'' || c == '_'
+
+-- | Is this character acceptable in a symbol (after the first char)?
+-- See alexGetByte in Lexer.x
+okSymChar :: Char -> Bool
+okSymChar c
+  | c `elem` specialSymbols
+  = False
+  | c `elem` "_\"'"
+  = False
+  | otherwise
+  = case generalCategory c of
+      ConnectorPunctuation -> True
+      DashPunctuation      -> True
+      OtherPunctuation     -> True
+      MathSymbol           -> True
+      CurrencySymbol       -> True
+      ModifierSymbol       -> True
+      OtherSymbol          -> True
+      _                    -> False
+    
+-- | All reserved identifiers. Taken from section 2.4 of the 2010 Report.
+reservedIds :: Set.Set String
+reservedIds = Set.fromList [ "case", "class", "data", "default", "deriving"
+                           , "do", "else", "foreign", "if", "import", "in"
+                           , "infix", "infixl", "infixr", "instance", "let"
+                           , "module", "newtype", "of", "then", "type", "where"
+                           , "_" ]
+
+-- | All punctuation that cannot appear in symbols. See $special in Lexer.x.
+specialSymbols :: [Char]
+specialSymbols = "(),;[]`{}"
+
+-- | All reserved operators. Taken from section 2.4 of the 2010 Report.
+reservedOps :: Set.Set String
+reservedOps = Set.fromList [ "..", ":", "::", "=", "\\", "|", "<-", "->"
+                           , "@", "~", "=>" ]
+
+-- | Does this string contain only dashes and has at least 2 of them?
+isDashes :: String -> Bool
+isDashes ('-' : '-' : rest) = all (== '-') rest
+isDashes _                  = False
diff --git a/compiler/basicTypes/OccName.lhs b/compiler/basicTypes/OccName.lhs
index 0010ad37dc..fdc7c95918 100644
--- a/compiler/basicTypes/OccName.lhs
+++ b/compiler/basicTypes/OccName.lhs
@@ -94,11 +94,6 @@ module OccName (
         -- * Tidying up
         TidyOccEnv, emptyTidyOccEnv, tidyOccName, initTidyOccEnv,
 
-        -- * Lexical characteristics of Haskell names
-        isLexCon, isLexVar, isLexId, isLexSym,
-        isLexConId, isLexConSym, isLexVarId, isLexVarSym,
-        startsVarSym, startsVarId, startsConSym, startsConId,
-
         -- FsEnv
         FastStringEnv, emptyFsEnv, lookupFsEnv, extendFsEnv, mkFsEnv
     ) where
@@ -110,6 +105,7 @@ import UniqFM
 import UniqSet
 import FastString
 import Outputable
+import Lexeme
 import Binary
 import Data.Char
 import Data.Data
@@ -851,72 +847,6 @@ tidyOccName env occ@(OccName occ_sp fs)
 
 %************************************************************************
 %*                                                                      *
-\subsection{Lexical categories}
-%*                                                                      *
-%************************************************************************
-
-These functions test strings to see if they fit the lexical categories
-defined in the Haskell report.
-
-Note [Classification of generated names]
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Some names generated for internal use can show up in debugging output,
-e.g.  when using -ddump-simpl. These generated names start with a $
-but should still be pretty-printed using prefix notation. We make sure
-this is the case in isLexVarSym by only classifying a name as a symbol
-if all its characters are symbols, not just its first one.
-
-\begin{code}
-isLexCon,   isLexVar,    isLexId,    isLexSym    :: FastString -> Bool
-isLexConId, isLexConSym, isLexVarId, isLexVarSym :: FastString -> Bool
-
-isLexCon cs = isLexConId  cs || isLexConSym cs
-isLexVar cs = isLexVarId  cs || isLexVarSym cs
-
-isLexId  cs = isLexConId  cs || isLexVarId  cs
-isLexSym cs = isLexConSym cs || isLexVarSym cs
-
--------------
-
-isLexConId cs                           -- Prefix type or data constructors
-  | nullFS cs          = False          --      e.g. "Foo", "[]", "(,)"
-  | cs == (fsLit "[]") = True
-  | otherwise          = startsConId (headFS cs)
-
-isLexVarId cs                           -- Ordinary prefix identifiers
-  | nullFS cs         = False           --      e.g. "x", "_x"
-  | otherwise         = startsVarId (headFS cs)
-
-isLexConSym cs                          -- Infix type or data constructors
-  | nullFS cs          = False          --      e.g. ":-:", ":", "->"
-  | cs == (fsLit "->") = True
-  | otherwise          = startsConSym (headFS cs)
-
-isLexVarSym fs                          -- Infix identifiers e.g. "+"
-  | fs == (fsLit "~R#") = True
-  | otherwise
-  = case (if nullFS fs then [] else unpackFS fs) of
-      [] -> False
-      (c:cs) -> startsVarSym c && all isVarSymChar cs
-        -- See Note [Classification of generated names]
-
--------------
-startsVarSym, startsVarId, startsConSym, startsConId :: Char -> Bool
-startsVarSym c = isSymbolASCII c || (ord c > 0x7f && isSymbol c)  -- Infix Ids
-startsConSym c = c == ':'               -- Infix data constructors
-startsVarId c  = isLower c || c == '_'  -- Ordinary Ids
-startsConId c  = isUpper c || c == '('  -- Ordinary type constructors and data constructors
-
-isSymbolASCII :: Char -> Bool
-isSymbolASCII c = c `elem` "!#$%&*+./<=>?@\\^|~-"
-
-isVarSymChar :: Char -> Bool
-isVarSymChar c = c == ':' || startsVarSym c
-\end{code}
-
-%************************************************************************
-%*                                                                      *
                 Binary instance
     Here rather than BinIface because OccName is abstract
 %*                                                                      *
diff --git a/compiler/ghc.cabal.in b/compiler/ghc.cabal.in
index bfc2e9cf7c..4aa2e3a4d5 100644
--- a/compiler/ghc.cabal.in
+++ b/compiler/ghc.cabal.in
@@ -161,6 +161,7 @@ Library
         Hooks
         Id
         IdInfo
+        Lexeme
         Literal
         Llvm
         Llvm.AbsSyn
diff --git a/compiler/ghc.mk b/compiler/ghc.mk
index 9716ef285a..752a607c72 100644
--- a/compiler/ghc.mk
+++ b/compiler/ghc.mk
@@ -536,6 +536,7 @@ compiler_stage2_dll0_MODULES = \
 	IfaceType \
 	InstEnv \
 	Kind \
+	Lexeme \
 	ListSetOps \
 	Literal \
 	LoadIface \
diff --git a/compiler/hsSyn/Convert.lhs b/compiler/hsSyn/Convert.lhs
index 83c286d4dc..141b8b840a 100644
--- a/compiler/hsSyn/Convert.lhs
+++ b/compiler/hsSyn/Convert.lhs
@@ -30,6 +30,7 @@ import ForeignCall
 import Unique
 import ErrUtils
 import Bag
+import Lexeme
 import Util
 import FastString
 import Outputable
@@ -1122,14 +1123,11 @@ cvtName ctxt_ns (TH.Name occ flavour)
     occ_str = TH.occString occ
 
 okOcc :: OccName.NameSpace -> String -> Bool
-okOcc _  []      = False
-okOcc ns str@(c:_)
-  | OccName.isVarNameSpace ns     = startsVarId c || startsVarSym c
-  | OccName.isDataConNameSpace ns = startsConId c || startsConSym c || str == "[]"
-  | otherwise                     = startsConId c || startsConSym c ||
-                                    startsVarSym c || str == "[]" || str == "->"
-                                     -- allow type operators like "+"
-
+okOcc ns str
+  | OccName.isVarNameSpace ns     = okVarOcc str
+  | OccName.isDataConNameSpace ns = okConOcc str
+  | otherwise                     = okTcOcc  str
+            
 -- Determine the name space of a name in a type
 --
 isVarName :: TH.Name -> Bool
diff --git a/compiler/parser/Lexer.x b/compiler/parser/Lexer.x
index b4223c82d8..1e8712b2d5 100644
--- a/compiler/parser/Lexer.x
+++ b/compiler/parser/Lexer.x
@@ -115,6 +115,8 @@ import Ctype
 -- -----------------------------------------------------------------------------
 -- Alex "Character set macros"
 
+-- NB: The logic behind these definitions is also reflected in basicTypes/Lexeme.hs
+-- Any changes here should likely be reflected there.
 $unispace    = \x05 -- Trick Alex into handling Unicode. See alexGetByte.
 $nl          = [\n\r\f]
 $whitechar   = [$nl\v\ $unispace]
@@ -1802,6 +1804,10 @@ alexGetByte (AI loc s)
           -- character is encountered we output these values
           -- with the actual character value hidden in the state.
           | otherwise =
+                -- NB: The logic behind these definitions is also reflected
+                -- in basicTypes/Lexeme.hs
+                -- Any changes here should likely be reflected there.
+
                 case generalCategory c of
                   UppercaseLetter       -> upper
                   LowercaseLetter       -> lower
diff --git a/compiler/typecheck/TcGenDeriv.lhs b/compiler/typecheck/TcGenDeriv.lhs
index df45001870..0779e67363 100644
--- a/compiler/typecheck/TcGenDeriv.lhs
+++ b/compiler/typecheck/TcGenDeriv.lhs
@@ -60,6 +60,7 @@ import Util
 import Var
 import MonadUtils
 import Outputable
+import Lexeme
 import FastString
 import Pair
 import Bag
diff --git a/compiler/typecheck/TcSplice.lhs b/compiler/typecheck/TcSplice.lhs
index 3302d028a5..7c8085ebe2 100644
--- a/compiler/typecheck/TcSplice.lhs
+++ b/compiler/typecheck/TcSplice.lhs
@@ -91,6 +91,7 @@ import BasicTypes hiding( SuccessFlag(..) )
 import Maybes( MaybeErr(..) )
 import DynFlags
 import Panic
+import Lexeme
 import FastString
 import Outputable
 import Control.Monad    ( when )
diff --git a/testsuite/tests/th/all.T b/testsuite/tests/th/all.T
index 5109473d96..1144156607 100644
--- a/testsuite/tests/th/all.T
+++ b/testsuite/tests/th/all.T
@@ -343,4 +343,4 @@ test('T9066', normal, compile, ['-v0'])
 test('T8100', normal, compile, ['-v0'])
 test('T9064', normal, compile, ['-v0'])
 test('T9209', normal, compile_fail, ['-v0'])
-test('T7484', expect_broken(7484), compile_fail, ['-v0'])
+test('T7484', normal, compile_fail, ['-v0'])
diff --git a/utils/haddock b/utils/haddock
-Subproject 19409126be62383bc64d79698b265ffaf96269a
+Subproject 2b3712d701c1df626abbc60525c35e735272e45
author	Richard Eisenberg <eir@cis.upenn.edu>	2014-11-03 15:34:53 -0500
committer	Richard Eisenberg <eir@cis.upenn.edu>	2014-11-21 11:15:46 -0500
commit	da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41 (patch)
tree	ccc30f5a5b1d8ff4cc1ef66cfec4499bb28846d8
parent	dbf360a5264d5d6597e046dcd9b4f49effa91eee (diff)
download	haskell-da2fca9e2be8c61c91c034d8c2302d8b1d1e7b41.tar.gz