1 files changed, 378 insertions, 0 deletions
diff --git a/compiler/GHC/Parser/Annotation.hs b/compiler/GHC/Parser/Annotation.hs
new file mode 100644
index 0000000000..dbd1f79e23
--- /dev/null
+++ b/compiler/GHC/Parser/Annotation.hs
@@ -0,0 +1,378 @@
+{-# LANGUAGE DeriveDataTypeable #-}
+
+module GHC.Parser.Annotation (
+  getAnnotation, getAndRemoveAnnotation,
+  getAnnotationComments,getAndRemoveAnnotationComments,
+  ApiAnns(..),
+  ApiAnnKey,
+  AnnKeywordId(..),
+  AnnotationComment(..),
+  IsUnicodeSyntax(..),
+  unicodeAnn,
+  HasE(..),
+  LRdrName -- Exists for haddocks only
+  ) where
+
+import GhcPrelude
+
+import GHC.Types.Name.Reader
+import Outputable
+import GHC.Types.SrcLoc
+import qualified Data.Map as Map
+import Data.Data
+
+
+{-
+Note [Api annotations]
+~~~~~~~~~~~~~~~~~~~~~~
+Given a parse tree of a Haskell module, how can we reconstruct
+the original Haskell source code, retaining all whitespace and
+source code comments?  We need to track the locations of all
+elements from the original source: this includes keywords such as
+'let' / 'in' / 'do' etc as well as punctuation such as commas and
+braces, and also comments.  We collectively refer to this
+metadata as the "API annotations".
+
+Rather than annotate the resulting parse tree with these locations
+directly (this would be a major change to some fairly core data
+structures in GHC), we instead capture locations for these elements in a
+structure separate from the parse tree, and returned in the
+pm_annotations field of the ParsedModule type.
+
+The full ApiAnns type is
+
+> data ApiAnns =
+>  ApiAnns
+>    { apiAnnItems :: Map.Map ApiAnnKey [RealSrcSpan],
+>      apiAnnEofPos :: Maybe RealSrcSpan,
+>      apiAnnComments :: Map.Map RealSrcSpan [RealLocated AnnotationComment],
+>      apiAnnRogueComments :: [RealLocated AnnotationComment]
+>    }
+
+NON-COMMENT ELEMENTS
+
+Intuitively, every AST element directly contains a bag of keywords
+(keywords can show up more than once in a node: a semicolon i.e. newline
+can show up multiple times before the next AST element), each of which
+needs to be associated with its location in the original source code.
+
+Consequently, the structure that records non-comment elements is logically
+a two level map, from the RealSrcSpan of the AST element containing it, to
+a map from keywords ('AnnKeyWord') to all locations of the keyword directly
+in the AST element:
+
+> type ApiAnnKey = (RealSrcSpan,AnnKeywordId)
+>
+> Map.Map ApiAnnKey [RealSrcSpan]
+
+So
+
+> let x = 1 in 2 *x
+
+would result in the AST element
+
+  L span (HsLet (binds for x = 1) (2 * x))
+
+and the annotations
+
+  (span,AnnLet) having the location of the 'let' keyword
+  (span,AnnEqual) having the location of the '=' sign
+  (span,AnnIn)  having the location of the 'in' keyword
+
+For any given element in the AST, there is only a set number of
+keywords that are applicable for it (e.g., you'll never see an
+'import' keyword associated with a let-binding.)  The set of allowed
+keywords is documented in a comment associated with the constructor
+of a given AST element, although the ground truth is in GHC.Parser
+and GHC.Parser.PostProcess (which actually add the annotations; see #13012).
+
+COMMENT ELEMENTS
+
+Every comment is associated with a *located* AnnotationComment.
+We associate comments with the lowest (most specific) AST element
+enclosing them:
+
+> Map.Map RealSrcSpan [RealLocated AnnotationComment]
+
+PARSER STATE
+
+There are three fields in PState (the parser state) which play a role
+with annotations.
+
+>  annotations :: [(ApiAnnKey,[RealSrcSpan])],
+>  comment_q :: [RealLocated AnnotationComment],
+>  annotations_comments :: [(RealSrcSpan,[RealLocated AnnotationComment])]
+
+The 'annotations' and 'annotations_comments' fields are simple: they simply
+accumulate annotations that will end up in 'ApiAnns' at the end
+(after they are passed to Map.fromList).
+
+The 'comment_q' field captures comments as they are seen in the token stream,
+so that when they are ready to be allocated via the parser they are
+available (at the time we lex a comment, we don't know what the enclosing
+AST node of it is, so we can't associate it with a RealSrcSpan in
+annotations_comments).
+
+PARSER EMISSION OF ANNOTATIONS
+
+The parser interacts with the lexer using the function
+
+> addAnnotation :: RealSrcSpan -> AnnKeywordId -> RealSrcSpan -> P ()
+
+which takes the AST element RealSrcSpan, the annotation keyword and the
+target RealSrcSpan.
+
+This adds the annotation to the `annotations` field of `PState` and
+transfers any comments in `comment_q` WHICH ARE ENCLOSED by
+the RealSrcSpan of this element to the `annotations_comments`
+field.  (Comments which are outside of this annotation are deferred
+until later. 'allocateComments' in 'Lexer' is responsible for
+making sure we only attach comments that actually fit in the 'SrcSpan'.)
+
+The wiki page describing this feature is
+https://gitlab.haskell.org/ghc/ghc/wikis/api-annotations
+
+-}
+-- ---------------------------------------------------------------------
+
+-- If you update this, update the Note [Api annotations] above
+data ApiAnns =
+  ApiAnns
+    { apiAnnItems :: Map.Map ApiAnnKey [RealSrcSpan],
+      apiAnnEofPos :: Maybe RealSrcSpan,
+      apiAnnComments :: Map.Map RealSrcSpan [RealLocated AnnotationComment],
+      apiAnnRogueComments :: [RealLocated AnnotationComment]
+    }
+
+-- If you update this, update the Note [Api annotations] above
+type ApiAnnKey = (RealSrcSpan,AnnKeywordId)
+
+
+-- | Retrieve a list of annotation 'SrcSpan's based on the 'SrcSpan'
+-- of the annotated AST element, and the known type of the annotation.
+getAnnotation :: ApiAnns -> RealSrcSpan -> AnnKeywordId -> [RealSrcSpan]
+getAnnotation anns span ann =
+  case Map.lookup ann_key ann_items of
+    Nothing -> []
+    Just ss -> ss
+  where ann_items = apiAnnItems anns
+        ann_key = (span,ann)
+
+-- | Retrieve a list of annotation 'SrcSpan's based on the 'SrcSpan'
+-- of the annotated AST element, and the known type of the annotation.
+-- The list is removed from the annotations.
+getAndRemoveAnnotation :: ApiAnns -> RealSrcSpan -> AnnKeywordId
+                       -> ([RealSrcSpan],ApiAnns)
+getAndRemoveAnnotation anns span ann =
+  case Map.lookup ann_key ann_items of
+    Nothing -> ([],anns)
+    Just ss -> (ss,anns{ apiAnnItems = Map.delete ann_key ann_items })
+  where ann_items = apiAnnItems anns
+        ann_key = (span,ann)
+
+-- |Retrieve the comments allocated to the current 'SrcSpan'
+--
+--  Note: A given 'SrcSpan' may appear in multiple AST elements,
+--  beware of duplicates
+getAnnotationComments :: ApiAnns -> RealSrcSpan -> [RealLocated AnnotationComment]
+getAnnotationComments anns span =
+  case Map.lookup span (apiAnnComments anns) of
+    Just cs -> cs
+    Nothing -> []
+
+-- |Retrieve the comments allocated to the current 'SrcSpan', and
+-- remove them from the annotations
+getAndRemoveAnnotationComments :: ApiAnns -> RealSrcSpan
+                               -> ([RealLocated AnnotationComment],ApiAnns)
+getAndRemoveAnnotationComments anns span =
+  case Map.lookup span ann_comments of
+    Just cs -> (cs, anns{ apiAnnComments = Map.delete span ann_comments })
+    Nothing -> ([], anns)
+  where ann_comments = apiAnnComments anns
+
+-- --------------------------------------------------------------------
+
+-- | API Annotations exist so that tools can perform source to source
+-- conversions of Haskell code. They are used to keep track of the
+-- various syntactic keywords that are not captured in the existing
+-- AST.
+--
+-- The annotations, together with original source comments are made
+-- available in the @'pm_annotations'@ field of @'GHC.ParsedModule'@.
+-- Comments are only retained if @'Opt_KeepRawTokenStream'@ is set in
+-- @'DynFlags.DynFlags'@ before parsing.
+--
+-- The wiki page describing this feature is
+-- https://gitlab.haskell.org/ghc/ghc/wikis/api-annotations
+--
+-- Note: in general the names of these are taken from the
+-- corresponding token, unless otherwise noted
+-- See note [Api annotations] above for details of the usage
+data AnnKeywordId
+    = AnnAnyclass
+    | AnnAs
+    | AnnAt
+    | AnnBang  -- ^ '!'
+    | AnnBackquote -- ^ '`'
+    | AnnBy
+    | AnnCase -- ^ case or lambda case
+    | AnnClass
+    | AnnClose -- ^  '\#)' or '\#-}'  etc
+    | AnnCloseB -- ^ '|)'
+    | AnnCloseBU -- ^ '|)', unicode variant
+    | AnnCloseC -- ^ '}'
+    | AnnCloseQ  -- ^ '|]'
+    | AnnCloseQU -- ^ '|]', unicode variant
+    | AnnCloseP -- ^ ')'
+    | AnnCloseS -- ^ ']'
+    | AnnColon
+    | AnnComma -- ^ as a list separator
+    | AnnCommaTuple -- ^ in a RdrName for a tuple
+    | AnnDarrow -- ^ '=>'
+    | AnnDarrowU -- ^ '=>', unicode variant
+    | AnnData
+    | AnnDcolon -- ^ '::'
+    | AnnDcolonU -- ^ '::', unicode variant
+    | AnnDefault
+    | AnnDeriving
+    | AnnDo
+    | AnnDot    -- ^ '.'
+    | AnnDotdot -- ^ '..'
+    | AnnElse
+    | AnnEqual
+    | AnnExport
+    | AnnFamily
+    | AnnForall
+    | AnnForallU -- ^ Unicode variant
+    | AnnForeign
+    | AnnFunId -- ^ for function name in matches where there are
+               -- multiple equations for the function.
+    | AnnGroup
+    | AnnHeader -- ^ for CType
+    | AnnHiding
+    | AnnIf
+    | AnnImport
+    | AnnIn
+    | AnnInfix -- ^ 'infix' or 'infixl' or 'infixr'
+    | AnnInstance
+    | AnnLam
+    | AnnLarrow     -- ^ '<-'
+    | AnnLarrowU    -- ^ '<-', unicode variant
+    | AnnLet
+    | AnnMdo
+    | AnnMinus -- ^ '-'
+    | AnnModule
+    | AnnNewtype
+    | AnnName -- ^ where a name loses its location in the AST, this carries it
+    | AnnOf
+    | AnnOpen    -- ^ '(\#' or '{-\# LANGUAGE' etc
+    | AnnOpenB   -- ^ '(|'
+    | AnnOpenBU  -- ^ '(|', unicode variant
+    | AnnOpenC   -- ^ '{'
+    | AnnOpenE   -- ^ '[e|' or '[e||'
+    | AnnOpenEQ  -- ^ '[|'
+    | AnnOpenEQU -- ^ '[|', unicode variant
+    | AnnOpenP   -- ^ '('
+    | AnnOpenS   -- ^ '['
+    | AnnDollar          -- ^ prefix '$'   -- TemplateHaskell
+    | AnnDollarDollar    -- ^ prefix '$$'  -- TemplateHaskell
+    | AnnPackageName
+    | AnnPattern
+    | AnnProc
+    | AnnQualified
+    | AnnRarrow -- ^ '->'
+    | AnnRarrowU -- ^ '->', unicode variant
+    | AnnRec
+    | AnnRole
+    | AnnSafe
+    | AnnSemi -- ^ ';'
+    | AnnSimpleQuote -- ^ '''
+    | AnnSignature
+    | AnnStatic -- ^ 'static'
+    | AnnStock
+    | AnnThen
+    | AnnThIdSplice -- ^ '$'
+    | AnnThIdTySplice -- ^ '$$'
+    | AnnThTyQuote -- ^ double '''
+    | AnnTilde -- ^ '~'
+    | AnnType
+    | AnnUnit -- ^ '()' for types
+    | AnnUsing
+    | AnnVal  -- ^ e.g. INTEGER
+    | AnnValStr  -- ^ String value, will need quotes when output
+    | AnnVbar -- ^ '|'
+    | AnnVia -- ^ 'via'
+    | AnnWhere
+    | Annlarrowtail -- ^ '-<'
+    | AnnlarrowtailU -- ^ '-<', unicode variant
+    | Annrarrowtail -- ^ '->'
+    | AnnrarrowtailU -- ^ '->', unicode variant
+    | AnnLarrowtail -- ^ '-<<'
+    | AnnLarrowtailU -- ^ '-<<', unicode variant
+    | AnnRarrowtail -- ^ '>>-'
+    | AnnRarrowtailU -- ^ '>>-', unicode variant
+    deriving (Eq, Ord, Data, Show)
+
+instance Outputable AnnKeywordId where
+  ppr x = text (show x)
+
+-- ---------------------------------------------------------------------
+
+data AnnotationComment =
+  -- Documentation annotations
+    AnnDocCommentNext  String     -- ^ something beginning '-- |'
+  | AnnDocCommentPrev  String     -- ^ something beginning '-- ^'
+  | AnnDocCommentNamed String     -- ^ something beginning '-- $'
+  | AnnDocSection      Int String -- ^ a section heading
+  | AnnDocOptions      String     -- ^ doc options (prune, ignore-exports, etc)
+  | AnnLineComment     String     -- ^ comment starting by "--"
+  | AnnBlockComment    String     -- ^ comment in {- -}
+    deriving (Eq, Ord, Data, Show)
+-- Note: these are based on the Token versions, but the Token type is
+-- defined in GHC.Parser.Lexer and bringing it in here would create a loop
+
+instance Outputable AnnotationComment where
+  ppr x = text (show x)
+
+-- | - 'ApiAnnotation.AnnKeywordId' : 'ApiAnnotation.AnnOpen',
+--             'ApiAnnotation.AnnClose','ApiAnnotation.AnnComma',
+--             'ApiAnnotation.AnnRarrow'
+--             'ApiAnnotation.AnnTilde'
+--   - May have 'ApiAnnotation.AnnComma' when in a list
+type LRdrName = Located RdrName
+
+
+-- | Certain tokens can have alternate representations when unicode syntax is
+-- enabled. This flag is attached to those tokens in the lexer so that the
+-- original source representation can be reproduced in the corresponding
+-- 'ApiAnnotation'
+data IsUnicodeSyntax = UnicodeSyntax | NormalSyntax
+    deriving (Eq, Ord, Data, Show)
+
+-- | Convert a normal annotation into its unicode equivalent one
+unicodeAnn :: AnnKeywordId -> AnnKeywordId
+unicodeAnn AnnForall     = AnnForallU
+unicodeAnn AnnDcolon     = AnnDcolonU
+unicodeAnn AnnLarrow     = AnnLarrowU
+unicodeAnn AnnRarrow     = AnnRarrowU
+unicodeAnn AnnDarrow     = AnnDarrowU
+unicodeAnn Annlarrowtail = AnnlarrowtailU
+unicodeAnn Annrarrowtail = AnnrarrowtailU
+unicodeAnn AnnLarrowtail = AnnLarrowtailU
+unicodeAnn AnnRarrowtail = AnnRarrowtailU
+unicodeAnn AnnOpenB      = AnnOpenBU
+unicodeAnn AnnCloseB     = AnnCloseBU
+unicodeAnn AnnOpenEQ     = AnnOpenEQU
+unicodeAnn AnnCloseQ     = AnnCloseQU
+unicodeAnn ann           = ann
+
+
+-- | Some template haskell tokens have two variants, one with an `e` the other
+-- not:
+--
+-- >  [| or [e|
+-- >  [|| or [e||
+--
+-- This type indicates whether the 'e' is present or not.
+data HasE = HasE | NoE
+     deriving (Eq, Ord, Data, Show)