summaryrefslogtreecommitdiff
path: root/ghc/lib/misc/Regex.lhs
diff options
context:
space:
mode:
Diffstat (limited to 'ghc/lib/misc/Regex.lhs')
-rw-r--r--ghc/lib/misc/Regex.lhs370
1 files changed, 0 insertions, 370 deletions
diff --git a/ghc/lib/misc/Regex.lhs b/ghc/lib/misc/Regex.lhs
deleted file mode 100644
index c418bc281f..0000000000
--- a/ghc/lib/misc/Regex.lhs
+++ /dev/null
@@ -1,370 +0,0 @@
-\section[regex]{Haskell binding to the GNU regex library}
-
-What follows is a straightforward binding to the functions
-provided by the GNU regex library (the GNU group of functions with Perl
-like syntax)
-
-\begin{code}
-{-# OPTIONS -#include "cbits/ghcRegex.h" #-}
-
-module Regex (
- PatBuffer(..),
- re_compile_pattern,
- re_match,
- re_search,
- re_match2,
- re_search2,
-
- REmatch(..)
- ) where
-
-import GlaExts
-import CCall
-import PackedString
-import Array ( array, bounds, (!) )
-import PrelArr ( MutableByteArray(..), Array(..) )
-import PrelGHC ( MutableByteArray# )
-import Char ( ord )
-import Foreign
-
-\end{code}
-
-First, the higher level matching structure that the functions herein
-return:
-\begin{code}
---
--- GroupBounds hold the interval where a group
--- matched inside a string, e.g.
---
--- matching "reg(exp)" "a regexp" returns the pair (5,7) for the
--- (exp) group. (PackedString indices start from 0)
-
-type GroupBounds = (Int, Int)
-
-data REmatch
- = REmatch (Array Int GroupBounds) -- for $1, ... $n
- GroupBounds -- for $` (everything before match)
- GroupBounds -- for $& (entire matched string)
- GroupBounds -- for $' (everything after)
- GroupBounds -- for $+ (matched by last bracket)
-\end{code}
-
-Prior to any matching (or searching), the regular expression
-have to compiled into an internal form, the pattern buffer.
-Represent the pattern buffer as a Haskell heap object:
-
-\begin{code}
-data PatBuffer = PatBuffer# (MutableByteArray# RealWorld)
-instance CCallable PatBuffer
-instance CReturnable PatBuffer
-
-createPatBuffer :: Bool -> IO PatBuffer
-
-createPatBuffer insensitive
- = _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz ->
- stToIO (newCharArray (0::Int,sz)) >>= \ (MutableByteArray _ _ pbuf#) ->
- let
- pbuf = PatBuffer# pbuf#
- in
- (if insensitive then
- {-
- See comment re: fastmap below
- -}
- ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap ->
- {-
- Set up the translate table so that any lowercase
- char. gets mapped to an uppercase one. Beacuse quoting
- inside CAsmStrings is Problematic, we pass in the ordinal values
- of 'a','z' and 'A'
- -}
- _casm_ ``{ int i;
-
- for(i=0; i<256; i++)
- ((char *)%0)[i] = (char)i;
- for(i=(int)%1;i <=(int)%2;i++)
- ((char *)%0)[i] = i - ((int)%1 - (int)%3);
- }'' tmap (ord 'a') (ord 'z') (ord 'A') >>
- _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap
- else
- _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >>
- {-
- Use a fastmap to speed things up, would like to have the fastmap
- in the Haskell heap, but it will get GCed before we can say regexp,
- as the reference to it is buried inside a ByteArray :-(
- -}
- ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap ->
- _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap = %1; '' pbuf fmap >>
- {-
- We want the compiler of the pattern to alloc. memory
- for the pattern.
- -}
- _casm_ `` ((struct re_pattern_buffer *)%0)->buffer = 0; '' pbuf >>
- _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >>
- return pbuf
-\end{code}
-
-@re_compile_pattern@ converts a regular expression into a pattern buffer,
-GNU style.
-
-Q: should we lift the syntax bits configuration up to the Haskell
-programmer level ?
-
-\begin{code}
-re_compile_pattern :: PackedString -- pattern to compile
- -> Bool -- True <=> assume single-line mode
- -> Bool -- True <=> case-insensitive
- -> IO PatBuffer
-
-re_compile_pattern str single_line_mode insensitive
- = createPatBuffer insensitive >>= \ pbuf ->
- (if single_line_mode then -- match a multi-line buffer
- _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;''
- else
- _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >>
-
- _casm_ `` (int)re_compile_pattern((char *)%0,
- (int)%1,
- (struct re_pattern_buffer *)%2);''
- (unpackPS str) (lengthPS str) pbuf >>= \ () ->
- --
- -- No checking for how the compilation of the pattern went yet.
- --
- return pbuf
-\end{code}
-
-Got a match?
-
-Each call to re_match uses a new re_registers structures, so we need
-to ask the regex library to allocate enough memory to store the
-registers in each time. That's what the line '... REGS_UNALLOCATED'
-is all about.
-
-\begin{code}
-re_match :: PatBuffer -- compiled regexp
- -> PackedString -- string to match
- -> Int -- start position
- -> Bool -- True <=> record results in registers
- -> IO (Maybe REmatch)
-
-re_match pbuf str start reg
- = ((if reg then -- record result of match in registers
- _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
- else
- _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
- _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED;
- %r=(int)re_match((struct re_pattern_buffer *)%0,
- (char *)%1,
- (int)%2,
- (int)%3,
- (struct re_registers *)%4);'' pbuf
- (unpackPS str)
- (lengthPS str)
- start
- regs >>= \ match_res ->
- if match_res == ((-2)::Int) then
- error "re_match: Internal error"
- else if match_res < 0 then
- _casm_ ``free((struct re_registers *)%0); '' regs >>
- return Nothing
- else
- build_re_match start (lengthPS str) regs >>= \ arr ->
- _casm_ ``free(((struct re_registers *)%0)->start);
- free(((struct re_registers *)%0)->end);
- free((struct re_registers *)%0); '' regs >>
- return (Just arr)
-\end{code}
-
-Matching on 2 strings is useful when you're dealing with multiple
-buffers, which is something that could prove useful for PackedStrings,
-as we don't want to stuff the contents of a file into one massive heap
-chunk, but load (smaller chunks) on demand.
-
-\begin{code}
-re_match2 :: PatBuffer
- -> PackedString
- -> PackedString
- -> Int
- -> Int
- -> Bool
- -> IO (Maybe REmatch)
-
-re_match2 pbuf str1 str2 start stop reg
- = ((if reg then -- record result of match in registers
- _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
- else
- _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs ->
- _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0,
- (char *)%1,
- (int)%2,
- (char *)%3,
- (int)%4,
- (int)%5,
- (struct re_registers *)%6,
- (int)%7);'' pbuf
- (unpackPS str1)
- (lengthPS str1)
- (unpackPS str2)
- (lengthPS str2)
- start
- regs
- stop >>= \ match_res ->
- if match_res == ((-2)::Int) then
- error "re_match2: Internal error"
- else if match_res < 0 then
- _casm_ ``free((struct re_registers *)%0); '' regs >>
- return Nothing
- else
- build_re_match start stop regs >>= \ arr ->
- _casm_ ``free((struct re_registers *)%0); '' regs >>
- return (Just arr)
-\end{code}
-
-Find all the matches in a string:
-\begin{code}
-re_search :: PatBuffer -- the compiled regexp
- -> PackedString -- the string to search
- -> Int -- start index
- -> Int -- stop index
- -> Bool -- record result of match in registers
- -> IO (Maybe REmatch)
-
-re_search pbuf str start range reg
- = (if reg then -- record result of match in registers
- _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
- else
- _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
- _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED;
- %r=(int)re_search((struct re_pattern_buffer *)%0,
- (char *)%1,
- (int)%2,
- (int)%3,
- (int)%4,
- (struct re_registers *)%5);'' pbuf
- (unpackPS str)
- (lengthPS str)
- start
- range
- regs >>= \ match_res ->
- if match_res== ((-1)::Int) then
- _casm_ `` free((struct re_registers *)%0); '' regs >>
- return Nothing
- else
- let
- (st,en) = if range > start then
- (start,range)
- else
- (range,start)
- in
- build_re_match st en regs >>= \ arr ->
- _casm_ ``free(((struct re_registers *)%0)->start);
- free(((struct re_registers *)%0)->end);
- free((struct re_registers *)%0); '' regs >>
- return (Just arr)
-\end{code}
-
-Double buffer search:
-\begin{code}
-re_search2 :: PatBuffer
- -> PackedString
- -> PackedString
- -> Int
- -> Int
- -> Int
- -> Bool
- -> IO (Maybe REmatch)
-
-re_search2 pbuf str1 str2 start range stop reg
-
- = (if reg then -- record result of match in registers
- _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));''
- else
- _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs ->
- _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0,
- (char *)%1,
- (int)%2,
- (char *)%3,
- (int)%4,
- (int)%5,
- (int)%6,
- (struct re_registers *)%7,
- (int)%8);'' pbuf
- (unpackPS str1)
- (lengthPS str1)
- (unpackPS str2)
- (lengthPS str2)
- start
- range
- regs
- stop >>= \ match_res ->
- if match_res== ((-1)::Int) then
- _casm_ `` free((struct re_registers *)%0); '' regs >>
- return Nothing
- else
- let
- (st,en) = if range > start then
- (start,range)
- else
- (range,start)
- in
- build_re_match st en regs >>= \ arr ->
- _casm_ `` free((struct re_registers *)%0); '' regs >>
- return (Just arr)
-\end{code}
-
-\begin{code}
-build_re_match :: Int
- -> Int
- -> Addr
- -> IO REmatch
-
-build_re_match str_start str_end regs
- = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs >>= \ len ->
- match_reg_to_array regs len >>= \ (match_start,match_end,arr) ->
- let
- (1,x) = bounds arr
-
- bef = (str_start,match_start) -- $'
- aft = (match_end,str_end) -- $`
- lst = arr!x -- $+
- mtch = (match_start,match_end) -- $&
- in
- return (REmatch arr
- bef
- mtch
- aft
- lst)
- where
- match_reg_to_array rs len
- = trundleIO rs (0,[]) len >>= \ (no,ls) ->
- let
- (st,end,ls')
- = case ls of
- [] -> (0,0,[])
- [(a,b)] -> (a,b,ls)
- ((a,b):xs) -> (a,b,xs)
- in
- return
- (st,
- end,
- array (1,max 1 (no-1))
- [ (i, x) | (i,x) <- zip [1..] ls'])
-
- trundleIO :: Addr
- -> (Int,[(Int,Int)])
- -> Int
- -> IO (Int,[(Int,Int)])
-
- trundleIO rs (i,acc) len
- | i==len = return (i,reverse acc)
- | otherwise
- = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' rs i >>= \ start ->
- _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];'' rs i >>= \ end ->
- let
- acc' = (start,end):acc
- in
- if (start == (-1)) && (end == (-1)) then
- return (i,reverse acc)
- else
- trundleIO rs (i+1,acc') len
-\end{code}
-