diff options
Diffstat (limited to 'ghc/lib/misc/Regex.lhs')
-rw-r--r-- | ghc/lib/misc/Regex.lhs | 370 |
1 files changed, 0 insertions, 370 deletions
diff --git a/ghc/lib/misc/Regex.lhs b/ghc/lib/misc/Regex.lhs deleted file mode 100644 index c418bc281f..0000000000 --- a/ghc/lib/misc/Regex.lhs +++ /dev/null @@ -1,370 +0,0 @@ -\section[regex]{Haskell binding to the GNU regex library} - -What follows is a straightforward binding to the functions -provided by the GNU regex library (the GNU group of functions with Perl -like syntax) - -\begin{code} -{-# OPTIONS -#include "cbits/ghcRegex.h" #-} - -module Regex ( - PatBuffer(..), - re_compile_pattern, - re_match, - re_search, - re_match2, - re_search2, - - REmatch(..) - ) where - -import GlaExts -import CCall -import PackedString -import Array ( array, bounds, (!) ) -import PrelArr ( MutableByteArray(..), Array(..) ) -import PrelGHC ( MutableByteArray# ) -import Char ( ord ) -import Foreign - -\end{code} - -First, the higher level matching structure that the functions herein -return: -\begin{code} --- --- GroupBounds hold the interval where a group --- matched inside a string, e.g. --- --- matching "reg(exp)" "a regexp" returns the pair (5,7) for the --- (exp) group. (PackedString indices start from 0) - -type GroupBounds = (Int, Int) - -data REmatch - = REmatch (Array Int GroupBounds) -- for $1, ... $n - GroupBounds -- for $` (everything before match) - GroupBounds -- for $& (entire matched string) - GroupBounds -- for $' (everything after) - GroupBounds -- for $+ (matched by last bracket) -\end{code} - -Prior to any matching (or searching), the regular expression -have to compiled into an internal form, the pattern buffer. -Represent the pattern buffer as a Haskell heap object: - -\begin{code} -data PatBuffer = PatBuffer# (MutableByteArray# RealWorld) -instance CCallable PatBuffer -instance CReturnable PatBuffer - -createPatBuffer :: Bool -> IO PatBuffer - -createPatBuffer insensitive - = _casm_ ``%r = (int)sizeof(struct re_pattern_buffer);'' >>= \ sz -> - stToIO (newCharArray (0::Int,sz)) >>= \ (MutableByteArray _ _ pbuf#) -> - let - pbuf = PatBuffer# pbuf# - in - (if insensitive then - {- - See comment re: fastmap below - -} - ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ tmap -> - {- - Set up the translate table so that any lowercase - char. gets mapped to an uppercase one. Beacuse quoting - inside CAsmStrings is Problematic, we pass in the ordinal values - of 'a','z' and 'A' - -} - _casm_ ``{ int i; - - for(i=0; i<256; i++) - ((char *)%0)[i] = (char)i; - for(i=(int)%1;i <=(int)%2;i++) - ((char *)%0)[i] = i - ((int)%1 - (int)%3); - }'' tmap (ord 'a') (ord 'z') (ord 'A') >> - _casm_ ``((struct re_pattern_buffer *)%0)->translate = %1; '' pbuf tmap - else - _casm_ ``((struct re_pattern_buffer *)%0)->translate = 0; '' pbuf) >> - {- - Use a fastmap to speed things up, would like to have the fastmap - in the Haskell heap, but it will get GCed before we can say regexp, - as the reference to it is buried inside a ByteArray :-( - -} - ((_casm_ ``%r = (char *)malloc(256*sizeof(char));'')::IO Addr) >>= \ fmap -> - _casm_ `` ((struct re_pattern_buffer *)%0)->fastmap = %1; '' pbuf fmap >> - {- - We want the compiler of the pattern to alloc. memory - for the pattern. - -} - _casm_ `` ((struct re_pattern_buffer *)%0)->buffer = 0; '' pbuf >> - _casm_ `` ((struct re_pattern_buffer *)%0)->allocated = 0; '' pbuf >> - return pbuf -\end{code} - -@re_compile_pattern@ converts a regular expression into a pattern buffer, -GNU style. - -Q: should we lift the syntax bits configuration up to the Haskell -programmer level ? - -\begin{code} -re_compile_pattern :: PackedString -- pattern to compile - -> Bool -- True <=> assume single-line mode - -> Bool -- True <=> case-insensitive - -> IO PatBuffer - -re_compile_pattern str single_line_mode insensitive - = createPatBuffer insensitive >>= \ pbuf -> - (if single_line_mode then -- match a multi-line buffer - _casm_ ``re_syntax_options = RE_PERL_SINGLELINE_SYNTAX;'' - else - _casm_ ``re_syntax_options = RE_PERL_MULTILINE_SYNTAX;'') >> - - _casm_ `` (int)re_compile_pattern((char *)%0, - (int)%1, - (struct re_pattern_buffer *)%2);'' - (unpackPS str) (lengthPS str) pbuf >>= \ () -> - -- - -- No checking for how the compilation of the pattern went yet. - -- - return pbuf -\end{code} - -Got a match? - -Each call to re_match uses a new re_registers structures, so we need -to ask the regex library to allocate enough memory to store the -registers in each time. That's what the line '... REGS_UNALLOCATED' -is all about. - -\begin{code} -re_match :: PatBuffer -- compiled regexp - -> PackedString -- string to match - -> Int -- start position - -> Bool -- True <=> record results in registers - -> IO (Maybe REmatch) - -re_match pbuf str start reg - = ((if reg then -- record result of match in registers - _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' - else - _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs -> - _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED; - %r=(int)re_match((struct re_pattern_buffer *)%0, - (char *)%1, - (int)%2, - (int)%3, - (struct re_registers *)%4);'' pbuf - (unpackPS str) - (lengthPS str) - start - regs >>= \ match_res -> - if match_res == ((-2)::Int) then - error "re_match: Internal error" - else if match_res < 0 then - _casm_ ``free((struct re_registers *)%0); '' regs >> - return Nothing - else - build_re_match start (lengthPS str) regs >>= \ arr -> - _casm_ ``free(((struct re_registers *)%0)->start); - free(((struct re_registers *)%0)->end); - free((struct re_registers *)%0); '' regs >> - return (Just arr) -\end{code} - -Matching on 2 strings is useful when you're dealing with multiple -buffers, which is something that could prove useful for PackedStrings, -as we don't want to stuff the contents of a file into one massive heap -chunk, but load (smaller chunks) on demand. - -\begin{code} -re_match2 :: PatBuffer - -> PackedString - -> PackedString - -> Int - -> Int - -> Bool - -> IO (Maybe REmatch) - -re_match2 pbuf str1 str2 start stop reg - = ((if reg then -- record result of match in registers - _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' - else - _casm_ ``%r = (struct re_registers *)NULL;'')::IO Addr) >>= \ regs -> - _casm_ ``%r=(int)re_match_2((struct re_pattern_buffer *)%0, - (char *)%1, - (int)%2, - (char *)%3, - (int)%4, - (int)%5, - (struct re_registers *)%6, - (int)%7);'' pbuf - (unpackPS str1) - (lengthPS str1) - (unpackPS str2) - (lengthPS str2) - start - regs - stop >>= \ match_res -> - if match_res == ((-2)::Int) then - error "re_match2: Internal error" - else if match_res < 0 then - _casm_ ``free((struct re_registers *)%0); '' regs >> - return Nothing - else - build_re_match start stop regs >>= \ arr -> - _casm_ ``free((struct re_registers *)%0); '' regs >> - return (Just arr) -\end{code} - -Find all the matches in a string: -\begin{code} -re_search :: PatBuffer -- the compiled regexp - -> PackedString -- the string to search - -> Int -- start index - -> Int -- stop index - -> Bool -- record result of match in registers - -> IO (Maybe REmatch) - -re_search pbuf str start range reg - = (if reg then -- record result of match in registers - _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' - else - _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs -> - _casm_ ``((struct re_pattern_buffer *)%0)->regs_allocated = REGS_UNALLOCATED; - %r=(int)re_search((struct re_pattern_buffer *)%0, - (char *)%1, - (int)%2, - (int)%3, - (int)%4, - (struct re_registers *)%5);'' pbuf - (unpackPS str) - (lengthPS str) - start - range - regs >>= \ match_res -> - if match_res== ((-1)::Int) then - _casm_ `` free((struct re_registers *)%0); '' regs >> - return Nothing - else - let - (st,en) = if range > start then - (start,range) - else - (range,start) - in - build_re_match st en regs >>= \ arr -> - _casm_ ``free(((struct re_registers *)%0)->start); - free(((struct re_registers *)%0)->end); - free((struct re_registers *)%0); '' regs >> - return (Just arr) -\end{code} - -Double buffer search: -\begin{code} -re_search2 :: PatBuffer - -> PackedString - -> PackedString - -> Int - -> Int - -> Int - -> Bool - -> IO (Maybe REmatch) - -re_search2 pbuf str1 str2 start range stop reg - - = (if reg then -- record result of match in registers - _casm_ ``%r = (struct re_registers *)malloc(sizeof(struct re_registers *));'' - else - _casm_ ``%r = (struct re_registers *)NULL;'') >>= \ regs -> - _casm_ ``%r=(int)re_search_2((struct re_pattern_buffer *)%0, - (char *)%1, - (int)%2, - (char *)%3, - (int)%4, - (int)%5, - (int)%6, - (struct re_registers *)%7, - (int)%8);'' pbuf - (unpackPS str1) - (lengthPS str1) - (unpackPS str2) - (lengthPS str2) - start - range - regs - stop >>= \ match_res -> - if match_res== ((-1)::Int) then - _casm_ `` free((struct re_registers *)%0); '' regs >> - return Nothing - else - let - (st,en) = if range > start then - (start,range) - else - (range,start) - in - build_re_match st en regs >>= \ arr -> - _casm_ `` free((struct re_registers *)%0); '' regs >> - return (Just arr) -\end{code} - -\begin{code} -build_re_match :: Int - -> Int - -> Addr - -> IO REmatch - -build_re_match str_start str_end regs - = _casm_ ``%r=(int)(*(struct re_registers *)%0).num_regs;'' regs >>= \ len -> - match_reg_to_array regs len >>= \ (match_start,match_end,arr) -> - let - (1,x) = bounds arr - - bef = (str_start,match_start) -- $' - aft = (match_end,str_end) -- $` - lst = arr!x -- $+ - mtch = (match_start,match_end) -- $& - in - return (REmatch arr - bef - mtch - aft - lst) - where - match_reg_to_array rs len - = trundleIO rs (0,[]) len >>= \ (no,ls) -> - let - (st,end,ls') - = case ls of - [] -> (0,0,[]) - [(a,b)] -> (a,b,ls) - ((a,b):xs) -> (a,b,xs) - in - return - (st, - end, - array (1,max 1 (no-1)) - [ (i, x) | (i,x) <- zip [1..] ls']) - - trundleIO :: Addr - -> (Int,[(Int,Int)]) - -> Int - -> IO (Int,[(Int,Int)]) - - trundleIO rs (i,acc) len - | i==len = return (i,reverse acc) - | otherwise - = _casm_ ``%r = (int)(((struct re_registers *)%0)->start)[(int)%1];'' rs i >>= \ start -> - _casm_ ``%r = (int)(((struct re_registers *)%0)->end)[(int)%1];'' rs i >>= \ end -> - let - acc' = (start,end):acc - in - if (start == (-1)) && (end == (-1)) then - return (i,reverse acc) - else - trundleIO rs (i+1,acc') len -\end{code} - |