summaryrefslogtreecommitdiff
path: root/libraries/base/Text/Regex/Posix.hsc
blob: 8c84dbd3914a71247fb099b6c3e8e6f1a428c097 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
-----------------------------------------------------------------------------
-- |
-- Module      :  Text.Regex.Posix
-- Copyright   :  (c) The University of Glasgow 2001
-- License     :  BSD-style (see the file libraries/core/LICENSE)
-- 
-- Maintainer  :  libraries@haskell.org
-- Stability   :  experimental
-- Portability :  non-portable (only on platforms that provide POSIX regexps)
--
-- Interface to the POSIX regular expression library.
-- ToDo: should have an interface using PackedStrings.
--
-----------------------------------------------------------------------------

module Text.Regex.Posix (
	Regex,	 	-- abstract

	regcomp, 	-- :: String -> Int -> IO Regex

	regexec, 	-- :: Regex		     -- pattern
	         	-- -> String		     -- string to match
	         	-- -> IO (Maybe (String,     -- everything before match
	         	-- 	 	 String,     -- matched portion
	         	--		 String,     -- everything after match
	         	-- 	 	 [String]))  -- subexpression matches

	regExtended,	-- (flag to regcomp) use extended regex syntax
	regIgnoreCase,	-- (flag to regcomp) ignore case when matching
	regNewline	-- (flag to regcomp) '.' doesn't match newline
  ) where

#include <sys/types.h>
#include "regex.h"

import Prelude

import Foreign
import Foreign.C

newtype Regex = Regex (ForeignPtr CRegex)

-- -----------------------------------------------------------------------------
-- regcomp

regcomp :: String -> Int -> IO Regex
regcomp pattern flags = do
  regex_ptr <- mallocBytes (#const sizeof(regex_t))
  regex_fptr <- newForeignPtr regex_ptr (regfree regex_ptr)
  r <- withCString pattern $ \cstr ->
    	 withForeignPtr regex_fptr $ \p ->
           c_regcomp p cstr (fromIntegral flags)
  if (r == 0)
     then return (Regex regex_fptr)
     else error "Text.Regex.Posix.regcomp: error in pattern" -- ToDo

regfree :: Ptr CRegex -> IO ()
regfree p_regex = do
  c_regfree p_regex
  free p_regex

-- -----------------------------------------------------------------------------
-- regexec

regexec :: Regex			-- pattern
	-> String			-- string to match
	-> IO (Maybe (String,		-- everything before match
		      String,		-- matched portion
		      String,		-- everything after match
		      [String])) 	-- subexpression matches

regexec (Regex regex_fptr) str = do
  withCString str $ \cstr -> do
    withForeignPtr regex_fptr $ \regex_ptr -> do
      nsub <- (#peek regex_t, re_nsub) regex_ptr
      let nsub_int = fromIntegral (nsub :: CSize)
      allocaBytes ((1 + nsub_int) * (#const sizeof(regmatch_t))) $ \p_match -> do
		-- add one because index zero covers the whole match
        r <- c_regexec regex_ptr cstr (1 + nsub) p_match 0{-no flags for now-}

        if (r /= 0) then return Nothing else do 

        (before,match,after) <- matched_parts str p_match

        sub_strs <- 
	  mapM (unpack str) $ take nsub_int $ tail $
	     iterate (`plusPtr` (#const sizeof(regmatch_t))) p_match

        return (Just (before, match, after, sub_strs))

matched_parts :: String -> Ptr CRegMatch -> IO (String, String, String)
matched_parts string p_match = do
  start <- (#peek regmatch_t, rm_so) p_match :: IO CInt
  end   <- (#peek regmatch_t, rm_eo) p_match :: IO CInt
  let s = fromIntegral start; e = fromIntegral end
  return ( take (s-1) string, 
	   take (e-s) (drop s string),
	   drop e string )  

unpack :: String -> Ptr CRegMatch -> IO (String)
unpack string p_match = do
  start <- (#peek regmatch_t, rm_so) p_match :: IO CInt
  end   <- (#peek regmatch_t, rm_eo) p_match :: IO CInt
  -- the subexpression may not have matched at all, perhaps because it
  -- was optional.  In this case, the offsets are set to -1.
  if (start == -1) then return "" else do
  return (take (fromIntegral (end-start)) (drop (fromIntegral start) string))

-- -----------------------------------------------------------------------------
-- The POSIX regex C interface

-- Flags for regexec
#enum Int,, \
	REG_NOTBOL, \
	REG_NOTEOL \

-- Return values from regexec
#enum Int,, \
	REG_NOMATCH
--	REG_ESPACE

-- Flags for regcomp
#enum Int,, \
	REG_EXTENDED, \
	regIgnoreCase = REG_ICASE, \
	REG_NOSUB, \
	REG_NEWLINE

-- Error codes from regcomp
#enum Int,, \
	REG_BADBR, \
	REG_BADPAT, \
	REG_BADRPT, \
	REG_ECOLLATE, \
	REG_ECTYPE, \
	REG_EESCAPE, \
	REG_ESUBREG, \
	REG_EBRACK, \
	REG_EPAREN, \
	REG_EBRACE, \
	REG_ERANGE, \
	REG_ESPACE

type CRegex    = ()
type CRegMatch = ()

foreign import ccall unsafe "regcomp"
  c_regcomp :: Ptr CRegex -> CString -> CInt -> IO CInt

foreign import ccall  unsafe "regfree"
  c_regfree :: Ptr CRegex -> IO ()

foreign import ccall unsafe "regexec"
  c_regexec :: Ptr CRegex -> CString -> CSize
	    -> Ptr CRegMatch -> CInt -> IO CInt