summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorReuben Thomas <rrt@sc3d.org>2010-09-12 17:38:04 +0200
committerReuben Thomas <rrt@sc3d.org>2010-09-12 22:42:04 +0200
commit247b99d4e7f0ac1c7856d6f9b65cea1cf88ff898 (patch)
tree23cdc0deeca7998c22360103d3420835bbfd4cf5
parentcfeb0ba81b624a15001da525ca902ba6e138c816 (diff)
downloadlrexlib-247b99d4e7f0ac1c7856d6f9b65cea1cf88ff898.tar.gz
Remove gnu_rex.setsyntax, reimplement syntax flags as compilation
flags, adding the per-feature flags (which is why we need them to be numbers, not strings, so they can be combined). Hence, remove the special “syn” argument from the constructors, and alter the tests accordingly. Improve the documentation of GNU regex in one or two places. Rename the “reverse” execution flag to “backward”, for clarity.
-rw-r--r--doc/manual.txt39
-rw-r--r--src/gnu/lgnu.c150
-rw-r--r--test/emacs_sets.lua9
3 files changed, 83 insertions, 115 deletions
diff --git a/doc/manual.txt b/doc/manual.txt
index 0237b44..cda6f26 100644
--- a/doc/manual.txt
+++ b/doc/manual.txt
@@ -399,11 +399,12 @@ constants in the used library. They are formed as follows:
but for ONIG_OPTION_xxx constants, alias strings are created additionally,
e.g., the value of ONIG_OPTION_IGNORECASE constant becomes accessible via
either of two keys: ``"ONIG_OPTION_IGNORECASE"`` and ``"IGNORECASE"``.
-* **GNU**: the GNU library provides the flags ``not_bol``, which stops
- a beginning-of-line anchor from matching at the start of a string,
- ``not_eol``, which stops an end-of-line anchor from matching at the
- end of a string, and ``reverse`` which causes the search to be
- performed backwards.
+* **GNU**: the GNU library provides the flags ``not_bol``, which stops a
+ beginning-of-line anchor from matching at the start of a string, ``not_eol``,
+ which stops an end-of-line anchor from matching at the end of a string, and
+ ``backward`` which causes the search to be performed backwards, as well as the
+ RE_xxx syntax specifiers (as defined in regex.h), omitting the RE\_ prefix.
+ For example, RE_SYNTAX_GREP becomes ``SYNTAX_GREP`` in Lua.
------------------------------------------------------------
@@ -622,37 +623,17 @@ GNU-only functions and methods
new
---
-:funcdef:`rex.new (patt, [cf], [syn], [tr])`
+:funcdef:`rex.new (patt, [cf], [tr])`
-The *syntax* parameter (*syn*) must be one of the predefined strings that are
-formed from the RE_SYNTAX_xxx identifiers defined in regex.h, by means of
-omitting the RE_SYNTAX\_ part. For example, RE_SYNTAX_GREP becomes ``"GREP"`` on
-the Lua side. The default value, used when the parameter is not supplied or
-``nil``, is either ``"POSIX_EXTENDED"`` (at start-up), or the value set by the
-last setsyntax_ call.
+If the compilation flags (*cf*) are not supplied or ``nil``, the default syntax
+is ``SYNTAX_POSIX_EXTENDED``. Note that this is not the same as passing a value
+of zero, which is the same as ``SYNTAX_EMACS``.
The *translation* parameter (*tr*) is a map of eight-bit character codes (0 to
255 inclusive) to 8-bit characters (strings). If this parameter is given, the
pattern is translated at compilation time, and each string to be matched is
translated when it is being matched.
-setsyntax
----------
-
-:funcdef:`rex_gnu.setsyntax (syntax)`
-
-This function sets the default syntax for the GNU library (see the ``new``
-method above for the interpretation of the *syntax* parameter). The specified
-syntax will be further used for compiling string regex patterns by all relevant
-functions, unless the *syn* argument is passed to those functions explicitly.
-
-**Returns:** nothing
-
-**Examples:**
-
- 1. ``rex_gnu.setsyntax ("POSIX") -- use POSIX regex syntax as the default``
- 2. ``rex_gnu.setsyntax ("EMACS") -- use Emacs regex syntax as the default``
-
Oniguruma-only functions and methods
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/src/gnu/lgnu.c b/src/gnu/lgnu.c
index 1f9d76b..82e37fb 100644
--- a/src/gnu/lgnu.c
+++ b/src/gnu/lgnu.c
@@ -27,13 +27,13 @@
#define REX_TYPENAME REX_LIBNAME"_regex"
-#define ALG_CFLAGS_DFLT 0
+#define ALG_CFLAGS_DFLT RE_SYNTAX_POSIX_EXTENDED
#define ALG_EFLAGS_DFLT 0
-#define ALG_GETCFLAGS(L,pos) ALG_CFLAGS_DFLT
+#define ALG_GETCFLAGS(L,pos) luaL_optint(L, pos, ALG_CFLAGS_DFLT)
-static void checkarg_compile (lua_State *L, int pos, TArgComp *argC);
-#define ALG_GETCARGS(a,b,c) checkarg_compile(a,b,c)
+static const unsigned char *gettranslate (lua_State *L, int pos);
+#define ALG_GETCARGS(L,pos,argC) argC->translate = gettranslate (L, pos)
#define ALG_NOMATCH(res) ((res) == -1 || (res) == -2)
#define ALG_ISMATCH(res) ((res) >= 0)
@@ -72,9 +72,9 @@ typedef struct {
*/
/* Execution flags, which we need to simulate as GNU does not use flags for this. */
-#define GNU_NOTBOL 1
-#define GNU_NOTEOL 2
-#define GNU_REVERSE 4
+#define GNU_NOTBOL 1
+#define GNU_NOTEOL 2
+#define GNU_BACKWARD 4
static int generate_error (lua_State *L, const TUserdata *ud, int errcode) {
const char *errmsg;
@@ -114,78 +114,20 @@ static const unsigned char *gettranslate (lua_State *L, int pos) {
return translate;
}
-typedef struct {
- const char * name;
- int value;
-} EncPair;
-
-/* ATTENTION:
- This array must always be kept alphabetically sorted, as it's used in the
- binary search, so take care when manually inserting new elements.
- */
-static EncPair Syntaxes[] = {
- { "AWK", RE_SYNTAX_AWK },
- { "ED", RE_SYNTAX_ED },
- { "EGREP", RE_SYNTAX_EGREP },
- { "EMACS", RE_SYNTAX_EMACS },
- { "GNU_AWK", RE_SYNTAX_GNU_AWK },
- { "GREP", RE_SYNTAX_GREP },
- { "POSIX_AWK", RE_SYNTAX_POSIX_AWK },
- { "POSIX_BASIC", RE_SYNTAX_POSIX_BASIC },
- { "POSIX_EGREP", RE_SYNTAX_POSIX_EGREP },
- { "POSIX_EXTENDED", RE_SYNTAX_POSIX_EXTENDED },
- { "POSIX_MINIMAL_BASIC", RE_SYNTAX_POSIX_MINIMAL_BASIC },
- { "POSIX_MINIMAL_EXTENDED", RE_SYNTAX_POSIX_MINIMAL_EXTENDED },
- { "SED", RE_SYNTAX_SED },
-};
-
-static int fcmp (const void *p1, const void *p2) {
- return strcmp (((EncPair*) p1)->name, ((EncPair*) p2)->name);
-}
-
-static int getsyntax (lua_State *L, int pos) {
- EncPair key, *found;
- if ((key.name = luaL_optstring (L, pos, NULL)) == NULL)
- return -1;
- found = (EncPair*) bsearch (&key, Syntaxes, sizeof (Syntaxes) / sizeof (EncPair),
- sizeof (EncPair), fcmp);
- if (found == NULL)
- luaL_argerror (L, pos, "invalid or unsupported syntax string");
- return found->value;
-}
-
-static void checkarg_compile (lua_State *L, int pos, TArgComp *argC) {
- argC->translate = gettranslate (L, pos);
- argC->gnusyn = getsyntax (L, pos + 1);
-}
-
static void seteflags (TGnu *ud, TArgExec *argE) {
ud->r.not_bol = (argE->eflags & GNU_NOTBOL) != 0;
ud->r.not_eol = (argE->eflags & GNU_NOTEOL) != 0;
}
-/*
- rex.setsyntax (syntax)
- @param syntax: one of the predefined strings listed in array 'Syntaxes'
- @return: nothing
-*/
-static int LGnu_setsyntax (lua_State *L) {
- (void) luaL_checkstring (L, 1);
- re_set_syntax (getsyntax (L, 1));
- return 0;
-}
-
static int compile_regex (lua_State *L, const TArgComp *argC, TGnu **pud) {
const char *res;
TGnu *ud;
- reg_syntax_t old_syntax = 0;
int ret;
ud = (TGnu *)lua_newuserdata (L, sizeof (TGnu));
memset (ud, 0, sizeof (TGnu)); /* initialize all members to 0 */
- if (argC->gnusyn >= 0)
- old_syntax = re_set_syntax (argC->gnusyn);
+ re_set_syntax (argC->cflags);
/* translate table is never written to, so this cast is safe */
ud->r.translate = (unsigned char *) argC->translate;
@@ -195,9 +137,6 @@ static int compile_regex (lua_State *L, const TArgComp *argC, TGnu **pud) {
ud->errmsg = res;
ret = generate_error (L, ud, 0);
} else {
- if (argC->cflags & REG_NOSUB)
- ud->r.no_sub = 1;
-
lua_pushvalue (L, LUA_ENVIRONINDEX);
lua_setmetatable (L, -2);
@@ -205,8 +144,6 @@ static int compile_regex (lua_State *L, const TArgComp *argC, TGnu **pud) {
ret = 1;
}
- if (argC->gnusyn >= 0)
- re_set_syntax (old_syntax);
return ret;
}
@@ -216,7 +153,7 @@ static int gmatch_exec (TUserdata *ud, TArgExec *argE) {
ud->r.not_bol = 1;
argE->text += argE->startoffset;
argE->textlen -= argE->startoffset;
- if (argE->eflags & GNU_REVERSE)
+ if (argE->eflags & GNU_BACKWARD)
return re_search (&ud->r, argE->text, argE->textlen, argE->textlen, -argE->textlen, &ud->match);
else
return re_search (&ud->r, argE->text, argE->textlen, 0, argE->textlen, &ud->match);
@@ -230,7 +167,7 @@ static int findmatch_exec (TGnu *ud, TArgExec *argE) {
argE->text += argE->startoffset;
argE->textlen -= argE->startoffset;
seteflags (ud, argE);
- if (argE->eflags & GNU_REVERSE)
+ if (argE->eflags & GNU_BACKWARD)
return re_search (&ud->r, argE->text, argE->textlen, argE->textlen, -argE->textlen, &ud->match);
else
return re_search (&ud->r, argE->text, argE->textlen, 0, argE->textlen, &ud->match);
@@ -240,7 +177,7 @@ static int gsub_exec (TGnu *ud, TArgExec *argE, int st) {
seteflags (ud, argE);
if (st > 0)
ud->r.not_bol = 1;
- if (argE->eflags & GNU_REVERSE)
+ if (argE->eflags & GNU_BACKWARD)
return re_search (&ud->r, argE->text + st, argE->textlen - st, argE->textlen - st, -(argE->textlen - st), &ud->match);
else
return re_search (&ud->r, argE->text + st, argE->textlen - st, 0, argE->textlen - st, &ud->match);
@@ -250,7 +187,7 @@ static int split_exec (TGnu *ud, TArgExec *argE, int offset) {
seteflags (ud, argE);
if (offset > 0)
ud->r.not_bol = 1;
- if (argE->eflags & GNU_REVERSE)
+ if (argE->eflags & GNU_BACKWARD)
return re_search (&ud->r, argE->text + offset, argE->textlen - offset, argE->textlen - offset, -(argE->textlen - offset), &ud->match);
else
return re_search (&ud->r, argE->text + offset, argE->textlen - offset, 0, argE->textlen - offset, &ud->match);
@@ -278,15 +215,67 @@ static int Gnu_tostring (lua_State *L) {
static flag_pair gnu_flags[] =
{
- { "not_bol", GNU_NOTBOL },
- { "not_eol", GNU_NOTEOL },
- { "reverse", GNU_REVERSE },
+ { "not_bol", GNU_NOTBOL },
+ { "not_eol", GNU_NOTEOL },
+ { "backward", GNU_BACKWARD },
+/*---------------------------------------------------------------------------*/
+ { NULL, 0 }
+};
+
+static flag_pair gnu_syntax_flags[] = {
+ /* Syntax flag sets. */
+ { "SYNTAX_EMACS", RE_SYNTAX_EMACS },
+ { "SYNTAX_AWK", RE_SYNTAX_AWK },
+ { "SYNTAX_GNU_AWK", RE_SYNTAX_GNU_AWK },
+ { "SYNTAX_POSIX_AWK", RE_SYNTAX_POSIX_AWK },
+ { "SYNTAX_POSIX_AWK", RE_SYNTAX_POSIX_AWK },
+ { "SYNTAX_EGREP", RE_SYNTAX_EGREP },
+ { "SYNTAX_POSIX_EGREP", RE_SYNTAX_POSIX_EGREP },
+ { "SYNTAX_ED", RE_SYNTAX_ED },
+ { "SYNTAX_SED", RE_SYNTAX_SED },
+ { "SYNTAX_POSIX_AWK", RE_SYNTAX_POSIX_AWK },
+ { "SYNTAX_GREP", RE_SYNTAX_GREP },
+ { "SYNTAX_POSIX_BASIC", RE_SYNTAX_POSIX_BASIC },
+ { "SYNTAX_POSIX_MINIMAL_BASIC", RE_SYNTAX_POSIX_MINIMAL_BASIC },
+ { "SYNTAX_POSIX_EXTENDED", RE_SYNTAX_POSIX_EXTENDED },
+ { "SYNTAX_POSIX_MINIMAL_EXTENDED", RE_SYNTAX_POSIX_MINIMAL_EXTENDED },
+
+ /* Individual syntax flags. */
+ { "BACKSLASH_ESCAPE_IN_LISTS", RE_BACKSLASH_ESCAPE_IN_LISTS },
+ { "BK_PLUS_QM", RE_BK_PLUS_QM },
+ { "CHAR_CLASSES", RE_CHAR_CLASSES },
+ { "CONTEXT_INDEP_ANCHORS", RE_CONTEXT_INDEP_ANCHORS },
+ { "CONTEXT_INDEP_OPS", RE_CONTEXT_INDEP_OPS },
+ { "CONTEXT_INVALID_OPS", RE_CONTEXT_INVALID_OPS },
+ { "DOT_NEWLINE", RE_DOT_NEWLINE },
+ { "DOT_NOT_NULL", RE_DOT_NOT_NULL },
+ { "HAT_LISTS_NOT_NEWLINE", RE_HAT_LISTS_NOT_NEWLINE },
+ { "INTERVALS", RE_INTERVALS },
+ { "LIMITED_OPS", RE_LIMITED_OPS },
+ { "NEWLINE_ALT", RE_NEWLINE_ALT },
+ { "NO_BK_BRACES", RE_NO_BK_BRACES },
+ { "NO_BK_PARENS", RE_NO_BK_PARENS },
+ { "NO_BK_REFS", RE_NO_BK_REFS },
+ { "NO_BK_VBAR", RE_NO_BK_VBAR },
+ { "NO_EMPTY_RANGES", RE_NO_EMPTY_RANGES },
+ { "UNMATCHED_RIGHT_PAREN_ORD", RE_UNMATCHED_RIGHT_PAREN_ORD },
+ { "NO_POSIX_BACKTRACKING", RE_NO_POSIX_BACKTRACKING },
+ { "NO_GNU_OPS", RE_NO_GNU_OPS },
+ { "DEBUG", RE_DEBUG },
+ { "INVALID_INTERVAL_ORD", RE_INVALID_INTERVAL_ORD },
+ { "ICASE", RE_ICASE },
+ { "CARET_ANCHORS_HERE", RE_CARET_ANCHORS_HERE },
+ { "CONTEXT_INVALID_DUP", RE_CONTEXT_INVALID_DUP },
+ { "NO_SUB", RE_NO_SUB },
+#ifdef RE_PLAIN
+ { "PLAIN", RE_PLAIN },
+#endif
/*---------------------------------------------------------------------------*/
{ NULL, 0 }
};
static int Gnu_get_flags (lua_State *L) {
- const flag_pair* fps[] = { gnu_flags, NULL };
+ const flag_pair* fps[] = { gnu_flags, gnu_syntax_flags, NULL };
return get_flags (L, fps);
}
@@ -309,15 +298,12 @@ static const luaL_reg rexlib[] = {
{ "new", ud_new },
{ "flags", Gnu_get_flags },
{ "plainfind", plainfind_func },
- { "setsyntax", LGnu_setsyntax },
{ NULL, NULL }
};
/* Open the library */
REX_API int REX_OPENLIB (lua_State *L)
{
- re_set_syntax (RE_SYNTAX_POSIX_EXTENDED);
-
/* create a new function environment to serve as a metatable for methods */
lua_newtable (L);
lua_pushvalue (L, -1);
diff --git a/test/emacs_sets.lua b/test/emacs_sets.lua
index 19f0a3b..fb2b024 100644
--- a/test/emacs_sets.lua
+++ b/test/emacs_sets.lua
@@ -12,7 +12,7 @@ local function set_f_gmatch (lib, flg)
-- gmatch (s, p, [cf], [ef])
local function test_gmatch (subj, patt)
local out, guard = {}, 10
- for a, b in lib.gmatch (subj, patt, nil, nil, nil, "EMACS") do
+ for a, b in lib.gmatch (subj, patt, flg.SYNTAX_EMACS, nil) do
table.insert (out, { norm(a), norm(b) })
guard = guard - 1
if guard == 0 then break end
@@ -31,7 +31,7 @@ local function set_f_split (lib, flg)
-- split (s, p, [cf], [ef])
local function test_split (subj, patt)
local out, guard = {}, 10
- for a, b, c in lib.split (subj, patt, nil, nil, nil, "EMACS") do
+ for a, b, c in lib.split (subj, patt, flg.SYNTAX_EMACS, nil) do
table.insert (out, { norm(a), norm(b), norm(c) })
guard = guard - 1
if guard == 0 then break end
@@ -48,8 +48,9 @@ end
return function (libname)
local lib = require (libname)
+ local flags = lib.flags ()
return {
- set_f_gmatch (lib),
- set_f_split (lib),
+ set_f_gmatch (lib, flags),
+ set_f_split (lib, flags),
}
end