diff options
author | Reuben Thomas <rrt@sc3d.org> | 2010-07-26 19:18:11 +0100 |
---|---|---|
committer | Reuben Thomas <rrt@sc3d.org> | 2010-07-26 19:18:11 +0100 |
commit | 271f9f34af6cec4f27f75fa486365e0263b4e4e4 (patch) | |
tree | 8a1dc839b4231972f679279b0905966cc0e2c0c0 /src | |
parent | e3ccbbb7c3c5db5478ead6488b2d3630e8427fad (diff) | |
download | lrexlib-271f9f34af6cec4f27f75fa486365e0263b4e4e4.tar.gz |
Various improvements to GNU regex support:
1. Change default syntax to POSIX_EXTENDED, as agreed with Shmuel.
2. Add setsyntax function to set the syntax. Use it to run a much
smaller emacs_sets.lua, while using common_sets.lua (with normal
syntax) as the main test.
This commit also introduces preliminary untested code to support
translation arrays, and code to parse a syntax argument so per-regex
syntax can be selected.
Diffstat (limited to 'src')
-rwxr-xr-x | src/algo.h | 8 | ||||
-rwxr-xr-x | src/common.h | 10 | ||||
-rwxr-xr-x | src/gnu/lgnu.c | 143 |
3 files changed, 109 insertions, 52 deletions
@@ -20,6 +20,10 @@ static int generate_error (lua_State *L, const TUserdata *ud, int errcode); # define ALG_OPTSYNTAX(a,b,c) #endif +#ifndef ALG_OPTTRANSLATE +# define ALG_OPTTRANSLATE(a,b,c) +#endif + #ifndef DO_NAMED_SUBPATTERNS #define DO_NAMED_SUBPATTERNS(a,b,c) #endif @@ -118,6 +122,7 @@ static void checkarg_new (lua_State *L, TArgComp *argC) { argC->pattern = luaL_checklstring (L, 1, &argC->patlen); argC->cflags = ALG_GETCFLAGS (L, 2); ALG_OPTLOCALE (argC, L, 3); + ALG_OPTTRANSLATE (argC, L, 3); ALG_OPTSYNTAX (argC, L, 4); } @@ -138,6 +143,7 @@ static void checkarg_gsub (lua_State *L, TArgComp *argC, TArgExec *argE) { argC->cflags = ALG_GETCFLAGS (L, 5); argE->eflags = luaL_optint (L, 6, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 7); + ALG_OPTTRANSLATE (argC, L, 7); ALG_OPTSYNTAX (argC, L, 8); } @@ -151,6 +157,7 @@ static void checkarg_find_func (lua_State *L, TArgComp *argC, TArgExec *argE) { argC->cflags = ALG_GETCFLAGS (L, 4); argE->eflags = luaL_optint (L, 5, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 6); + ALG_OPTTRANSLATE (argC, L, 6); ALG_OPTSYNTAX (argC, L, 7); } @@ -163,6 +170,7 @@ static void checkarg_gmatch_split (lua_State *L, TArgComp *argC, TArgExec *argE) argC->cflags = ALG_GETCFLAGS (L, 3); argE->eflags = luaL_optint (L, 4, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 5); + ALG_OPTTRANSLATE (argC, L, 5); ALG_OPTSYNTAX (argC, L, 6); } diff --git a/src/common.h b/src/common.h index f492af8..de8d52d 100755 --- a/src/common.h +++ b/src/common.h @@ -31,10 +31,12 @@ typedef struct { /* compile arguments */ size_t patlen; void * ud; int cflags; - const char * locale; /* PCRE, Oniguruma */ - const unsigned char * tables; /* PCRE */ - int tablespos; /* PCRE */ - void * syntax; /* Oniguruma */ + const char * locale; /* PCRE, Oniguruma */ + const unsigned char * tables; /* PCRE */ + int tablespos; /* PCRE */ + void * syntax; /* Oniguruma */ + const unsigned char * translate; /* GNU */ + int gnusyn; /* GNU */ } TArgComp; typedef struct { /* exec arguments */ diff --git a/src/gnu/lgnu.c b/src/gnu/lgnu.c index 0a5cab3..16cad7b 100755 --- a/src/gnu/lgnu.c +++ b/src/gnu/lgnu.c @@ -39,6 +39,12 @@ static int getcflags (lua_State *L, int pos); unsigned __REPB_PREFIX(newline_anchor) : 1; */ +static void opttranslate (TArgComp *argC, lua_State *L, int pos); +#define ALG_OPTTRANSLATE(a,b,c) opttranslate(a,b,c) + +static void optsyntax (TArgComp *argC, lua_State *L, int pos); +#define ALG_OPTSYNTAX(a,b,c) optsyntax(a,b,c) + #define ALG_NOMATCH -1 /* FIXME: -2 for internal error is also possible; take arg like ALG_ISMATCH */ #define ALG_ISMATCH(res) ((res) >= 0) #define ALG_SUBBEG(ud,n) ud->match.start[n] @@ -81,30 +87,10 @@ static int getcflags (lua_State *L, int pos) { case LUA_TNIL: return ALG_CFLAGS_DFLT; default: - return luaL_typeerror (L, pos, "FIXME: translation tables not yet implemented"); + return luaL_typeerror (L, pos, "FIXME: compilation flags not yet implemented"); } } -/* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes - RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this - isn't minimal, since other operators, such as \`, aren't disabled. */ -#define RE_SYNTAX_POSIX_MINIMAL_BASIC \ - (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS) - -#define RE_SYNTAX_POSIX_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_VBAR \ - | RE_CONTEXT_INVALID_OPS | RE_UNMATCHED_RIGHT_PAREN_ORD) - -/* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INDEP_OPS is - removed and RE_NO_BK_REFS is added. */ -#define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \ - (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \ - | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \ - | RE_NO_BK_PARENS | RE_NO_BK_REFS \ - | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD) - static int generate_error (lua_State *L, const TUserdata *ud, int errcode) { const char *errmsg; switch (errcode) { @@ -118,24 +104,99 @@ static int generate_error (lua_State *L, const TUserdata *ud, int errcode) { errmsg = "internal error in GNU regex"; break; default: - errmsg = "internal error in lrexlib"; + errmsg = "unknown error"; } return luaL_error (L, "%s", errmsg); } +#define ALG_TRANSLATE_SIZE (UCHAR_MAX + 1) +static void opttranslate (TArgComp *argC, lua_State *L, int pos) { + if (!lua_isnoneornil (L, pos)) { + unsigned i; + + argC->translate = (const unsigned char *) Lmalloc (L, ALG_TRANSLATE_SIZE); + memset ((unsigned char *) argC->translate, 0, ALG_TRANSLATE_SIZE); /* initialize all members to 0 */ + for (i = 0; i < ALG_TRANSLATE_SIZE; i++) { + lua_pushinteger (L, i); + lua_gettable (L, pos); + if (lua_tostring (L, -1)) + ((unsigned char *) argC->translate)[i] = *lua_tostring (L, -1); + lua_pop (L, 1); + } + } else + argC->translate = NULL; +} + +typedef struct { + const char * name; + int value; +} EncPair; + +/* ATTENTION: + This array must always be kept alphabetically sorted, as it's used in the + binary search, so take care when manually inserting new elements. + */ +static EncPair Syntaxes[] = { + { "AWK", RE_SYNTAX_AWK }, + { "ED", RE_SYNTAX_ED }, + { "EGREP", RE_SYNTAX_EGREP }, + { "EMACS", RE_SYNTAX_EMACS }, + { "GNU_AWK", RE_SYNTAX_GNU_AWK }, + { "GREP", RE_SYNTAX_GREP }, + { "POSIX_AWK", RE_SYNTAX_POSIX_AWK }, + { "POSIX_BASIC", RE_SYNTAX_POSIX_BASIC }, + { "POSIX_EGREP", RE_SYNTAX_POSIX_EGREP }, + { "POSIX_EXTENDED", RE_SYNTAX_POSIX_EXTENDED }, + { "POSIX_MINIMAL_BASIC", RE_SYNTAX_POSIX_MINIMAL_BASIC }, + { "POSIX_MINIMAL_EXTENDED", RE_SYNTAX_POSIX_MINIMAL_EXTENDED }, + { "SED", RE_SYNTAX_SED }, +}; + +static int fcmp (const void *p1, const void *p2) { + return strcmp (((EncPair*) p1)->name, ((EncPair*) p2)->name); +} + +static int getsyntax (lua_State *L, int pos) { + EncPair key, *found; + if ((key.name = luaL_optstring (L, pos, NULL)) == NULL) + return RE_SYNTAX_POSIX_EXTENDED; + found = (EncPair*) bsearch (&key, Syntaxes, sizeof (Syntaxes) / sizeof (EncPair), + sizeof (EncPair), fcmp); + if (found == NULL) + luaL_argerror (L, pos, "invalid or unsupported syntax string"); + return found->value; +} + +static void optsyntax (TArgComp *argC, lua_State *L, int pos) { + argC->gnusyn = getsyntax (L, pos); +} + +/* + rex.setsyntax (syntax) + @param syntax: one of the predefined strings listed in array 'Syntaxes' + @return: nothing +*/ +static int LGnu_setsyntax (lua_State *L) { + (void) luaL_checkstring (L, 1); + re_set_syntax (getsyntax (L, 1)); + return 0; +} + static int compile_regex (lua_State *L, const TArgComp *argC, TGnu **pud) { const char *res; TGnu *ud; - reg_syntax_t old_syntax; + /* reg_syntax_t old_syntax; */ int ret; ud = (TGnu *)lua_newuserdata (L, sizeof (TGnu)); memset (ud, 0, sizeof (TGnu)); /* initialize all members to 0 */ + /* FIXME: take syntax parameter in cflags */ + /* old_syntax = re_set_syntax (cflags->syntax); */ + /* translate table is never written to, so this cast is safe */ - /* FIXME: ud->r.translate = (unsigned char *) translate; */ + ud->r.translate = (unsigned char *) argC->translate; - old_syntax = re_set_syntax (RE_SYNTAX_EMACS); /* FIXME: take syntax parameter in cflags */ res = re_compile_pattern (argC->pattern, argC->patlen, &ud->r); if (res != NULL) { ud->errmsg = res; @@ -151,7 +212,7 @@ static int compile_regex (lua_State *L, const TArgComp *argC, TGnu **pud) { ret = 1; } - re_set_syntax (old_syntax); + /* FIXME: re_set_syntax (old_syntax); */ return ret; } @@ -206,26 +267,10 @@ static int Gnu_tostring (lua_State *L) { return 1; } -static flag_pair gnu_flags[] = -{ - { "SYNTAX_EMACS", RE_SYNTAX_EMACS }, - { "SYNTAX_AWK", RE_SYNTAX_AWK }, - { "SYNTAX_GNU_AWK", RE_SYNTAX_GNU_AWK }, - { "SYNTAX_POSIX_AWK", RE_SYNTAX_POSIX_AWK }, - { "SYNTAX_GREP", RE_SYNTAX_GREP }, - { "SYNTAX_EGREP", RE_SYNTAX_EGREP }, - { "SYNTAX_POSIX_EGREP", RE_SYNTAX_POSIX_EGREP }, - { "SYNTAX_ED", RE_SYNTAX_ED }, - { "SYNTAX_SED", RE_SYNTAX_SED }, - { "SYNTAX_POSIX_BASIC", RE_SYNTAX_POSIX_BASIC }, -/*---------------------------------------------------------------------------*/ - { NULL, 0 } -}; - -static int Gnu_get_flags (lua_State *L) { - const flag_pair* fps[] = { gnu_flags, NULL }; - return get_flags (L, fps); -} +/* static int Gnu_get_flags (lua_State *L) { */ +/* const flag_pair* fps[] = { gnu_flags, NULL }; */ +/* return get_flags (L, fps); */ +/* } */ static const luaL_reg gnumeta[] = { { "exec", ud_exec }, @@ -247,15 +292,17 @@ static const luaL_reg rexlib[] = { { "gsub", gsub }, { "split", split }, { "new", ud_new }, - { "flags", Gnu_get_flags }, + /* { "flags", Gnu_get_flags }, */ { "plainfind", plainfind_func }, - /* { "set_syntax", set_syntax }, */ + { "setsyntax", LGnu_setsyntax }, { NULL, NULL } }; /* Open the library */ REX_API int REX_OPENLIB (lua_State *L) { + re_set_syntax (RE_SYNTAX_POSIX_EXTENDED); + /* create a new function environment to serve as a metatable for methods */ lua_newtable (L); lua_pushvalue (L, -1); |