diff options
-rwxr-xr-x | Makefile | 35 | ||||
-rwxr-xr-x | doc/manual.txt | 240 | ||||
-rwxr-xr-x | src/algo.h | 8 | ||||
-rwxr-xr-x | src/common.h | 15 | ||||
-rwxr-xr-x | test/onig_sets.lua | 147 | ||||
-rwxr-xr-x | test/runtest.lua | 1 |
6 files changed, 348 insertions, 98 deletions
@@ -2,25 +2,44 @@ # See src/*.mak for user-definable settings -all: build_pcre test_pcre build_posix test_posix +POSIX = src/posix +PCRE = src/pcre +ONIG = src/oniguruma + +all: build test + +build: build_pcre build_posix build_onig + +test: test_pcre test_posix test_onig + +clean: clean_pcre clean_posix clean_onig build_pcre: - make -C src -f rex_pcre.mak + make -C $(PCRE) -f rex_pcre.mak build_posix: - make -C src -f rex_posix.mak + make -C $(POSIX) -f rex_posix.mak + +build_onig: + make -C $(ONIG) -f rex_onig.mak test_pcre: - cd test && lua ./runtest.lua -d../src pcre + cd test && lua ./runtest.lua -d../$(PCRE) pcre test_posix: - cd test && lua ./runtest.lua -d../src posix + cd test && lua ./runtest.lua -d../$(POSIX) posix + +test_onig: + cd test && lua ./runtest.lua -d../$(ONIG) onig clean_pcre: - make -C src -f rex_pcre.mak clean + make -C $(PCRE) -f rex_pcre.mak clean clean_posix: - make -C src -f rex_posix.mak clean + make -C $(POSIX) -f rex_posix.mak clean -.PHONY: all build_pcre test_pcre build_posix test_posix clean_pcre clean_posix +clean_onig: + make -C $(ONIG) -f rex_onig.mak clean +.PHONY: all build test clean build_pcre test_pcre clean_pcre build_posix \ + test_posix clean_posix build_onig test_onig clean_onig diff --git a/doc/manual.txt b/doc/manual.txt index 87d7210..f571956 100755 --- a/doc/manual.txt +++ b/doc/manual.txt @@ -10,17 +10,18 @@ Lrexlib 2.4 Reference Manual Introduction ~~~~~~~~~~~~ -**Lrexlib** provides bindings of the two principal regular expression library -interfaces (POSIX_ and PCRE_) to Lua_ 5.1. +**Lrexlib** provides bindings of the three principal regular expression library +interfaces (POSIX_, PCRE_ and Oniguruma_) to Lua_ 5.1. -**Lrexlib** builds into shared libraries called by default *rex_posix.so* and -*rex_pcre.so*, which can be used with *require*. +**Lrexlib** builds into shared libraries called by default *rex_posix.so*, +*rex_pcre.so* and *rex_onig.so*, which can be used with *require*. **Lrexlib** is copyright Reuben Thomas 2000-2008 and copyright Shmuel Zeigerman 2004-2008, and is released under the MIT license. .. _POSIX: http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html .. _PCRE: http://www.pcre.org/pcre.txt +.. _Oniguruma: http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt .. _Lua: http://www.lua.org ------------------------------------------------------------ @@ -39,67 +40,93 @@ Notes MyFunc (arg1, arg2, [arg3], [arg4]) -3. Throughout this document, the identifier *rex* is used in place of either - *rex_posix* or *rex_pcre*, that are the default namespaces for the - corresponding libraries. +3. Throughout this document (unless it causes ambiguity), the identifier *rex* + is used in place of either *rex_posix*, *rex_pcre* or *rex_onig*, that are + the default namespaces for the corresponding libraries. 4. All functions receiving a regular expression pattern as an argument will - generate an error if that pattern is found invalid by the used POSIX_ / PCRE_ - library. + generate an error if that pattern is found invalid by the used + POSIX_ / PCRE_ / Oniguruma_ library. 5. All functions receiving a string-type regex argument accept a compiled regex - too. In this case, the cf_ and locale_ arguments are ignored (should be - either supplied as nils or omitted). + too. In this case, the cf_, locale_ and syntax_ arguments are ignored (should + be either supplied as nils or omitted). .. _cf: 6. The default value for *compilation flags* (*cf*) that Lrexlib uses when the parameter is not supplied or ``nil``, is: - * 0 for PCRE * REG_EXTENDED for POSIX regex library - - For PCRE, *cf* may also be supplied as a string, whose characters stand for - PCRE compilation flags. Combinations of the following characters (case - sensitive) are supported: - - =============== ================== - **Character** **PCRE flag** - =============== ================== - **i** PCRE_CASELESS - **m** PCRE_MULTILINE - **s** PCRE_DOTALL - **x** PCRE_EXTENDED - **U** PCRE_UNGREEDY - **X** PCRE_EXTRA - =============== ================== + * 0 for PCRE + * ONIG_OPTION_NONE for Oniguruma + + **PCRE**, **Oniguruma**: *cf* may also be supplied as a string, whose + characters stand for compilation flags. Combinations of the following + characters (case sensitive) are supported: + + =============== ================== ============================== + **Character** **PCRE flag** **Oniguruma flag** + =============== ================== ============================== + **i** PCRE_CASELESS ONIG_OPTION_IGNORECASE + **m** PCRE_MULTILINE ONIG_OPTION_NEGATE_SINGLELINE + **s** PCRE_DOTALL ONIG_OPTION_MULTILINE + **x** PCRE_EXTENDED ONIG_OPTION_EXTEND + **U** PCRE_UNGREEDY n/a + **X** PCRE_EXTRA n/a + =============== ================== ============================== .. _ef: 7. The default value for *execution flags* (*ef*) that Lrexlib uses when the parameter is not supplied or ``nil``, is: - * 0 for PCRE * 0 for standard POSIX regex library * REG_STARTEND for those POSIX regex libraries that support it, e.g. Spencer's. + * 0 for PCRE + * 0 for Oniguruma .. _locale: -8. Parameter *locale* (*lo*) can be either a string (e.g., "French_France.1252"), - or a userdata obtained from a call to maketables_. The default value, used - when the parameter is not supplied or ``nil``, is the built-in PCRE set of - character tables. +8. **PCRE:** parameter *locale* (*lo*) can be either a string (e.g., + "French_France.1252"), or a userdata obtained from a call to maketables_. + The default value, used when the parameter is not supplied or ``nil``, + is the built-in PCRE set of character tables. + + **Oniguruma:** this parameter (which actually should be named "encoding" + rather then "locale") must be one of the predefined strings that are formed + from the ONIG_ENCODING_xxx identifiers defined in oniguruma.h, by means of + omitting the ONIG_ENCODING\_ part. For example, ONIG_ENCODING_UTF8 becomes + ``"UTF8"`` on the Lua side (or ``"utf8"``, as this parameter is case + insensitive). The default value, used when the parameter is not supplied or + ``nil``, is ``"ASCII"``. + + If the caller-supplied value of this parameter is not one of the predefined + "encoding" string set, an error is raised. + +.. _syntax: + +9. **Oniguruma:** parameter *syntax* (*syn*) must be one of the predefined + strings that are formed from the ONIG_SYNTAX_xxx identifiers defined in + oniguruma.h, by means of omitting the ONIG_SYNTAX\_ part. For example, + ONIG_SYNTAX_JAVA becomes ``"JAVA"`` on the Lua side (or ``"java"``, as this + parameter is case insensitive). The default value, used when the parameter is + not supplied or ``nil``, is either ``"RUBY"`` (at the start-up), or the value + set by the last setdefaultsyntax_ call. + + If the caller-supplied value of `syntax` parameter is not one of the + predefined "syntax" string set, an error is raised. ------------------------------------------------------------ -Common (PCRE and POSIX) functions and methods +Functions and methods common for all bindings ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ match ----- -:funcdef:`rex.match (subj, patt, [init], [cf], [ef], [lo])` +:funcdef:`rex.match (subj, patt, [init], [cf], [ef], [lo], [syn])` or @@ -108,8 +135,6 @@ or The function searches for the first match of the regexp *patt* in the string *subj*, starting from offset *init*, subject to flags *cf* and *ef*. -PCRE: A locale *lo* may be specified. - +---------+-------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| +=========+===============================+========+=============+ @@ -128,10 +153,12 @@ PCRE: A locale *lo* may be specified. +---------+-------------------------------+--------+-------------+ | [ef] | execution flags (bitwise OR) | number | ef_ | +---------+-------------------------------+--------+-------------+ - | [lo] |[PCRE] locale |string |locale_ | + | [lo] |[PCRE, Oniguruma] locale |string |locale_ | | | |or | | | | |userdata| | +---------+-------------------------------+--------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-------------------------------+--------+-------------+ **Returns on success:** 1. All substring matches ("captures"), in the order they appear in the @@ -147,7 +174,7 @@ PCRE: A locale *lo* may be specified. find ---- -:funcdef:`rex.find (subj, patt, [init], [cf], [ef], [lo])` +:funcdef:`rex.find (subj, patt, [init], [cf], [ef], [lo], [syn])` or @@ -156,8 +183,6 @@ or The function searches for the first match of the regexp *patt* in the string *subj*, starting from offset *init*, subject to flags *cf* and *ef*. -PCRE: A locale *lo* may be specified. - +---------+-------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| +=========+===============================+========+=============+ @@ -176,10 +201,12 @@ PCRE: A locale *lo* may be specified. +---------+-------------------------------+--------+-------------+ | [ef] | execution flags (bitwise OR) | number | ef_ | +---------+-------------------------------+--------+-------------+ - | [lo] |[PCRE] locale |string |locale_ | + | [lo] |[PCRE, Oniguruma] locale |string |locale_ | | | |or | | | | |userdata| | +---------+-------------------------------+--------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-------------------------------+--------+-------------+ **Returns on success:** 1. The start point of the match (a number). @@ -196,14 +223,12 @@ PCRE: A locale *lo* may be specified. gmatch ------ -:funcdef:`rex.gmatch (subj, patt, [cf], [ef], [lo])` +:funcdef:`rex.gmatch (subj, patt, [cf], [ef], [lo], [syn])` The function is intended for use in the *generic for* Lua construct. It returns an iterator for repeated matching of the pattern *patt* in the string *subj*, subject to flags *cf* and *ef*. -PCRE: A locale *lo* may be specified. - +---------+-------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| +=========+===============================+========+=============+ @@ -217,10 +242,12 @@ PCRE: A locale *lo* may be specified. +---------+-------------------------------+--------+-------------+ | [ef] |execution flags (bitwise OR) |number | ef_ | +---------+-------------------------------+--------+-------------+ - | [lo] |[PCRE] locale |string |locale_ | + | [lo] |[PCRE, Oniguruma] locale |string |locale_ | | | |or | | | | |userdata| | +---------+-------------------------------+--------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-------------------------------+--------+-------------+ The iterator function is called by Lua. On every iteration (that is, on every match), it returns all captures in the order they appear in the pattern (or the @@ -232,14 +259,12 @@ till the subject fails to match. gsub ---- -:funcdef:`rex.gsub (subj, patt, repl, [n], [cf], [ef], [lo])` +:funcdef:`rex.gsub (subj, patt, repl, [n], [cf], [ef], [lo], [syn])` This function searches for all matches of the pattern *patt* in the string *subj* and replaces them according to the parameters *repl* and *n* (see details below). -PCRE: A locale *lo* may be specified. - +---------+-----------------------------------+-------------------------+-------------+ |Parameter| Description | Type |Default Value| +=========+===================================+=========================+=============+ @@ -256,9 +281,11 @@ PCRE: A locale *lo* may be specified. +---------+-----------------------------------+-------------------------+-------------+ | [ef] |execution flags (bitwise OR) | number | ef_ | +---------+-----------------------------------+-------------------------+-------------+ - | [lo] |[PCRE] locale | string or userdata |locale_ | + | [lo] |[PCRE, Oniguruma] locale | string or userdata |locale_ | | | | | | +---------+-----------------------------------+-------------------------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-----------------------------------+-------------------------+-------------+ **Returns:** 1. The subject string with the substitutions made. @@ -350,7 +377,7 @@ PCRE: A locale *lo* may be specified. split ----- -:funcdef:`rex.split (subj, sep, [cf], [ef], [lo])` +:funcdef:`rex.split (subj, sep, [cf], [ef], [lo], [syn])` The function is intended for use in the *generic for* Lua construct. It is used for splitting a subject string *subj* into parts (*sections*). @@ -360,8 +387,6 @@ The *sep* parameter is a regular expression pattern representing The function returns an iterator for repeated matching of the pattern *sep* in the string *subj*, subject to flags *cf* and *ef*. -PCRE: A locale *lo* may be specified. - +---------+-------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| +=========+===============================+========+=============+ @@ -375,10 +400,12 @@ PCRE: A locale *lo* may be specified. +---------+-------------------------------+--------+-------------+ | [ef] |execution flags (bitwise OR) |number | ef_ | +---------+-------------------------------+--------+-------------+ - | [lo] |[PCRE] locale |string |locale_ | + | [lo] |[PCRE, Oniguruma] locale |string |locale_ | | | |or | | | | |userdata| | +---------+-------------------------------+--------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-------------------------------+--------+-------------+ **On every iteration pass, the iterator returns:** @@ -400,15 +427,15 @@ flags :funcdef:`rex.flags ([tb])` This function returns a table containing numeric values of the constants defined -by the used regex library (either PCRE or POSIX). Those constants are keyed by -their names (strings). If the table argument *tb* is supplied then it is used as -the output table, else a new table is created. +by the used regex library. Those constants are keyed by their names (strings). +If the table argument *tb* is supplied then it is used as the output table, +else a new table is created. The constants contained in the returned table can then be used in most functions and methods where *compilation flags* or *execution flags* can be specified. They can also be used for comparing with return codes of some functions and -methods for determining the reason of failure. For details, see PCRE_ and POSIX_ -documentation. +methods for determining the reason of failure. For details, see POSIX_, PCRE_ +and Oniguruma_ documentation. +---------+--------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| @@ -419,20 +446,29 @@ documentation. **Returns:** 1. A table filled with the results. +**Notes:** +The keys in the `tb` table are formed from the names of the corresponding +constants in the used library. They are formed as follows: + +* **POSIX:** prefix REG\_ is omitted, e.g. REG_ICASE becomes ``"ICASE"``. +* **PCRE:** prefix PCRE\_ is omitted, e.g. PCRE_CASELESS becomes + ``"CASELESS"``. +* **Oniguruma:** names of constants are converted to strings with no alteration, + but for ONIG_OPTION_xxx constants, alias strings are created additionally, + e.g., the value of ONIG_OPTION_IGNORECASE constant becomes accessible via + either of two keys: ``"ONIG_OPTION_IGNORECASE"`` and ``"IGNORECASE"``. + ------------------------------------------------------------ new --- -:funcdef:`rex.new (patt, [cf], [lo])` +:funcdef:`rex.new (patt, [cf], [lo], [syn])` The functions compiles regular expression *patt* into a regular expression -object whose internal representation is correspondent to the library used (PCRE -or POSIX regex). The returned result then can be used by the methods `tfind`_, -`exec`_ and `dfa_exec`_. Regular expression objects are automatically garbage -collected. - -PCRE: A locale *lo* may be specified. +object whose internal representation is corresponding to the library used. +The returned result then can be used by the methods, e.g. `tfind`_, `exec`_, +etc. Regular expression objects are automatically garbage collected. +---------+-------------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| @@ -441,10 +477,12 @@ PCRE: A locale *lo* may be specified. +---------+-------------------------------+--------+-------------+ | [cf] |compilation flags (bitwise OR) | number | cf_ | +---------+-------------------------------+--------+-------------+ - | [lo] |[PCRE] locale |string |locale_ | + | [lo] |[PCRE, Oniguruma] locale |string |locale_ | | | |or | | | | |userdata| | +---------+-------------------------------+--------+-------------+ + | [syn] |[Oniguruma] syntax | string |syntax_ | + +---------+-------------------------------+--------+-------------+ **Returns:** 1. Compiled regular expression (a userdata). @@ -479,17 +517,17 @@ string *subj*, starting from offset *init*, subject to execution flags *ef*. result, in a table. This table contains ``false`` in the positions where the corresponding sub-pattern did not participate in the match. - 1. PCRE: if *named subpatterns* are used then the table also contains - substring matches keyed by their correspondent subpattern names - (strings). + 1. **PCRE**, **Oniguruma**: if *named subpatterns* are used then the table + also contains substring matches keyed by their correspondent subpattern + names (strings). **Returns on failure:** 1. ``nil`` **Notes:** - 1. If *named subpatterns* (see PCRE_ docs) are used then the returned table - also contains substring matches keyed by their correspondent subpattern - names (strings). + 1. If *named subpatterns* (see PCRE_ and Oniguruma_ docs) are used then the + returned table also contains substring matches keyed by their correspondent + subpattern names (strings). ------------------------------------------------------------ @@ -522,9 +560,9 @@ string *subj*, starting from offset *init*, subject to execution flags *ef*. positions where the corresponding sub-pattern did not participate in the match. - 1. PCRE: if *named subpatterns* are used then the table also contains - substring matches keyed by their correspondent subpattern names - (strings). + 1. **PCRE**, **Oniguruma**: if *named subpatterns* are used then the table + also contains substring matches keyed by their correspondent subpattern + names (strings). **Returns on failure:** 1. ``nil`` @@ -585,9 +623,9 @@ string *subj*, using a DFA matching algorithm. maketables ---------- -[PCRE only. See *pcre_maketables* in the PCRE_ docs.] +[See *pcre_maketables* in the PCRE_ docs.] -:funcdef:`rex.maketables ()` +:funcdef:`rex_pcre.maketables ()` Creates a set of character tables corresponding to the current locale and returns it as a userdata. The returned value can be passed to any Lrexlib @@ -600,7 +638,7 @@ config [PCRE 4.0 and later. See *pcre_config* in the PCRE_ docs.] -:funcdef:`rex.config ([tb])` +:funcdef:`rex_pcre.config ([tb])` This function returns a table containing the values of the configuration parameters used at PCRE library build-time. Those parameters (numbers) are @@ -618,18 +656,54 @@ is used as the output table, else a new table is created. ------------------------------------------------------------ -version -------- +.. _version: + +rex_pcre.version +---------------- -[PCRE only. See *pcre_version* in the PCRE_ docs.] +[See *pcre_version* in the PCRE_ docs.] -:funcdef:`rex.version ()` +:funcdef:`rex_pcre.version ()` This function returns a string containing the version of the used PCRE library and its release date. ------------------------------------------------------------ +Oniguruma-only functions and methods +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +setdefaultsyntax +---------------- + +:funcdef:`rex_onig.setdefaultsyntax (syntax)` + +This function sets the default syntax for the Oniguruma library, according to +value of the string syntax_. The specified syntax will be further used for +interpreting string regex patterns by all relevant functions, unless `syntax` +argument is passed to those functions explicitly. + +**Returns:** nothing + +**Examples:** + + 1. ``rex_onig.setdefaultsyntax ("ASIS") -- use plain text syntax as the default`` + 2. ``rex_onig.setdefaultsyntax ("PERL") -- use PERL regex syntax as the default`` + +------------------------------------------------------------ + +rex_onig.version +---------------- + +[See *onig_version* in the Oniguruma docs.] + +:funcdef:`rex_onig.version ()` + +This function returns a string containing the version of the used Oniguruma +library. + +------------------------------------------------------------ + Other functions ~~~~~~~~~~~~~~~ @@ -645,7 +719,7 @@ The function searches for the first match of the string *patt* in the subject themselves. * Both strings *subj* and *patt* can have embedded zeros. * The flag *ci* specifies case-insensitive search (current locale is used). - * This function uses neither PCRE nor POSIX regex library. + * This function uses no regex library. +---------+---------------------------+--------+-------------+ |Parameter| Description | Type |Default Value| @@ -16,6 +16,10 @@ static int generate_error (lua_State *L, const TUserdata *ud, int errcode); # define ALG_OPTLOCALE(a,b,c) #endif +#ifndef ALG_OPTSYNTAX +# define ALG_OPTSYNTAX(a,b,c) +#endif + #ifndef DO_NAMED_SUBPATTERNS #define DO_NAMED_SUBPATTERNS(a,b,c) #endif @@ -113,6 +117,7 @@ static void checkarg_new (lua_State *L, TArgComp *argC) { argC->pattern = luaL_checklstring (L, 1, &argC->patlen); argC->cflags = ALG_GETCFLAGS (L, 2); ALG_OPTLOCALE (argC, L, 3); + ALG_OPTSYNTAX (argC, L, 4); } @@ -132,6 +137,7 @@ static void checkarg_gsub (lua_State *L, TArgComp *argC, TArgExec *argE) { argC->cflags = ALG_GETCFLAGS (L, 5); argE->eflags = luaL_optint (L, 6, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 7); + ALG_OPTSYNTAX (argC, L, 8); } @@ -144,6 +150,7 @@ static void checkarg_find_func (lua_State *L, TArgComp *argC, TArgExec *argE) { argC->cflags = ALG_GETCFLAGS (L, 4); argE->eflags = luaL_optint (L, 5, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 6); + ALG_OPTSYNTAX (argC, L, 7); } @@ -155,6 +162,7 @@ static void checkarg_gmatch_split (lua_State *L, TArgComp *argC, TArgExec *argE) argC->cflags = ALG_GETCFLAGS (L, 3); argE->eflags = luaL_optint (L, 4, ALG_EFLAGS_DFLT); ALG_OPTLOCALE (argC, L, 5); + ALG_OPTSYNTAX (argC, L, 6); } diff --git a/src/common.h b/src/common.h index 6f25ec3..06d9856 100755 --- a/src/common.h +++ b/src/common.h @@ -27,9 +27,10 @@ typedef struct { /* compile arguments */ size_t patlen; void * ud; int cflags; - const char * locale; - const unsigned char * tables; - int tablespos; + const char * locale; /* PCRE, Oniguruma */ + const unsigned char * tables; /* PCRE */ + int tablespos; /* PCRE */ + void * syntax; /* Oniguruma */ } TArgComp; typedef struct { /* exec arguments */ @@ -39,10 +40,10 @@ typedef struct { /* exec arguments */ int eflags; int funcpos; int maxmatch; - int funcpos2; /* used with gsub */ - int reptype; /* used with gsub */ - size_t ovecsize; /* used with dfa_exec */ - size_t wscount; /* used with dfa_exec */ + int funcpos2; /* used with gsub */ + int reptype; /* used with gsub */ + size_t ovecsize; /* PCRE: dfa_exec */ + size_t wscount; /* PCRE: dfa_exec */ } TArgExec; struct tagFreeList; /* forward declaration */ diff --git a/test/onig_sets.lua b/test/onig_sets.lua new file mode 100755 index 0000000..83555a6 --- /dev/null +++ b/test/onig_sets.lua @@ -0,0 +1,147 @@ +-- See Copyright Notice in the file LICENSE + +local luatest = require "luatest" +local N = luatest.NT + +local function norm(a) return a==nil and N or a end + +local function fill (n, m) + local t = {} + for i = n, m, -1 do table.insert (t, i) end + return t +end + +local function set_named_subpatterns (lib, flg) + return { + Name = "Named Subpatterns", + Func = function (methodname, subj, patt, name1, name2) + local r = lib.new (patt) + local _,_,caps = r[methodname] (r, subj) + return norm(caps[name1]), norm(caps[name2]) + end, + --{} + { {"tfind", "abcd", "(?<dog>.)b.(?<cat>d)", "dog", "cat"}, {"a","d"} }, + { {"exec", "abcd", "(?<dog>.)b.(?<cat>d)", "dog", "cat"}, {"a","d"} }, + } +end + +local function set_f_find (lib, flg) + local cp1251 = + "ÀÁÂÃÄŨÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÜÛÚÝÞßàáâãäå¸æçèéêëìíîïðñòóôõö÷øùüûúýþÿ" + local loc = "CP1251" + return { + Name = "Function find", + Func = lib.find, + --{subj, patt, st,cf,ef,lo}, { results } + { {"abcd", ".+", 5}, { N } }, -- failing st + { {"abcd", ".*?"}, { 1,0 } }, -- non-greedy + { {"abc", "aBC", N,flg.IGNORECASE}, { 1,3 } }, -- cf + { {"abc", "aBC", N,"i" }, { 1,3 } }, -- cf + { {cp1251, "[[:upper:]]+", N,N,N, loc}, { 1,33} }, -- locale + { {cp1251, "[[:lower:]]+", N,N,N, loc}, {34,66} }, -- locale + { {cp1251, "\\w+", N,N,N, loc}, {1, 66} }, -- locale +} +end + +local function set_f_match (lib, flg) + return { + Name = "Function match", + Func = lib.match, + --{subj, patt, st,cf,ef,lo}, { results } + { {"abcd", ".+", 5}, { N }}, -- failing st + { {"abcd", ".*?"}, { "" }}, -- non-greedy + { {"abc", "aBC", N,flg.IGNORECASE}, {"abc" }}, -- cf + { {"abc", "aBC", N,"i" }, {"abc" }}, -- cf +} +end + +local function set_f_gmatch (lib, flg) + -- gmatch (s, p, [cf], [ef]) + local pCSV = "(^[^,]*)|,([^,]*)" + local F = false + local function test_gmatch (subj, patt) + local out, guard = {}, 10 + for a, b in lib.gmatch (subj, patt) do + table.insert (out, { norm(a), norm(b) }) + guard = guard - 1 + if guard == 0 then break end + end + return unpack (out) + end + return { + Name = "Function gmatch", + Func = test_gmatch, + --{ subj patt results } + { {"a\0c", "." }, {{"a",N},{"\0",N},{"c",N}} },--nuls in subj + { {"", pCSV}, {{"",F}} }, + { {"12", pCSV}, {{"12",F}} }, + ----{ {",", pCSV}, {{"", F},{F,""}} }, + { {"12,,45", pCSV}, {{"12",F},{F,""},{F,"45"}} }, + ----{ {",,12,45,,ab,", pCSV}, {{"",F},{F,""},{F,"12"},{F,"45"},{F,""},{F,"ab"},{F,""}} }, + } +end + +local function set_f_split (lib, flg) + -- split (s, p, [cf], [ef]) + local function test_split (subj, patt) + local out, guard = {}, 10 + for a, b, c in lib.split (subj, patt) do + table.insert (out, { norm(a), norm(b), norm(c) }) + guard = guard - 1 + if guard == 0 then break end + end + return unpack (out) + end + return { + Name = "Function split", + Func = test_split, + --{ subj patt results } + { {"a,\0,c", ","}, {{"a",",",N},{"\0",",",N},{"c",N,N}, } },--nuls in subj + { {"ab", "$"}, {{"ab","",N}, {"",N,N}, } }, + { {"ab", "^|$"}, {{"", "", N}, {"ab","",N}, {"",N,N}, } }, + { {"ab45ab","(?<=ab).*?"}, {{"ab","",N}, {"45ab","",N},{"",N,N}, } }, + { {"ab", "\\b"}, {{"", "", N}, {"ab","",N}, {"",N,N}, } }, + } +end + +local function set_m_exec (lib, flg) + return { + Name = "Method exec", + Method = "exec", +--{patt,cf,lo}, {subj,st,ef} { results } + { {".+"}, {"abcd",5}, { N } }, -- failing st + { {".*?"}, {"abcd"}, {1,0,{}} }, -- non-greedy + { {"aBC",flg.IGNORECASE}, {"abc"}, {1,3,{}} }, -- cf + { {"aBC","i" }, {"abc"}, {1,3,{}} }, -- cf +} +end + +local function set_m_tfind (lib, flg) + return { + Name = "Method tfind", + Method = "tfind", +--{patt,cf,lo}, {subj,st,ef} { results } + { {".+"}, {"abcd",5}, { N } }, -- failing st + { {".*?"}, {"abcd"}, {1,0,{}} }, -- non-greedy + { {"aBC",flg.IGNORECASE}, {"abc"}, {1,3,{}} }, -- cf + { {"aBC","i" }, {"abc"}, {1,3,{}} }, -- cf +} +end + +return function (libname) + local lib = require (libname) + local flags = lib.flags () + local sets = { + set_f_match (lib, flags), + set_f_find (lib, flags), + set_f_gmatch (lib, flags), + set_f_split (lib, flags), + set_m_exec (lib, flags), + set_m_tfind (lib, flags), + } + local MAJOR = tonumber(lib.version():match("%d+")) + if MAJOR >= 0 then + table.insert (sets, set_named_subpatterns (lib, flags)) + end + return sets +end diff --git a/test/runtest.lua b/test/runtest.lua index ede0c0c..cc75205 100755 --- a/test/runtest.lua +++ b/test/runtest.lua @@ -48,6 +48,7 @@ local avail_tests = { pcre = { lib = "rex_pcre", "common_sets", "pcre_sets", "pcre_sets2", }, pcre_nr = { lib = "rex_pcre_nr", "common_sets", "pcre_sets", "pcre_sets2", }, pcre45 = { lib = "rex_pcre45", "common_sets", "pcre_sets", "pcre_sets2", }, + onig = { lib = "rex_onig", "common_sets", "onig_sets", } } do |