summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorshmuz <shmuz>2008-07-30 17:40:04 +0000
committershmuz <shmuz>2008-07-30 17:40:04 +0000
commitde14e217536363827c9b24551c36d4205630ce02 (patch)
tree74eaf70d0680910be37185b25f5a1ba9d1761583
parent357e3817481bba56eca4c5e7e844b515f6605c49 (diff)
downloadlrexlib-de14e217536363827c9b24551c36d4205630ce02.tar.gz
Changes related to Oniguruma addition and directory layout change.
-rwxr-xr-xMakefile35
-rwxr-xr-xdoc/manual.txt240
-rwxr-xr-xsrc/algo.h8
-rwxr-xr-xsrc/common.h15
-rwxr-xr-xtest/onig_sets.lua147
-rwxr-xr-xtest/runtest.lua1
6 files changed, 348 insertions, 98 deletions
diff --git a/Makefile b/Makefile
index b4e3277..9c017c3 100755
--- a/Makefile
+++ b/Makefile
@@ -2,25 +2,44 @@
# See src/*.mak for user-definable settings
-all: build_pcre test_pcre build_posix test_posix
+POSIX = src/posix
+PCRE = src/pcre
+ONIG = src/oniguruma
+
+all: build test
+
+build: build_pcre build_posix build_onig
+
+test: test_pcre test_posix test_onig
+
+clean: clean_pcre clean_posix clean_onig
build_pcre:
- make -C src -f rex_pcre.mak
+ make -C $(PCRE) -f rex_pcre.mak
build_posix:
- make -C src -f rex_posix.mak
+ make -C $(POSIX) -f rex_posix.mak
+
+build_onig:
+ make -C $(ONIG) -f rex_onig.mak
test_pcre:
- cd test && lua ./runtest.lua -d../src pcre
+ cd test && lua ./runtest.lua -d../$(PCRE) pcre
test_posix:
- cd test && lua ./runtest.lua -d../src posix
+ cd test && lua ./runtest.lua -d../$(POSIX) posix
+
+test_onig:
+ cd test && lua ./runtest.lua -d../$(ONIG) onig
clean_pcre:
- make -C src -f rex_pcre.mak clean
+ make -C $(PCRE) -f rex_pcre.mak clean
clean_posix:
- make -C src -f rex_posix.mak clean
+ make -C $(POSIX) -f rex_posix.mak clean
-.PHONY: all build_pcre test_pcre build_posix test_posix clean_pcre clean_posix
+clean_onig:
+ make -C $(ONIG) -f rex_onig.mak clean
+.PHONY: all build test clean build_pcre test_pcre clean_pcre build_posix \
+ test_posix clean_posix build_onig test_onig clean_onig
diff --git a/doc/manual.txt b/doc/manual.txt
index 87d7210..f571956 100755
--- a/doc/manual.txt
+++ b/doc/manual.txt
@@ -10,17 +10,18 @@ Lrexlib 2.4 Reference Manual
Introduction
~~~~~~~~~~~~
-**Lrexlib** provides bindings of the two principal regular expression library
-interfaces (POSIX_ and PCRE_) to Lua_ 5.1.
+**Lrexlib** provides bindings of the three principal regular expression library
+interfaces (POSIX_, PCRE_ and Oniguruma_) to Lua_ 5.1.
-**Lrexlib** builds into shared libraries called by default *rex_posix.so* and
-*rex_pcre.so*, which can be used with *require*.
+**Lrexlib** builds into shared libraries called by default *rex_posix.so*,
+*rex_pcre.so* and *rex_onig.so*, which can be used with *require*.
**Lrexlib** is copyright Reuben Thomas 2000-2008 and copyright Shmuel Zeigerman
2004-2008, and is released under the MIT license.
.. _POSIX: http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
.. _PCRE: http://www.pcre.org/pcre.txt
+.. _Oniguruma: http://www.geocities.jp/kosako3/oniguruma/doc/RE.txt
.. _Lua: http://www.lua.org
------------------------------------------------------------
@@ -39,67 +40,93 @@ Notes
MyFunc (arg1, arg2, [arg3], [arg4])
-3. Throughout this document, the identifier *rex* is used in place of either
- *rex_posix* or *rex_pcre*, that are the default namespaces for the
- corresponding libraries.
+3. Throughout this document (unless it causes ambiguity), the identifier *rex*
+ is used in place of either *rex_posix*, *rex_pcre* or *rex_onig*, that are
+ the default namespaces for the corresponding libraries.
4. All functions receiving a regular expression pattern as an argument will
- generate an error if that pattern is found invalid by the used POSIX_ / PCRE_
- library.
+ generate an error if that pattern is found invalid by the used
+ POSIX_ / PCRE_ / Oniguruma_ library.
5. All functions receiving a string-type regex argument accept a compiled regex
- too. In this case, the cf_ and locale_ arguments are ignored (should be
- either supplied as nils or omitted).
+ too. In this case, the cf_, locale_ and syntax_ arguments are ignored (should
+ be either supplied as nils or omitted).
.. _cf:
6. The default value for *compilation flags* (*cf*) that Lrexlib uses when
the parameter is not supplied or ``nil``, is:
- * 0 for PCRE
* REG_EXTENDED for POSIX regex library
-
- For PCRE, *cf* may also be supplied as a string, whose characters stand for
- PCRE compilation flags. Combinations of the following characters (case
- sensitive) are supported:
-
- =============== ==================
- **Character** **PCRE flag**
- =============== ==================
- **i** PCRE_CASELESS
- **m** PCRE_MULTILINE
- **s** PCRE_DOTALL
- **x** PCRE_EXTENDED
- **U** PCRE_UNGREEDY
- **X** PCRE_EXTRA
- =============== ==================
+ * 0 for PCRE
+ * ONIG_OPTION_NONE for Oniguruma
+
+ **PCRE**, **Oniguruma**: *cf* may also be supplied as a string, whose
+ characters stand for compilation flags. Combinations of the following
+ characters (case sensitive) are supported:
+
+ =============== ================== ==============================
+ **Character** **PCRE flag** **Oniguruma flag**
+ =============== ================== ==============================
+ **i** PCRE_CASELESS ONIG_OPTION_IGNORECASE
+ **m** PCRE_MULTILINE ONIG_OPTION_NEGATE_SINGLELINE
+ **s** PCRE_DOTALL ONIG_OPTION_MULTILINE
+ **x** PCRE_EXTENDED ONIG_OPTION_EXTEND
+ **U** PCRE_UNGREEDY n/a
+ **X** PCRE_EXTRA n/a
+ =============== ================== ==============================
.. _ef:
7. The default value for *execution flags* (*ef*) that Lrexlib uses when
the parameter is not supplied or ``nil``, is:
- * 0 for PCRE
* 0 for standard POSIX regex library
* REG_STARTEND for those POSIX regex libraries that support it,
e.g. Spencer's.
+ * 0 for PCRE
+ * 0 for Oniguruma
.. _locale:
-8. Parameter *locale* (*lo*) can be either a string (e.g., "French_France.1252"),
- or a userdata obtained from a call to maketables_. The default value, used
- when the parameter is not supplied or ``nil``, is the built-in PCRE set of
- character tables.
+8. **PCRE:** parameter *locale* (*lo*) can be either a string (e.g.,
+ "French_France.1252"), or a userdata obtained from a call to maketables_.
+ The default value, used when the parameter is not supplied or ``nil``,
+ is the built-in PCRE set of character tables.
+
+ **Oniguruma:** this parameter (which actually should be named "encoding"
+ rather then "locale") must be one of the predefined strings that are formed
+ from the ONIG_ENCODING_xxx identifiers defined in oniguruma.h, by means of
+ omitting the ONIG_ENCODING\_ part. For example, ONIG_ENCODING_UTF8 becomes
+ ``"UTF8"`` on the Lua side (or ``"utf8"``, as this parameter is case
+ insensitive). The default value, used when the parameter is not supplied or
+ ``nil``, is ``"ASCII"``.
+
+ If the caller-supplied value of this parameter is not one of the predefined
+ "encoding" string set, an error is raised.
+
+.. _syntax:
+
+9. **Oniguruma:** parameter *syntax* (*syn*) must be one of the predefined
+ strings that are formed from the ONIG_SYNTAX_xxx identifiers defined in
+ oniguruma.h, by means of omitting the ONIG_SYNTAX\_ part. For example,
+ ONIG_SYNTAX_JAVA becomes ``"JAVA"`` on the Lua side (or ``"java"``, as this
+ parameter is case insensitive). The default value, used when the parameter is
+ not supplied or ``nil``, is either ``"RUBY"`` (at the start-up), or the value
+ set by the last setdefaultsyntax_ call.
+
+ If the caller-supplied value of `syntax` parameter is not one of the
+ predefined "syntax" string set, an error is raised.
------------------------------------------------------------
-Common (PCRE and POSIX) functions and methods
+Functions and methods common for all bindings
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
match
-----
-:funcdef:`rex.match (subj, patt, [init], [cf], [ef], [lo])`
+:funcdef:`rex.match (subj, patt, [init], [cf], [ef], [lo], [syn])`
or
@@ -108,8 +135,6 @@ or
The function searches for the first match of the regexp *patt* in the string
*subj*, starting from offset *init*, subject to flags *cf* and *ef*.
-PCRE: A locale *lo* may be specified.
-
+---------+-------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
+=========+===============================+========+=============+
@@ -128,10 +153,12 @@ PCRE: A locale *lo* may be specified.
+---------+-------------------------------+--------+-------------+
| [ef] | execution flags (bitwise OR) | number | ef_ |
+---------+-------------------------------+--------+-------------+
- | [lo] |[PCRE] locale |string |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale |string |locale_ |
| | |or | |
| | |userdata| |
+---------+-------------------------------+--------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-------------------------------+--------+-------------+
**Returns on success:**
1. All substring matches ("captures"), in the order they appear in the
@@ -147,7 +174,7 @@ PCRE: A locale *lo* may be specified.
find
----
-:funcdef:`rex.find (subj, patt, [init], [cf], [ef], [lo])`
+:funcdef:`rex.find (subj, patt, [init], [cf], [ef], [lo], [syn])`
or
@@ -156,8 +183,6 @@ or
The function searches for the first match of the regexp *patt* in the string
*subj*, starting from offset *init*, subject to flags *cf* and *ef*.
-PCRE: A locale *lo* may be specified.
-
+---------+-------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
+=========+===============================+========+=============+
@@ -176,10 +201,12 @@ PCRE: A locale *lo* may be specified.
+---------+-------------------------------+--------+-------------+
| [ef] | execution flags (bitwise OR) | number | ef_ |
+---------+-------------------------------+--------+-------------+
- | [lo] |[PCRE] locale |string |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale |string |locale_ |
| | |or | |
| | |userdata| |
+---------+-------------------------------+--------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-------------------------------+--------+-------------+
**Returns on success:**
1. The start point of the match (a number).
@@ -196,14 +223,12 @@ PCRE: A locale *lo* may be specified.
gmatch
------
-:funcdef:`rex.gmatch (subj, patt, [cf], [ef], [lo])`
+:funcdef:`rex.gmatch (subj, patt, [cf], [ef], [lo], [syn])`
The function is intended for use in the *generic for* Lua construct.
It returns an iterator for repeated matching of the pattern *patt* in
the string *subj*, subject to flags *cf* and *ef*.
-PCRE: A locale *lo* may be specified.
-
+---------+-------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
+=========+===============================+========+=============+
@@ -217,10 +242,12 @@ PCRE: A locale *lo* may be specified.
+---------+-------------------------------+--------+-------------+
| [ef] |execution flags (bitwise OR) |number | ef_ |
+---------+-------------------------------+--------+-------------+
- | [lo] |[PCRE] locale |string |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale |string |locale_ |
| | |or | |
| | |userdata| |
+---------+-------------------------------+--------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-------------------------------+--------+-------------+
The iterator function is called by Lua. On every iteration (that is, on every
match), it returns all captures in the order they appear in the pattern (or the
@@ -232,14 +259,12 @@ till the subject fails to match.
gsub
----
-:funcdef:`rex.gsub (subj, patt, repl, [n], [cf], [ef], [lo])`
+:funcdef:`rex.gsub (subj, patt, repl, [n], [cf], [ef], [lo], [syn])`
This function searches for all matches of the pattern *patt* in the string
*subj* and replaces them according to the parameters *repl* and *n* (see details
below).
-PCRE: A locale *lo* may be specified.
-
+---------+-----------------------------------+-------------------------+-------------+
|Parameter| Description | Type |Default Value|
+=========+===================================+=========================+=============+
@@ -256,9 +281,11 @@ PCRE: A locale *lo* may be specified.
+---------+-----------------------------------+-------------------------+-------------+
| [ef] |execution flags (bitwise OR) | number | ef_ |
+---------+-----------------------------------+-------------------------+-------------+
- | [lo] |[PCRE] locale | string or userdata |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale | string or userdata |locale_ |
| | | | |
+---------+-----------------------------------+-------------------------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-----------------------------------+-------------------------+-------------+
**Returns:**
1. The subject string with the substitutions made.
@@ -350,7 +377,7 @@ PCRE: A locale *lo* may be specified.
split
-----
-:funcdef:`rex.split (subj, sep, [cf], [ef], [lo])`
+:funcdef:`rex.split (subj, sep, [cf], [ef], [lo], [syn])`
The function is intended for use in the *generic for* Lua construct.
It is used for splitting a subject string *subj* into parts (*sections*).
@@ -360,8 +387,6 @@ The *sep* parameter is a regular expression pattern representing
The function returns an iterator for repeated matching of the pattern *sep* in
the string *subj*, subject to flags *cf* and *ef*.
-PCRE: A locale *lo* may be specified.
-
+---------+-------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
+=========+===============================+========+=============+
@@ -375,10 +400,12 @@ PCRE: A locale *lo* may be specified.
+---------+-------------------------------+--------+-------------+
| [ef] |execution flags (bitwise OR) |number | ef_ |
+---------+-------------------------------+--------+-------------+
- | [lo] |[PCRE] locale |string |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale |string |locale_ |
| | |or | |
| | |userdata| |
+---------+-------------------------------+--------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-------------------------------+--------+-------------+
**On every iteration pass, the iterator returns:**
@@ -400,15 +427,15 @@ flags
:funcdef:`rex.flags ([tb])`
This function returns a table containing numeric values of the constants defined
-by the used regex library (either PCRE or POSIX). Those constants are keyed by
-their names (strings). If the table argument *tb* is supplied then it is used as
-the output table, else a new table is created.
+by the used regex library. Those constants are keyed by their names (strings).
+If the table argument *tb* is supplied then it is used as the output table,
+else a new table is created.
The constants contained in the returned table can then be used in most functions
and methods where *compilation flags* or *execution flags* can be specified.
They can also be used for comparing with return codes of some functions and
-methods for determining the reason of failure. For details, see PCRE_ and POSIX_
-documentation.
+methods for determining the reason of failure. For details, see POSIX_, PCRE_
+and Oniguruma_ documentation.
+---------+--------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
@@ -419,20 +446,29 @@ documentation.
**Returns:**
1. A table filled with the results.
+**Notes:**
+The keys in the `tb` table are formed from the names of the corresponding
+constants in the used library. They are formed as follows:
+
+* **POSIX:** prefix REG\_ is omitted, e.g. REG_ICASE becomes ``"ICASE"``.
+* **PCRE:** prefix PCRE\_ is omitted, e.g. PCRE_CASELESS becomes
+ ``"CASELESS"``.
+* **Oniguruma:** names of constants are converted to strings with no alteration,
+ but for ONIG_OPTION_xxx constants, alias strings are created additionally,
+ e.g., the value of ONIG_OPTION_IGNORECASE constant becomes accessible via
+ either of two keys: ``"ONIG_OPTION_IGNORECASE"`` and ``"IGNORECASE"``.
+
------------------------------------------------------------
new
---
-:funcdef:`rex.new (patt, [cf], [lo])`
+:funcdef:`rex.new (patt, [cf], [lo], [syn])`
The functions compiles regular expression *patt* into a regular expression
-object whose internal representation is correspondent to the library used (PCRE
-or POSIX regex). The returned result then can be used by the methods `tfind`_,
-`exec`_ and `dfa_exec`_. Regular expression objects are automatically garbage
-collected.
-
-PCRE: A locale *lo* may be specified.
+object whose internal representation is corresponding to the library used.
+The returned result then can be used by the methods, e.g. `tfind`_, `exec`_,
+etc. Regular expression objects are automatically garbage collected.
+---------+-------------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
@@ -441,10 +477,12 @@ PCRE: A locale *lo* may be specified.
+---------+-------------------------------+--------+-------------+
| [cf] |compilation flags (bitwise OR) | number | cf_ |
+---------+-------------------------------+--------+-------------+
- | [lo] |[PCRE] locale |string |locale_ |
+ | [lo] |[PCRE, Oniguruma] locale |string |locale_ |
| | |or | |
| | |userdata| |
+---------+-------------------------------+--------+-------------+
+ | [syn] |[Oniguruma] syntax | string |syntax_ |
+ +---------+-------------------------------+--------+-------------+
**Returns:**
1. Compiled regular expression (a userdata).
@@ -479,17 +517,17 @@ string *subj*, starting from offset *init*, subject to execution flags *ef*.
result, in a table. This table contains ``false`` in the positions where the
corresponding sub-pattern did not participate in the match.
- 1. PCRE: if *named subpatterns* are used then the table also contains
- substring matches keyed by their correspondent subpattern names
- (strings).
+ 1. **PCRE**, **Oniguruma**: if *named subpatterns* are used then the table
+ also contains substring matches keyed by their correspondent subpattern
+ names (strings).
**Returns on failure:**
1. ``nil``
**Notes:**
- 1. If *named subpatterns* (see PCRE_ docs) are used then the returned table
- also contains substring matches keyed by their correspondent subpattern
- names (strings).
+ 1. If *named subpatterns* (see PCRE_ and Oniguruma_ docs) are used then the
+ returned table also contains substring matches keyed by their correspondent
+ subpattern names (strings).
------------------------------------------------------------
@@ -522,9 +560,9 @@ string *subj*, starting from offset *init*, subject to execution flags *ef*.
positions where the corresponding sub-pattern did not participate in the
match.
- 1. PCRE: if *named subpatterns* are used then the table also contains
- substring matches keyed by their correspondent subpattern names
- (strings).
+ 1. **PCRE**, **Oniguruma**: if *named subpatterns* are used then the table
+ also contains substring matches keyed by their correspondent subpattern
+ names (strings).
**Returns on failure:**
1. ``nil``
@@ -585,9 +623,9 @@ string *subj*, using a DFA matching algorithm.
maketables
----------
-[PCRE only. See *pcre_maketables* in the PCRE_ docs.]
+[See *pcre_maketables* in the PCRE_ docs.]
-:funcdef:`rex.maketables ()`
+:funcdef:`rex_pcre.maketables ()`
Creates a set of character tables corresponding to the current locale and
returns it as a userdata. The returned value can be passed to any Lrexlib
@@ -600,7 +638,7 @@ config
[PCRE 4.0 and later. See *pcre_config* in the PCRE_ docs.]
-:funcdef:`rex.config ([tb])`
+:funcdef:`rex_pcre.config ([tb])`
This function returns a table containing the values of the configuration
parameters used at PCRE library build-time. Those parameters (numbers) are
@@ -618,18 +656,54 @@ is used as the output table, else a new table is created.
------------------------------------------------------------
-version
--------
+.. _version:
+
+rex_pcre.version
+----------------
-[PCRE only. See *pcre_version* in the PCRE_ docs.]
+[See *pcre_version* in the PCRE_ docs.]
-:funcdef:`rex.version ()`
+:funcdef:`rex_pcre.version ()`
This function returns a string containing the version of the used PCRE library
and its release date.
------------------------------------------------------------
+Oniguruma-only functions and methods
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+setdefaultsyntax
+----------------
+
+:funcdef:`rex_onig.setdefaultsyntax (syntax)`
+
+This function sets the default syntax for the Oniguruma library, according to
+value of the string syntax_. The specified syntax will be further used for
+interpreting string regex patterns by all relevant functions, unless `syntax`
+argument is passed to those functions explicitly.
+
+**Returns:** nothing
+
+**Examples:**
+
+ 1. ``rex_onig.setdefaultsyntax ("ASIS") -- use plain text syntax as the default``
+ 2. ``rex_onig.setdefaultsyntax ("PERL") -- use PERL regex syntax as the default``
+
+------------------------------------------------------------
+
+rex_onig.version
+----------------
+
+[See *onig_version* in the Oniguruma docs.]
+
+:funcdef:`rex_onig.version ()`
+
+This function returns a string containing the version of the used Oniguruma
+library.
+
+------------------------------------------------------------
+
Other functions
~~~~~~~~~~~~~~~
@@ -645,7 +719,7 @@ The function searches for the first match of the string *patt* in the subject
themselves.
* Both strings *subj* and *patt* can have embedded zeros.
* The flag *ci* specifies case-insensitive search (current locale is used).
- * This function uses neither PCRE nor POSIX regex library.
+ * This function uses no regex library.
+---------+---------------------------+--------+-------------+
|Parameter| Description | Type |Default Value|
diff --git a/src/algo.h b/src/algo.h
index a72edd3..4230df7 100755
--- a/src/algo.h
+++ b/src/algo.h
@@ -16,6 +16,10 @@ static int generate_error (lua_State *L, const TUserdata *ud, int errcode);
# define ALG_OPTLOCALE(a,b,c)
#endif
+#ifndef ALG_OPTSYNTAX
+# define ALG_OPTSYNTAX(a,b,c)
+#endif
+
#ifndef DO_NAMED_SUBPATTERNS
#define DO_NAMED_SUBPATTERNS(a,b,c)
#endif
@@ -113,6 +117,7 @@ static void checkarg_new (lua_State *L, TArgComp *argC) {
argC->pattern = luaL_checklstring (L, 1, &argC->patlen);
argC->cflags = ALG_GETCFLAGS (L, 2);
ALG_OPTLOCALE (argC, L, 3);
+ ALG_OPTSYNTAX (argC, L, 4);
}
@@ -132,6 +137,7 @@ static void checkarg_gsub (lua_State *L, TArgComp *argC, TArgExec *argE) {
argC->cflags = ALG_GETCFLAGS (L, 5);
argE->eflags = luaL_optint (L, 6, ALG_EFLAGS_DFLT);
ALG_OPTLOCALE (argC, L, 7);
+ ALG_OPTSYNTAX (argC, L, 8);
}
@@ -144,6 +150,7 @@ static void checkarg_find_func (lua_State *L, TArgComp *argC, TArgExec *argE) {
argC->cflags = ALG_GETCFLAGS (L, 4);
argE->eflags = luaL_optint (L, 5, ALG_EFLAGS_DFLT);
ALG_OPTLOCALE (argC, L, 6);
+ ALG_OPTSYNTAX (argC, L, 7);
}
@@ -155,6 +162,7 @@ static void checkarg_gmatch_split (lua_State *L, TArgComp *argC, TArgExec *argE)
argC->cflags = ALG_GETCFLAGS (L, 3);
argE->eflags = luaL_optint (L, 4, ALG_EFLAGS_DFLT);
ALG_OPTLOCALE (argC, L, 5);
+ ALG_OPTSYNTAX (argC, L, 6);
}
diff --git a/src/common.h b/src/common.h
index 6f25ec3..06d9856 100755
--- a/src/common.h
+++ b/src/common.h
@@ -27,9 +27,10 @@ typedef struct { /* compile arguments */
size_t patlen;
void * ud;
int cflags;
- const char * locale;
- const unsigned char * tables;
- int tablespos;
+ const char * locale; /* PCRE, Oniguruma */
+ const unsigned char * tables; /* PCRE */
+ int tablespos; /* PCRE */
+ void * syntax; /* Oniguruma */
} TArgComp;
typedef struct { /* exec arguments */
@@ -39,10 +40,10 @@ typedef struct { /* exec arguments */
int eflags;
int funcpos;
int maxmatch;
- int funcpos2; /* used with gsub */
- int reptype; /* used with gsub */
- size_t ovecsize; /* used with dfa_exec */
- size_t wscount; /* used with dfa_exec */
+ int funcpos2; /* used with gsub */
+ int reptype; /* used with gsub */
+ size_t ovecsize; /* PCRE: dfa_exec */
+ size_t wscount; /* PCRE: dfa_exec */
} TArgExec;
struct tagFreeList; /* forward declaration */
diff --git a/test/onig_sets.lua b/test/onig_sets.lua
new file mode 100755
index 0000000..83555a6
--- /dev/null
+++ b/test/onig_sets.lua
@@ -0,0 +1,147 @@
+-- See Copyright Notice in the file LICENSE
+
+local luatest = require "luatest"
+local N = luatest.NT
+
+local function norm(a) return a==nil and N or a end
+
+local function fill (n, m)
+ local t = {}
+ for i = n, m, -1 do table.insert (t, i) end
+ return t
+end
+
+local function set_named_subpatterns (lib, flg)
+ return {
+ Name = "Named Subpatterns",
+ Func = function (methodname, subj, patt, name1, name2)
+ local r = lib.new (patt)
+ local _,_,caps = r[methodname] (r, subj)
+ return norm(caps[name1]), norm(caps[name2])
+ end,
+ --{}
+ { {"tfind", "abcd", "(?<dog>.)b.(?<cat>d)", "dog", "cat"}, {"a","d"} },
+ { {"exec", "abcd", "(?<dog>.)b.(?<cat>d)", "dog", "cat"}, {"a","d"} },
+ }
+end
+
+local function set_f_find (lib, flg)
+ local cp1251 =
+ "ÀÁÂÃÄŨÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÜÛÚÝÞßàáâãäå¸æçèéêëìíîïðñòóôõö÷øùüûúýþÿ"
+ local loc = "CP1251"
+ return {
+ Name = "Function find",
+ Func = lib.find,
+ --{subj, patt, st,cf,ef,lo}, { results }
+ { {"abcd", ".+", 5}, { N } }, -- failing st
+ { {"abcd", ".*?"}, { 1,0 } }, -- non-greedy
+ { {"abc", "aBC", N,flg.IGNORECASE}, { 1,3 } }, -- cf
+ { {"abc", "aBC", N,"i" }, { 1,3 } }, -- cf
+ { {cp1251, "[[:upper:]]+", N,N,N, loc}, { 1,33} }, -- locale
+ { {cp1251, "[[:lower:]]+", N,N,N, loc}, {34,66} }, -- locale
+ { {cp1251, "\\w+", N,N,N, loc}, {1, 66} }, -- locale
+}
+end
+
+local function set_f_match (lib, flg)
+ return {
+ Name = "Function match",
+ Func = lib.match,
+ --{subj, patt, st,cf,ef,lo}, { results }
+ { {"abcd", ".+", 5}, { N }}, -- failing st
+ { {"abcd", ".*?"}, { "" }}, -- non-greedy
+ { {"abc", "aBC", N,flg.IGNORECASE}, {"abc" }}, -- cf
+ { {"abc", "aBC", N,"i" }, {"abc" }}, -- cf
+}
+end
+
+local function set_f_gmatch (lib, flg)
+ -- gmatch (s, p, [cf], [ef])
+ local pCSV = "(^[^,]*)|,([^,]*)"
+ local F = false
+ local function test_gmatch (subj, patt)
+ local out, guard = {}, 10
+ for a, b in lib.gmatch (subj, patt) do
+ table.insert (out, { norm(a), norm(b) })
+ guard = guard - 1
+ if guard == 0 then break end
+ end
+ return unpack (out)
+ end
+ return {
+ Name = "Function gmatch",
+ Func = test_gmatch,
+ --{ subj patt results }
+ { {"a\0c", "." }, {{"a",N},{"\0",N},{"c",N}} },--nuls in subj
+ { {"", pCSV}, {{"",F}} },
+ { {"12", pCSV}, {{"12",F}} },
+ ----{ {",", pCSV}, {{"", F},{F,""}} },
+ { {"12,,45", pCSV}, {{"12",F},{F,""},{F,"45"}} },
+ ----{ {",,12,45,,ab,", pCSV}, {{"",F},{F,""},{F,"12"},{F,"45"},{F,""},{F,"ab"},{F,""}} },
+ }
+end
+
+local function set_f_split (lib, flg)
+ -- split (s, p, [cf], [ef])
+ local function test_split (subj, patt)
+ local out, guard = {}, 10
+ for a, b, c in lib.split (subj, patt) do
+ table.insert (out, { norm(a), norm(b), norm(c) })
+ guard = guard - 1
+ if guard == 0 then break end
+ end
+ return unpack (out)
+ end
+ return {
+ Name = "Function split",
+ Func = test_split,
+ --{ subj patt results }
+ { {"a,\0,c", ","}, {{"a",",",N},{"\0",",",N},{"c",N,N}, } },--nuls in subj
+ { {"ab", "$"}, {{"ab","",N}, {"",N,N}, } },
+ { {"ab", "^|$"}, {{"", "", N}, {"ab","",N}, {"",N,N}, } },
+ { {"ab45ab","(?<=ab).*?"}, {{"ab","",N}, {"45ab","",N},{"",N,N}, } },
+ { {"ab", "\\b"}, {{"", "", N}, {"ab","",N}, {"",N,N}, } },
+ }
+end
+
+local function set_m_exec (lib, flg)
+ return {
+ Name = "Method exec",
+ Method = "exec",
+--{patt,cf,lo}, {subj,st,ef} { results }
+ { {".+"}, {"abcd",5}, { N } }, -- failing st
+ { {".*?"}, {"abcd"}, {1,0,{}} }, -- non-greedy
+ { {"aBC",flg.IGNORECASE}, {"abc"}, {1,3,{}} }, -- cf
+ { {"aBC","i" }, {"abc"}, {1,3,{}} }, -- cf
+}
+end
+
+local function set_m_tfind (lib, flg)
+ return {
+ Name = "Method tfind",
+ Method = "tfind",
+--{patt,cf,lo}, {subj,st,ef} { results }
+ { {".+"}, {"abcd",5}, { N } }, -- failing st
+ { {".*?"}, {"abcd"}, {1,0,{}} }, -- non-greedy
+ { {"aBC",flg.IGNORECASE}, {"abc"}, {1,3,{}} }, -- cf
+ { {"aBC","i" }, {"abc"}, {1,3,{}} }, -- cf
+}
+end
+
+return function (libname)
+ local lib = require (libname)
+ local flags = lib.flags ()
+ local sets = {
+ set_f_match (lib, flags),
+ set_f_find (lib, flags),
+ set_f_gmatch (lib, flags),
+ set_f_split (lib, flags),
+ set_m_exec (lib, flags),
+ set_m_tfind (lib, flags),
+ }
+ local MAJOR = tonumber(lib.version():match("%d+"))
+ if MAJOR >= 0 then
+ table.insert (sets, set_named_subpatterns (lib, flags))
+ end
+ return sets
+end
diff --git a/test/runtest.lua b/test/runtest.lua
index ede0c0c..cc75205 100755
--- a/test/runtest.lua
+++ b/test/runtest.lua
@@ -48,6 +48,7 @@ local avail_tests = {
pcre = { lib = "rex_pcre", "common_sets", "pcre_sets", "pcre_sets2", },
pcre_nr = { lib = "rex_pcre_nr", "common_sets", "pcre_sets", "pcre_sets2", },
pcre45 = { lib = "rex_pcre45", "common_sets", "pcre_sets", "pcre_sets2", },
+ onig = { lib = "rex_onig", "common_sets", "onig_sets", }
}
do