From 9a4df7ffa58f90a802b9ce5704cb2452cb54fc9e Mon Sep 17 00:00:00 2001 From: Shmuel Zeigerman Date: Sun, 12 Feb 2012 23:31:32 +0200 Subject: TRE binding: add wide-character functions. --- src/algo.h | 18 ++++--- src/common.c | 39 +++++++++++++++ src/common.h | 1 + src/tre/ltre_w.c | 1 + test/tre_sets.lua | 138 +++++++++++++++++++++++++++--------------------------- 5 files changed, 121 insertions(+), 76 deletions(-) diff --git a/src/algo.h b/src/algo.h index 65db6c5..d5ad9d7 100644 --- a/src/algo.h +++ b/src/algo.h @@ -17,6 +17,10 @@ static int generate_error (lua_State *L, const TUserdata *ud, int errcode); # define ALG_CHARSIZE 1 #endif +#ifndef BUFFERZ_PUTREPSTRING +# define BUFFERZ_PUTREPSTRING bufferZ_putrepstring +#endif + #ifndef ALG_GETCARGS # define ALG_GETCARGS(a,b,c) #endif @@ -213,7 +217,7 @@ static int gsub (lua_State *L) { /*------------------------------------------------------------------*/ if (argE.reptype == LUA_TSTRING) { buffer_init (&BufRep, 256, L, &freelist); - bufferZ_putrepstring (&BufRep, argE.funcpos, ALG_NSUB(ud)); + BUFFERZ_PUTREPSTRING (&BufRep, argE.funcpos, ALG_NSUB(ud)); } /*------------------------------------------------------------------*/ if (argE.maxmatch == GSUB_CONDITIONAL) { @@ -231,8 +235,8 @@ static int gsub (lua_State *L) { #ifdef ALG_USERETRY if (retry) { if (st < (int)argE.textlen) { /* advance by 1 char (not replaced) */ - buffer_addlstring (&BufOut, argE.text + st, 1); - ++st; + buffer_addlstring (&BufOut, argE.text + st, ALG_CHARSIZE); + st += ALG_CHARSIZE; retry = 0; continue; } @@ -309,8 +313,8 @@ static int gsub (lua_State *L) { if (argE.maxmatch == GSUB_CONDITIONAL) { /* Call the function */ lua_pushvalue (L, argE.funcpos2); - lua_pushinteger (L, from + 1); - lua_pushinteger (L, to); + lua_pushinteger (L, from/ALG_CHARSIZE + 1); + lua_pushinteger (L, to/ALG_CHARSIZE); if (argE.reptype == LUA_TSTRING) buffer_pushresult (&BufTemp); else { @@ -359,8 +363,8 @@ static int gsub (lua_State *L) { retry = 1; #else /* advance by 1 char (not replaced) */ - buffer_addlstring (&BufOut, argE.text + st, 1); - ++st; + buffer_addlstring (&BufOut, argE.text + st, ALG_CHARSIZE); + st += ALG_CHARSIZE; #endif } else break; diff --git a/src/common.c b/src/common.c index ae6147c..03a2f1d 100644 --- a/src/common.c +++ b/src/common.c @@ -220,6 +220,45 @@ void bufferZ_putrepstring (TBuffer *BufRep, int reppos, int nsub) { } } +/* 1. When called repeatedly on the same TBuffer, its existing data + is discarded and overwritten by the new data. + 2. The TBuffer's array is never shrunk by this function. +*/ +void bufferZ_putrepstringW (TBuffer *BufRep, int reppos, int nsub) { + char dbuf[] = { 0, 0 }; + size_t replen; + const wchar_t *p = (const wchar_t*) lua_tolstring (BufRep->L, reppos, &replen); + replen /= sizeof(wchar_t); + const wchar_t *end = p + replen; + BufRep->top = 0; + while (p < end) { + const wchar_t *q; + for (q = p; q < end && *q != L'%'; ++q) + {} + if (q != p) + bufferZ_addlstring (BufRep, p, (q - p) * sizeof(wchar_t)); + if (q < end) { + if (++q < end) { /* skip % */ + if (iswdigit (*q)) { + int num; + *dbuf = *q & 0xFF; + num = atoi (dbuf); + if (num == 1 && nsub == 0) + num = 0; + else if (num > nsub) { + freelist_free (BufRep->freelist); + luaL_error (BufRep->L, "invalid capture index"); + } + bufferZ_addnum (BufRep, num); + } + else bufferZ_addlstring (BufRep, q, 1 * sizeof(wchar_t)); + } + p = q + 1; + } + else break; + } +} + /****************************************************************************** The intended use of this function is as follows: size_t iter = 0; diff --git a/src/common.h b/src/common.h index de8d52d..04ef7cb 100644 --- a/src/common.h +++ b/src/common.h @@ -83,6 +83,7 @@ void buffer_addvalue (TBuffer *buf, int stackpos); void buffer_pushresult (TBuffer *buf); void bufferZ_putrepstring (TBuffer *buf, int reppos, int nsub); +void bufferZ_putrepstringW (TBuffer *buf, int reppos, int nsub); int bufferZ_next (TBuffer *buf, size_t *iter, size_t *len, const char **str); int get_int_field (lua_State *L, const char* field); diff --git a/src/tre/ltre_w.c b/src/tre/ltre_w.c index bd3ca1e..7f8f87b 100644 --- a/src/tre/ltre_w.c +++ b/src/tre/ltre_w.c @@ -25,6 +25,7 @@ #define ALG_CFLAGS_DFLT REG_EXTENDED #define ALG_EFLAGS_DFLT 0 #define ALG_CHARSIZE 2 +#define BUFFERZ_PUTREPSTRING bufferZ_putrepstringW #define ALG_NOMATCH(res) ((res) == REG_NOMATCH) #define ALG_ISMATCH(res) ((res) == 0) diff --git a/test/tre_sets.lua b/test/tre_sets.lua index 2b25b48..b2169a1 100644 --- a/test/tre_sets.lua +++ b/test/tre_sets.lua @@ -185,7 +185,7 @@ local function set_f_wgsub1 (lib, flg) return { Name = "Function wgsub, set1", Func = get_wgsub (lib), - --{ s, p, f, n, res1, res2, res3 }, + --{ s, p, f, n, res1, res2, res3 }, { {subj, cpat, L"", 0}, {subj, 0, 0} }, -- test "n" + empty_replace { {subj, pat, L"", 0}, {subj, 0, 0} }, -- test "n" + empty_replace { {subj, pat, L"", -1}, {subj, 0, 0} }, -- test "n" + empty_replace @@ -207,7 +207,7 @@ local function set_f_wgsub2 (lib, flg) return { Name = "Function wgsub, set2", Func = get_wgsub (lib), - --{ s, p, f, n, res1, res2, res3 }, + --{ s, p, f, n, res1, res2, res3 }, { {subj, pat, L"<%1>" }, {L"b", 2, 2} }, -- test non-escaped chars in f { {subj, pat, L"%<%1%>" }, {L"b", 2, 2} }, -- test escaped chars in f { {subj, pat, L"" }, {L"b", 2, 2} }, -- test empty replace @@ -224,12 +224,12 @@ local function set_f_wgsub3 (lib, flg) return { Name = "Function wgsub, set3", Func = get_wgsub (lib), - --{ s, p, f, n, res1,res2,res3 }, - { {L"abc", L"a", "%0" }, {"abc", 1, 1} }, -- test (in)valid capture index - { {L"abc", L"a", "%1" }, {"abc", 1, 1} }, - { {L"abc", L"[ac]", "%1" }, {"abc", 2, 2} }, - { {L"abc", L"(a)", "%1" }, {"abc", 1, 1} }, - { {L"abc", L"(a)", "%2" }, "invalid capture index" }, + --{ s, p, f, n, res1,res2,res3 }, + { {L"abc", L"a", L"%0" }, {L"abc", 1, 1} }, -- test (in)valid capture index + { {L"abc", L"a", L"%1" }, {L"abc", 1, 1} }, + { {L"abc", L"[ac]", L"%1" }, {L"abc", 2, 2} }, + { {L"abc", L"(a)", L"%1" }, {L"abc", 1, 1} }, + { {L"abc", L"(a)", L"%2" }, "invalid capture index" }, } end @@ -237,87 +237,87 @@ local function set_f_wgsub4 (lib, flg) return { Name = "Function wgsub, set4", Func = get_wgsub (lib), - --{ s, p, f, n, res1, res2, res3 }, - { {"a2c3", ".", "#" }, {"####", 4, 4} }, -- test . - { {"a2c3", ".+", "#" }, {"#", 1, 1} }, -- test .+ - { {"a2c3", ".*", "#" }, {"##", 2, 2} }, -- test .* - { {"/* */ */", "\\/\\*(.*)\\*\\/", "#" }, {"#", 1, 1} }, - { {"a2c3", "[0-9]", "#" }, {"a#c#", 2, 2} }, -- test %d - { {"a2c3", "[^0-9]", "#" }, {"#2#3", 2, 2} }, -- test %D - { {"a \t\nb", "[ \t\n]", "#" }, {"a###b", 3, 3} }, -- test %s - { {"a \t\nb", "[^ \t\n]", "#" }, {"# \t\n#", 2, 2} }, -- test %S + --{ s, p, f, n, res1, res2, res3 }, + { {L"a2c3", L".", L"#" }, {L"####", 4, 4} }, -- test . + { {L"a2c3", L".+", L"#" }, {L"#", 1, 1} }, -- test .+ + { {L"a2c3", L".*", L"#" }, {L"##", 2, 2} }, -- test .* + { {L"/* */ */", L"\\/\\*(.*)\\*\\/", L"#" }, {L"#", 1, 1} }, + { {L"a2c3", L"[0-9]", L"#" }, {L"a#c#", 2, 2} }, -- test %d + { {L"a2c3", L"[^0-9]", L"#" }, {L"#2#3", 2, 2} }, -- test %D + { {L"a \t\nb", L"[ \t\n]", L"#" }, {L"a###b", 3, 3} }, -- test %s + { {L"a \t\nb", L"[^ \t\n]", L"#" }, {L"# \t\n#", 2, 2} }, -- test %S } end local function set_f_wgsub5 (lib, flg) local function frep1 () end -- returns nothing - local function frep2 () return "#" end -- ignores arguments - local function frep3 (...) return table.concat({...}, ",") end -- "normal" + local function frep2 () return L"#" end -- ignores arguments + local function frep3 (...) return table.concat({...}, L",") end -- "normal" local function frep4 () return {} end -- invalid return type - local function frep5 () return "7", "a" end -- 2-nd return is "a" - local function frep6 () return "7", "break" end -- 2-nd return is "break" - local subj = "a2c3" + local function frep5 () return L"7", L"a" end -- 2-nd return is "a" + local function frep6 () return L"7", "break" end -- 2-nd return is "break" + local subj = L"a2c3" return { Name = "Function wgsub, set5", Func = get_wgsub (lib), - --{ s, p, f, n, res1, res2, res3 }, - { {subj, "a(.)c(.)", frep1 }, {subj, 1, 0} }, - { {subj, "a(.)c(.)", frep2 }, {"#", 1, 1} }, - { {subj, "a(.)c(.)", frep3 }, {"2,3", 1, 1} }, - { {subj, "a.c.", frep3 }, {subj, 1, 1} }, - { {subj, "z*", frep1 }, {subj, 5, 0} }, - { {subj, "z*", frep2 }, {"#a#2#c#3#", 5, 5} }, - { {subj, "z*", frep3 }, {subj, 5, 5} }, - { {subj, subj, frep4 }, "invalid return type" }, - { {"abc",".", frep5 }, {"777", 3, 3} }, - { {"abc",".", frep6 }, {"777", 3, 3} }, + --{ s, p, f, n, res1, res2, res3 }, + { {subj, L"a(.)c(.)", frep1 }, {subj, 1, 0} }, + { {subj, L"a(.)c(.)", frep2 }, {L"#", 1, 1} }, + { {subj, L"a(.)c(.)", frep3 }, {L"2,3", 1, 1} }, + { {subj, L"a.c.", frep3 }, {subj, 1, 1} }, + { {subj, L"z*", frep1 }, {subj, 5, 0} }, + { {subj, L"z*", frep2 }, {L"#a#2#c#3#", 5, 5} }, + { {subj, L"z*", frep3 }, {subj, 5, 5} }, + { {subj, subj, frep4 }, "invalid return type" }, + { {L"abc",L".", frep5 }, {L"777", 3, 3} }, + { {L"abc",L".", frep6 }, {L"777", 3, 3} }, } end local function set_f_wgsub6 (lib, flg) - local tab1, tab2, tab3 = {}, { ["2"] = 56 }, { ["2"] = {} } - local subj = "a2c3" + local tab1, tab2, tab3 = {}, { [L"2"] = 56 }, { [L"2"] = {} } + local subj = L"a2c3" return { Name = "Function wgsub, set6", Func = get_wgsub (lib), - --{ s, p, f, n, res1,res2,res3 }, - { {subj, "a(.)c(.)", tab1 }, {subj, 1, 0} }, - { {subj, "a(.)c(.)", tab2 }, {"56", 1, 1} }, - { {subj, "a(.)c(.)", tab3 }, "invalid replacement type" }, - { {subj, "a.c.", tab1 }, {subj, 1, 0} }, - { {subj, "a.c.", tab2 }, {subj, 1, 0} }, - { {subj, "a.c.", tab3 }, {subj, 1, 0} }, + --{ s, p, f, n, res1,res2,res3 }, + { {subj, L"a(.)c(.)", tab1 }, {subj, 1, 0} }, + { {subj, L"a(.)c(.)", tab2 }, {"56", 1, 1} }, + { {subj, L"a(.)c(.)", tab3 }, "invalid replacement type" }, + { {subj, L"a.c.", tab1 }, {subj, 1, 0} }, + { {subj, L"a.c.", tab2 }, {subj, 1, 0} }, + { {subj, L"a.c.", tab3 }, {subj, 1, 0} }, } end local function set_f_wgsub8 (lib, flg) - local subj, patt, repl = "abcdef", "..", "*" + local subj, patt, repl = L"abcdef", L"..", L"*" return { Name = "Function wgsub, set8", Func = get_wgsub (lib), --{ s, p, f, n, res1, res2, res3 }, - { {subj, patt, repl, function() end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return nil end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return false end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return true end }, {"***", 3, 3} }, - { {subj, patt, repl, function() return {} end }, {"***", 3, 3} }, - { {subj, patt, repl, function() return "#" end }, {"###", 3, 3} }, - { {subj, patt, repl, function() return 57 end }, {"575757", 3, 3} }, - { {subj, patt, repl, function (from) return from end }, {"135", 3, 3} }, - { {subj, patt, repl, function (from, to) return to end }, {"246", 3, 3} }, + { {subj, patt, repl, function() end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return nil end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return false end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return true end }, {L"***", 3, 3} }, + { {subj, patt, repl, function() return {} end }, {L"***", 3, 3} }, + { {subj, patt, repl, function() return L"#" end }, {L"###", 3, 3} }, + { {subj, patt, repl, function() return 57 end }, {"575757", 3, 3} }, + { {subj, patt, repl, function (from) return from end }, {"135", 3, 3} }, + { {subj, patt, repl, function (from, to) return to end }, {"246", 3, 3} }, { {subj, patt, repl, function (from,to,rep) return rep end }, - {"***", 3, 3} }, + {L"***", 3, 3} }, { {subj, patt, repl, function (from, to, rep) return rep..to..from end }, - {"*21*43*65", 3, 3} }, - { {subj, patt, repl, function() return nil end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return nil, nil end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return nil, false end }, {"abcdef", 3, 0} }, - { {subj, patt, repl, function() return nil, true end }, {"ab**", 3, 2} }, - { {subj, patt, repl, function() return true, true end }, {"***", 3, 3} }, - { {subj, patt, repl, function() return nil, 0 end }, {"abcdef", 1, 0} }, - { {subj, patt, repl, function() return true, 0 end }, {"*cdef", 1, 1} }, - { {subj, patt, repl, function() return nil, 1 end }, {"ab*ef", 2, 1} }, - { {subj, patt, repl, function() return true, 1 end }, {"**ef", 2, 2} }, + {L"*".."21"..L"*".."43"..L"*".."65", 3, 3} }, + { {subj, patt, repl, function() return nil end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return nil, nil end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return nil, false end }, {L"abcdef", 3, 0} }, + { {subj, patt, repl, function() return nil, true end }, {L"ab**", 3, 2} }, + { {subj, patt, repl, function() return true, true end }, {L"***", 3, 3} }, + { {subj, patt, repl, function() return nil, 0 end }, {L"abcdef", 1, 0} }, + { {subj, patt, repl, function() return true, 0 end }, {L"*cdef", 1, 1} }, + { {subj, patt, repl, function() return nil, 1 end }, {L"ab*ef", 2, 1} }, + { {subj, patt, repl, function() return true, 1 end }, {L"**ef", 2, 2} }, } end @@ -334,12 +334,12 @@ return function (libname) set_m_wfind (lib), set_m_wmatch (lib), set_f_wgsub1 (lib), - --set_f_wgsub2 (lib), - --set_f_wgsub3 (lib), - --set_f_wgsub4 (lib), - --set_f_wgsub5 (lib), - --set_f_wgsub6 (lib), - --set_f_wgsub8 (lib), + set_f_wgsub2 (lib), + set_f_wgsub3 (lib), + set_f_wgsub4 (lib), + set_f_wgsub5 (lib), + set_f_wgsub6 (lib), + set_f_wgsub8 (lib), --set_f_plainfind (lib), } end -- cgit v1.2.1