diff options
author | Russell Branca <chewbranca@apache.org> | 2019-07-02 13:31:33 -0700 |
---|---|---|
committer | Paul J. Davis <paul.joseph.davis@gmail.com> | 2019-07-31 11:55:30 -0500 |
commit | d42d9b75e6a8f46edad13d47aef71d0f08175ddc (patch) | |
tree | cf3d52ff4ab1758b6feea7a1a3b9976589213b34 | |
parent | 8e574e987c967cb8aa319719d5f5d1f8f4a78fd7 (diff) | |
download | couchdb-d42d9b75e6a8f46edad13d47aef71d0f08175ddc.tar.gz |
Expose ICU ucol_getSortKey
-rw-r--r-- | src/couch/priv/icu_driver/couch_icu_driver.c | 74 | ||||
-rw-r--r-- | src/couch/src/couch_util.erl | 13 | ||||
-rw-r--r-- | src/couch/test/eunit/couch_util_tests.erl | 140 |
3 files changed, 219 insertions, 8 deletions
diff --git a/src/couch/priv/icu_driver/couch_icu_driver.c b/src/couch/priv/icu_driver/couch_icu_driver.c index 4d9bb982d..ffccf2e9d 100644 --- a/src/couch/priv/icu_driver/couch_icu_driver.c +++ b/src/couch/priv/icu_driver/couch_icu_driver.c @@ -30,6 +30,8 @@ specific language governing permissions and limitations under the License. #include <string.h> /* for memcpy */ #endif +#define BUFFER_SIZE 1024 + typedef struct { ErlDrvPort port; @@ -54,6 +56,8 @@ static ErlDrvData couch_drv_start(ErlDrvPort port, char *buff) UErrorCode status = U_ZERO_ERROR; couch_drv_data* pData = (couch_drv_data*)driver_alloc(sizeof(couch_drv_data)); + set_port_control_flags(port, PORT_CONTROL_FLAG_BINARY); + if (pData == NULL) return ERL_DRV_ERROR_GENERAL; @@ -84,14 +88,17 @@ ErlDrvSSizeT return_control_result(void* pLocalResult, int localLen, char **ppRetBuf, ErlDrvSizeT returnLen) { + ErlDrvBinary* buf = NULL; + if (*ppRetBuf == NULL || localLen > returnLen) { - *ppRetBuf = (char*)driver_alloc_binary(localLen); - if(*ppRetBuf == NULL) { - return -1; - } + buf = driver_alloc_binary(localLen); + memcpy(buf->orig_bytes, pLocalResult, localLen); + *ppRetBuf = (char*) buf; + return localLen; + } else { + memcpy(*ppRetBuf, pLocalResult, localLen); + return localLen; } - memcpy(*ppRetBuf, pLocalResult, localLen); - return localLen; } static ErlDrvSSizeT @@ -147,6 +154,61 @@ couch_drv_control(ErlDrvData drv_data, unsigned int command, return return_control_result(&response, sizeof(response), rbuf, rlen); } + case 2: /* GET_SORT_KEY: */ + { + + UChar source[BUFFER_SIZE]; + UChar* sourcePtr = source; + int32_t sourceLen = BUFFER_SIZE; + + uint8_t sortKey[BUFFER_SIZE]; + uint8_t* sortKeyPtr = sortKey; + int32_t sortKeyLen = BUFFER_SIZE; + + int32_t inputLen; + + UErrorCode status = U_ZERO_ERROR; + ErlDrvSSizeT res; + + /* first 32bits are the length */ + memcpy(&inputLen, pBuf, sizeof(inputLen)); + pBuf += sizeof(inputLen); + + u_strFromUTF8(sourcePtr, BUFFER_SIZE, &sourceLen, pBuf, inputLen, &status); + + if (sourceLen >= BUFFER_SIZE) { + /* reset status or next u_strFromUTF8 call will auto-fail */ + status = U_ZERO_ERROR; + sourcePtr = (UChar*) malloc(sourceLen * sizeof(UChar)); + u_strFromUTF8(sourcePtr, sourceLen, NULL, pBuf, inputLen, &status); + if (U_FAILURE(status)) { + rbuf = NULL; + return 0; + } + } else if (U_FAILURE(status)) { + rbuf = NULL; + return 0; + } + + sortKeyLen = ucol_getSortKey(pData->coll, sourcePtr, sourceLen, sortKeyPtr, BUFFER_SIZE); + + if (sortKeyLen > BUFFER_SIZE) { + sortKeyPtr = (uint8_t*) malloc(sortKeyLen); + ucol_getSortKey(pData->coll, sourcePtr, sourceLen, sortKeyPtr, sortKeyLen); + } + + res = return_control_result(sortKeyPtr, sortKeyLen, rbuf, rlen); + + if (sourcePtr != source) { + free(sourcePtr); + } + + if (sortKeyPtr != sortKey) { + free(sortKeyPtr); + } + + return res; + } default: return -1; diff --git a/src/couch/src/couch_util.erl b/src/couch/src/couch_util.erl index 62e17ce36..b3553a5b5 100644 --- a/src/couch/src/couch_util.erl +++ b/src/couch/src/couch_util.erl @@ -14,7 +14,7 @@ -export([priv_dir/0, normpath/1, fold_files/5]). -export([should_flush/0, should_flush/1, to_existing_atom/1]). --export([rand32/0, implode/2, collate/2, collate/3]). +-export([rand32/0, implode/2, collate/2, collate/3, get_sort_key/1]). -export([abs_pathname/1,abs_pathname/2, trim/1, drop_dot_couch_ext/1]). -export([encodeBase64Url/1, decodeBase64Url/1]). -export([validate_utf8/1, to_hex/1, parse_term/1, dict_find/3]). @@ -406,11 +406,20 @@ collate(A, B, Options) when is_binary(A), is_binary(B) -> SizeA = byte_size(A), SizeB = byte_size(B), Bin = <<SizeA:32/native, A/binary, SizeB:32/native, B/binary>>, - [Result] = erlang:port_control(drv_port(), Operation, Bin), + <<Result>> = erlang:port_control(drv_port(), Operation, Bin), % Result is 0 for lt, 1 for eq and 2 for gt. Subtract 1 to return the % expected typical -1, 0, 1 Result - 1. +get_sort_key(Str) when is_binary(Str) -> + Operation = 2, % get_sort_key + Size = byte_size(Str), + Bin = <<Size:32/native, Str/binary>>, + case erlang:port_control(drv_port(), Operation, Bin) of + <<>> -> error; + Res -> Res + end. + should_flush() -> should_flush(?FLUSH_MAX_MEM). diff --git a/src/couch/test/eunit/couch_util_tests.erl b/src/couch/test/eunit/couch_util_tests.erl index 3e145c4f6..b518028cc 100644 --- a/src/couch/test/eunit/couch_util_tests.erl +++ b/src/couch/test/eunit/couch_util_tests.erl @@ -14,6 +14,12 @@ -include_lib("couch/include/couch_eunit.hrl"). +% For generating poisson distributed string lengths +% in the random unicode generation. This shoots +% for lengths centered around 24 characters. To +% change, replace this value with math:exp(-Length). +-define(POISSON_LIMIT, 3.775134544279098e-11). +-define(RANDOM_TEST_SIZE, 10000). setup() -> %% We cannot start driver from here since it becomes bounded to eunit @@ -168,3 +174,137 @@ to_hex_test_() -> ?_assertEqual("", couch_util:to_hex(<<>>)), ?_assertEqual("010203faff", couch_util:to_hex(<<1, 2, 3, 250, 255>>)) ]. + +sort_key_test_() -> + { + "Sort Key tests", + [ + { + foreach, + fun setup/0, fun teardown/1, + [ + fun test_get_sort_key/1, + fun test_get_sort_key_jiffy_string/1, + fun test_get_sort_key_fails_on_bad_input/1, + fun test_get_sort_key_longer_than_buffer/1, + fun test_sort_key_collation/1, + fun test_sort_key_list_sort/1 + ] + } + ] + }. + +test_get_sort_key(_) -> + Strs = [ + <<"">>, + <<"foo">>, + <<"bar">>, + <<"Bar">>, + <<"baz">>, + <<"BAZ">>, + <<"quaz">>, + <<"1234fdsa">>, + <<"1234">>, + <<"pizza">> + ], + Pairs = [{S1, S2} || S1 <- Strs, S2 <- Strs], + lists:map(fun({S1, S2}) -> + S1K = couch_util:get_sort_key(S1), + S2K = couch_util:get_sort_key(S2), + SortRes = sort_keys(S1K, S2K), + Comment = list_to_binary(io_lib:format("strcmp(~p, ~p)", [S1, S2])), + CollRes = couch_util:collate(S1, S2), + {Comment, ?_assertEqual(SortRes, CollRes)} + end, Pairs). + +test_get_sort_key_jiffy_string(_) -> + %% jiffy:decode does not null terminate strings + %% so we use it here to test unterminated strings + {[{S1,S2}]} = jiffy:decode(<<"{\"foo\": \"bar\"}">>), + S1K = couch_util:get_sort_key(S1), + S2K = couch_util:get_sort_key(S2), + SortRes = sort_keys(S1K, S2K), + CollRes = couch_util:collate(S1, S2), + ?_assertEqual(SortRes, CollRes). + +test_get_sort_key_fails_on_bad_input(_) -> + %% generated with crypto:strong_rand_bytes + %% contains invalid character, should error + S = <<209,98,222,144,60,163,72,134,206,157>>, + Res = couch_util:get_sort_key(S), + ?_assertEqual(error, Res). + +test_get_sort_key_longer_than_buffer(_) -> + %% stack allocated buffer is 1024 units + %% test resize logic with strings > 1024 char + Extra = list_to_binary(["a" || _ <- lists:seq(1, 1200)]), + ?_assert(is_binary(Extra)). + +test_sort_key_collation(_) -> + ?_test(begin + lists:foreach(fun(_) -> + K1 = random_unicode_binary(), + SK1 = couch_util:get_sort_key(K1), + + K2 = random_unicode_binary(), + SK2 = couch_util:get_sort_key(K2), + + % Probably kinda silly but whatevs + ?assertEqual(couch_util:collate(K1, K1), sort_keys(SK1, SK1)), + ?assertEqual(couch_util:collate(K2, K2), sort_keys(SK2, SK2)), + + ?assertEqual(couch_util:collate(K1, K2), sort_keys(SK1, SK2)), + ?assertEqual(couch_util:collate(K2, K1), sort_keys(SK2, SK1)) + end, lists:seq(1, ?RANDOM_TEST_SIZE)) + end). + +test_sort_key_list_sort(_) -> + ?_test(begin + RandomKeys = lists:map(fun(_) -> + random_unicode_binary() + end, lists:seq(1, ?RANDOM_TEST_SIZE)), + + CollationSorted = lists:sort(fun(A, B) -> + couch_util:collate(A, B) =< 0 + end, RandomKeys), + + SortKeys = lists:map(fun(K) -> + {couch_util:get_sort_key(K), K} + end, RandomKeys), + {_, SortKeySorted} = lists:unzip(lists:sort(SortKeys)), + + ?assertEqual(CollationSorted, SortKeySorted) + end). + +sort_keys(S1, S2) -> + case S1 < S2 of + true -> + -1; + false -> case S1 =:= S2 of + true -> + 0; + false -> + 1 + end + end. + +random_unicode_binary() -> + Size = poisson_length(0, rand:uniform()), + Chars = [random_unicode_char() || _ <- lists:seq(1, Size)], + <<_/binary>> = unicode:characters_to_binary(Chars). + +poisson_length(N, Acc) when Acc > ?POISSON_LIMIT -> + poisson_length(N + 1, Acc * rand:uniform()); +poisson_length(N, _) -> + N. + +random_unicode_char() -> + BaseChar = rand:uniform(16#FFFD + 1) - 1, + case BaseChar of + BC when BC >= 16#D800, BC =< 16#DFFF -> + % This range is reserved for surrogate pair + % encodings. + random_unicode_char(); + BC -> + BC + end. |