diff options
-rw-r--r-- | numpy/core/src/multiarray/item_selection.c | 3 | ||||
-rw-r--r-- | numpy/core/src/npysort/npysort_common.h | 8 | ||||
-rw-r--r-- | numpy/core/src/npysort/timsort.c.src | 2170 |
3 files changed, 1922 insertions, 259 deletions
diff --git a/numpy/core/src/multiarray/item_selection.c b/numpy/core/src/multiarray/item_selection.c index 6265aa207..7f560363c 100644 --- a/numpy/core/src/multiarray/item_selection.c +++ b/numpy/core/src/multiarray/item_selection.c @@ -1289,6 +1289,9 @@ PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which) case NPY_MERGESORT: argsort = npy_amergesort; break; + case NPY_TIMSORT: + argsort = npy_atimsort; + break; } } else { diff --git a/numpy/core/src/npysort/npysort_common.h b/numpy/core/src/npysort/npysort_common.h index a22045b41..5fd03b96f 100644 --- a/numpy/core/src/npysort/npysort_common.h +++ b/numpy/core/src/npysort/npysort_common.h @@ -273,10 +273,10 @@ STRING_SWAP(char *s1, char *s2, size_t len) NPY_INLINE static int -STRING_LT(char *s1, char *s2, size_t len) +STRING_LT(const char *s1, const char *s2, size_t len) { - const unsigned char *c1 = (unsigned char *)s1; - const unsigned char *c2 = (unsigned char *)s2; + const unsigned char *c1 = (const unsigned char *)s1; + const unsigned char *c2 = (const unsigned char *)s2; size_t i; int ret = 0; @@ -311,7 +311,7 @@ UNICODE_SWAP(npy_ucs4 *s1, npy_ucs4 *s2, size_t len) NPY_INLINE static int -UNICODE_LT(npy_ucs4 *s1, npy_ucs4 *s2, size_t len) +UNICODE_LT(const npy_ucs4 *s1, const npy_ucs4 *s2, size_t len) { size_t i; int ret = 0; diff --git a/numpy/core/src/npysort/timsort.c.src b/numpy/core/src/npysort/timsort.c.src index cf2377555..1fd86ff97 100644 --- a/numpy/core/src/npysort/timsort.c.src +++ b/numpy/core/src/npysort/timsort.c.src @@ -26,21 +26,22 @@ * The heap sort is included for completeness. */ + +/* For details of Timsort, refer to + * https://github.com/python/cpython/blob/3.7/Objects/listsort.txt + */ + #define NPY_NO_DEPRECATED_API NPY_API_VERSION #include "npy_sort.h" #include "npysort_common.h" #include <stdlib.h> -#define NOT_USED NPY_UNUSED(unused) -#define PYA_QS_STACK 100 -#define SMALL_QUICKSORT 15 -#define SMALL_MERGESORT 20 -#define SMALL_STRING 16 - +/* enough for 32 * 1.618 ** 128 elements */ #define TIMSORT_STACK_SIZE 128 + npy_intp compute_min_run(npy_intp num) { npy_intp r = 0; @@ -54,11 +55,41 @@ npy_intp compute_min_run(npy_intp num) } typedef struct { - npy_intp s; - npy_intp l; + npy_intp s; /* start pointer */ + npy_intp l; /* length */ } run; +/* buffer for argsort. Declared here to avoid multiple declarations. */ +typedef struct { + npy_intp *pw; + npy_intp size; +} buffer_intp; + + +/* buffer method */ +static NPY_INLINE int +resize_buffer_intp(buffer_intp *buffer, npy_intp new_size) +{ + if (new_size <= buffer->size) { + return 0; + } + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + buffer->pw = malloc(new_size * sizeof(npy_intp)); + } else { + buffer->pw = realloc(buffer->pw, new_size * sizeof(npy_intp)); + } + + buffer->size = new_size; + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + return -NPY_ENOMEM; + } else { + return 0; + } +} + /* ***************************************************************************** ** NUMERIC SORTS ** @@ -122,14 +153,12 @@ count_run_@suff@(@type@ *arr, npy_intp l, npy_intp num, npy_intp minrun) pl = arr + l; - // (not strictly) ascending sequence + /* (not strictly) ascending sequence */ if (!@TYPE@_LT(*(pl + 1), *pl)) { - for (pi = pl + 1; pi < pl + num - 1 && !@TYPE@_LT(*(pi + 1), *pi); ++pi) { + for (pi = pl + 1; pi < arr + num - 1 && !@TYPE@_LT(*(pi + 1), *pi); ++pi) { } - - // (strictly) descending sequence - } else { - for (pi = pl + 1; pi < pl + num - 1 && @TYPE@_LT(*(pi + 1), *pi); ++pi) { + } else { /* (strictly) descending sequence */ + for (pi = pl + 1; pi < arr + num - 1 && @TYPE@_LT(*(pi + 1), *pi); ++pi) { } for (pj = pl, pr = pi; pj < pr; ++pj, --pr) { @@ -137,7 +166,8 @@ count_run_@suff@(@type@ *arr, npy_intp l, npy_intp num, npy_intp minrun) } } - sz = pi - pl + 1; + ++pi; + sz = pi - pl; if (sz < minrun) { if (l + minrun < num) { @@ -148,6 +178,7 @@ count_run_@suff@(@type@ *arr, npy_intp l, npy_intp num, npy_intp minrun) pr = pl + sz; + /* insertion sort */ for (; pi < pr; ++pi) { vc = *pi; pj = pi; @@ -165,6 +196,9 @@ count_run_@suff@(@type@ *arr, npy_intp l, npy_intp num, npy_intp minrun) } +/* when the left part of the array (p1) is smaller, copy p1 to buffer + * and merge from left to right + */ static void merge_left_@suff@(@type@ *p1, npy_intp l1, @type@ *p2, npy_intp l2, @type@ *p3) @@ -188,6 +222,9 @@ merge_left_@suff@(@type@ *p1, npy_intp l1, @type@ *p2, npy_intp l2, } +/* when the right part of the array (p2) is smaller, copy p2 to buffer + * and merge from right to left + */ static void merge_right_@suff@(@type@ *p1, npy_intp l1, @type@ *p2, npy_intp l2, @type@ *p3) @@ -243,6 +280,7 @@ gallop_right_@suff@(const @type@ *arr, const npy_intp size, const @type@ key) break; } else { last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ ofs = (ofs << 1) + 1; } } @@ -319,7 +357,10 @@ merge_at_@suff@(@type@ *arr, const run *stack, const npy_intp at, l1 = stack[at].l; s2 = stack[at + 1].s; l2 = stack[at + 1].l; - /* arr[s2] belongs to arr[s1+k] */ + /* arr[s2] belongs to arr[s1+k]. + * if try to comment this out for debugging purpose, remember + * in the merging process the first element is skipped + */ k = gallop_right_@suff@(arr + s1, l1, arr[s2]); if (l1 == k) { @@ -436,32 +477,30 @@ force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr, int -timsort_@suff@(void *start, npy_intp num, void *NOT_USED) +timsort_@suff@(void *start, npy_intp num, void *NPY_UNUSED(varr)) { int ret; npy_intp l, n, stack_ptr, minrun; buffer_@suff@ buffer; - @type@ *arr; run stack[TIMSORT_STACK_SIZE]; buffer.pw = NULL; buffer.size = 0; stack_ptr = 0; minrun = compute_min_run(num); - arr = start; for (l = 0; l < num;) { - n = count_run_@suff@(arr, l, num, minrun); + n = count_run_@suff@(start, l, num, minrun); stack[stack_ptr].s = l; stack[stack_ptr].l = n; ++stack_ptr; - ret = try_collapse_@suff@(arr, stack, &stack_ptr, &buffer); + ret = try_collapse_@suff@(start, stack, &stack_ptr, &buffer); if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } l += n; } - ret = force_collapse_@suff@(arr, stack, &stack_ptr, &buffer); + ret = force_collapse_@suff@(start, stack, &stack_ptr, &buffer); if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } @@ -476,78 +515,407 @@ cleanup: } -/* Below copied directly from mergesort */ +/* argsort */ -static void -amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw) +static npy_intp +acount_run_@suff@(@type@ *arr, npy_intp *tosort, npy_intp l, npy_intp num, + npy_intp minrun) { - @type@ vp; - npy_intp vi, *pi, *pj, *pk, *pm; + npy_intp sz; + @type@ vc; + npy_intp vi; + npy_intp *pl, *pi, *pj, *pr; - if (pr - pl > SMALL_MERGESORT) { - /* merge sort */ - pm = pl + ((pr - pl) >> 1); - amergesort0_@suff@(pl, pm, v, pw); - amergesort0_@suff@(pm, pr, v, pw); + if (NPY_UNLIKELY(num - l == 1)) { + return 1; + } - for (pi = pw, pj = pl; pj < pm;) { - *pi++ = *pj++; - } + pl = tosort + l; - pi = pw + (pm - pl); - pj = pw; - pk = pl; + /* (not strictly) ascending sequence */ + if (!@TYPE@_LT(arr[*(pl + 1)], arr[*pl])) { + for (pi = pl + 1; pi < tosort + num - 1 + && !@TYPE@_LT(arr[*(pi + 1)], arr[*pi]); ++pi) { + } + } else { /* (strictly) descending sequence */ + for (pi = pl + 1; pi < tosort + num - 1 + && @TYPE@_LT(arr[*(pi + 1)], arr[*pi]); ++pi) { + } - while (pj < pi && pm < pr) { - if (@TYPE@_LT(v[*pm], v[*pj])) { - *pk++ = *pm++; - } else { - *pk++ = *pj++; - } + for (pj = pl, pr = pi; pj < pr; ++pj, --pr) { + INTP_SWAP(*pj, *pr); } + } + + ++pi; + sz = pi - pl; - while (pj < pi) { - *pk++ = *pj++; + if (sz < minrun) { + if (l + minrun < num) { + sz = minrun; + } else { + sz = num - l; } - } else { + + pr = pl + sz; + /* insertion sort */ - for (pi = pl + 1; pi < pr; ++pi) { + for (; pi < pr; ++pi) { vi = *pi; - vp = v[vi]; + vc = arr[*pi]; pj = pi; - pk = pi - 1; - while (pj > pl && @TYPE@_LT(vp, v[*pk])) { - *pj-- = *pk--; + while (pl < pj && @TYPE@_LT(vc, arr[*(pj - 1)])) { + *pj = *(pj - 1); + --pj; } *pj = vi; } } + + return sz; } -int -atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *NOT_USED) +static npy_intp +agallop_right_@suff@(const @type@ *arr, const npy_intp *tosort, + const npy_intp size, const @type@ key) { - npy_intp *pl, *pr, *pw; - pl = tosort; - pr = pl + num; - pw = malloc((num / 2) * sizeof(npy_intp)); + npy_intp last_ofs, ofs, m; - if (pw == NULL) { - return -NPY_ENOMEM; + if (@TYPE@_LT(key, arr[tosort[0]])) { + return 0; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; /* arr[ofs] is never accessed */ + break; + } + + if (@TYPE@_LT(key, arr[tosort[ofs]])) { + break; + } else { + last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[last_ofs]] <= key < arr[tosort[ofs]] */ + while (last_ofs + 1 < ofs) { + m = last_ofs + ((ofs - last_ofs) >> 1); + + if (@TYPE@_LT(key, arr[tosort[m]])) { + ofs = m; + } else { + last_ofs = m; + } + } + + /* now that arr[tosort[ofs-1]] <= key < arr[tosort[ofs]] */ + return ofs; +} + + + +static npy_intp +agallop_left_@suff@(const @type@ *arr, const npy_intp *tosort, + const npy_intp size, const @type@ key) +{ + npy_intp last_ofs, ofs, l, m, r; + + if (@TYPE@_LT(arr[tosort[size - 1]], key)) { + return size; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; + break; + } + + if (@TYPE@_LT(arr[tosort[size - ofs - 1]], key)) { + break; + } else { + last_ofs = ofs; + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[size-ofs-1]] < key <= arr[tosort[size-last_ofs-1]] */ + l = size - ofs - 1; + r = size - last_ofs - 1; + + while (l + 1 < r) { + m = l + ((r - l) >> 1); + + if (@TYPE@_LT(arr[tosort[m]], key)) { + l = m; + } else { + r = m; + } + } + + /* now that arr[tosort[r-1]] < key <= arr[tosort[r]] */ + return r; +} + + +static void +amerge_left_@suff@(@type@ *arr, npy_intp *p1, npy_intp l1, npy_intp *p2, + npy_intp l2, + npy_intp *p3) +{ + npy_intp *end = p2 + l2; + memcpy(p3, p1, sizeof(npy_intp) * l1); + /* first element must be in p2 otherwise skipped in the caller */ + *p1++ = *p2++; + + while (p1 < p2 && p2 < end) { + if (@TYPE@_LT(arr[*p2], arr[*p3])) { + *p1++ = *p2++; + } else { + *p1++ = *p3++; + } + } + + if (p1 != p2) { + memcpy(p1, p3, sizeof(npy_intp) * (p2 - p1)); + } +} + + +static void +amerge_right_@suff@(@type@ *arr, npy_intp* p1, npy_intp l1, npy_intp *p2, + npy_intp l2, + npy_intp *p3) +{ + npy_intp ofs; + npy_intp *start = p1 - 1; + memcpy(p3, p2, sizeof(npy_intp) * l2); + p1 += l1 - 1; + p2 += l2 - 1; + p3 += l2 - 1; + /* first element must be in p1 otherwise skipped in the caller */ + *p2-- = *p1--; + + while (p1 < p2 && start < p1) { + if (@TYPE@_LT(arr[*p3], arr[*p1])) { + *p2-- = *p1--; + } else { + *p2-- = *p3--; + } + } + + if (p1 != p2) { + ofs = p2 - start; + memcpy(start + 1, p3 - ofs + 1, sizeof(npy_intp) * ofs); + } +} + + +static int +amerge_at_@suff@(@type@ *arr, npy_intp *tosort, const run *stack, + const npy_intp at, + buffer_intp *buffer) +{ + int ret; + npy_intp s1, l1, s2, l2, k; + npy_intp *p1, *p2; + s1 = stack[at].s; + l1 = stack[at].l; + s2 = stack[at + 1].s; + l2 = stack[at + 1].l; + /* tosort[s2] belongs to tosort[s1+k] */ + k = agallop_right_@suff@(arr, tosort + s1, l1, arr[tosort[s2]]); + + if (l1 == k) { + /* already sorted */ + return 0; + } + + p1 = tosort + s1 + k; + l1 -= k; + p2 = tosort + s2; + /* tosort[s2-1] belongs to tosort[s2+l2] */ + l2 = agallop_left_@suff@(arr, tosort + s2, l2, arr[tosort[s2 - 1]]); + + if (l2 < l1) { + ret = resize_buffer_intp(buffer, l2); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + amerge_right_@suff@(arr, p1, l1, p2, l2, buffer->pw); + } else { + ret = resize_buffer_intp(buffer, l1); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + amerge_left_@suff@(arr, p1, l1, p2, l2, buffer->pw); + } + + return 0; +} + + +static int +atry_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, + buffer_intp *buffer) +{ + int ret; + npy_intp A, B, C, top; + top = *stack_ptr; + + while (1 < top) { + B = stack[top - 2].l; + C = stack[top - 1].l; + + if ((2 < top && stack[top - 3].l <= B + C) || + (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) { + A = stack[top - 3].l; + + if (A <= C) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 3, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += B; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } + } else if (1 < top && B <= C) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } else { + break; + } + } + + *stack_ptr = top; + return 0; +} + + +static int +aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, + buffer_intp *buffer) +{ + int ret; + npy_intp top = *stack_ptr; + + while (2 < top) { + if (stack[top - 3].l <= stack[top - 1].l) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 3, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += stack[top - 2].l; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += stack[top - 1].l; + --top; + } + } + + if (1 < top) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } } - amergesort0_@suff@(pl, pr, v, pw); - free(pw); return 0; } + +int +atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num, + void *NPY_UNUSED(varr)) +{ + int ret; + npy_intp l, n, stack_ptr, minrun; + buffer_intp buffer; + run stack[TIMSORT_STACK_SIZE]; + buffer.pw = NULL; + buffer.size = 0; + stack_ptr = 0; + minrun = compute_min_run(num); + + for (l = 0; l < num;) { + n = acount_run_@suff@(v, tosort, l, num, minrun); + stack[stack_ptr].s = l; + stack[stack_ptr].l = n; + ++stack_ptr; + ret = atry_collapse_@suff@(v, tosort, stack, &stack_ptr, &buffer); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + l += n; + } + + ret = aforce_collapse_@suff@(v, tosort, stack, &stack_ptr, &buffer); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + ret = 0; +cleanup: + + if (buffer.pw != NULL) { + free(buffer.pw); + } + + return ret; +} + /**end repeat**/ + +/* For string sorts and generic sort, element comparisons are very expensive, + * and the time cost of insertion sort (involves N**2 comparison) clearly hurts. + * Implementing binary insertion sort and probably gallop mode during merging process + * can hopefully boost the performance. Here as a temporary workaround we use shorter + * run length to reduce the cost of insertion sort. + */ + +npy_intp compute_min_run_short(npy_intp num) +{ + npy_intp r = 0; + + while (16 < num) { + r |= num & 1; + num >>= 1; + } + + return num + r; +} + /* ***************************************************************************** ** STRING SORTS ** @@ -562,171 +930,809 @@ atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *NOT_USED) * #type = npy_char, npy_ucs4# */ -static void -mergesort0_@suff@(@type@ *pl, @type@ *pr, @type@ *pw, @type@ *vp, size_t len) -{ - @type@ *pi, *pj, *pk, *pm; - - if ((size_t)(pr - pl) > SMALL_MERGESORT * len) { - /* merge sort */ - pm = pl + (((pr - pl) / len) >> 1) * len; - mergesort0_@suff@(pl, pm, pw, vp, len); - mergesort0_@suff@(pm, pr, pw, vp, len); - @TYPE@_COPY(pw, pl, pm - pl); - pi = pw + (pm - pl); - pj = pw; - pk = pl; - - while (pj < pi && pm < pr) { - if (@TYPE@_LT(pm, pj, len)) { - @TYPE@_COPY(pk, pm, len); - pm += len; - pk += len; - } else { - @TYPE@_COPY(pk, pj, len); - pj += len; - pk += len; - } - } - @TYPE@_COPY(pk, pj, pi - pj); +typedef struct { + @type@ * pw; + npy_intp size; + size_t len; +} buffer_@suff@; + + +static NPY_INLINE int +resize_buffer_@suff@(buffer_@suff@ *buffer, npy_intp new_size) +{ + if (new_size <= buffer->size) { + return 0; + } + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + buffer->pw = malloc(sizeof(@type@) * new_size * buffer->len); } else { + buffer->pw = realloc(buffer->pw, sizeof(@type@) * new_size * buffer->len); + } + + buffer->size = new_size; + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + return -NPY_ENOMEM; + } else { + return 0; + } +} + + +static npy_intp +count_run_@suff@(@type@ *arr, npy_intp l, npy_intp num, npy_intp minrun, + @type@ *vp, size_t len) +{ + npy_intp sz; + @type@ *pl, *pi, *pj, *pr; + + if (NPY_UNLIKELY(num - l == 1)) { + return 1; + } + + pl = arr + l * len; + + /* (not strictly) ascending sequence */ + if (!@TYPE@_LT(pl + len, pl, len)) { + for (pi = pl + len; pi < arr + (num - 1) * len + && !@TYPE@_LT(pi + len, pi, len); pi += len) { + } + } else { /* (strictly) descending sequence */ + for (pi = pl + len; pi < arr + (num - 1) * len + && @TYPE@_LT(pi + len, pi, len); pi += len) { + } + + for (pj = pl, pr = pi; pj < pr; pj += len, pr -= len) { + @TYPE@_SWAP(pj, pr, len); + } + } + + pi += len; + sz = (pi - pl) / len; + + if (sz < minrun) { + if (l + minrun < num) { + sz = minrun; + } else { + sz = num - l; + } + + pr = pl + sz * len; + /* insertion sort */ - for (pi = pl + len; pi < pr; pi += len) { + for (; pi < pr; pi += len) { @TYPE@_COPY(vp, pi, len); pj = pi; - pk = pi - len; - while (pj > pl && @TYPE@_LT(vp, pk, len)) { - @TYPE@_COPY(pj, pk, len); + while (pl < pj && @TYPE@_LT(vp, pj - len, len)) { + @TYPE@_COPY(pj, pj - len, len); pj -= len; - pk -= len; } @TYPE@_COPY(pj, vp, len); } } + + return sz; } -int -timsort_@suff@(void *start, npy_intp num, void *varr) +static npy_intp +gallop_right_@suff@(const @type@ *arr, const npy_intp size, + const @type@ *key, size_t len) { - PyArrayObject *arr = varr; - size_t elsize = PyArray_ITEMSIZE(arr); - size_t len = elsize / sizeof(@type@); - @type@ *pl, *pr, *pw, *vp; - int err = 0; + npy_intp last_ofs, ofs, m; - /* Items that have zero size don't make sense to sort */ - if (elsize == 0) { + if (@TYPE@_LT(key, arr, len)) { return 0; } - pl = start; - pr = pl + num * len; - pw = malloc((num / 2) * elsize); + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; /* arr[ofs] is never accessed */ + break; + } - if (pw == NULL) { - err = -NPY_ENOMEM; - goto fail_0; + if (@TYPE@_LT(key, arr + ofs * len, len)) { + break; + } else { + last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ + ofs = (ofs << 1) + 1; + } } - vp = malloc(elsize); + /* now that arr[last_ofs*len] <= key < arr[ofs*len] */ + while (last_ofs + 1 < ofs) { + m = last_ofs + ((ofs - last_ofs) >> 1); - if (vp == NULL) { - err = -NPY_ENOMEM; - goto fail_1; + if (@TYPE@_LT(key, arr + m * len, len)) { + ofs = m; + } else { + last_ofs = m; + } } - mergesort0_@suff@(pl, pr, pw, vp, len); - free(vp); -fail_1: - free(pw); -fail_0: - return err; + /* now that arr[(ofs-1)*len] <= key < arr[ofs*len] */ + return ofs; } -static void -amergesort0_@suff@(npy_intp *pl, npy_intp *pr, @type@ *v, npy_intp *pw, + +static npy_intp +gallop_left_@suff@(const @type@ *arr, const npy_intp size, const @type@ *key, size_t len) { - @type@ *vp; - npy_intp vi, *pi, *pj, *pk, *pm; + npy_intp last_ofs, ofs, l, m, r; + + if (@TYPE@_LT(arr + (size - 1) * len, key, len)) { + return size; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; + break; + } + + if (@TYPE@_LT(arr + (size - ofs - 1) * len, key, len)) { + break; + } else { + last_ofs = ofs; + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[(size-ofs-1)*len] < key <= arr[(size-last_ofs-1)*len] */ + l = size - ofs - 1; + r = size - last_ofs - 1; + + while (l + 1 < r) { + m = l + ((r - l) >> 1); + + if (@TYPE@_LT(arr + m * len, key, len)) { + l = m; + } else { + r = m; + } + } + + /* now that arr[(r-1)*len] < key <= arr[r*len] */ + return r; +} + + +static void +merge_left_@suff@(@type@ *p1, npy_intp l1, @type@ *p2, npy_intp l2, + @type@ *p3, size_t len) +{ + @type@ *end = p2 + l2 * len; + memcpy(p3, p1, sizeof(@type@) * l1 * len); + /* first element must be in p2 otherwise skipped in the caller */ + @TYPE@_COPY(p1, p2, len); + p1 += len; + p2 += len; + + while (p1 < p2 && p2 < end) { + if (@TYPE@_LT(p2, p3, len)) { + @TYPE@_COPY(p1, p2, len); + p1 += len; + p2 += len; + } else { + @TYPE@_COPY(p1, p3, len); + p1 += len; + p3 += len; + } + } + + if (p1 != p2) { + memcpy(p1, p3, sizeof(@type@) * (p2 - p1)); + } +} + - if (pr - pl > SMALL_MERGESORT) { - /* merge sort */ - pm = pl + ((pr - pl) >> 1); - amergesort0_@suff@(pl, pm, v, pw, len); - amergesort0_@suff@(pm, pr, v, pw, len); +static void +merge_right_@suff@(@type@ *p1, npy_intp l1, @type@ *p2, npy_intp l2, + @type@ *p3, size_t len) +{ + npy_intp ofs; + @type@ *start = p1 - len; + memcpy(p3, p2, sizeof(@type@) * l2 * len); + p1 += (l1 - 1) * len; + p2 += (l2 - 1) * len; + p3 += (l2 - 1) * len; + /* first element must be in p1 otherwise skipped in the caller */ + @TYPE@_COPY(p2, p1, len); + p2 -= len; + p1 -= len; - for (pi = pw, pj = pl; pj < pm;) { - *pi++ = *pj++; + while (p1 < p2 && start < p1) { + if (@TYPE@_LT(p3, p1, len)) { + @TYPE@_COPY(p2, p1, len); + p2 -= len; + p1 -= len; + } else { + @TYPE@_COPY(p2, p3, len); + p2 -= len; + p3 -= len; } + } + + if (p1 != p2) { + ofs = p2 - start; + memcpy(start + len, p3 - ofs + len, sizeof(@type@) * ofs); + } +} + + +static int +merge_at_@suff@(@type@ *arr, const run *stack, const npy_intp at, + buffer_@suff@ *buffer, size_t len) +{ + int ret; + npy_intp s1, l1, s2, l2, k; + @type@ *p1, *p2; + s1 = stack[at].s; + l1 = stack[at].l; + s2 = stack[at + 1].s; + l2 = stack[at + 1].l; + /* arr[s2] belongs to arr[s1+k] */ + @TYPE@_COPY(buffer->pw, arr + s2 * len, len); + k = gallop_right_@suff@(arr + s1 * len, l1, buffer->pw, len); + + if (l1 == k) { + /* already sorted */ + return 0; + } + + p1 = arr + (s1 + k) * len; + l1 -= k; + p2 = arr + s2 * len; + /* arr[s2-1] belongs to arr[s2+l2] */ + @TYPE@_COPY(buffer->pw, arr + (s2 - 1) * len, len); + l2 = gallop_left_@suff@(arr + s2 * len, l2, buffer->pw, len); + + if (l2 < l1) { + ret = resize_buffer_@suff@(buffer, l2); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + merge_right_@suff@(p1, l1, p2, l2, buffer->pw, len); + } else { + ret = resize_buffer_@suff@(buffer, l1); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } - pi = pw + (pm - pl); - pj = pw; - pk = pl; + merge_left_@suff@(p1, l1, p2, l2, buffer->pw, len); + } + + return 0; +} + + +static int +try_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr, + buffer_@suff@ *buffer, size_t len) +{ + int ret; + npy_intp A, B, C, top; + top = *stack_ptr; + + while (1 < top) { + B = stack[top - 2].l; + C = stack[top - 1].l; + + if ((2 < top && stack[top - 3].l <= B + C) || + (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) { + A = stack[top - 3].l; + + if (A <= C) { + ret = merge_at_@suff@(arr, stack, top - 3, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } - while (pj < pi && pm < pr) { - if (@TYPE@_LT(v + (*pm)*len, v + (*pj)*len, len)) { - *pk++ = *pm++; + stack[top - 3].l += B; + stack[top - 2] = stack[top - 1]; + --top; } else { - *pk++ = *pj++; + ret = merge_at_@suff@(arr, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; } + } else if (1 < top && B <= C) { + ret = merge_at_@suff@(arr, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } else { + break; } + } + + *stack_ptr = top; + return 0; +} + - while (pj < pi) { - *pk++ = *pj++; +static int +force_collapse_@suff@(@type@ *arr, run *stack, npy_intp *stack_ptr, + buffer_@suff@ *buffer, size_t len) +{ + int ret; + npy_intp top = *stack_ptr; + + while (2 < top) { + if (stack[top - 3].l <= stack[top - 1].l) { + ret = merge_at_@suff@(arr, stack, top - 3, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += stack[top - 2].l; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = merge_at_@suff@(arr, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += stack[top - 1].l; + --top; } - } else { + } + + if (1 < top) { + ret = merge_at_@suff@(arr, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + } + + return 0; +} + + +int +timsort_@suff@(void *start, npy_intp num, void *varr) +{ + PyArrayObject *arr = varr; + size_t elsize = PyArray_ITEMSIZE(arr); + size_t len = elsize / sizeof(@type@); + int ret; + npy_intp l, n, stack_ptr, minrun; + run stack[TIMSORT_STACK_SIZE]; + buffer_@suff@ buffer; + buffer.pw = NULL; + buffer.size = 0; + buffer.len = len; + stack_ptr = 0; + minrun = compute_min_run_short(num); + /* used for insertion sort and gallop key */ + ret = resize_buffer_@suff@(&buffer, 1); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + for (l = 0; l < num;) { + n = count_run_@suff@(start, l, num, minrun, buffer.pw, len); + /* both s and l are scaled by len */ + stack[stack_ptr].s = l; + stack[stack_ptr].l = n; + ++stack_ptr; + ret = try_collapse_@suff@(start, stack, &stack_ptr, &buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + l += n; + } + + ret = force_collapse_@suff@(start, stack, &stack_ptr, &buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + ret = 0; +cleanup: + + if (buffer.pw != NULL) { + free(buffer.pw); + } + + return ret; +} + + +/* argsort */ + + +static npy_intp +acount_run_@suff@(@type@ *arr, npy_intp *tosort, npy_intp l, npy_intp num, + npy_intp minrun, size_t len) +{ + npy_intp sz; + npy_intp vi; + npy_intp *pl, *pi, *pj, *pr; + + if (NPY_UNLIKELY(num - l == 1)) { + return 1; + } + + pl = tosort + l; + + /* (not strictly) ascending sequence */ + if (!@TYPE@_LT(arr + (*(pl + 1)) * len, arr + (*pl) * len, len)) { + for (pi = pl + 1; pi < tosort + num - 1 + && !@TYPE@_LT(arr + (*(pi + 1)) * len, arr + (*pi) * len, len); ++pi) { + } + } else { /* (strictly) descending sequence */ + for (pi = pl + 1; pi < tosort + num - 1 + && @TYPE@_LT(arr + (*(pi + 1)) * len, arr + (*pi) * len, len); ++pi) { + } + + for (pj = pl, pr = pi; pj < pr; ++pj, --pr) { + INTP_SWAP(*pj, *pr); + } + } + + ++pi; + sz = pi - pl; + + if (sz < minrun) { + if (l + minrun < num) { + sz = minrun; + } else { + sz = num - l; + } + + pr = pl + sz; + /* insertion sort */ - for (pi = pl + 1; pi < pr; ++pi) { + for (; pi < pr; ++pi) { vi = *pi; - vp = v + vi * len; pj = pi; - pk = pi - 1; - while (pj > pl && @TYPE@_LT(vp, v + (*pk)*len, len)) { - *pj-- = *pk--; + while (pl < pj && @TYPE@_LT(arr + vi * len, arr + (*(pj - 1)) * len, len)) { + *pj = *(pj - 1); + --pj; } *pj = vi; } } + + return sz; +} + + +static npy_intp +agallop_left_@suff@(const @type@ *arr, const npy_intp *tosort, + const npy_intp size, const @type@ *key, size_t len) +{ + npy_intp last_ofs, ofs, l, m, r; + + if (@TYPE@_LT(arr + tosort[size - 1] * len, key, len)) { + return size; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; + break; + } + + if (@TYPE@_LT(arr + tosort[size - ofs - 1] * len, key, len)) { + break; + } else { + last_ofs = ofs; + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[size-ofs-1]*len] < key <= arr[tosort[size-last_ofs-1]*len] */ + l = size - ofs - 1; + r = size - last_ofs - 1; + + while (l + 1 < r) { + m = l + ((r - l) >> 1); + + if (@TYPE@_LT(arr + tosort[m] * len, key, len)) { + l = m; + } else { + r = m; + } + } + + /* now that arr[tosort[r-1]*len] < key <= arr[tosort[r]*len] */ + return r; +} + + +static npy_intp +agallop_right_@suff@(const @type@ *arr, const npy_intp *tosort, + const npy_intp size, const @type@ *key, size_t len) +{ + npy_intp last_ofs, ofs, m; + + if (@TYPE@_LT(key, arr + tosort[0] * len, len)) { + return 0; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; /* arr[ofs] is never accessed */ + break; + } + + if (@TYPE@_LT(key, arr + tosort[ofs] * len, len)) { + break; + } else { + last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[last_ofs]*len] <= key < arr[tosort[ofs]*len] */ + while (last_ofs + 1 < ofs) { + m = last_ofs + ((ofs - last_ofs) >> 1); + + if (@TYPE@_LT(key, arr + tosort[m] * len, len)) { + ofs = m; + } else { + last_ofs = m; + } + } + + /* now that arr[tosort[ofs-1]*len] <= key < arr[tosort[ofs]*len] */ + return ofs; +} + + + +static void +amerge_left_@suff@(@type@ *arr, npy_intp *p1, npy_intp l1, npy_intp *p2, + npy_intp l2, npy_intp *p3, size_t len) +{ + npy_intp *end = p2 + l2; + memcpy(p3, p1, sizeof(npy_intp) * l1); + /* first element must be in p2 otherwise skipped in the caller */ + *p1++ = *p2++; + + while (p1 < p2 && p2 < end) { + if (@TYPE@_LT(arr + (*p2) * len, arr + (*p3) * len, len)) { + *p1++ = *p2++; + } else { + *p1++ = *p3++; + } + } + + if (p1 != p2) { + memcpy(p1, p3, sizeof(npy_intp) * (p2 - p1)); + } +} + + +static void +amerge_right_@suff@(@type@ *arr, npy_intp* p1, npy_intp l1, npy_intp *p2, + npy_intp l2, npy_intp *p3, size_t len) +{ + npy_intp ofs; + npy_intp *start = p1 - 1; + memcpy(p3, p2, sizeof(npy_intp) * l2); + p1 += l1 - 1; + p2 += l2 - 1; + p3 += l2 - 1; + /* first element must be in p1 otherwise skipped in the caller */ + *p2-- = *p1--; + + while (p1 < p2 && start < p1) { + if (@TYPE@_LT(arr + (*p3) * len, arr + (*p1) * len, len)) { + *p2-- = *p1--; + } else { + *p2-- = *p3--; + } + } + + if (p1 != p2) { + ofs = p2 - start; + memcpy(start + 1, p3 - ofs + 1, sizeof(npy_intp) * ofs); + } +} + + + +static int +amerge_at_@suff@(@type@ *arr, npy_intp *tosort, const run *stack, + const npy_intp at, buffer_intp *buffer, size_t len) +{ + int ret; + npy_intp s1, l1, s2, l2, k; + npy_intp *p1, *p2; + s1 = stack[at].s; + l1 = stack[at].l; + s2 = stack[at + 1].s; + l2 = stack[at + 1].l; + /* tosort[s2] belongs to tosort[s1+k] */ + k = agallop_right_@suff@(arr, tosort + s1, l1, arr + tosort[s2] * len, len); + + if (l1 == k) { + /* already sorted */ + return 0; + } + + p1 = tosort + s1 + k; + l1 -= k; + p2 = tosort + s2; + /* tosort[s2-1] belongs to tosort[s2+l2] */ + l2 = agallop_left_@suff@(arr, tosort + s2, l2, arr + tosort[s2 - 1] * len, + len); + + if (l2 < l1) { + ret = resize_buffer_intp(buffer, l2); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + amerge_right_@suff@(arr, p1, l1, p2, l2, buffer->pw, len); + } else { + ret = resize_buffer_intp(buffer, l1); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + amerge_left_@suff@(arr, p1, l1, p2, l2, buffer->pw, len); + } + + return 0; +} + + +static int +atry_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, buffer_intp *buffer, size_t len) +{ + int ret; + npy_intp A, B, C, top; + top = *stack_ptr; + + while (1 < top) { + B = stack[top - 2].l; + C = stack[top - 1].l; + + if ((2 < top && stack[top - 3].l <= B + C) || + (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) { + A = stack[top - 3].l; + + if (A <= C) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 3, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += B; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } + } else if (1 < top && B <= C) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } else { + break; + } + } + + *stack_ptr = top; + return 0; +} + + + +static int +aforce_collapse_@suff@(@type@ *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, buffer_intp *buffer, size_t len) +{ + int ret; + npy_intp top = *stack_ptr; + + while (2 < top) { + if (stack[top - 3].l <= stack[top - 1].l) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 3, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += stack[top - 2].l; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += stack[top - 1].l; + --top; + } + } + + if (1 < top) { + ret = amerge_at_@suff@(arr, tosort, stack, top - 2, buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + } + + return 0; } int -atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *varr) +atimsort_@suff@(void *start, npy_intp *tosort, npy_intp num, void *varr) { PyArrayObject *arr = varr; size_t elsize = PyArray_ITEMSIZE(arr); size_t len = elsize / sizeof(@type@); - npy_intp *pl, *pr, *pw; + int ret; + npy_intp l, n, stack_ptr, minrun; + run stack[TIMSORT_STACK_SIZE]; + buffer_intp buffer; + buffer.pw = NULL; + buffer.size = 0; + stack_ptr = 0; + minrun = compute_min_run_short(num); - /* Items that have zero size don't make sense to sort */ - if (elsize == 0) { - return 0; + for (l = 0; l < num;) { + n = acount_run_@suff@(start, tosort, l, num, minrun, len); + /* both s and l are scaled by len */ + stack[stack_ptr].s = l; + stack[stack_ptr].l = n; + ++stack_ptr; + ret = atry_collapse_@suff@(start, tosort, stack, &stack_ptr, &buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + l += n; } - pl = tosort; - pr = pl + num; - pw = malloc((num / 2) * sizeof(npy_intp)); + ret = aforce_collapse_@suff@(start, tosort, stack, &stack_ptr, &buffer, len); - if (pw == NULL) { - return -NPY_ENOMEM; + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + ret = 0; +cleanup: + + if (buffer.pw != NULL) { + free(buffer.pw); } - amergesort0_@suff@(pl, pr, v, pw, len); - free(pw); - return 0; + return ret; } + /**end repeat**/ + /* ***************************************************************************** ** GENERIC SORT ** @@ -734,157 +1740,811 @@ atimsort_@suff@(void *v, npy_intp *tosort, npy_intp num, void *varr) */ -static void -npy_mergesort0(char *pl, char *pr, char *pw, char *vp, npy_intp elsize, - PyArray_CompareFunc *cmp, PyArrayObject *arr) -{ - char *pi, *pj, *pk, *pm; - - if (pr - pl > SMALL_MERGESORT * elsize) { - /* merge sort */ - pm = pl + (((pr - pl) / elsize) >> 1) * elsize; - npy_mergesort0(pl, pm, pw, vp, elsize, cmp, arr); - npy_mergesort0(pm, pr, pw, vp, elsize, cmp, arr); - GENERIC_COPY(pw, pl, pm - pl); - pi = pw + (pm - pl); - pj = pw; - pk = pl; - - while (pj < pi && pm < pr) { - if (cmp(pm, pj, arr) < 0) { - GENERIC_COPY(pk, pm, elsize); - pm += elsize; - pk += elsize; - } else { - GENERIC_COPY(pk, pj, elsize); - pj += elsize; - pk += elsize; - } - } +typedef struct { + char *pw; + npy_intp size; + size_t len; +} buffer_char; + - GENERIC_COPY(pk, pj, pi - pj); +static NPY_INLINE int +resize_buffer_char(buffer_char *buffer, npy_intp new_size) +{ + if (new_size <= buffer->size) { + return 0; + } + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + buffer->pw = malloc(sizeof(char) * new_size * buffer->len); + } else { + buffer->pw = realloc(buffer->pw, sizeof(char) * new_size * buffer->len); + } + + buffer->size = new_size; + + if (NPY_UNLIKELY(buffer->pw == NULL)) { + return -NPY_ENOMEM; } else { + return 0; + } +} + + +static npy_intp +npy_count_run(char *arr, npy_intp l, npy_intp num, npy_intp minrun, + char *vp, size_t len, PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp sz; + char *pl, *pi, *pj, *pr; + + if (NPY_UNLIKELY(num - l == 1)) { + return 1; + } + + pl = arr + l * len; + + /* (not strictly) ascending sequence */ + if (cmp(pl, pl + len, py_arr) <= 0) { + for (pi = pl + len; pi < arr + (num - 1) * len + && cmp(pi, pi + len, py_arr) <= 0; pi += len) { + } + } else { /* (strictly) descending sequence */ + for (pi = pl + len; pi < arr + (num - 1) * len + && cmp(pi + len, pi, py_arr) < 0; pi += len) { + } + + for (pj = pl, pr = pi; pj < pr; pj += len, pr -= len) { + GENERIC_SWAP(pj, pr, len); + } + } + + pi += len; + sz = (pi - pl) / len; + + if (sz < minrun) { + if (l + minrun < num) { + sz = minrun; + } else { + sz = num - l; + } + + pr = pl + sz * len; + /* insertion sort */ - for (pi = pl + elsize; pi < pr; pi += elsize) { - GENERIC_COPY(vp, pi, elsize); + for (; pi < pr; pi += len) { + GENERIC_COPY(vp, pi, len); pj = pi; - pk = pi - elsize; - while (pj > pl && cmp(vp, pk, arr) < 0) { - GENERIC_COPY(pj, pk, elsize); - pj -= elsize; - pk -= elsize; + while (pl < pj && cmp(vp, pj - len, py_arr) < 0) { + GENERIC_COPY(pj, pj - len, len); + pj -= len; } - GENERIC_COPY(pj, vp, elsize); + GENERIC_COPY(pj, vp, len); } } + + return sz; } -int -npy_timsort(void *start, npy_intp num, void *varr) +static npy_intp +npy_gallop_right(const char *arr, const npy_intp size, const char *key, + size_t len, PyArray_CompareFunc *cmp, PyArrayObject *py_arr) { - PyArrayObject *arr = varr; - npy_intp elsize = PyArray_ITEMSIZE(arr); - PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare; - char *pl = start; - char *pr = pl + num * elsize; - char *pw; - char *vp; - int err = -NPY_ENOMEM; + npy_intp last_ofs, ofs, m; - /* Items that have zero size don't make sense to sort */ - if (elsize == 0) { + if (cmp(key, arr, py_arr) < 0) { return 0; } - pw = malloc((num >> 1) * elsize); - vp = malloc(elsize); + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; /* arr[ofs] is never accessed */ + break; + } - if (pw != NULL && vp != NULL) { - npy_mergesort0(pl, pr, pw, vp, elsize, cmp, arr); - err = 0; + if (cmp(key, arr + ofs * len, py_arr) < 0) { + break; + } else { + last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[last_ofs*len] <= key < arr[ofs*len] */ + while (last_ofs + 1 < ofs) { + m = last_ofs + ((ofs - last_ofs) >> 1); + + if (cmp(key, arr + m * len, py_arr) < 0) { + ofs = m; + } else { + last_ofs = m; + } } - free(vp); - free(pw); - return err; + /* now that arr[(ofs-1)*len] <= key < arr[ofs*len] */ + return ofs; +} + + + +static npy_intp +npy_gallop_left(const char *arr, const npy_intp size, const char *key, + size_t len, PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp last_ofs, ofs, l, m, r; + + if (cmp(arr + (size - 1) * len, key, py_arr) < 0) { + return size; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; + break; + } + + if (cmp(arr + (size - ofs - 1) * len, key, py_arr) < 0) { + break; + } else { + last_ofs = ofs; + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[(size-ofs-1)*len] < key <= arr[(size-last_ofs-1)*len] */ + l = size - ofs - 1; + r = size - last_ofs - 1; + + while (l + 1 < r) { + m = l + ((r - l) >> 1); + + if (cmp(arr + m * len, key, py_arr) < 0) { + l = m; + } else { + r = m; + } + } + + /* now that arr[(r-1)*len] < key <= arr[r*len] */ + return r; } static void -npy_amergesort0(npy_intp *pl, npy_intp *pr, char *v, npy_intp *pw, - npy_intp elsize, PyArray_CompareFunc *cmp, PyArrayObject *arr) +npy_merge_left(char *p1, npy_intp l1, char *p2, npy_intp l2, + char *p3, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) { - char *vp; - npy_intp vi, *pi, *pj, *pk, *pm; + char *end = p2 + l2 * len; + memcpy(p3, p1, sizeof(char) * l1 * len); + /* first element must be in p2 otherwise skipped in the caller */ + GENERIC_COPY(p1, p2, len); + p1 += len; + p2 += len; + + while (p1 < p2 && p2 < end) { + if (cmp(p2, p3, py_arr) < 0) { + GENERIC_COPY(p1, p2, len); + p1 += len; + p2 += len; + } else { + GENERIC_COPY(p1, p3, len); + p1 += len; + p3 += len; + } + } + + if (p1 != p2) { + memcpy(p1, p3, sizeof(char) * (p2 - p1)); + } +} + - if (pr - pl > SMALL_MERGESORT) { - /* merge sort */ - pm = pl + ((pr - pl) >> 1); - npy_amergesort0(pl, pm, v, pw, elsize, cmp, arr); - npy_amergesort0(pm, pr, v, pw, elsize, cmp, arr); +static void +npy_merge_right(char *p1, npy_intp l1, char *p2, npy_intp l2, + char *p3, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp ofs; + char *start = p1 - len; + memcpy(p3, p2, sizeof(char) * l2 * len); + p1 += (l1 - 1) * len; + p2 += (l2 - 1) * len; + p3 += (l2 - 1) * len; + /* first element must be in p1 otherwise skipped in the caller */ + GENERIC_COPY(p2, p1, len); + p2 -= len; + p1 -= len; - for (pi = pw, pj = pl; pj < pm;) { - *pi++ = *pj++; + while (p1 < p2 && start < p1) { + if (cmp(p3, p1, py_arr) < 0) { + GENERIC_COPY(p2, p1, len); + p2 -= len; + p1 -= len; + } else { + GENERIC_COPY(p2, p3, len); + p2 -= len; + p3 -= len; } + } + + if (p1 != p2) { + ofs = p2 - start; + memcpy(start + len, p3 - ofs + len, sizeof(char) * ofs); + } +} - pi = pw + (pm - pl); - pj = pw; - pk = pl; - while (pj < pi && pm < pr) { - if (cmp(v + (*pm)*elsize, v + (*pj)*elsize, arr) < 0) { - *pk++ = *pm++; + +static int +npy_merge_at(char *arr, const run *stack, const npy_intp at, + buffer_char *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp s1, l1, s2, l2, k; + char *p1, *p2; + s1 = stack[at].s; + l1 = stack[at].l; + s2 = stack[at + 1].s; + l2 = stack[at + 1].l; + /* arr[s2] belongs to arr[s1+k] */ + GENERIC_COPY(buffer->pw, arr + s2 * len, len); + k = npy_gallop_right(arr + s1 * len, l1, buffer->pw, len, cmp, py_arr); + + if (l1 == k) { + /* already sorted */ + return 0; + } + + p1 = arr + (s1 + k) * len; + l1 -= k; + p2 = arr + s2 * len; + /* arr[s2-1] belongs to arr[s2+l2] */ + GENERIC_COPY(buffer->pw, arr + (s2 - 1) * len, len); + l2 = npy_gallop_left(arr + s2 * len, l2, buffer->pw, len, cmp, py_arr); + + if (l2 < l1) { + ret = resize_buffer_char(buffer, l2); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + npy_merge_right(p1, l1, p2, l2, buffer->pw, len, cmp, py_arr); + } else { + ret = resize_buffer_char(buffer, l1); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + npy_merge_left(p1, l1, p2, l2, buffer->pw, len, cmp, py_arr); + } + + return 0; +} + + +static int +npy_try_collapse(char *arr, run *stack, npy_intp *stack_ptr, + buffer_char *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp A, B, C, top; + top = *stack_ptr; + + while (1 < top) { + B = stack[top - 2].l; + C = stack[top - 1].l; + + if ((2 < top && stack[top - 3].l <= B + C) || + (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) { + A = stack[top - 3].l; + + if (A <= C) { + ret = npy_merge_at(arr, stack, top - 3, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += B; + stack[top - 2] = stack[top - 1]; + --top; } else { - *pk++ = *pj++; + ret = npy_merge_at(arr, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; } + } else if (1 < top && B <= C) { + ret = npy_merge_at(arr, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } else { + break; } + } + + *stack_ptr = top; + return 0; +} + - while (pj < pi) { - *pk++ = *pj++; +static int +npy_force_collapse(char *arr, run *stack, npy_intp *stack_ptr, + buffer_char *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp top = *stack_ptr; + + while (2 < top) { + if (stack[top - 3].l <= stack[top - 1].l) { + ret = npy_merge_at(arr, stack, top - 3, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += stack[top - 2].l; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = npy_merge_at(arr, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += stack[top - 1].l; + --top; } - } else { + } + + if (1 < top) { + ret = npy_merge_at(arr, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + } + + return 0; +} + + +int +npy_timsort(void *start, npy_intp num, void *varr) +{ + size_t len = PyArray_ITEMSIZE(varr); + PyArray_CompareFunc *cmp = PyArray_DESCR(varr)->f->compare; + int ret; + npy_intp l, n, stack_ptr, minrun; + run stack[TIMSORT_STACK_SIZE]; + buffer_char buffer; + buffer.pw = NULL; + buffer.size = 0; + buffer.len = len; + stack_ptr = 0; + minrun = compute_min_run_short(num); + /* used for insertion sort and gallop key */ + ret = resize_buffer_char(&buffer, len); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + for (l = 0; l < num;) { + n = npy_count_run(start, l, num, minrun, buffer.pw, len, cmp, varr); + /* both s and l are scaled by len */ + stack[stack_ptr].s = l; + stack[stack_ptr].l = n; + ++stack_ptr; + ret = npy_try_collapse(start, stack, &stack_ptr, &buffer, len, cmp, varr); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + l += n; + } + + ret = npy_force_collapse(start, stack, &stack_ptr, &buffer, len, cmp, varr); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + ret = 0; +cleanup: + + if (buffer.pw != NULL) { + free(buffer.pw); + } + + return ret; +} + + +/* argsort */ + +static npy_intp +npy_acount_run(char *arr, npy_intp *tosort, npy_intp l, npy_intp num, + npy_intp minrun, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp sz; + npy_intp vi; + npy_intp *pl, *pi, *pj, *pr; + + if (NPY_UNLIKELY(num - l == 1)) { + return 1; + } + + pl = tosort + l; + + /* (not strictly) ascending sequence */ + if (cmp(arr + (*pl) * len, arr + (*(pl + 1)) * len, py_arr) <= 0) { + for (pi = pl + 1; pi < tosort + num - 1 + && cmp(arr + (*pi) * len, arr + (*(pi + 1)) * len, py_arr) <= 0; ++pi) { + } + } else { /* (strictly) descending sequence */ + for (pi = pl + 1; pi < tosort + num - 1 + && cmp(arr + (*(pi + 1)) * len, arr + (*pi) * len, py_arr) < 0; ++pi) { + } + + for (pj = pl, pr = pi; pj < pr; ++pj, --pr) { + INTP_SWAP(*pj, *pr); + } + } + + ++pi; + sz = pi - pl; + + if (sz < minrun) { + if (l + minrun < num) { + sz = minrun; + } else { + sz = num - l; + } + + pr = pl + sz; + /* insertion sort */ - for (pi = pl + 1; pi < pr; ++pi) { + for (; pi < pr; ++pi) { vi = *pi; - vp = v + vi * elsize; pj = pi; - pk = pi - 1; - while (pj > pl && cmp(vp, v + (*pk)*elsize, arr) < 0) { - *pj-- = *pk--; + while (pl < pj && cmp(arr + vi * len, arr + (*(pj - 1)) * len, py_arr) < 0) { + *pj = *(pj - 1); + --pj; } *pj = vi; } } + + return sz; } -int -npy_atimsort(void *v, npy_intp *tosort, npy_intp num, void *varr) +static npy_intp +npy_agallop_left(const char *arr, const npy_intp *tosort, + const npy_intp size, const char *key, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) { - PyArrayObject *arr = varr; - npy_intp elsize = PyArray_ITEMSIZE(arr); - PyArray_CompareFunc *cmp = PyArray_DESCR(arr)->f->compare; - npy_intp *pl, *pr, *pw; + npy_intp last_ofs, ofs, l, m, r; + + if (cmp(arr + tosort[size - 1] * len, key, py_arr) < 0) { + return size; + } + + last_ofs = 0; + ofs = 1; + + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; + break; + } - /* Items that have zero size don't make sense to sort */ - if (elsize == 0) { + if (cmp(arr + tosort[size - ofs - 1] * len, key, py_arr) < 0) { + break; + } else { + last_ofs = ofs; + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[size-ofs-1]*len] < key <= arr[tosort[size-last_ofs-1]*len] */ + l = size - ofs - 1; + r = size - last_ofs - 1; + + while (l + 1 < r) { + m = l + ((r - l) >> 1); + + if (cmp(arr + tosort[m] * len, key, py_arr) < 0) { + l = m; + } else { + r = m; + } + } + + /* now that arr[tosort[r-1]*len] < key <= arr[tosort[r]*len] */ + return r; +} + + +static npy_intp +npy_agallop_right(const char *arr, const npy_intp *tosort, + const npy_intp size, const char *key, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp last_ofs, ofs, m; + + if (cmp(key, arr + tosort[0] * len, py_arr) < 0) { return 0; } - pl = tosort; - pr = pl + num; - pw = malloc((num >> 1) * sizeof(npy_intp)); + last_ofs = 0; + ofs = 1; - if (pw == NULL) { - return -NPY_ENOMEM; + for (;;) { + if (size <= ofs || ofs < 0) { + ofs = size; /* arr[ofs] is never accessed */ + break; + } + + if (cmp(key, arr + tosort[ofs] * len, py_arr) < 0) { + break; + } else { + last_ofs = ofs; + /* ofs = 1, 3, 7, 15... */ + ofs = (ofs << 1) + 1; + } + } + + /* now that arr[tosort[last_ofs]*len] <= key < arr[tosort[ofs]*len] */ + while (last_ofs + 1 < ofs) { + m = last_ofs + ((ofs - last_ofs) >> 1); + + if (cmp(key, arr + tosort[m] * len, py_arr) < 0) { + ofs = m; + } else { + last_ofs = m; + } + } + + /* now that arr[tosort[ofs-1]*len] <= key < arr[tosort[ofs]*len] */ + return ofs; +} + + +static void +npy_amerge_left(char *arr, npy_intp *p1, npy_intp l1, npy_intp *p2, + npy_intp l2, npy_intp *p3, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp *end = p2 + l2; + memcpy(p3, p1, sizeof(npy_intp) * l1); + /* first element must be in p2 otherwise skipped in the caller */ + *p1++ = *p2++; + + while (p1 < p2 && p2 < end) { + if (cmp(arr + (*p2) * len, arr + (*p3) * len, py_arr) < 0) { + *p1++ = *p2++; + } else { + *p1++ = *p3++; + } + } + + if (p1 != p2) { + memcpy(p1, p3, sizeof(npy_intp) * (p2 - p1)); + } +} + + +static void +npy_amerge_right(char *arr, npy_intp* p1, npy_intp l1, npy_intp *p2, + npy_intp l2, npy_intp *p3, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + npy_intp ofs; + npy_intp *start = p1 - 1; + memcpy(p3, p2, sizeof(npy_intp) * l2); + p1 += l1 - 1; + p2 += l2 - 1; + p3 += l2 - 1; + /* first element must be in p1 otherwise skipped in the caller */ + *p2-- = *p1--; + + while (p1 < p2 && start < p1) { + if (cmp(arr + (*p3) * len, arr + (*p1) * len, py_arr) < 0) { + *p2-- = *p1--; + } else { + *p2-- = *p3--; + } + } + + if (p1 != p2) { + ofs = p2 - start; + memcpy(start + 1, p3 - ofs + 1, sizeof(npy_intp) * ofs); + } +} + + + +static int +npy_amerge_at(char *arr, npy_intp *tosort, const run *stack, + const npy_intp at, buffer_intp *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp s1, l1, s2, l2, k; + npy_intp *p1, *p2; + s1 = stack[at].s; + l1 = stack[at].l; + s2 = stack[at + 1].s; + l2 = stack[at + 1].l; + /* tosort[s2] belongs to tosort[s1+k] */ + k = npy_agallop_right(arr, tosort + s1, l1, arr + tosort[s2] * len, len, cmp, + py_arr); + + if (l1 == k) { + /* already sorted */ + return 0; + } + + p1 = tosort + s1 + k; + l1 -= k; + p2 = tosort + s2; + /* tosort[s2-1] belongs to tosort[s2+l2] */ + l2 = npy_agallop_left(arr, tosort + s2, l2, arr + tosort[s2 - 1] * len, + len, cmp, py_arr); + + if (l2 < l1) { + ret = resize_buffer_intp(buffer, l2); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + npy_amerge_right(arr, p1, l1, p2, l2, buffer->pw, len, cmp, py_arr); + } else { + ret = resize_buffer_intp(buffer, l1); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + npy_amerge_left(arr, p1, l1, p2, l2, buffer->pw, len, cmp, py_arr); } - npy_amergesort0(pl, pr, v, pw, elsize, cmp, arr); - free(pw); return 0; } + + +static int +npy_atry_collapse(char *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, buffer_intp *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp A, B, C, top; + top = *stack_ptr; + + while (1 < top) { + B = stack[top - 2].l; + C = stack[top - 1].l; + + if ((2 < top && stack[top - 3].l <= B + C) || + (3 < top && stack[top - 4].l <= stack[top - 3].l + B)) { + A = stack[top - 3].l; + + if (A <= C) { + ret = npy_amerge_at(arr, tosort, stack, top - 3, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += B; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = npy_amerge_at(arr, tosort, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } + } else if (1 < top && B <= C) { + ret = npy_amerge_at(arr, tosort, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += C; + --top; + } else { + break; + } + } + + *stack_ptr = top; + return 0; +} + + +static int +npy_aforce_collapse(char *arr, npy_intp *tosort, run *stack, + npy_intp *stack_ptr, buffer_intp *buffer, size_t len, + PyArray_CompareFunc *cmp, PyArrayObject *py_arr) +{ + int ret; + npy_intp top = *stack_ptr; + + while (2 < top) { + if (stack[top - 3].l <= stack[top - 1].l) { + ret = npy_amerge_at(arr, tosort, stack, top - 3, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 3].l += stack[top - 2].l; + stack[top - 2] = stack[top - 1]; + --top; + } else { + ret = npy_amerge_at(arr, tosort, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + + stack[top - 2].l += stack[top - 1].l; + --top; + } + } + + if (1 < top) { + ret = npy_amerge_at(arr, tosort, stack, top - 2, buffer, len, cmp, py_arr); + + if (NPY_UNLIKELY(ret < 0)) { return ret; } + } + + return 0; +} + + +int +npy_atimsort(void *start, npy_intp *tosort, npy_intp num, void *varr) +{ + size_t len = PyArray_ITEMSIZE(varr); + PyArray_CompareFunc *cmp = PyArray_DESCR(varr)->f->compare; + int ret; + npy_intp l, n, stack_ptr, minrun; + run stack[TIMSORT_STACK_SIZE]; + buffer_intp buffer; + buffer.pw = NULL; + buffer.size = 0; + stack_ptr = 0; + minrun = compute_min_run_short(num); + + for (l = 0; l < num;) { + n = npy_acount_run(start, tosort, l, num, minrun, len, cmp, varr); + /* both s and l are scaled by len */ + stack[stack_ptr].s = l; + stack[stack_ptr].l = n; + ++stack_ptr; + ret = npy_atry_collapse(start, tosort, stack, &stack_ptr, &buffer, len, cmp, + varr); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + l += n; + } + + ret = npy_aforce_collapse(start, tosort, stack, &stack_ptr, &buffer, len, + cmp, varr); + + if (NPY_UNLIKELY(ret < 0)) { goto cleanup; } + + ret = 0; +cleanup: + + if (buffer.pw != NULL) { + free(buffer.pw); + } + + return ret; +} |