diff options
author | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2020-01-29 09:42:48 -0800 |
---|---|---|
committer | Raghuveer Devulapalli <raghuveer.devulapalli@intel.com> | 2020-02-01 20:14:38 -0800 |
commit | d5b4b721cce90adea3592c126087f1fbe489784e (patch) | |
tree | d3f9c051eb54604b375842a7558ccd2cd6d5c95e /numpy/core/src | |
parent | 5562a8c93fe18f0a51d9051f0c25c7cf525312fe (diff) | |
download | numpy-d5b4b721cce90adea3592c126087f1fbe489784e.tar.gz |
ENH: Improve performance of absolute for CFLOAT and CDOUBLE
Diffstat (limited to 'numpy/core/src')
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 106 |
1 files changed, 81 insertions, 25 deletions
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index e3c0ee3cc..7ec90f9c8 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -1725,13 +1725,17 @@ avx512_hsub_@vsub@(const @vtype@ x) } static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F @vtype@ -avx512_cabsolute_@vsub@(const @vtype@ x) +avx512_cabsolute_@vsub@(const @vtype@ x1, + const @vtype@ x2, + const __m512i re_indices, + const __m512i im_indices) { @vtype@ inf = _mm512_set1_@vsub@(@INF@); @vtype@ nan = _mm512_set1_@vsub@(@NAN@); - @vtype@ x_abs = avx512_abs_@vsub@(x); - @vtype@ re = _mm512_maskz_compress_@vsub@(@cmpx_re_mask@, x_abs); - @vtype@ im = _mm512_maskz_compress_@vsub@(@cmpx_img_mask@, x_abs); + @vtype@ x1_abs = avx512_abs_@vsub@(x1); + @vtype@ x2_abs = avx512_abs_@vsub@(x2); + @vtype@ re = _mm512_permutex2var_@vsub@(x1_abs, re_indices, x2_abs); + @vtype@ im = _mm512_permutex2var_@vsub@(x1_abs, im_indices , x2_abs); /* * If real or imag = INF, then convert it to inf + j*inf * Handles: inf + j*nan, nan + j*inf @@ -2621,12 +2625,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void * #type = npy_float, npy_double# * #num_lanes = 16, 8# * #vsuffix = ps, pd# + * #epi_vsub = epi32, epi64# * #mask = __mmask16, __mmask8# * #vtype = __m512, __m512d# * #scale = 4, 8# * #vindextype = __m512i, __m256i# * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# * #storemask = 0xFF, 0xF# + * #IS_FLOAT = 1, 0# */ /**begin repeat1 @@ -2669,9 +2675,8 @@ AVX512F_@func@_@TYPE@(char **args, const npy_intp *dimensions, const npy_intp *s /**end repeat1**/ /**begin repeat1 - * #func = absolute, square, conjugate# - * #vectorf = avx512_cabsolute, avx512_csquare, avx512_conjugate# - * #is_out_real = 1, 0, 0# + * #func = square, conjugate# + * #vectorf = avx512_csquare, avx512_conjugate# */ #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS @@ -2695,19 +2700,12 @@ AVX512F_@func@_@TYPE@(@type@ * op, } @vindextype@ vindex = @vindexload@((@vindextype@*)index_ip1); @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); -#if @is_out_real@ - @mask@ store_mask = _mm512_kand(avx512_get_full_load_mask_@vsuffix@(), @storemask@); -#endif @vtype@ zeros = _mm512_setzero_@vsuffix@(); while (num_remaining_elements > 0) { if (num_remaining_elements < @num_lanes@) { load_mask = avx512_get_partial_load_mask_@vsuffix@( num_remaining_elements, @num_lanes@); -#if @is_out_real@ - store_mask = avx512_get_partial_load_mask_@vsuffix@( - num_remaining_elements/2, @num_lanes@); -#endif } @vtype@ x1; if (stride_ip1 == 1) { @@ -2719,27 +2717,85 @@ AVX512F_@func@_@TYPE@(@type@ * op, @vtype@ out = @vectorf@_@vsuffix@(x1); -#if @is_out_real@ - _mm512_mask_storeu_@vsuffix@(op, store_mask, out); - op += @num_lanes@/2; -#else _mm512_mask_storeu_@vsuffix@(op, load_mask, out); op += @num_lanes@; -#endif - ip += @num_lanes@*stride_ip1; num_remaining_elements -= @num_lanes@; } -#if @is_out_real@ +} +#endif +/**end repeat1**/ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_GCC_OPT_3 NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_absolute_@TYPE@(@type@ * op, + @type@ * ip, + const npy_intp array_size, + const npy_intp steps) +{ + npy_intp num_remaining_elements = 2*array_size; + const npy_intp stride_ip1 = steps/(npy_intp)sizeof(@type@)/2; + /* - * Ignore invalid exception for cabsolute generated by vmaxps/vmaxpd - * and vminps/vminpd instructions + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via max_stride */ - npy_clear_floatstatus_barrier((char*)op); + npy_int32 index_ip[32]; + for (npy_int32 ii = 0; ii < 2*@num_lanes@; ii=ii+2) { + index_ip[ii] = ii*stride_ip1; + index_ip[ii+1] = ii*stride_ip1 + 1; + } + @vindextype@ vindex1 = @vindexload@((@vindextype@*)index_ip); + @vindextype@ vindex2 = @vindexload@((@vindextype@*)(index_ip+@num_lanes@)); + + @mask@ load_mask1 = avx512_get_full_load_mask_@vsuffix@(); + @mask@ load_mask2 = avx512_get_full_load_mask_@vsuffix@(); + @mask@ store_mask = avx512_get_full_load_mask_@vsuffix@(); + @vtype@ zeros = _mm512_setzero_@vsuffix@(); + +#if @IS_FLOAT@ + __m512i re_index = _mm512_set_epi32(30,28,26,24,22,20,18,16,14,12,10,8,6,4,2,0); + __m512i im_index = _mm512_set_epi32(31,29,27,25,23,21,19,17,15,13,11,9,7,5,3,1); +#else + __m512i re_index = _mm512_set_epi64(14,12,10,8,6,4,2,0); + __m512i im_index = _mm512_set_epi64(15,13,11,9,7,5,3,1); #endif + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask1 = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + load_mask2 = 0x0000; + store_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements/2, @num_lanes@); + } else if (num_remaining_elements < 2*@num_lanes@) { + load_mask1 = avx512_get_full_load_mask_@vsuffix@(); + load_mask2 = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements - @num_lanes@, @num_lanes@); + store_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements/2, @num_lanes@); + } + @vtype@ x1, x2; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask1, ip); + x2 = avx512_masked_load_@vsuffix@(load_mask2, ip+@num_lanes@); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex1, load_mask1); + x2 = avx512_masked_gather_@vsuffix@(zeros, ip, vindex2, load_mask2); + } + + @vtype@ out = avx512_cabsolute_@vsuffix@(x1, x2, re_index, im_index); + + _mm512_mask_storeu_@vsuffix@(op, store_mask, out); + op += @num_lanes@; + ip += 2*@num_lanes@*stride_ip1; + num_remaining_elements -= 2*@num_lanes@; + } + npy_clear_floatstatus_barrier((char*)op); } + #endif -/**end repeat1**/ /**end repeat**/ /* |