[RNG] Add optimizations (#138)

[RNG] Add optimizations
author: paveldyakov <70019067+paveldyakov@users.noreply.github.com> 2021-03-03 10:10:38 +0300
committer: GitHub <noreply@github.com> 2021-03-03 10:10:38 +0300
commit: 39a58022a0eb67a8a8b63fdd46d9bfc3acaa9425 (patch)
tree: f045494b1f7ae1d84022fbb8173e0572033aeb27
parent: 2e03e11e8d481a2d75fe924acbf16925642f303c (diff)
download: llvm-39a58022a0eb67a8a8b63fdd46d9bfc3acaa9425.tar.gz
7 files changed, 330 insertions, 116 deletions
diff --git a/include/oneapi/dpl/internal/random_impl/discard_block_engine.h b/include/oneapi/dpl/internal/random_impl/discard_block_engine.h
index eb891a71e5e6..c1b5ba8667ca 100644
--- a/include/oneapi/dpl/internal/random_impl/discard_block_engine.h
+++ b/include/oneapi/dpl/internal/random_impl/discard_block_engine.h
@@ -82,25 +82,42 @@ class discard_block_engine
         if (!__num_to_skip)
             return;
 
-        for (; __num_to_skip > 0; --__num_to_skip)
-            generate_internal_scalar<internal::type_traits_t<result_type>::num_elems>();
+        if (__num_to_skip < (used_block - n_))
+        {
+            n_ += __num_to_skip;
+            engine_.discard(__num_to_skip);
+        }
+        else
+        {
+            unsigned long long __n_skip =
+                __num_to_skip + static_cast<unsigned long long>((__num_to_skip + n_) / used_block) *
+                                    static_cast<unsigned long long>(block_size - used_block);
+            // Check the oveflow case
+            if (__n_skip >= __num_to_skip)
+            {
+                n_ = (__num_to_skip - (used_block - n_)) % used_block;
+                engine_.discard(__n_skip);
+            }
+            else
+            {
+                for (; __num_to_skip > 0; --__num_to_skip)
+                    operator()();
+            }
+        }
     }
 
     // operator () returns bits of engine recurrence
     result_type
     operator()()
     {
-        result_type res = generate_internal<internal::type_traits_t<result_type>::num_elems>();
-
-        return res;
+        return generate_internal<internal::type_traits_t<result_type>::num_elems>();
     }
 
     // operator () overload for result portion generation
     result_type
-    operator()(unsigned int __randoms_num)
+    operator()(unsigned int __random_nums)
     {
-        result_type res = generate_internal<internal::type_traits_t<result_type>::num_elems>(__randoms_num);
-        return res;
+        return generate_internal<internal::type_traits_t<result_type>::num_elems>(__random_nums);
     }
 
     // Property function
@@ -154,25 +171,36 @@ class discard_block_engine
     generate_internal()
     {
         result_type __res;
-        for (int __i = 0; __i < _N; ++__i)
+        if (_N < (used_block - n_))
         {
-            __res[__i] = generate_internal_scalar<internal::type_traits_t<result_type>::num_elems>();
+            __res = engine_();
+            n_ += _N;
+        }
+        else
+        {
+            for (int __i = 0; __i < _N; ++__i)
+            {
+                __res[__i] = generate_internal_scalar<internal::type_traits_t<result_type>::num_elems>();
+            }
         }
-
         return __res;
     }
 
     template <int _N>
     typename ::std::enable_if<(_N > 0), result_type>::type
-    generate_internal(unsigned int __randoms_num)
+    generate_internal(unsigned int __random_nums)
     {
-        result_type __res;
-        for (unsigned int __i = 0; __i < __randoms_num; ++__i)
+        if (__random_nums >= _N)
+            return operator()();
+
+        result_type __part_vec;
+
+        for (unsigned int __i = 0; __i < __random_nums; ++__i)
         {
-            __res[__i] = generate_internal_scalar<internal::type_traits_t<result_type>::num_elems>();
+            __part_vec[__i] = generate_internal_scalar<internal::type_traits_t<result_type>::num_elems>();
         }
 
-        return __res;
+        return __part_vec;
     }
 
     _Engine engine_;
diff --git a/include/oneapi/dpl/internal/random_impl/linear_congruential_engine.h b/include/oneapi/dpl/internal/random_impl/linear_congruential_engine.h
index 96efa8ea94da..d5a7d418fda0 100644
--- a/include/oneapi/dpl/internal/random_impl/linear_congruential_engine.h
+++ b/include/oneapi/dpl/internal/random_impl/linear_congruential_engine.h
@@ -75,9 +75,9 @@ class linear_congruential_engine
         // Skipping sequence
         if (__num_to_skip == 0)
             return;
-        constexpr bool flag = (increment == 0) && (modulus < ::std::numeric_limits<::std::uint32_t>::max()) &&
-                              (multiplier < ::std::numeric_limits<::std::uint32_t>::max());
-        skip_seq<internal::type_traits_t<result_type>::num_elems, flag>(__num_to_skip);
+        constexpr bool __flag = (increment == 0) && (modulus < ::std::numeric_limits<::std::uint32_t>::max()) &&
+                                (multiplier < ::std::numeric_limits<::std::uint32_t>::max());
+        skip_seq<internal::type_traits_t<result_type>::num_elems, __flag>(__num_to_skip);
     }
 
     // operator () returns bits of engine recurrence
@@ -94,9 +94,9 @@ class linear_congruential_engine
 
     // operator () overload for result portion generation
     result_type
-    operator()(unsigned int __randoms_num)
+    operator()(unsigned int __random_nums)
     {
-        return result_portion_internal<internal::type_traits_t<result_type>::num_elems>(__randoms_num);
+        return result_portion_internal<internal::type_traits_t<result_type>::num_elems>(__random_nums);
     }
 
   private:
@@ -143,7 +143,7 @@ class linear_congruential_engine
 
         state_[0] = mod_scalar(state_[0]);
 
-        for (int __i = 1u; __i < _N; __i++)
+        for (int __i = 1u; __i < _N; ++__i)
             state_[__i] = mod_scalar(state_[__i - 1u]);
     }
 
@@ -154,9 +154,7 @@ class linear_congruential_engine
         ::std::uint64_t __a2;
         ::std::uint64_t __mod = static_cast<::std::uint64_t>(modulus);
         ::std::uint64_t __a = static_cast<::std::uint64_t>(multiplier);
-        scalar_type __r;
-
-        __r = 1;
+        scalar_type __r = 1;
 
         do
         {
@@ -182,7 +180,7 @@ class linear_congruential_engine
     typename ::std::enable_if<(_N == 0) && (_FLAG == false)>::type
     skip_seq(unsigned long long __num_to_skip)
     {
-        for (unsigned long long __i = 0; __i < __num_to_skip; ++__i)
+        for (; __num_to_skip > 0; --__num_to_skip)
             state_ = mod_scalar(state_);
     }
 
@@ -190,7 +188,7 @@ class linear_congruential_engine
     typename ::std::enable_if<(_N == 1) && (_FLAG == false)>::type
     skip_seq(unsigned long long __num_to_skip)
     {
-        for (unsigned long long __i = 0; __i < __num_to_skip; ++__i)
+        for (; __num_to_skip > 0; --__num_to_skip)
             state_[0] = mod_scalar(state_[0]);
     }
 
@@ -198,11 +196,11 @@ class linear_congruential_engine
     typename ::std::enable_if<(_N > 1) && (_FLAG == false)>::type
     skip_seq(unsigned long long __num_to_skip)
     {
-        for (unsigned long long __i = 0; __i < __num_to_skip; ++__i)
+        for (; __num_to_skip > 0; --__num_to_skip)
         {
-            for (int __j = 0; __j < (_N - 1); ++__j)
+            for (int __i = 0; __i < (_N - 1); ++__i)
             {
-                state_[__j] = state_[__j + 1];
+                state_[__i] = state_[__i + 1];
             }
             state_[_N - 1] = mod_scalar(state_[_N - 2]);
         }
@@ -232,24 +230,24 @@ class linear_congruential_engine
     {
         ::std::uint64_t __mod = modulus, __inc = increment;
         ::std::uint64_t __mult = pow_mult_n(__num_to_skip);
-        for (unsigned int __i = 0; __i < _N; ++__i)
+        for (int __i = 0; __i < _N; ++__i)
             state_[__i] = static_cast<scalar_type>(((__mult * static_cast<::std::uint64_t>(state_[__i])) % __mod));
     }
 
     // result_portion implementation
     template <int _N>
     typename ::std::enable_if<(_N > 0), result_type>::type
-    result_portion_internal(unsigned int __randoms_num)
+    result_portion_internal(unsigned int __random_nums)
     {
         result_type __part_vec;
-        if (__randoms_num < 1)
-            return __part_vec;
 
-        unsigned int __num_to_gen = (__randoms_num > _N) ? _N : __randoms_num;
-        for (unsigned int __i = 0; __i < __num_to_gen; ++__i)
+        if (__random_nums >= _N)
+            return operator()();
+
+        for (unsigned int __i = 0; __i < __random_nums; ++__i)
             __part_vec[__i] = state_[__i];
 
-        discard(__num_to_gen);
+        discard(__random_nums);
         return __part_vec;
     }
 
diff --git a/include/oneapi/dpl/internal/random_impl/normal_distribution.h b/include/oneapi/dpl/internal/random_impl/normal_distribution.h
index 60ef897cfc4c..08615a6c3b95 100644
--- a/include/oneapi/dpl/internal/random_impl/normal_distribution.h
+++ b/include/oneapi/dpl/internal/random_impl/normal_distribution.h
@@ -118,23 +118,16 @@ class normal_distribution
 
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, unsigned int __randoms_num)
+    operator()(_Engine& __engine, unsigned int __random_nums)
     {
-        return operator()<_Engine>(__engine, param_type(mean_, stddev_), __randoms_num);
+        return operator()<_Engine>(__engine, param_type(mean_, stddev_), __random_nums);
     }
 
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, const param_type& __params, unsigned int __randoms_num)
+    operator()(_Engine& __engine, const param_type& __params, unsigned int __random_nums)
     {
-        result_type __part_vec;
-        if (__randoms_num < 1)
-            return __part_vec;
-
-        int __portion = (__randoms_num > size_of_type_) ? size_of_type_ : __randoms_num;
-
-        __part_vec = result_portion_internal<size_of_type_, _Engine>(__engine, __params, __portion);
-        return __part_vec;
+        return result_portion_internal<size_of_type_, _Engine>(__engine, __params, __random_nums);
     }
 
   private:
@@ -161,14 +154,14 @@ class normal_distribution
 
     // Callback function
     template <typename _Type = float>
-    scalar_type
+    inline scalar_type
     callback()
     {
         return ((scalar_type*)(internal::gaussian_sp_table))[1];
     }
 
     template <>
-    scalar_type
+    inline scalar_type
     callback<double>()
     {
         return ((scalar_type*)(internal::gaussian_dp_table))[1];
@@ -176,14 +169,14 @@ class normal_distribution
 
     // Get 2 * pi function
     template <typename _Type = float>
-    scalar_type
+    inline scalar_type
     pi2()
     {
         return ((scalar_type*)(internal::gaussian_sp_table))[0];
     }
 
     template <>
-    scalar_type
+    inline scalar_type
     pi2<double>()
     {
         return ((scalar_type*)(internal::gaussian_dp_table))[0];
@@ -194,9 +187,10 @@ class normal_distribution
     typename ::std::enable_if<(_Ndistr != 0), result_type>::type
     generate(_Engine& __engine, const param_type __params)
     {
-        return generate_vec_internal(__engine, __params, _Ndistr);
+        return generate_vec<_Ndistr, _Engine>(__engine, __params);
     }
 
+    // Specialization of the scalar generation
     template <int _Ndistr, class _Engine>
     typename ::std::enable_if<(_Ndistr == 0), result_type>::type
     generate(_Engine& __engine, const param_type __params)
@@ -232,14 +226,86 @@ class normal_distribution
         return __res;
     }
 
-    // Implementation for the generate vector function
+    // Specialization of the vector generation with size = [1; 3]
+    template <int __N, class _Engine>
+    typename ::std::enable_if<(__N <= 3), result_type>::type
+    generate_vec(_Engine& __engine, const param_type __params)
+    {
+        return generate_n_elems<_Engine>(__engine, __params, __N);
+    }
+
+    // Specialization of the vector generation with size = [4; 8; 16]
+    template <int __N, class _Engine>
+    typename ::std::enable_if<(__N > 3), result_type>::type
+    generate_vec(_Engine& __engine, const param_type __params)
+    {
+        uniform_result_type __u;
+        scalar_type __mean = __params.first, __stddev = __params.second;
+        result_type __res;
+
+        constexpr unsigned int __vec_size = __N / 2;
+        sycl::vec<scalar_type, __vec_size> __sin, __cos;
+        sycl::vec<scalar_type, __vec_size> __u1_transformed;
+
+        __u = uniform_real_distribution_(__engine,
+                                         param_type(static_cast<scalar_type>(0.0), static_cast<scalar_type>(1.0)), __N);
+
+        sycl::vec<scalar_type, __vec_size> __u1 = __u.even();
+        sycl::vec<scalar_type, __vec_size> __u2 = __u.odd();
+
+        // Calculate sycl::log with callback
+        __u1_transformed =
+            select(sycl::log(__u1), sycl::vec<scalar_type, __vec_size>{callback<scalar_type>()},
+                   sycl::isequal(__u1, sycl::vec<scalar_type, __vec_size>{static_cast<scalar_type>(0.0)}));
+
+        // Get sincos
+        __sin = sycl::sincos(pi2<scalar_type>() * __u2, &__cos);
+
+        if (!flag_)
+        {
+            __u1_transformed = sycl::sqrt(static_cast<scalar_type>(-2.0) * __u1_transformed);
+            __res.even() = __u1_transformed * __sin * __stddev + __mean;
+            __res.odd() = __u1_transformed * __cos * __stddev + __mean;
+
+            // Flag is still false as code-branch for 4/8/16 vector sizes
+        }
+        else
+        {
+            __res[0] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * saved_ln_) *
+                                            sycl::cos(pi2<scalar_type>() * saved_u2_));
+
+            for (int __i = 0, __j = 0; __i < __N; __i += 2, ++__j)
+            {
+                __res[__i + 1] =
+                    (sycl::sqrt(static_cast<scalar_type>(-2.0) * __u1_transformed[__j]) * __sin[__j]) * __stddev +
+                    __mean;
+                __res[__i + 2] =
+                    (sycl::sqrt(static_cast<scalar_type>(-2.0) * __u1_transformed[__j]) * __cos[__j]) * __stddev +
+                    __mean;
+            }
+
+            __res[__N - 1] = (sycl::sqrt(static_cast<scalar_type>(-2.0) * __u1_transformed[__vec_size - 1]) *
+                              __sin[__vec_size - 1]) *
+                                 __stddev +
+                             __mean;
+
+            saved_ln_ = __u1_transformed[__vec_size - 1];
+            saved_u2_ = __u2[__vec_size - 1];
+
+            // Flag is still true as code-branch for 4/8/16 vector sizes
+        }
+        return __res;
+    }
+
+    // Implementation for the N vector's elements generation
     template <class _Engine>
     result_type
-    generate_vec_internal(_Engine& __engine, const param_type __params, unsigned int __N)
+    generate_n_elems(_Engine& __engine, const param_type __params, unsigned int __N)
     {
 
         uniform_result_type __u;
         scalar_type __u1, __u2, __ln;
+        scalar_type __sin, __cos;
         scalar_type __mean = __params.first, __stddev = __params.second;
         result_type __res;
 
@@ -253,11 +319,12 @@ class normal_distribution
             {
                 __u1 = __u[__i];
                 __u2 = __u[__i + 1];
+
+                __sin = sycl::sincos(pi2<scalar_type>() * __u2, &__cos);
+
                 __ln = (__u1 == static_cast<scalar_type>(0.0)) ? callback<scalar_type>() : sycl::log(__u1);
-                __res[__i] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) *
-                                                  sycl::sin(pi2<scalar_type>() * __u2));
-                __res[__i + 1] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) *
-                                                      sycl::cos(pi2<scalar_type>() * __u2));
+                __res[__i] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) * __sin);
+                __res[__i + 1] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) * __cos);
             }
             if (__tail)
             {
@@ -286,13 +353,13 @@ class normal_distribution
 
             for (unsigned int __i = 1; __i < (__N - __tail); __i += 2)
             {
+                __sin = sycl::sincos(pi2<scalar_type>() * __u2, &__cos);
+
                 __u1 = __u[__i - 1];
                 __u2 = __u[__i];
                 __ln = (__u1 == static_cast<scalar_type>(0.0)) ? callback<scalar_type>() : sycl::log(__u1);
-                __res[__i] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) *
-                                                  sycl::sin(pi2<scalar_type>() * __u2));
-                __res[__i + 1] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) *
-                                                      sycl::cos(pi2<scalar_type>() * __u2));
+                __res[__i] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) * __sin);
+                __res[__i + 1] = __mean + __stddev * (sycl::sqrt(-static_cast<scalar_type>(2.0) * __ln) * __cos);
             }
             if (__tail)
             {
@@ -315,7 +382,14 @@ class normal_distribution
     typename ::std::enable_if<(_Ndistr != 0), result_type>::type
     result_portion_internal(_Engine& __engine, const param_type __params, unsigned int __N)
     {
-        return generate_vec_internal(__engine, __params, __N);
+        result_type __part_vec;
+        if (__N == 0)
+            return __part_vec;
+        else if (__N >= _Ndistr)
+            return operator()(__engine);
+
+        __part_vec = generate_n_elems(__engine, __params, __N);
+        return __part_vec;
     }
 };
 
diff --git a/include/oneapi/dpl/internal/random_impl/subtract_with_carry_engine.h b/include/oneapi/dpl/internal/random_impl/subtract_with_carry_engine.h
index 48831f699881..918e394c6872 100644
--- a/include/oneapi/dpl/internal/random_impl/subtract_with_carry_engine.h
+++ b/include/oneapi/dpl/internal/random_impl/subtract_with_carry_engine.h
@@ -84,18 +84,14 @@ class subtract_with_carry_engine
     result_type
     operator()()
     {
-        result_type res = generate_internal<internal::type_traits_t<result_type>::num_elems>();
-
-        return res;
+        return generate_internal<internal::type_traits_t<result_type>::num_elems>();
     }
 
     // operator () overload for result portion generation
     result_type
-    operator()(unsigned int __randoms_num)
+    operator()(unsigned int __random_nums)
     {
-        result_type __res = generate_internal<internal::type_traits_t<result_type>::num_elems>(__randoms_num);
-
-        return __res;
+        return result_portion_internal<internal::type_traits_t<result_type>::num_elems>(__random_nums);
     }
 
   private:
@@ -142,12 +138,14 @@ class subtract_with_carry_engine
         }
         else
         {
-            x_[i_] = max() - x_[__id_2] - c_ + x_[__id_1] + 1u;
+            x_[__id] = max() - x_[__id_2] - c_ + x_[__id_1] + 1u;
             c_ = 1;
         }
+
         i_++;
         if (i_ >= long_lag)
             i_ = 0;
+
         return x_[__id];
     };
 
@@ -171,17 +169,22 @@ class subtract_with_carry_engine
         return __res;
     }
 
+    // result_portion implementation
     template <int _N>
     typename ::std::enable_if<(_N > 0), result_type>::type
-    generate_internal(unsigned int __randoms_num)
+    result_portion_internal(unsigned int __random_nums)
     {
-        result_type __res;
-        for (unsigned int __i = 0; __i < __randoms_num; ++__i)
+        result_type __part_vec;
+
+        if (__random_nums >= _N)
+            return operator()();
+
+        for (unsigned int __i = 0; __i < __random_nums; ++__i)
         {
-            __res[__i] = generate_internal_scalar();
+            __part_vec[__i] = generate_internal_scalar();
         }
 
-        return __res;
+        return __part_vec;
     }
 
     scalar_type x_[long_lag];
diff --git a/include/oneapi/dpl/internal/random_impl/uniform_int_distribution.h b/include/oneapi/dpl/internal/random_impl/uniform_int_distribution.h
index 3954ce411764..9e4b6530ba03 100644
--- a/include/oneapi/dpl/internal/random_impl/uniform_int_distribution.h
+++ b/include/oneapi/dpl/internal/random_impl/uniform_int_distribution.h
@@ -104,23 +104,16 @@ class uniform_int_distribution
     // Generation by portion
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, unsigned int __randoms_num)
+    operator()(_Engine& __engine, unsigned int __random_nums)
     {
-        return operator()<_Engine>(__engine, param_type(a_, b_), __randoms_num);
+        return operator()<_Engine>(__engine, param_type(a_, b_), __random_nums);
     }
 
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, const param_type& __params, unsigned int __randoms_num)
+    operator()(_Engine& __engine, const param_type& __params, unsigned int __random_nums)
     {
-        result_type __part_vec;
-        if (__randoms_num < 1)
-            return __part_vec;
-
-        unsigned int __portion = (__randoms_num > size_of_type_) ? size_of_type_ : __randoms_num;
-
-        __part_vec = result_portion_internal<size_of_type_, _Engine>(__engine, __params, __portion);
-        return __part_vec;
+        return result_portion_internal<size_of_type_, _Engine>(__engine, __params, __random_nums);
     }
 
   private:
@@ -150,7 +143,11 @@ class uniform_int_distribution
             __engine, ::std::pair<double, double>(static_cast<double>(__params.first),
                                                   static_cast<double>(__params.second) + 1.0));
 
-        return __res.template convert<scalar_type, sycl::rounding_mode::rte>();
+        result_type __res_ret;
+        for (int __i = 0; __i < _Ndistr; ++__i)
+            __res_ret[__i] = static_cast<scalar_type>(__res[__i]);
+
+        return __res_ret;
     }
 
     template <int _Ndistr, class _Engine>
@@ -169,13 +166,22 @@ class uniform_int_distribution
     typename ::std::enable_if<(_Ndistr != 0), result_type>::type
     result_portion_internal(_Engine& __engine, const param_type& __params, unsigned int __N)
     {
+        result_type __part_vec;
+        if (__N == 0)
+            return __part_vec;
+        else if (__N >= _Ndistr)
+            return operator()(__engine);
+
         RealType __res =
             uniform_real_distribution_(__engine,
                                        ::std::pair<double, double>(static_cast<double>(__params.first),
                                                                    static_cast<double>(__params.second) + 1.0),
                                        __N);
 
-        return __res.template convert<scalar_type, sycl::rounding_mode::rte>();
+        for (unsigned int __i = 0; __i < __N; ++__i)
+            __part_vec[__i] = static_cast<scalar_type>(__res[__i]);
+
+        return __part_vec;
     }
 };
 
diff --git a/include/oneapi/dpl/internal/random_impl/uniform_real_distribution.h b/include/oneapi/dpl/internal/random_impl/uniform_real_distribution.h
index 3c4bf1803084..8aa4ea2aa831 100644
--- a/include/oneapi/dpl/internal/random_impl/uniform_real_distribution.h
+++ b/include/oneapi/dpl/internal/random_impl/uniform_real_distribution.h
@@ -98,33 +98,23 @@ class uniform_real_distribution
     result_type
     operator()(_Engine& __engine, const param_type& __params)
     {
-        result_type __res =
-            generate<size_of_type_, internal::type_traits_t<typename _Engine::result_type>::num_elems, _Engine>(
-                __engine, __params);
-        return __res;
+        return generate<size_of_type_, internal::type_traits_t<typename _Engine::result_type>::num_elems, _Engine>(
+            __engine, __params);
     }
 
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, unsigned int __randoms_num)
+    operator()(_Engine& __engine, unsigned int __random_nums)
     {
-        return operator()<_Engine>(__engine, param_type(a_, b_), __randoms_num);
+        return operator()<_Engine>(__engine, param_type(a_, b_), __random_nums);
     }
 
     template <class _Engine>
     result_type
-    operator()(_Engine& __engine, const param_type& __params, unsigned int __randoms_num)
+    operator()(_Engine& __engine, const param_type& __params, unsigned int __random_nums)
     {
-        result_type __part_vec;
-        if (__randoms_num < 1)
-            return __part_vec;
-
-        unsigned int __portion = (__randoms_num > size_of_type_) ? size_of_type_ : __randoms_num;
-
-        __part_vec =
-            result_portion_internal<size_of_type_, internal::type_traits_t<typename _Engine::result_type>::num_elems,
-                                    _Engine>(__engine, __params, __portion);
-        return __part_vec;
+        return result_portion_internal<size_of_type_, internal::type_traits_t<typename _Engine::result_type>::num_elems,
+                                       _Engine>(__engine, __params, __random_nums);
     }
 
   private:
@@ -145,7 +135,11 @@ class uniform_real_distribution
     generate(_Engine& __engine, const param_type& __params)
     {
         auto __engine_output = __engine();
-        auto __res = __engine_output.template convert<scalar_type, sycl::rounding_mode::rte>();
+        result_type __res;
+
+        for (int __i = 0; __i < _Ndistr; ++__i)
+            __res[__i] = static_cast<scalar_type>(__engine_output[__i]);
+
         __res = ((__res - __engine.min()) / (1 + static_cast<scalar_type>(__engine.max() - __engine.min()))) *
                     (__params.second - __params.first) +
                 __params.first;
@@ -200,7 +194,7 @@ class uniform_real_distribution
     {
         sycl::vec<scalar_type, _Ndistr> __res;
         int __i;
-        int __tail_size = _Ndistr % _Negnine;
+        constexpr int __tail_size = _Ndistr % _Negnine;
         for (__i = 0; __i < _Ndistr; __i += _Negnine)
         {
             auto __engine_output = __engine();
@@ -248,11 +242,11 @@ class uniform_real_distribution
     // Implementation for result_portion function
     template <int _Ndistr, int _Negnine, class _Engine>
     typename ::std::enable_if<((_Ndistr <= _Negnine) & (_Ndistr != 0)), result_type>::type
-    result_portion_internal(_Engine& __engine, const param_type& __params, unsigned int __N)
+    generate_n_elems(_Engine& __engine, const param_type& __params, unsigned int __N)
     {
         auto __engine_output = __engine(__N);
         result_type __res;
-        for (unsigned int __i = 0; __i < __N; ++__i)
+        for (int __i = 0; __i < __N; ++__i)
         {
             __res[__i] = static_cast<scalar_type>(__engine_output[__i]);
             __res[__i] =
@@ -266,15 +260,15 @@ class uniform_real_distribution
 
     template <int _Ndistr, int _Negnine, class _Engine>
     typename ::std::enable_if<((_Ndistr > _Negnine) & (_Negnine != 0)), result_type>::type
-    result_portion_internal(_Engine& __engine, const param_type& __params, unsigned int __N)
+    generate_n_elems(_Engine& __engine, const param_type& __params, unsigned int __N)
     {
         result_type __res;
-        unsigned int __i;
+        int __i;
 
         if (_Negnine >= __N)
         {
             auto __engine_output = __engine(__N);
-            for (unsigned int __i = 0; __i < __N; ++__i)
+            for (__i = 0; __i < __N; ++__i)
             {
                 __res[__i] = static_cast<scalar_type>(__engine_output[__i]);
                 __res[__i] =
@@ -321,10 +315,10 @@ class uniform_real_distribution
 
     template <int _Ndistr, int _Negnine, class _Engine>
     typename ::std::enable_if<((_Ndistr > _Negnine) & (_Negnine == 0)), result_type>::type
-    result_portion_internal(_Engine& __engine, const param_type& __params, unsigned int __N)
+    generate_n_elems(_Engine& __engine, const param_type& __params, unsigned int __N)
     {
         result_type __res;
-        for (unsigned int __i = 0; __i < __N; ++__i)
+        for (int __i = 0; __i < __N; ++__i)
         {
             __res[__i] = static_cast<scalar_type>(__engine());
             __res[__i] =
@@ -335,6 +329,21 @@ class uniform_real_distribution
 
         return __res;
     }
+
+    // Implementation for result_portion function
+    template <int _Ndistr, int _Negnine, class _Engine>
+    typename ::std::enable_if<(_Ndistr != 0), result_type>::type
+    result_portion_internal(_Engine& __engine, const param_type __params, unsigned int __N)
+    {
+        result_type __part_vec;
+        if (__N == 0)
+            return __part_vec;
+        else if (__N >= _Ndistr)
+            return operator()(__engine);
+
+        __part_vec = generate_n_elems<_Ndistr, _Negnine, _Engine>(__engine, __params, __N);
+        return __part_vec;
+    }
 };
 
 } // namespace dpl
diff --git a/test/rng_testsuite/statistics_tests/normal_distribution_test.pass.cpp b/test/rng_testsuite/statistics_tests/normal_distribution_test.pass.cpp
index cf28e6f869f8..4725de219ae2 100644
--- a/test/rng_testsuite/statistics_tests/normal_distribution_test.pass.cpp
+++ b/test/rng_testsuite/statistics_tests/normal_distribution_test.pass.cpp
@@ -215,6 +215,72 @@ int test_portion(oneapi::dpl::internal::element_type_t<RealType> mean, oneapi::d
 }
 
 template<class RealType, class UIntType>
+int test_flag(oneapi::dpl::internal::element_type_t<RealType> mean, oneapi::dpl::internal::element_type_t<RealType> stddev, int nsamples) {
+
+    sycl::queue queue(sycl::default_selector{});
+
+    // memory allocation
+    std::vector<oneapi::dpl::internal::element_type_t<RealType>> std_samples(nsamples);
+    std::vector<oneapi::dpl::internal::element_type_t<RealType>> dpstd_samples(nsamples);
+
+    constexpr int num_elems = oneapi::dpl::internal::type_traits_t<RealType>::num_elems == 0 ? 1 : oneapi::dpl::internal::type_traits_t<RealType>::num_elems;
+    constexpr int num_to_skip = (num_elems + 1) % 2 ? (num_elems + 2) : (num_elems + 1);
+
+    // dpstd generation
+    {
+        sycl::buffer<oneapi::dpl::internal::element_type_t<RealType>, 1> dpstd_buffer(dpstd_samples.data(), nsamples);
+
+        queue.submit([&](sycl::handler &cgh) {
+            auto dpstd_acc = dpstd_buffer.template get_access<sycl::access::mode::write>(cgh);
+
+            cgh.parallel_for<>(sycl::range<1>(nsamples / (num_elems + 1)),
+                    [=](sycl::item<1> idx) {
+
+                unsigned long long offset = idx.get_linear_id() * num_to_skip;
+                oneapi::dpl::linear_congruential_engine<UIntType, a, c, m> engine(seed, offset);
+                oneapi::dpl::normal_distribution<RealType> distr(mean, stddev);
+
+                // Generate the first element
+                auto res_1 = distr(engine, 1);
+
+                // Generate the rest elements
+                auto res = distr(engine);
+
+                dpstd_acc[idx * (num_elems + 1)] = res_1[0];
+                for(int i = 0; i < num_elems; ++i)
+                    dpstd_acc[idx * (num_elems + 1) + i + 1] = res[i];
+            });
+        });
+        queue.wait();
+    }
+
+    // std generation
+    generate_std<oneapi::dpl::internal::element_type_t<UIntType>, oneapi::dpl::internal::element_type_t<RealType>>
+        (num_elems + 1, nsamples, mean, stddev, std_samples);
+
+    // comparison
+    int err = 0;
+    for(int i = 0; i < nsamples; ++i) {
+        if (abs(std_samples[i] - dpstd_samples[i]) > eps) {
+            std::cout << "\nError: std_sample[" << i << "] = " << std_samples[i] << ", dpstd_samples[" << i << "] = " << dpstd_samples[i];
+            err++;
+        }
+    }
+
+    // statistics check
+    err += statistics_check(nsamples, mean, stddev, dpstd_samples);
+
+    if(err) {
+        std::cout << "\tFailed" << std::endl;
+    }
+    else {
+        std::cout << "\tPassed" << std::endl;
+    }
+
+    return err;
+}
+
+template<class RealType, class UIntType>
 int tests_set(int nsamples) {
     constexpr int nparams = 2;
 
@@ -254,6 +320,20 @@ int tests_set_portion(std::int32_t nsamples, unsigned int part) {
     return 0;
 }
 
+template<class RealType, class UIntType>
+int tests_set_flag(int nsamples) {
+
+    int err;
+    // Test for all non-zero parameters
+    std::cout << "normal_distribution test<type>, mean = " << 0.0 << ", stddev = " << 1.0 <<
+    ", nsamples = " << nsamples << ", flag = true, vec_size = " << oneapi::dpl::internal::type_traits_t<RealType>::num_elems;
+    err = test_flag<RealType, UIntType>(0.0, 1.0, nsamples);
+    if (err)
+        return 1;
+
+    return 0;
+}
+
 #endif // _ONEDPL_BACKEND_SYCL
 
 int main() {
@@ -353,6 +433,22 @@ int main() {
         return 1;
     }
 
+
+    // testing flag = true case
+    std::cout << "----------------------------------------------------------------------" << std::endl;
+    std::cout << "Float vector sizes = [1, 2, 3, 4, 8, 16], flag = true" << std::endl;
+    std::cout << "----------------------------------------------------------------------" << std::endl;
+    err = tests_set_flag<sycl::vec<float, 1>, sycl::vec<std::uint32_t, 1>>(160);
+    err = tests_set_flag<sycl::vec<float, 2>, sycl::vec<std::uint32_t, 2>>(99);
+    err = tests_set_flag<sycl::vec<float, 3>, sycl::vec<std::uint32_t, 3>>(160);
+    err = tests_set_flag<sycl::vec<float, 4>, sycl::vec<std::uint32_t, 4>>(100);
+    err = tests_set_flag<sycl::vec<float, 8>, sycl::vec<std::uint32_t, 8>>(99);
+    err = tests_set_flag<sycl::vec<float, 16>, sycl::vec<std::uint32_t, 16>>(170);
+    if(err) {
+        std::cout << "Test FAILED" << std::endl;
+        return 1;
+    }
+
 #else
     std::cout << "\tTest is skipped for non-SYCL backend" << std::endl;
 #endif // _ONEDPL_BACKEND_SYCL
author	paveldyakov <70019067+paveldyakov@users.noreply.github.com>	2021-03-03 10:10:38 +0300
committer	GitHub <noreply@github.com>	2021-03-03 10:10:38 +0300
commit	39a58022a0eb67a8a8b63fdd46d9bfc3acaa9425 (patch)
tree	f045494b1f7ae1d84022fbb8173e0572033aeb27
parent	2e03e11e8d481a2d75fe924acbf16925642f303c (diff)
download	llvm-39a58022a0eb67a8a8b63fdd46d9bfc3acaa9425.tar.gz