summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGrigore Lupescu <grigore.lupescu at intel.com>2016-04-11 17:41:30 +0300
committerYang Rong <rong.r.yang@intel.com>2016-05-18 15:10:36 +0800
commit52628691322b9fd5bdba2ecfb9e6bfe17ae940f8 (patch)
tree16299017fb63c4e00fc9aede947d8d0d3162a9ff
parent22d7f7c22654b58d1daf1e10da26235fe4ece3b8 (diff)
downloadbeignet-52628691322b9fd5bdba2ecfb9e6bfe17ae940f8.tar.gz
Benchmark: Add performance tests for workgroup broadcast
Added the following performance tests: benchmark_workgroup_broadcast_1D_int benchmark_workgroup_broadcast_1D_long benchmark_workgroup_broadcast_2D_int benchmark_workgroup_broadcast_2D_long Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com> Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
-rw-r--r--benchmark/benchmark_workgroup.cpp187
-rw-r--r--kernels/bench_workgroup.cl99
2 files changed, 251 insertions, 35 deletions
diff --git a/benchmark/benchmark_workgroup.cpp b/benchmark/benchmark_workgroup.cpp
index 4e4802e4..3f073bb4 100644
--- a/benchmark/benchmark_workgroup.cpp
+++ b/benchmark/benchmark_workgroup.cpp
@@ -9,13 +9,26 @@
using namespace std;
-/* NDRANGE */
-#define WG_GLOBAL_SIZE (512 * 256)
-#define WG_LOCAL_SIZE 128
-#define WG_LOOP_COUNT 10000
+/* work-group general settings */
+#define WG_GLOBAL_SIZE (512 * 256)
+#define WG_LOCAL_SIZE 128
+#define WG_LOOP_COUNT 1000
+
+/* work-group broadcast only */
+#define WG_GLOBAL_SIZE_X 1024
+#define WG_GLOBAL_SIZE_Y 1024
+
+#define WG_LOCAL_SIZE_X 32
+#define WG_LOCAL_SIZE_Y 2
+
+#define WG_LOCAL_X 5
+#define WG_LOCAL_Y 0
+
enum WG_FUNCTION
{
+ WG_BROADCAST_1D,
+ WG_BROADCAST_2D,
WG_REDUCE_ADD,
WG_REDUCE_MIN,
WG_REDUCE_MAX,
@@ -34,48 +47,62 @@ enum WG_FUNCTION
template<class T>
static void benchmark_expected(WG_FUNCTION wg_func,
T* input,
- T* expected)
+ T* expected,
+ uint32_t wg_global_size,
+ uint32_t wg_local_size)
{
- if(wg_func == WG_REDUCE_ADD)
+ if(wg_func == WG_BROADCAST_1D)
+ {
+ for(uint32_t i = 0; i < wg_local_size; i++)
+ expected[i] = input[WG_LOCAL_X];
+ }
+ else if(wg_func == WG_BROADCAST_2D)
+ {
+ for(uint32_t i = 0; i < wg_local_size; i++)
+ expected[i] =
+ input[WG_LOCAL_X +
+ WG_LOCAL_Y * WG_LOCAL_SIZE_X];
+ }
+ else if(wg_func == WG_REDUCE_ADD)
{
T wg_sum = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
wg_sum += input[i];
- for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 0; i < wg_local_size; i++)
expected[i] = wg_sum;
}
else if(wg_func == WG_REDUCE_MAX)
{
T wg_max = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
wg_max = max(input[i], wg_max);
- for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 0; i < wg_local_size; i++)
expected[i] = wg_max;
}
else if(wg_func == WG_REDUCE_MIN)
{
T wg_min = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
wg_min = min(input[i], wg_min);
- for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 0; i < wg_local_size; i++)
expected[i] = wg_min;
}
else if(wg_func == WG_SCAN_INCLUSIVE_ADD)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
expected[i] = input[i] + expected[i - 1];
}
else if(wg_func == WG_SCAN_INCLUSIVE_MAX)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
expected[i] = max(input[i], expected[i - 1]);
}
else if(wg_func == WG_SCAN_INCLUSIVE_MIN)
{
expected[0] = input[0];
- for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++)
+ for(uint32_t i = 1; i < wg_local_size; i++)
expected[i] = min(input[i], expected[i - 1]);
}
}
@@ -87,23 +114,42 @@ static void benchmark_expected(WG_FUNCTION wg_func,
template<class T>
static void benchmark_data(WG_FUNCTION wg_func,
T* &input,
- T* &expected)
+ T* &expected,
+ uint32_t &wg_global_size,
+ uint32_t &wg_local_size)
{
- input = new T[WG_GLOBAL_SIZE];
- expected = new T[WG_GLOBAL_SIZE];
+ if(wg_func == WG_BROADCAST_1D)
+ {
+ wg_global_size = WG_GLOBAL_SIZE_X;
+ wg_local_size = WG_LOCAL_SIZE_X;
+ }
+ else if(wg_func == WG_BROADCAST_2D)
+ {
+ wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y;
+ wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y;
+ }
+ else
+ {
+ wg_global_size = WG_GLOBAL_SIZE;
+ wg_local_size = WG_LOCAL_SIZE;
+ }
+
+ input = new T[wg_global_size];
+ expected = new T[wg_global_size];
/* seed for random inputs */
srand (time(NULL));
/* generate inputs and expected values */
- for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE)
+ for(uint32_t gid = 0; gid < wg_global_size; gid += wg_local_size)
{
/* input values */
- for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++)
- input[gid + lid] = (rand() % 112) / 3.1415f;
+ for(uint32_t lid = 0; lid < wg_local_size; lid++)
+ input[gid + lid] = (rand() % 512) / 3.1415f;
/* expected values */
- benchmark_expected(wg_func, input + gid, expected + gid);
+ benchmark_expected(wg_func, input + gid, expected + gid,
+ wg_global_size, wg_local_size);
}
}
@@ -117,30 +163,60 @@ static double benchmark_generic(WG_FUNCTION wg_func,
T* expected)
{
double elapsed = 0;
- const uint32_t reduce_loop = 10000;
+ const uint32_t reduce_loop = WG_LOOP_COUNT;
struct timeval start,stop;
+ uint32_t wg_global_size = 0;
+ uint32_t wg_local_size = 0;
+
/* input and expected data */
- benchmark_data(wg_func, input, expected);
+ benchmark_data(wg_func, input, expected, wg_global_size, wg_local_size);
/* prepare input for datatype */
- OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
- OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[0], 0, wg_global_size * sizeof(T), NULL);
+ OCL_CREATE_BUFFER(buf[1], 0, wg_global_size * sizeof(T), NULL);
OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop);
+ if(wg_func == WG_BROADCAST_1D ||
+ wg_func == WG_BROADCAST_2D)
+ {
+ cl_uint wg_local_x = WG_LOCAL_X;
+ cl_uint wg_local_y = WG_LOCAL_Y;
+ OCL_SET_ARG(3, sizeof(cl_uint), &wg_local_x);
+ OCL_SET_ARG(4, sizeof(cl_uint), &wg_local_y);
+ }
+
/* set input data for GPU */
OCL_MAP_BUFFER(0);
- memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T));
+ memcpy(buf_data[0], input, wg_global_size * sizeof(T));
OCL_UNMAP_BUFFER(0);
/* run the kernel on GPU */
- globals[0] = WG_GLOBAL_SIZE;
- locals[0] = WG_LOCAL_SIZE;
-
gettimeofday(&start,0);
- OCL_NDRANGE(1);
+
+ if(wg_func == WG_BROADCAST_1D)
+ {
+ globals[0] = WG_GLOBAL_SIZE_X;
+ locals[0] = WG_LOCAL_SIZE_X;
+ OCL_NDRANGE(1);
+ }
+ else if(wg_func == WG_BROADCAST_2D)
+ {
+ globals[0] = WG_GLOBAL_SIZE_X;
+ locals[0] = WG_LOCAL_SIZE_X;
+ globals[1] = WG_GLOBAL_SIZE_Y;
+ locals[1] = WG_LOCAL_SIZE_Y;
+ OCL_NDRANGE(2);
+ }
+ else
+ { /* reduce, scan inclulsive, scan exclusive */
+ globals[0] = WG_GLOBAL_SIZE;
+ locals[0] = WG_LOCAL_SIZE;
+ OCL_NDRANGE(1);
+ }
+
clFinish(queue);
gettimeofday(&stop,0);
elapsed = time_subtract(&stop, &start, 0);
@@ -148,18 +224,59 @@ static double benchmark_generic(WG_FUNCTION wg_func,
/* check if mistmatch, display execution time */
OCL_MAP_BUFFER(1);
uint32_t mistmatches = 0;
- for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++)
+ for (uint32_t i = 0; i < wg_global_size; i++)
if(((T *)buf_data[1])[i] != *(expected + i)){
- cout << "Err at " << i << ", " <<
- ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl;
+ /* uncomment bellow for DEBUG */
+ /* cout << "Err at " << i << ", " <<
+ ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; */
mistmatches++;
}
cout << endl << endl << "Mistmatches " << mistmatches << endl;
cout << "Exec time " << elapsed << endl << endl;
OCL_UNMAP_BUFFER(1);
- return BANDWIDTH(WG_GLOBAL_SIZE * WG_LOOP_COUNT, elapsed);
+ return BANDWIDTH(sizeof(T) * wg_global_size * reduce_loop, elapsed);
+}
+
+/*
+ * Benchmark workgroup broadcast
+ */
+double benchmark_workgroup_broadcast_1D_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+ "bench_workgroup_broadcast_1D_int");
+ return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_int, "GB/S");
+double benchmark_workgroup_broadcast_1D_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+ "bench_workgroup_broadcast_1D_long");
+ return benchmark_generic(WG_BROADCAST_1D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_long, "GB/S");
+double benchmark_workgroup_broadcast_2D_int(void)
+{
+ cl_int *input = NULL;
+ cl_int *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+ "bench_workgroup_broadcast_2D_int");
+ return benchmark_generic(WG_BROADCAST_2D, input, expected);
+}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_int, "GB/S");
+double benchmark_workgroup_broadcast_2D_long(void)
+{
+ cl_long *input = NULL;
+ cl_long *expected = NULL;
+ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup",
+ "bench_workgroup_broadcast_2D_long");
+ return benchmark_generic(WG_BROADCAST_2D, input, expected);
}
+MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_long, "GB/S");
/*
* Benchmark workgroup reduce add
diff --git a/kernels/bench_workgroup.cl b/kernels/bench_workgroup.cl
index 8a525de0..87986fca 100644
--- a/kernels/bench_workgroup.cl
+++ b/kernels/bench_workgroup.cl
@@ -1,4 +1,103 @@
/*
+ * Benchmark broadcast 1D
+ */
+kernel void bench_workgroup_broadcast_1D_int(global int *src,
+ global int *dst,
+ int reduce_loop,
+ uint wg_local_x,
+ uint wg_local_y)
+{
+ uint offset = 0;
+ uint index = offset + get_global_id(0);
+
+ int val = src[index];
+ /* depending on generated ASM, volatile may be removed */
+ volatile int result;
+
+ for(; reduce_loop > 0; reduce_loop--){
+ result = work_group_broadcast(val,
+ wg_local_x);
+ }
+
+ dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_1D_long(global long *src,
+ global long *dst,
+ int reduce_loop,
+ uint wg_local_x,
+ uint wg_local_y)
+{
+ uint offset = 0;
+ uint index = offset + get_global_id(0);
+
+ long val = src[index];
+ /* depending on generated ASM, volatile may be removed */
+ volatile long result;
+
+ for(; reduce_loop > 0; reduce_loop--){
+ result = work_group_broadcast(val,
+ wg_local_x);
+ }
+
+ dst[index] = result;
+}
+
+
+/*
+ * Benchmark broadcast 2D
+ */
+kernel void bench_workgroup_broadcast_2D_int(global int *src,
+ global int *dst,
+ int reduce_loop,
+ uint wg_local_x,
+ uint wg_local_y)
+{
+ uint lsize = get_local_size(0) * get_local_size(1);
+ uint offset = get_group_id(0) * lsize +
+ get_group_id(1) * get_num_groups(0) * lsize;
+ uint index = offset + get_local_id(0) +
+ get_local_id(1) * get_local_size(0);
+
+ int val = src[index];
+ /* depending on generated ASM, volatile may be removed */
+ int result;
+
+ for(; reduce_loop > 0; reduce_loop--){
+ result = work_group_broadcast(val,
+ wg_local_x,
+ wg_local_y);
+ }
+
+ dst[index] = result;
+}
+
+kernel void bench_workgroup_broadcast_2D_long(global long *src,
+ global long *dst,
+ int reduce_loop,
+ uint wg_local_x,
+ uint wg_local_y)
+{
+ uint lsize = get_local_size(0) * get_local_size(1);
+ uint offset = get_group_id(0) * lsize +
+ get_group_id(1) * get_num_groups(0) * lsize;
+ uint index = offset + get_local_id(0) +
+ get_local_id(1) * get_local_size(0);
+
+ long val = src[index];
+ /* depending on generated ASM, volatile may be removed */
+ long result;
+
+ for(; reduce_loop > 0; reduce_loop--){
+ result = work_group_broadcast(val,
+ wg_local_x,
+ wg_local_y);
+ }
+
+ dst[index] = result;
+}
+
+/*
* Benchmark workgroup reduce add
*/
kernel void bench_workgroup_reduce_add_int(