diff options
author | Grigore Lupescu <grigore.lupescu at intel.com> | 2016-04-11 17:41:30 +0300 |
---|---|---|
committer | Yang Rong <rong.r.yang@intel.com> | 2016-05-18 15:10:36 +0800 |
commit | 52628691322b9fd5bdba2ecfb9e6bfe17ae940f8 (patch) | |
tree | 16299017fb63c4e00fc9aede947d8d0d3162a9ff | |
parent | 22d7f7c22654b58d1daf1e10da26235fe4ece3b8 (diff) | |
download | beignet-52628691322b9fd5bdba2ecfb9e6bfe17ae940f8.tar.gz |
Benchmark: Add performance tests for workgroup broadcast
Added the following performance tests:
benchmark_workgroup_broadcast_1D_int
benchmark_workgroup_broadcast_1D_long
benchmark_workgroup_broadcast_2D_int
benchmark_workgroup_broadcast_2D_long
Signed-off-by: Grigore Lupescu <grigore.lupescu at intel.com>
Reviewed-by: Pan Xiuli <xiuli.pan@intel.com>
-rw-r--r-- | benchmark/benchmark_workgroup.cpp | 187 | ||||
-rw-r--r-- | kernels/bench_workgroup.cl | 99 |
2 files changed, 251 insertions, 35 deletions
diff --git a/benchmark/benchmark_workgroup.cpp b/benchmark/benchmark_workgroup.cpp index 4e4802e4..3f073bb4 100644 --- a/benchmark/benchmark_workgroup.cpp +++ b/benchmark/benchmark_workgroup.cpp @@ -9,13 +9,26 @@ using namespace std; -/* NDRANGE */ -#define WG_GLOBAL_SIZE (512 * 256) -#define WG_LOCAL_SIZE 128 -#define WG_LOOP_COUNT 10000 +/* work-group general settings */ +#define WG_GLOBAL_SIZE (512 * 256) +#define WG_LOCAL_SIZE 128 +#define WG_LOOP_COUNT 1000 + +/* work-group broadcast only */ +#define WG_GLOBAL_SIZE_X 1024 +#define WG_GLOBAL_SIZE_Y 1024 + +#define WG_LOCAL_SIZE_X 32 +#define WG_LOCAL_SIZE_Y 2 + +#define WG_LOCAL_X 5 +#define WG_LOCAL_Y 0 + enum WG_FUNCTION { + WG_BROADCAST_1D, + WG_BROADCAST_2D, WG_REDUCE_ADD, WG_REDUCE_MIN, WG_REDUCE_MAX, @@ -34,48 +47,62 @@ enum WG_FUNCTION template<class T> static void benchmark_expected(WG_FUNCTION wg_func, T* input, - T* expected) + T* expected, + uint32_t wg_global_size, + uint32_t wg_local_size) { - if(wg_func == WG_REDUCE_ADD) + if(wg_func == WG_BROADCAST_1D) + { + for(uint32_t i = 0; i < wg_local_size; i++) + expected[i] = input[WG_LOCAL_X]; + } + else if(wg_func == WG_BROADCAST_2D) + { + for(uint32_t i = 0; i < wg_local_size; i++) + expected[i] = + input[WG_LOCAL_X + + WG_LOCAL_Y * WG_LOCAL_SIZE_X]; + } + else if(wg_func == WG_REDUCE_ADD) { T wg_sum = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) wg_sum += input[i]; - for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 0; i < wg_local_size; i++) expected[i] = wg_sum; } else if(wg_func == WG_REDUCE_MAX) { T wg_max = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) wg_max = max(input[i], wg_max); - for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 0; i < wg_local_size; i++) expected[i] = wg_max; } else if(wg_func == WG_REDUCE_MIN) { T wg_min = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) wg_min = min(input[i], wg_min); - for(uint32_t i = 0; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 0; i < wg_local_size; i++) expected[i] = wg_min; } else if(wg_func == WG_SCAN_INCLUSIVE_ADD) { expected[0] = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) expected[i] = input[i] + expected[i - 1]; } else if(wg_func == WG_SCAN_INCLUSIVE_MAX) { expected[0] = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) expected[i] = max(input[i], expected[i - 1]); } else if(wg_func == WG_SCAN_INCLUSIVE_MIN) { expected[0] = input[0]; - for(uint32_t i = 1; i < WG_LOCAL_SIZE; i++) + for(uint32_t i = 1; i < wg_local_size; i++) expected[i] = min(input[i], expected[i - 1]); } } @@ -87,23 +114,42 @@ static void benchmark_expected(WG_FUNCTION wg_func, template<class T> static void benchmark_data(WG_FUNCTION wg_func, T* &input, - T* &expected) + T* &expected, + uint32_t &wg_global_size, + uint32_t &wg_local_size) { - input = new T[WG_GLOBAL_SIZE]; - expected = new T[WG_GLOBAL_SIZE]; + if(wg_func == WG_BROADCAST_1D) + { + wg_global_size = WG_GLOBAL_SIZE_X; + wg_local_size = WG_LOCAL_SIZE_X; + } + else if(wg_func == WG_BROADCAST_2D) + { + wg_global_size = WG_GLOBAL_SIZE_X * WG_GLOBAL_SIZE_Y; + wg_local_size = WG_LOCAL_SIZE_X * WG_LOCAL_SIZE_Y; + } + else + { + wg_global_size = WG_GLOBAL_SIZE; + wg_local_size = WG_LOCAL_SIZE; + } + + input = new T[wg_global_size]; + expected = new T[wg_global_size]; /* seed for random inputs */ srand (time(NULL)); /* generate inputs and expected values */ - for(uint32_t gid = 0; gid < WG_GLOBAL_SIZE; gid += WG_LOCAL_SIZE) + for(uint32_t gid = 0; gid < wg_global_size; gid += wg_local_size) { /* input values */ - for(uint32_t lid = 0; lid < WG_LOCAL_SIZE; lid++) - input[gid + lid] = (rand() % 112) / 3.1415f; + for(uint32_t lid = 0; lid < wg_local_size; lid++) + input[gid + lid] = (rand() % 512) / 3.1415f; /* expected values */ - benchmark_expected(wg_func, input + gid, expected + gid); + benchmark_expected(wg_func, input + gid, expected + gid, + wg_global_size, wg_local_size); } } @@ -117,30 +163,60 @@ static double benchmark_generic(WG_FUNCTION wg_func, T* expected) { double elapsed = 0; - const uint32_t reduce_loop = 10000; + const uint32_t reduce_loop = WG_LOOP_COUNT; struct timeval start,stop; + uint32_t wg_global_size = 0; + uint32_t wg_local_size = 0; + /* input and expected data */ - benchmark_data(wg_func, input, expected); + benchmark_data(wg_func, input, expected, wg_global_size, wg_local_size); /* prepare input for datatype */ - OCL_CREATE_BUFFER(buf[0], 0, WG_GLOBAL_SIZE * sizeof(T), NULL); - OCL_CREATE_BUFFER(buf[1], 0, WG_GLOBAL_SIZE * sizeof(T), NULL); + OCL_CREATE_BUFFER(buf[0], 0, wg_global_size * sizeof(T), NULL); + OCL_CREATE_BUFFER(buf[1], 0, wg_global_size * sizeof(T), NULL); OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + if(wg_func == WG_BROADCAST_1D || + wg_func == WG_BROADCAST_2D) + { + cl_uint wg_local_x = WG_LOCAL_X; + cl_uint wg_local_y = WG_LOCAL_Y; + OCL_SET_ARG(3, sizeof(cl_uint), &wg_local_x); + OCL_SET_ARG(4, sizeof(cl_uint), &wg_local_y); + } + /* set input data for GPU */ OCL_MAP_BUFFER(0); - memcpy(buf_data[0], input, WG_GLOBAL_SIZE * sizeof(T)); + memcpy(buf_data[0], input, wg_global_size * sizeof(T)); OCL_UNMAP_BUFFER(0); /* run the kernel on GPU */ - globals[0] = WG_GLOBAL_SIZE; - locals[0] = WG_LOCAL_SIZE; - gettimeofday(&start,0); - OCL_NDRANGE(1); + + if(wg_func == WG_BROADCAST_1D) + { + globals[0] = WG_GLOBAL_SIZE_X; + locals[0] = WG_LOCAL_SIZE_X; + OCL_NDRANGE(1); + } + else if(wg_func == WG_BROADCAST_2D) + { + globals[0] = WG_GLOBAL_SIZE_X; + locals[0] = WG_LOCAL_SIZE_X; + globals[1] = WG_GLOBAL_SIZE_Y; + locals[1] = WG_LOCAL_SIZE_Y; + OCL_NDRANGE(2); + } + else + { /* reduce, scan inclulsive, scan exclusive */ + globals[0] = WG_GLOBAL_SIZE; + locals[0] = WG_LOCAL_SIZE; + OCL_NDRANGE(1); + } + clFinish(queue); gettimeofday(&stop,0); elapsed = time_subtract(&stop, &start, 0); @@ -148,18 +224,59 @@ static double benchmark_generic(WG_FUNCTION wg_func, /* check if mistmatch, display execution time */ OCL_MAP_BUFFER(1); uint32_t mistmatches = 0; - for (uint32_t i = 0; i < WG_GLOBAL_SIZE; i++) + for (uint32_t i = 0; i < wg_global_size; i++) if(((T *)buf_data[1])[i] != *(expected + i)){ - cout << "Err at " << i << ", " << - ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; + /* uncomment bellow for DEBUG */ + /* cout << "Err at " << i << ", " << + ((T *)buf_data[1])[i] << " != " << *(expected + i) << endl; */ mistmatches++; } cout << endl << endl << "Mistmatches " << mistmatches << endl; cout << "Exec time " << elapsed << endl << endl; OCL_UNMAP_BUFFER(1); - return BANDWIDTH(WG_GLOBAL_SIZE * WG_LOOP_COUNT, elapsed); + return BANDWIDTH(sizeof(T) * wg_global_size * reduce_loop, elapsed); +} + +/* + * Benchmark workgroup broadcast + */ +double benchmark_workgroup_broadcast_1D_int(void) +{ + cl_int *input = NULL; + cl_int *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup", + "bench_workgroup_broadcast_1D_int"); + return benchmark_generic(WG_BROADCAST_1D, input, expected); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_int, "GB/S"); +double benchmark_workgroup_broadcast_1D_long(void) +{ + cl_long *input = NULL; + cl_long *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup", + "bench_workgroup_broadcast_1D_long"); + return benchmark_generic(WG_BROADCAST_1D, input, expected); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_1D_long, "GB/S"); +double benchmark_workgroup_broadcast_2D_int(void) +{ + cl_int *input = NULL; + cl_int *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup", + "bench_workgroup_broadcast_2D_int"); + return benchmark_generic(WG_BROADCAST_2D, input, expected); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_int, "GB/S"); +double benchmark_workgroup_broadcast_2D_long(void) +{ + cl_long *input = NULL; + cl_long *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup", + "bench_workgroup_broadcast_2D_long"); + return benchmark_generic(WG_BROADCAST_2D, input, expected); } +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_broadcast_2D_long, "GB/S"); /* * Benchmark workgroup reduce add diff --git a/kernels/bench_workgroup.cl b/kernels/bench_workgroup.cl index 8a525de0..87986fca 100644 --- a/kernels/bench_workgroup.cl +++ b/kernels/bench_workgroup.cl @@ -1,4 +1,103 @@ /* + * Benchmark broadcast 1D + */ +kernel void bench_workgroup_broadcast_1D_int(global int *src, + global int *dst, + int reduce_loop, + uint wg_local_x, + uint wg_local_y) +{ + uint offset = 0; + uint index = offset + get_global_id(0); + + int val = src[index]; + /* depending on generated ASM, volatile may be removed */ + volatile int result; + + for(; reduce_loop > 0; reduce_loop--){ + result = work_group_broadcast(val, + wg_local_x); + } + + dst[index] = result; +} + +kernel void bench_workgroup_broadcast_1D_long(global long *src, + global long *dst, + int reduce_loop, + uint wg_local_x, + uint wg_local_y) +{ + uint offset = 0; + uint index = offset + get_global_id(0); + + long val = src[index]; + /* depending on generated ASM, volatile may be removed */ + volatile long result; + + for(; reduce_loop > 0; reduce_loop--){ + result = work_group_broadcast(val, + wg_local_x); + } + + dst[index] = result; +} + + +/* + * Benchmark broadcast 2D + */ +kernel void bench_workgroup_broadcast_2D_int(global int *src, + global int *dst, + int reduce_loop, + uint wg_local_x, + uint wg_local_y) +{ + uint lsize = get_local_size(0) * get_local_size(1); + uint offset = get_group_id(0) * lsize + + get_group_id(1) * get_num_groups(0) * lsize; + uint index = offset + get_local_id(0) + + get_local_id(1) * get_local_size(0); + + int val = src[index]; + /* depending on generated ASM, volatile may be removed */ + int result; + + for(; reduce_loop > 0; reduce_loop--){ + result = work_group_broadcast(val, + wg_local_x, + wg_local_y); + } + + dst[index] = result; +} + +kernel void bench_workgroup_broadcast_2D_long(global long *src, + global long *dst, + int reduce_loop, + uint wg_local_x, + uint wg_local_y) +{ + uint lsize = get_local_size(0) * get_local_size(1); + uint offset = get_group_id(0) * lsize + + get_group_id(1) * get_num_groups(0) * lsize; + uint index = offset + get_local_id(0) + + get_local_id(1) * get_local_size(0); + + long val = src[index]; + /* depending on generated ASM, volatile may be removed */ + long result; + + for(; reduce_loop > 0; reduce_loop--){ + result = work_group_broadcast(val, + wg_local_x, + wg_local_y); + } + + dst[index] = result; +} + +/* * Benchmark workgroup reduce add */ kernel void bench_workgroup_reduce_add_int( |