160 likes | 267 Views
Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013. Workshop on Reduction. Overview. Reduction Algorithm Terminology – CUDA and OpenCL Test Problem – Dot Product Test Parameters Execution Configuration Divergence Register Storage. Reduction Algorithm.
E N D
Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013 Workshop on Reduction
Overview • Reduction Algorithm • Terminology – CUDA and OpenCL • Test Problem – Dot Product • Test Parameters • Execution Configuration • Divergence • Register Storage
Reduction Algorithm • Select the Largest Value
OpenCL Workspace Workgroups Work Items Terminology - Configuration • CUDA • Grid • Blocks • Threads
OpenCL Global Local Private Terminology - Memory • CUDA • Global • Shared • Local
Dot Product • Vector a – n elements – a[0] a[1] … a[n-1] • Vector b – n elements – b[0] b[1] … b[n-1] • Dot Product • = a[0] * b[0]+a[1] * b[1]+…+a[n-1] * b[n-1] • 2 stages • Multiply matching elements • Sum their products
CUDA – Naive Dot Product const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }
OpenCL – Naive Dot Product #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }
CUDA – Minimize Divergence const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }
OpenCL – Minimize Divergence #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }
CUDA – Register Accumulator const int ntpb = 128; // number of threads per block // Shared Memory // __global__ voiddot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } __syncthreads(); } if (tid == 0) c[blockIdx.x] = x; }
OpenCL – Register Accumulator #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } } if (i == 0) c[wgrp] = x; }