1 / 16

Workshop on Reduction

Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013. Workshop on Reduction. Overview. Reduction Algorithm Terminology – CUDA and OpenCL Test Problem – Dot Product Test Parameters Execution Configuration Divergence Register Storage. Reduction Algorithm.

teneil
Download Presentation

Workshop on Reduction

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. Optimizing Simple CUDA and OpenCL Kernels Chris Szalwinski December 5 2013 Workshop on Reduction

  2. Overview • Reduction Algorithm • Terminology – CUDA and OpenCL • Test Problem – Dot Product • Test Parameters • Execution Configuration • Divergence • Register Storage

  3. Reduction Algorithm • Select the Largest Value

  4. Minimum - Naive Reduction

  5. OpenCL Workspace Workgroups Work Items Terminology - Configuration • CUDA • Grid • Blocks • Threads

  6. OpenCL Global Local Private Terminology - Memory • CUDA • Global • Shared • Local

  7. Dot Product • Vector a – n elements – a[0] a[1] … a[n-1] • Vector b – n elements – b[0] b[1] … b[n-1] • Dot Product • = a[0] * b[0]+a[1] * b[1]+…+a[n-1] * b[n-1] • 2 stages • Multiply matching elements • Sum their products

  8. CUDA – Naive Dot Product const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_D(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = 1; stride < blockDim.x; stride *= 2) { if (tid % (2 * stride) == 0) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }

  9. OpenCL – Naive Dot Product #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_D(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = 1; stride < size; stride *= 2) { if (i % (2 * stride) == 0) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }

  10. Minimize Divergence

  11. CUDA – Minimize Divergence const int ntpb = 128; // number of threads per block // Shared Memory // __global__ void dot_DM(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; // store product in shared memory if (gid < n) s[tid] = a[gid] * b[gid]; else s[tid] = 0; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) s[tid] += s[tid + stride]; __syncthreads(); } if (tid == 0) c[blockIdx.x] = s[0]; }

  12. OpenCL – Minimize Divergence #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DM(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory if (gid < n) s[i] = a[gid] * b[gid]; else s[i] = 0; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) s[i] += s[i + stride]; barrier(CLK_LOCAL_MEM_FENCE); } if (i == 0) c[wgrp] = s[0]; }

  13. CUDA – Register Accumulator const int ntpb = 128; // number of threads per block // Shared Memory // __global__ voiddot_DMR(const float* a, const float* b, float* c, int n) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int tid = threadIdx.x; __shared__ float s[ntpb]; float x = 0; // store product in shared memory if (gid < n) x = s[tid] = a[gid] * b[gid]; __syncthreads(); // reduce shared memory entries for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { if (tid < stride) { x += s[tid + stride]; s[tid] = x; } __syncthreads(); } if (tid == 0) c[blockIdx.x] = x; }

  14. OpenCL – Register Accumulator #define NWUPWG 128 // number of work-items per workgroup // Local Memory // __kernel void dot_DMR(__global const float* a, __global const float* b, __global float* c, int n) { int gid = get_global_id(0); int i = get_local_id(0); int size = get_local_size(0); int wgrp = get_group_id(0); __local float s[NWUPWG]; // store product in local memory float x = 0; if (gid < n) x = s[i] = a[gid] * b[gid]; barrier(CLK_LOCAL_MEM_FENCE); // reduce local memory entries for (int stride = size >> 1; stride > 0; stride >>= 1) { if (i < stride) { x += s[i + stride]; s[i] = x; } } if (i == 0) c[wgrp] = x; }

  15. CUDA Results

  16. OpenCL Results

More Related