1 / 9

GPGPU Labor 15.

GPGPU Labor 15. P árhuzamos primitívek. Map Reduce Scan Histogram Compact. Map. // TODO // // ID := threadIdx.x + blockIdx.x * blockDim.x // IF ID > dataSize THEN return // data[ID] := square(data[ID]) __ global__ void mapKernel ( int * data, unsigned int dataSize ) {

avedis
Download Presentation

GPGPU Labor 15.

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. GPGPU Labor 15.

  2. Párhuzamos primitívek • Map • Reduce • Scan • Histogram • Compact

  3. Map // TODO // // ID := threadIdx.x + blockIdx.x * blockDim.x // IF ID > dataSize THEN return // data[ID] := square(data[ID]) __global__ void mapKernel(int* data, unsigned intdataSize) { // TODO }

  4. Reduce // TODO // // FOR s = dataSize / 2 ; s > 0 ; s >>= 1 DO: // IF (ID < s) // data[ID] = max(data[ID], data[ID + s]) // SYNCHRONIZE THREADS // __global__ void reduceKernel(float* data, intdataSize) { intid = threadIdx.x + blockIdx.x * blockDim.x; // TODO }

  5. Histogram // TODO // // histogram[data[id]] := histogram[data[id]] + 1 // SYNCHRONIZATION! (atomicADD) __global__ void histogramGlobalKernel(int* data, int* histogram, intdataSize) { // TODO }

  6. Histogram (shared) // TODO // // IF LID < histogramSize DO: // lhistogram[LID] := 0 // SYNCHRONIZE THREADS // // Add data to local histogram // // SYNCHRONIZE THREADS // // IF LID < histogramSize DO: // histogram[LID] = lhistogram[LID] __global__ void histogramLocalKernel(int* data, int* histogram, intdataSize, inthistogramSize) { extern __shared__ intlhistogram[]; intid = threadIdx.x + blockIdx.x * blockDim.x; intlid = threadIdx.x; // TODO }

  7. Scan (exclusive) // TODO // // IF ID > 0 THEN data[ID] = data[ID - 1] // ELSE data[ID] = 0 // SYNCHRONIZE THREADS // // FOR s = 1; s < dataSize; s *= 2 DO: // tmp := data[ID] // IF ( ID + s < dataSize THEN // data[ID + s] += tmp; // SYNCHRONIZE THREADS // // IF(ID = 0) THEN data[ID] = 0; __global__ void exscanKernel(int* data, intdataSize) { intid = threadIdx.x + blockIdx.x * blockDim.x; // TODO }

  8. Scan (exclusive) • Módosítsuk a scan-t, hogy tetszőleges méretű adathalmazon működjön

  9. Profiling • Nézzük meg az Nsight Performance Analysis segítségével a histogram kerneleket! • Nézzük meg a Visual Profilerrel is! • Milyen módon optimalizálható a histogram algoritmus?

More Related