1 / 21

Programming of multiple GPUs with CUDA and Qt library

Explore the world of programming multiple GPUs using CUDA and Qt library, learning about context establishment, data configuration, computation, and comparing GPU & CPU results.

cpickens
Download Presentation

Programming of multiple GPUs with CUDA and Qt library

An Image/Link below is provided (as is) to download presentation Download Policy: Content on the Website is provided to you AS IS for your information and personal use and may not be sold / licensed / shared on other websites without getting consent from its author. Content is provided to you AS IS for your information and personal use only. Download presentation by click this link. While downloading, if for some reason you are not able to download a presentation, the publisher may have deleted the file from their server. During download, if you can't get a presentation, the file might be deleted by the publisher.

E N D

Presentation Transcript


  1. Lecture Programming of multiple GPUs with CUDAand Qt library Alexey Abramov abramov _at_ physik3.gwdg.de Georg-August University, Bernstein Center for Computational Neuroscience, III Physikalisches Institut, Göttingen, Germany

  2. Multi-GPU programming A host system can have multiple devices. Several host threads can execute device code on the same device, but by design, a host thread can execute device code on only one device at any given time. As a consequence, multiple host threads are required to execute device code on multiple devices. Alexey Abramov (BCCN, Göttingen) 11-03-11 2/21

  3. Multi-GPU programming In order to issue work to a GPU, a context is established between a CPU thread and the GPU. Only one context can be active on GPU at a time. Alexey Abramov (BCCN, Göttingen) 11-03-11 3/21

  4. Multi-GPU programming Even though a GPU can execute calls from one context at a time, it can belong to multiple contexts. For example, it is possible for several CPU threads to establish contexts with the same GPU. Alexey Abramov (BCCN, Göttingen) 11-03-11 4/21

  5. Multi-GPU programming A host thread can execute device code on only one device at any given time. (it will be possible in CUDA 4.0) Alexey Abramov (BCCN, Göttingen) 11-03-11 5/21

  6. #include <stdlib.h>#include <stdio.h>#include <math.h>#include <multithreading.h>#include <cutil_inline.h>#include <cuda_runtime_api.h>#include "simpleMultiGPU.h" typedef struct {// Device idint device;// Host-side input dataint dataN;float *h_Data;// Partial sum for this GPUfloat *h_Sum;} TGPUplan; Alexey Abramov (BCCN, Göttingen) 11-03-11 6/21

  7. // Data configurationconst intMAX_GPU_COUNT = 32;const intDATA_N        = 1048576 * 32;intmain(int argc, char **argv){// Solver configTGPUplan plan[MAX_GPU_COUNT];// GPU reduction resultsfloat h_SumGPU[MAX_GPU_COUNT];bzero(h_SumGPU, MAX_GPU_COUNT * sizeof(float));// OS thread ID CUTThread threadID[MAX_GPU_COUNT];// create a timer to measure runtimeunsigned int hTimer; cutCreateTimer(&hTimer); Alexey Abramov (BCCN, Göttingen) 11-03-11 7/21

  8. // get number of available CUDA-capable devicesint deviceCount = 0;cudaGetDeviceCount(&deviceCount); if(deviceCount > MAX_GPU_COUNT)    deviceCount = MAX_GPU_COUNT;printf("CUDA-capable device count: %i\n", deviceCount); printf("Generating input data...\n\n");float *h_Data = (float *)malloc(DATA_N * sizeof(float));for(int i = 0; i < DATA_N; i++)    h_Data[i] = (float)rand() / (float)RAND_MAX;// subdividing input data across GPUs// get data sizes for each GPUfor(int i = 0; i < deviceCount; i++)    plan[i].dataN = DATA_N / deviceCount; Alexey Abramov (BCCN, Göttingen) 11-03-11 8/21

  9. // take into account "odd" data sizesfor(int i = 0; i < DATA_N % deviceCount; i++)    plan[i].dataN++;// assign data ranges to GPUsint gpuBase = 0;for(int i = 0; i < deviceCount; i++){    plan[i].device = i;    plan[i].h_Data = h_Data + gpuBase;    plan[i].h_Sum = h_SumGPU + i;    gpuBase += plan[i].dataN;  }// start timing and compute on GPU(s)printf("Computing with %d GPU's...\n", deviceCount);cutResetTimer(hTimer);cutStartTimer(hTimer); Alexey Abramov (BCCN, Göttingen) 11-03-11 9/21

  10. // create deviceCount threadsfor(int i = 0; i < deviceCount; i++)    threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void*) (plan + i)); cutWaitForThreads(threadID, deviceCount);float sumGPU = 0;// get the final sum for(int i = 0; i < deviceCount; i++)    sumGPU += h_SumGPU[i];cutStopTimer(hTimer);printf("GPU Processing time: %f (ms)\n\n", cutGetTimerValue(hTimer)); Alexey Abramov (BCCN, Göttingen) 11-03-11 10/21

  11. // compute on Host CPU printf("Computing with Host CPU...\n\n");double sumCPU = 0;for(int i = 0; i < DATA_N; i++)    sumCPU += h_Data[i];// compare GPU and CPU results printf("Comparing GPU and Host CPU results...\n");double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);printf("  GPU sum: %f\n  CPU sum: %f\n", sumGPU, sumCPU);printf("  Relative difference: %E \n\n", diff);printf((diff < 1e-5) ? "PASSED\n\n" : "FAILED\n\n");// cleanup and shutdownprintf("Shutting down...\n");cutDeleteTimer(hTimer);free(h_Data);cudaThreadExit(); Alexey Abramov (BCCN, Göttingen) 11-03-11 11/21

  12. staticCUT_THREADPROCsolverThread(TGPUplan *plan){const intBLOCK_N = 32;const intTHREAD_N = 256;const intACCUM_N = BLOCK_N * THREAD_N;float *d_Data,*d_Sum;float *h_Sum;float sum;int i;// set devicecudaSetDevice(plan->device);// allocate memorycudaMalloc((void**)&d_Data, plan->dataN * sizeof(float));cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float));  h_Sum = (float *)malloc(ACCUM_N * sizeof(float); Alexey Abramov (BCCN, Göttingen) 11-03-11 12/21

  13. // copy input data from CPUcudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice);// perform GPU computations launch_reduceKernel(d_Sum, d_Data, plan->dataN, BLOCK_N, THREAD_N);// read back GPU resultscudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );  sum = 0;for(i = 0; i < ACCUM_N; i++)    sum += h_Sum[i];  *(plan->h_Sum) = (float)sum;// shut down this GPUfree(h_Sum);cudaFree(d_Sum);cudaFree(d_Data);CUT_THREADEND; } Alexey Abramov (BCCN, Göttingen) 11-03-11 13/21

  14. void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N) { reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Result, d_Input, N) cudaThreadSynchronize(); } __global__ static voidreduceKernel(float *d_Result, float *d_Input, int N){const int tid = blockIdx.x * blockDim.x + threadIdx.x;const int threadN = gridDim.x * blockDim.x; float sum = 0; for(int pos = tid; pos < N; pos += threadN)    sum += d_Input[pos];  d_Result[tid] = sum; } Alexey Abramov (BCCN, Göttingen) 11-03-11 14/21

  15. QThread class for multi-GPU programming The QThread class provides platform-independent threads. class QThread;// class for Qt thread with a GPU contextclass CDeviceThread: public QThread{private:TGPUplan *m_pPlan;protected:void run();public:    CDeviceThread(){};    ~CDeviceThread(){};void Init(TGPUplan *plan){ m_pPlan = plan; };}; Alexey Abramov (BCCN, Göttingen) 11-03-11 15/21

  16. intmain(int argc, char **argv){CDeviceThread *pThreads[MAX_GPU_COUNT]; … // create deviceCount threadsfor(int i = 0; i < deviceCount; i++){CDeviceThread *pDevice = newCDeviceThread;      pDevice->Init(plan+i);      pThreads[i] = pDevice;  }// start threadsfor(int i = 0; i < deviceCount; i++)      pThreads[i]->start();// wait for threadsfor(int i = 0; i < deviceCount; i++)      pThreads[i]->wait(); … Alexey Abramov (BCCN, Göttingen) 11-03-11 16/21

  17. // cleanup for(int i = 0; i < deviceCount; i++) delete pThreads[i]; … } void CDeviceThread::run(){  std::cout << "CDeviceThread thread ID = " << QThread::currentThreadId() << std::endl;  std::cout << "Device = " << m_pPlan->device << std::endl;  std::cout << "DataN = " << m_pPlan->dataN << std::endl;const intBLOCK_N = 32;const intTHREAD_N = 256;const intACCUM_N = BLOCK_N * THREAD_N;float *d_Data,*d_Sum;float *h_Sum;float sum; Alexey Abramov (BCCN, Göttingen) 11-03-11 17/21

  18. int i; // set device cudaSetDevice(m_pPlan->device); // allocate memory cudaMalloc((void**)&d_Data, m_pPlan->dataN * sizeof(float)); cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float)); // copy input data from CPU cudaMemcpy(d_Data, m_pPlan->h_Data, m_pPlan->dataN * sizeof(float), cudaMemcpyHostToDevice); // perform GPU computations launch_reduceKernel(d_Sum, d_Data, m_pPlan->dataN, BLOCK_N, THREAD_N); Alexey Abramov (BCCN, Göttingen) 11-03-11 18/21

  19. // read back GPU results cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost); // finalize GPU reduction for current subvector sum = 0; for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i]; *(m_pPlan->h_Sum) = (float)sum; // shut down this GPU free(h_Sum); cudaFree(d_Sum); cudaFree(d_Data); } Alexey Abramov (BCCN, Göttingen) 11-03-11 19/21

  20. Bibliography • NVIDIA CUDA Programming Guide • CUDA C Best Practices Guide • Qt documentationhttp://qt.nokia.com/ Alexey Abramov (BCCN, Göttingen) 11-03-11 20/21

  21. Thank you for your attention ! QUESTIONS ? Göttingen, 11.03.2011

More Related