210 likes | 254 Views
Explore the world of programming multiple GPUs using CUDA and Qt library, learning about context establishment, data configuration, computation, and comparing GPU & CPU results.
E N D
Lecture Programming of multiple GPUs with CUDAand Qt library Alexey Abramov abramov _at_ physik3.gwdg.de Georg-August University, Bernstein Center for Computational Neuroscience, III Physikalisches Institut, Göttingen, Germany
Multi-GPU programming A host system can have multiple devices. Several host threads can execute device code on the same device, but by design, a host thread can execute device code on only one device at any given time. As a consequence, multiple host threads are required to execute device code on multiple devices. Alexey Abramov (BCCN, Göttingen) 11-03-11 2/21
Multi-GPU programming In order to issue work to a GPU, a context is established between a CPU thread and the GPU. Only one context can be active on GPU at a time. Alexey Abramov (BCCN, Göttingen) 11-03-11 3/21
Multi-GPU programming Even though a GPU can execute calls from one context at a time, it can belong to multiple contexts. For example, it is possible for several CPU threads to establish contexts with the same GPU. Alexey Abramov (BCCN, Göttingen) 11-03-11 4/21
Multi-GPU programming A host thread can execute device code on only one device at any given time. (it will be possible in CUDA 4.0) Alexey Abramov (BCCN, Göttingen) 11-03-11 5/21
#include <stdlib.h>#include <stdio.h>#include <math.h>#include <multithreading.h>#include <cutil_inline.h>#include <cuda_runtime_api.h>#include "simpleMultiGPU.h" typedef struct {// Device idint device;// Host-side input dataint dataN;float *h_Data;// Partial sum for this GPUfloat *h_Sum;} TGPUplan; Alexey Abramov (BCCN, Göttingen) 11-03-11 6/21
// Data configurationconst intMAX_GPU_COUNT = 32;const intDATA_N = 1048576 * 32;intmain(int argc, char **argv){// Solver configTGPUplan plan[MAX_GPU_COUNT];// GPU reduction resultsfloat h_SumGPU[MAX_GPU_COUNT];bzero(h_SumGPU, MAX_GPU_COUNT * sizeof(float));// OS thread ID CUTThread threadID[MAX_GPU_COUNT];// create a timer to measure runtimeunsigned int hTimer; cutCreateTimer(&hTimer); Alexey Abramov (BCCN, Göttingen) 11-03-11 7/21
// get number of available CUDA-capable devicesint deviceCount = 0;cudaGetDeviceCount(&deviceCount); if(deviceCount > MAX_GPU_COUNT) deviceCount = MAX_GPU_COUNT;printf("CUDA-capable device count: %i\n", deviceCount); printf("Generating input data...\n\n");float *h_Data = (float *)malloc(DATA_N * sizeof(float));for(int i = 0; i < DATA_N; i++) h_Data[i] = (float)rand() / (float)RAND_MAX;// subdividing input data across GPUs// get data sizes for each GPUfor(int i = 0; i < deviceCount; i++) plan[i].dataN = DATA_N / deviceCount; Alexey Abramov (BCCN, Göttingen) 11-03-11 8/21
// take into account "odd" data sizesfor(int i = 0; i < DATA_N % deviceCount; i++) plan[i].dataN++;// assign data ranges to GPUsint gpuBase = 0;for(int i = 0; i < deviceCount; i++){ plan[i].device = i; plan[i].h_Data = h_Data + gpuBase; plan[i].h_Sum = h_SumGPU + i; gpuBase += plan[i].dataN; }// start timing and compute on GPU(s)printf("Computing with %d GPU's...\n", deviceCount);cutResetTimer(hTimer);cutStartTimer(hTimer); Alexey Abramov (BCCN, Göttingen) 11-03-11 9/21
// create deviceCount threadsfor(int i = 0; i < deviceCount; i++) threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void*) (plan + i)); cutWaitForThreads(threadID, deviceCount);float sumGPU = 0;// get the final sum for(int i = 0; i < deviceCount; i++) sumGPU += h_SumGPU[i];cutStopTimer(hTimer);printf("GPU Processing time: %f (ms)\n\n", cutGetTimerValue(hTimer)); Alexey Abramov (BCCN, Göttingen) 11-03-11 10/21
// compute on Host CPU printf("Computing with Host CPU...\n\n");double sumCPU = 0;for(int i = 0; i < DATA_N; i++) sumCPU += h_Data[i];// compare GPU and CPU results printf("Comparing GPU and Host CPU results...\n");double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU);printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU);printf(" Relative difference: %E \n\n", diff);printf((diff < 1e-5) ? "PASSED\n\n" : "FAILED\n\n");// cleanup and shutdownprintf("Shutting down...\n");cutDeleteTimer(hTimer);free(h_Data);cudaThreadExit(); Alexey Abramov (BCCN, Göttingen) 11-03-11 11/21
staticCUT_THREADPROCsolverThread(TGPUplan *plan){const intBLOCK_N = 32;const intTHREAD_N = 256;const intACCUM_N = BLOCK_N * THREAD_N;float *d_Data,*d_Sum;float *h_Sum;float sum;int i;// set devicecudaSetDevice(plan->device);// allocate memorycudaMalloc((void**)&d_Data, plan->dataN * sizeof(float));cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float); Alexey Abramov (BCCN, Göttingen) 11-03-11 12/21
// copy input data from CPUcudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float), cudaMemcpyHostToDevice);// perform GPU computations launch_reduceKernel(d_Sum, d_Data, plan->dataN, BLOCK_N, THREAD_N);// read back GPU resultscudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) ); sum = 0;for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i]; *(plan->h_Sum) = (float)sum;// shut down this GPUfree(h_Sum);cudaFree(d_Sum);cudaFree(d_Data);CUT_THREADEND; } Alexey Abramov (BCCN, Göttingen) 11-03-11 13/21
void launch_reduceKernel(float *d_Result, float *d_Input, int N, int BLOCK_N, int THREAD_N) { reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Result, d_Input, N) cudaThreadSynchronize(); } __global__ static voidreduceKernel(float *d_Result, float *d_Input, int N){const int tid = blockIdx.x * blockDim.x + threadIdx.x;const int threadN = gridDim.x * blockDim.x; float sum = 0; for(int pos = tid; pos < N; pos += threadN) sum += d_Input[pos]; d_Result[tid] = sum; } Alexey Abramov (BCCN, Göttingen) 11-03-11 14/21
QThread class for multi-GPU programming The QThread class provides platform-independent threads. class QThread;// class for Qt thread with a GPU contextclass CDeviceThread: public QThread{private:TGPUplan *m_pPlan;protected:void run();public: CDeviceThread(){}; ~CDeviceThread(){};void Init(TGPUplan *plan){ m_pPlan = plan; };}; Alexey Abramov (BCCN, Göttingen) 11-03-11 15/21
intmain(int argc, char **argv){CDeviceThread *pThreads[MAX_GPU_COUNT]; … // create deviceCount threadsfor(int i = 0; i < deviceCount; i++){CDeviceThread *pDevice = newCDeviceThread; pDevice->Init(plan+i); pThreads[i] = pDevice; }// start threadsfor(int i = 0; i < deviceCount; i++) pThreads[i]->start();// wait for threadsfor(int i = 0; i < deviceCount; i++) pThreads[i]->wait(); … Alexey Abramov (BCCN, Göttingen) 11-03-11 16/21
// cleanup for(int i = 0; i < deviceCount; i++) delete pThreads[i]; … } void CDeviceThread::run(){ std::cout << "CDeviceThread thread ID = " << QThread::currentThreadId() << std::endl; std::cout << "Device = " << m_pPlan->device << std::endl; std::cout << "DataN = " << m_pPlan->dataN << std::endl;const intBLOCK_N = 32;const intTHREAD_N = 256;const intACCUM_N = BLOCK_N * THREAD_N;float *d_Data,*d_Sum;float *h_Sum;float sum; Alexey Abramov (BCCN, Göttingen) 11-03-11 17/21
int i; // set device cudaSetDevice(m_pPlan->device); // allocate memory cudaMalloc((void**)&d_Data, m_pPlan->dataN * sizeof(float)); cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float)); // copy input data from CPU cudaMemcpy(d_Data, m_pPlan->h_Data, m_pPlan->dataN * sizeof(float), cudaMemcpyHostToDevice); // perform GPU computations launch_reduceKernel(d_Sum, d_Data, m_pPlan->dataN, BLOCK_N, THREAD_N); Alexey Abramov (BCCN, Göttingen) 11-03-11 18/21
// read back GPU results cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost); // finalize GPU reduction for current subvector sum = 0; for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i]; *(m_pPlan->h_Sum) = (float)sum; // shut down this GPU free(h_Sum); cudaFree(d_Sum); cudaFree(d_Data); } Alexey Abramov (BCCN, Göttingen) 11-03-11 19/21
Bibliography • NVIDIA CUDA Programming Guide • CUDA C Best Practices Guide • Qt documentationhttp://qt.nokia.com/ Alexey Abramov (BCCN, Göttingen) 11-03-11 20/21
Thank you for your attention ! QUESTIONS ? Göttingen, 11.03.2011