500 likes | 648 Views
Portable Operating System Interface Thread. Yukai Hung a0934147@gmail.com Department of Mathematics National Taiwan University. POSIX Thread Basic. POSIX Thread Basic. What is process? What is thread? - a thread of execution is the smallest unit of processing that can be
E N D
Portable Operating System Interface Thread YukaiHung a0934147@gmail.comDepartment of MathematicsNational Taiwan University
POSIX Thread Basic • What is process? What is thread? • - a thread of execution is the smallest unit of processing that can be • scheduled by operating system, which is contained inside a process • - multiple threads can exist within the same process and share • resources, while different processes do not share the resources • How to create new process? • - use system function fork(), which creates a copy of itself • - parent and child process can tell each other apart by examining • the return value of fork() system function (non-zero or zero value) 3
POSIX Thread Basic • int pthread_create(…) • create new thread with specified thread attributes and • execute thread function with specified function arguments • http://opengroup.org/onlinepubs/007908799/xsh/pthread_create.html • void pthread_exit(…) • terminate the current calling thread and makes the return value • pointer available to any successful join with the terminating thread • http://opengroup.org/onlinepubs/007908799/xsh/pthread_exit.html • int pthread_join(…) • suspend the execution of the current calling thread or process until the • target thread terminates, unless the target thread has already terminated • http://opengroup.org/onlinepubs/007908799/xsh/pthread_join.html 4
POSIX Thread Basic #include <stdio.h> #include <stdlib.h> #include <pthread.h> • int main(int argc,char** argv) • { • int error1; • int error2; • int input1; • int input2; • int return1; • int return2; • pthread_tthread1; • pthread_tthread2; • input1=1; • input2=2; • error1=pthread_create(&thread1,NULL,tfunction,(void*)&input1); • error2=pthread_create(&thread2,NULL,tfunction,(void*)&input2); 5
POSIX Thread Basic • if(error1!=0||error2!=0) • printf(“Error:thread create\n”); • error1=pthread_join(thread1,(void*)&return1); • error2=pthread_join(thread2,(void*)&return2); • if(error1!=0||error2!=0) • printf(“Error:thread join\n”); • printf(“thread 1 return %d\n”,return1)); • printf(“thread 2 return %d\n”,return2)); • return 0; • } 6
POSIX Thread Basic • void* tfunction(void* input) • { • printf(“thread %d is executing\n”,*((int*)input)); • pthread_exit((void*)1); • } 7
POSIX Thread Basic • int pthread_equal(…) • compare two threads from two thread handles • http://opengroup.org/onlinepubs/007908799/xsh/pthread_equal.html • pthread_tpthread_self(…) • return the thread handle of the current calling thread • http://opengroup.org/onlinepubs/007908775/xsh/pthread_self.html • int pthread_cancel(…) • request the thread be canceled, the target threads cancelability • states and types determines when the cancellation takes effects • http://opengroup.org/onlinepubs/007908775/xsh/pthread_cancel.html 8
POSIX Thread Basic • void pthread_cleanup_push(…) • the function shall push the specified cancellation cleanup handler • handler routine onto the calling threads cancellation cleanup stack • http://linux.die.net/man/3/pthread_cleanup_push • void pthread_cleanup_pop(…) • the function shall remove the routine at the top of calling cleanup • thread cancellation stack and optionally invoke it (if input is non-zero) • http://linux.die.net/man/3/pthread_cleanup_pop 9
POSIX Thread Basic #include <stdio.h> #include <stdlib.h> #include <pthread.h> • int main(int argc,char** argv) • { • int rvalue; • pthread_tthread; • if(pthread_create(&thread,NULL,tfunction,(void*)1)!=0) • printf(“Error:thread create\n”); • if(pthread_join(thread,(void*)&rvalue)!=0) • printf(“Error:thread join\n”); • printf(“thread return %d\n”,rvalue)); • return 0; • } 10
POSIX Thread Basic void* tfunction(void* input) { printf(“thread start\n”); • pthread_cleanup_push(cleanup,"thread first handler"); • pthread_cleanup_push(cleanup,"thread second handler"); • printf("thread push complete\n"); • pthread_cleanup_pop(1); • pthread_cleanup_pop(1); • return (void*)1; } • void cleanup(void* string) • { • printf(“cleanup:%s\n”,(char*)string); • return; • } 11
Race Condition and Mutex Lock • Consider the following parallel program • - threads are almost impossibly executed at the same time 13
Race Condition and Mutex Lock • Scenario 1 • - the result value R is 2 if the initial value R is 1 14
Race Condition and Mutex Lock • Scenario 2 • - the result value R is 2 if the initial value R is 1 15
Race Condition and Mutex Lock • Scenario 3 • - the result value R is 3 if the initial value R is 1 16
Race Condition and Mutex Lock • Solve the race condition by Locking • - manage the shared resource between threads • - avoid the deadlock or unbalanced problems 17
Race Condition and Mutex Lock • Guarantee the executed instruction order is correct • - the problem is back to the sequential procedure • - lock and release procedure have high overhead 18
Race Condition and Mutex Lock • Solve the race condition by Semaphore • - multi-value locking method (binary locking extension) • - instructions in procedure P and V are atomic operations 19
Race Condition and Mutex Lock #include <stdio.h> #include <stdlib.h> #include <pthread.h> • int main(int argc,char** argv) • { • int value; • int error1; • int error2; • pthread_tthread1; • pthread_tthread2; • value=0; • error1=pthread_create(&thread1,NULL,tfunction,(void*)&value); • error2=pthread_create(&thread2,NULL,tfunction,(void*)&value); • if(error1!=0||error2!=0) • printf(“Error:thread create\n”); 20
Race Condition and Mutex Lock • error1=pthread_join(thread1,NULL); • error2=pthread_join(thread2,NULL); • if(error1!=0||error2!=0) • printf(“Error:thread join\n”); • printf(“final result is %d\n”,value)); • return 0; • } • void* tfunction(void* input) • { • *((int*)input)=*((int*)input)+1; • return NULL; • } 21
Race Condition and Mutex Lock • int pthread_mutex_init(…) • initialize the mutex referenced by mutex with specified attributes • initialize an already initialized mutex results in undefined behavior • http://opengroup.org/onlinepubs/007908775/xsh/pthread_mutex_init.html • int pthread_mutex_destroy(…) • destroy the previously initialized mutex lock • the mutex must not be used after it has been destroyed • http://www.mkssoftware.com/docs/man3/pthread_mutex_destroy.3.asp 22
Race Condition and Mutex Lock • int pthread_mutex_lock(…) • lock the specified initialized mutex. if the mutex is already locked, • the calling thread blocks until he mutex becomes available or unlock • http://www.mkssoftware.com/docs/man3/pthread_mutex_lock.3.asp • int pthread_mutex_unlock(…) • attempt to unlock the specified mutex. If there are threads blocked on the mutex • object when unlock function is calling, resulting in the mutex becoming available • the scheduling policy is used to determine which thread acquire the mutex • http://www.mkssoftware.com/docs/man3/pthread_mutex_unlock.3.asp • int pthread_mutex_trylock(…) • try to lock the specified mutex. If the mutex is already locked, an error is • returned, otherwise, the operation returns with the mutex in the locked • state with the calling thread as its owner • http://www.mkssoftware.com/docs/man3/pthread_mutex_trylock.3.asp 23
Race Condition and Mutex Lock #include <stdio.h> #include <stdlib.h> #include <pthread.h> • pthread_mutex_twork_mutex; • int main(int argc,char** argv) • { • int value; • int error1; • int error2; • pthread_tthread1; • pthread_tthread2; • value=0; • if(pthread_mutex_init(&work_mutex,NULL)!=0) • printf(“Error:work mutex create\n”); 24
Race Condition and Mutex Lock • error1=pthread_create(&thread1,NULL,tfunction,(void*)&value); • error2=pthread_create(&thread2,NULL,tfunction,(void*)&value); • if(error1!=0||error2!=0) • printf(“Error:thread create\n”); • error1=pthread_join(thread1,NULL); • error2=pthread_join(thread2,NULL); • if(error1!=0||error2!=0) • printf(“Error:thread join\n”); • printf(“final result is %d\n”,value); • if(pthread_mutex_destroy(&work_mutex)!=0) • printf(“Error:work mutex destroy\n”); • return 0; • } 25
Race Condition and Mutex Lock • void* tfunction(void* input) • { • int* value; • if(pthread_mutex_lock(&work_mutex)!=0) • printf(“Error:lock work mutex\n”); • *((int*)input)=*((int*)input)+1; • if(pthread_mutex_unlock(&work_mutex)!=0) • printf(“Error:work mutex unlock\n”); • return NULL; • } 26
Signal and Condition Variable • int pthread_cond_init(…) • initialize the condition variable referenced by cond with specified attributes • initialize an already initialized condition variable results in undefined behavior • http://opengroup.org/onlinepubs/007908775/xsh/pthread_cond_init.html • int pthread_cond_destroy(…) • destroy the previously initialized condition variable • the condition variable must not be used after it has been destroyed 28
Signal and Condition Variable • int loop=1; • pthread_cond_tcond; • pthread_mutex_tmutex; • int main(int argc,char** argv) • { • pthread_tthread1; • pthread_tthread2; • pthread_cond_init(&cond,NULL); • pthread_mutex_init(&mutex,NULL); • pthread_create(&thread1,NULL,fthread1,(void *)NULL); • pthread_create(&thread2,NULL,fthread2,(void *)NULL); • pthread_join(thread1,NULL); • pthread_join(thread2,NULL); • pthread_cond_destroy(&cond); • pthread_mutex_destroy(&mutex); • return 0; • } 29
Signal and Condition Variable void* fthread1(void* input) { for(loop=1;loop<=9;loop++) { pthread_mutex_lock(&mutex); if(loop%3==0) pthread_cond_signal(&cond); else printf("thread1:%d\n",loop); pthread_mutex_unlock(&mutex); sleep(1); } return NULL; }; 30
Signal and Condition Variable void* fthread2(void* input) { while(loop<9) { pthread_mutex_lock(&mutex); if(loop%3!=0) pthread_cond_wait(&cond,&mutex); printf("thread2:%d\n",loop); pthread_mutex_unlock(&mutex); sleep(1); } return NULL; }; 31
Multiple Thread and Multiple GPU • A host thread can maintain one context at a time • - need as many host threads as GPUs to maintain all device • - multiple host threads can establish context with the same GPU • hardware diver handles time-sharing and resource partitioning device 0 device 1 device 2 host thread 0 host thread 1 host thread 2 host memory 33
Multiple Thread and Multiple GPU • cudaGetDeviceCount() • returns the number of devices on the current system with compute, • capability greater or equal to 1.0, that are available for execution • cudaSetDevice() • set the specific device on which the active host thread executes the • device code. If the host thread has already initialized he cuda runtime • by calling non-device management runtime functions, returns error • must be called prior to context creation, fails if the context has already • been established, one can forces the context creation with cudaFree(0) • cudaGetDevice(…) • returns the device on which the active host thread executes the code 34
Multiple Thread and Multiple GPU #include <cuda.h> #include <stdio.h> #include <stdlib.h> #include <pthread.h> • #define MaxDevice 8 • int main(int argc,char** argv) • { • int size; • int loop; • int devicecount; • float* h_veca; • float* h_vecb; • float* h_vecc; • pthread_tthreadt[MaxDevice]; • pthread_cthreadc[MaxDevice]; • size=32000*4; • h_veca=(float*)malloc(sizeof(float)*size); • h_vecb=(float*)malloc(sizeof(float)*size); • h_vecc=(float*)malloc(sizeof(float)*size); 35
Multiple Thread and Multiple GPU • for(loop=0;loop<size;loop++) • { • h_veca[loop]=1.0f; • h_vecb[loop]=2.0f; • h_vecc[loop]=0.0f; • } • cudaGetDeviceCount(&devicecount); • devicecount=(devicecount>MaxDevice)?MaxDevice:devicecount; • printf(“device number is %d\n”,devicecount); • for(loop=0;loop<devicecount;loop++) • { • threadc[loop].index=loop; • threadc[loop].subsz=size/devicecount; • threadc[loop].hveca=h_veca+loop*subsz; • threadc[loop].hvecb=h_vecb+loop*subsz; • threadc[loop].hvecc=h_vecc+loop*subsz; • } • for(loop=0;loop<devicecount;loop++) • pthread_create(threadt+loop,NULL,tfunction,(void*)(threadc+loop)); 36
Multiple Thread and Multiple GPU • for(loop=0;loop<devicecount;loop++) • pthread_join(threadt[loop],NULL); • for(loop=0;loop<size;loop++) • if(h_vecc[loop]!=3.0f) • printf(“Error:check result\n”); • free(h_veca); • free(h_vecb); • free(h_vecc); • return 0; • }; • structpthread_c • { • int index; • int subsz; • float* hveca; • float* hvecb; • float* hvecc; • }; 37
Multiple Thread and Multiple GPU • void* tfunction(void* content) • { • int index; • int subsz; • int gsize; • int bsize; • float *hveca,*dveca; • float *hvecb,*dvecb; • float *hvecc,*dvecc; • index=(*((pthread_c*)content)).index; • subsz=(*((pthread_c*)content)).subsz; • hveca=(*((pthread_c*)content)).hveca; • hvecb=(*((pthread_c*)content)).hvecb; • hvecc=(*((pthread_c*)content)).hvecc; • printf(“thread %d start!\n”,index); • //for(int loop=0;loop<subsz;loop++) • //hvecc[loop]=hveca[loop]+hvecb[loop]; • cudaSetDevice(index); 38
Multiple Thread and Multiple GPU • cudaMalloc((void**)&dveca,sizeof(float)*subsz); • cudaMalloc((void**)&dvecb,sizeof(float)*subsz); • cudaMalloc((void**)&dvecc,sizeof(float)*subsz); • cudaMemcpy(dveca,hveca,sizeof(float)*subsz,cudaMemcpyHostToDevice); • cudaMemcpy(dvecb,hvecb,sizeof(float)*subsz,cudaMemcpyHostToDevice); • bsize=256; • gsize=(int)ceil((float)subsz/256); • vecAdd<<<gsize,bsize>>>(dveca,dvecb,dvecc,subsz); • cudaMemcpy(hvecc,dvecc,sizeof(float)*subsz,cudaMemcpyDeviceToHost); • cudaFree(dveca); • cudaFree(dvecb); • cudaFree(dvecc); • cudaError_t error; • if((error=cudaGetLastError())!=cudaSuccess) • printf(“cudaError:%s\n”,cudaGetErrorString(error)); • printf(“thread %d finish!\n”,index); • return NULL; • }; 39
Multiple Thread and Multiple GPU • __global__ void vecAdd(float* veca,float* vecb,float* vecc,int size) • { • int index; • index=blockIdx.x*blockDim.x+threadIdx.x; • if(index<size) • vecc[index]=veca[index]+vecb[index]; • return; • }; 40
Multiple Thread and Multiple GPU • Where is constant memory? • - data is stored in the device global memory • - read data through multiprocessor constant cache • - 64KB constant memory and 8KB cache for each multiprocessor • How about the performance? • - optimized when warp of threads read same location • - 4 bytes per cycle through broadcasting to warp of threads • - serialized when warp of threads read in different location • - very slow when cache miss (read data from global memory) • - access latency can range from one to hundreds clock cycles 41
Multiple Thread and Multiple GPU • How to use constant memory? • - declare constant memory on the file scope (global variable) • - copy data to constant memory by host (because it is constant!!) //declare constant memory __constant__ float cst_ptr[size]; //copy data from host to constant memory • cudaMemcpyToSymbol(cst_ptr,host_ptr,data_size); 42
Multiple Thread and Multiple GPU //declare constant memory __constant__ float cangle[360]; int main(int argc,char** argv) { int size=3200; float* darray; • float hangle[360]; //allocate device memory cudaMalloc((void**)&darray,sizeof(float)*size); //initialize allocated memory cudaMemset(darray,0,sizeof(float)*size); //initialize angle array on host for(int loop=0;loop<360;loop++) hangle[loop]=acos(-1.0f)*loop/180.0f; //copy host angle data to constant memory cudaMemcpyToSymbol(cangle,hangle,sizeof(float)*360); 43
Constant Memory //execute device kernel test_kernel<<<size/64,64>>>(darray); //free device memory cudaFree(darray); return 0; } __global__ void test_kernel(float* darray) { int index; //calculate each thread global index index=blockIdx.x*blockDim.x+threadIdx.x; #pragma unroll 10 for(int loop=0;loop<360;loop++) darray[index]=darray[index]+cangle[loop]; return; }; 44
Multiple Thread and Multiple GPU #include <cuda.h> #include <stdio.h> #include <stdlib.h> #include <pthread.h> • #define MaxDevice 8 • __constant__ float cangle[360]; • int main(int argc,char** argv) • { • int loop; • int devicecount; • float summation; • float hangle[360]; • pthread_tthreadt[MaxDevice]; • pthread_cthreadc[MaxDevice]; • for(loop=0;loop<360;loop++) • hangle[loop]=acos(-1.0f)*loop/180.0f; • for(loop=0,summation=0.0f;loop<360;loop++) • summation=summation+hangle[loop]; 45
Multiple Thread and Multiple GPU • cudaGetDeviceCount(&devicecount); • devicecount=(devicecount>MaxDevice)?MaxDevice:devicecount; • for(loop=0;loop<devicecount;loop++) • { • threadc[loop].index=loop; • threadc[loop].hangle=hangle; • threadc[loop].summation=summation; • } • for(loop=0;loop<devicecount;loop++) • pthread_create(threadt+loop,NULL,tfunction,(void*)(threadc+loop)); • for(loop=0;loop<devicecount;loop++) • pthread_join(threadt[loop],NULL); • return 0; • } • structpthread_c • { • int index; • float* hangle; • float summation; • }; 46
Multiple Thread and Multiple GPU • void* tfunction(void* content) • { • int size; • int index; • int gsize; • int bsize; • float summation; • float* hangle; • float* hvector; • float* dvector; • size=32000; • index=(*((pthread_c*)content)).index; • hangle=(*((pthread_c*)content)).hangle; • summation=(*((pthread_c*)content)).summation; • printf(“thread %d start!\n”,index); • cudaSetDevice(index); • cudaMemcpyToSymbol(cangle,hangle,sizeof(float)*360); 47
Multiple Thread and Multiple GPU • hvector=(float*)malloc(sizeof(float)*size); • cudaMalloc((void**)&dvector,sizeof(float)*size); • bsize=256; • gsize=(int)ceil((float)size/256); • kernel<<<gsize,bsize>>>(dvector,size); • cudaMemcpy(hvector,dvector,sizeof(float)*size,cudaMemcpyDeviceToHost); • for(loop=0;loop<size;loop++) • if(hvector[loop]!=summation) • printf("Error: check result\n"); • free(hvector); • cudaFree(dvector); • cudaError_t error; • if((error=cudaGetLastError())!=cudaSuccess) • printf(“cudaError:%s\n”,cudaGetErrorString(error)); • printf(“thread %d finish!\n”,index); • return NULL; • }; 48
Multiple Thread and Multiple GPU __global__ void kernel(float* dvector,int size) { int loop; int index; float temp; index=blockIdx.x*blockDim.x+threadIdx.x; if(index<size) { for(loop=0,temp=0.0f;loop<360;loop++) temp=temp+cangle[loop]; *(dvector+index)=temp; } return; }; 49