90 likes | 216 Views
Introduction to Computer Systems. 15-213 “The Class That Gives CMU Its Zip!”. Randal E. Bryant August 30, 2005. Class 01. refbug. double fun(int i) { volatile double d[1] = {3.14}; volatile long int a[2]; a[i] = 1073741824; /* Possibly out of bounds */ return d[0]; }. _fun:
E N D
Introduction to Computer Systems 15-213 “The Class That Gives CMU Its Zip!” Randal E. Bryant August 30, 2005 Class 01
refbug double fun(int i) { volatile double d[1] = {3.14}; volatile long int a[2]; a[i] = 1073741824; /* Possibly out of bounds */ return d[0]; } _fun: pushl %ebp movl $1073741824, %edx movl %esp, %ebp subl $16, %esp movl 8(%ebp), %eax fldl LC0 fstpl -8(%ebp) movl %edx, -16(%ebp,%eax,4) fldl -8(%ebp) leave ret
copyij & copyji void copyij(int src[2048][2048], int dst[2048][2048]) { int i,j; for (i = 0; i < 2048; i++) for (j = 0; j < 2048; j++) dst[i][j] = src[i][j]; } void copyji(int src[2048][2048], int dst[2048][2048]) { int i,j; for (j = 0; j < 2048; j++) for (i = 0; i < 2048; i++) dst[i][j] = src[i][j]; }
copyij copyji s1 2k s3 8k s5 s7 32k s9 128k s11 512k s13 2m s15 8m The Memory Mountain Pentium III Xeon 1200 550 MHz 16 KB on-chip L1 d-cache 16 KB on-chip L1 i-cache 1000 512 KB off-chip unified L1 L2 cache 800 Read throughput (MB/s) 600 400 xe L2 200 0 Mem Stride (words) Working set size (bytes)
L21: movl %ebx, 4(%esp) leal -16(%ebp), %eax incl %ebx movl %eax, 8(%esp) movl %edi, (%esp) call _get_vec_element movl -16(%ebp), %eax movl (%esi), %edx imull %edx, %eax movl %eax, (%esi) movl %edi, (%esp) call _vec_length cmpl %ebx, %eax jg L21 abs_combine void abs_combine(vec_ptr v, long int *dest) { int i; *dest = 1; for (i = 0; i < vec_length(v); i++) { long int val; get_vec_element(v, i, &val); *dest = *dest * val; } }
direct_combine void direct_combine(vec_ptr v, long int *dest) { int i; int length = vec_length(v); long int *data = get_vec_start(v); long int x = 1; for (i = 0; i < length; i++) { x = x * data[i]; } *dest = x; } L30: movl (%eax,%edx,4), %ebx incl %edx imull %ebx, %ecx cmpl %esi, %edx jl L30
void parallel_combine(vec_ptr v, long int *dest) { int length = vec_length(v); int limit = length-7; long int *data = get_vec_start(v); long int x = 1; int i; /* Combine 8 elements at a time */ for (i = 0; i < limit; i+=8) { long int t1 = data[i] * data[i+1]; long int t2 = data[i+2] * data[i+3]; long int u1 = t1 * t2; long int t3 = data[i+4] * data[i+5]; long int t4 = data[i+6] * data[i+7]; long int u2 = t3 * t4; x = x * (u1 * u2); } /* Finish any remaining elements */ for (; i < length; i++) { x = x * data[i]; } *dest = x; } parallel_combine
Role within Curriculum CS 441 Networks CS 412 Operating Systems CS 411 Compilers ECE 447 Architecture ECE 349 Embedded Systems Network Protocols Processes Mem. Mgmt Machine Code Optimization Exec. Model Memory System CS 212 Execution Models CS 213 Systems Data Structures Applications Programming CS 211 Fundamental Structures CS 113 C Programming