530 likes | 619 Views
Refreshing Memory. Memory Hierarchy Part 2. Writing Cache-Conscious Programs. Problem: Write C code for a function that computes the sum of the elements of a two dimensional array, a[M][N], of integers. int SumArray ( int a[][], int M, int N).
E N D
Refreshing Memory Memory HierarchyPart 2 CMPUT 229
CMPUT 229 Writing Cache-Conscious Programs Problem: Write C code for a function that computes the sum of the elements of a two dimensional array, a[M][N], of integers. intSumArray(int a[][], int M, int N) 1 intSumArrayCols(int a[][], int M, int N) 2 { 3 inti, j; 4 intsum = 0; 5 6 for (j=0 ; j<N ; i++) 7 for (i=0 ; i<M ; i++) 8 sum += a[i][j]; 8 return sum; 9 } 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 intsum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[0][4] a[0][5] a[1][0] a[1][1] a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[0][4] a[0][5] a[1][0] a[1][1] a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayRows Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 a[0][2] 1 intSumArrayRows(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (i=0 ; i<M ; i++) 7 for (j=0 ; j<N ; j++) 8 sum += a[i][j]; 8 return sum; 9 } 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[0][4] a[0][5] a[1][0] a[1][1] a[3][1] 0x8000 4050 a[3][2] 0x8000 4054 Cache a[3][3] ••• 0x8000 4058 a[3][4] ••• ••• Memory Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayCols Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 1 intSumArrayCols(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (j=0 ; j<N ; i++) 7 for (i=0 ; i<M ; i++) 8 sum += a[i][j]; 8 return sum; 9 } a[0][2] 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] Cache ••• 0x8000 4054 a[3][3] 0x8000 4058 a[3][4] ••• ••• Byant/O’Hallaron, pp. 508 Memory
CMPUT 229 SumArrayCols Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 1 intSumArrayCols(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (j=0 ; j<N ; i++) 7 for (i=0 ; i<M ; i++) 8 sum += a[i][j]; 8 return sum; 9 } a[0][2] 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] a[0][4] a[0][5] a[1][0] a[1][1] 0x8000 404C a[3][1] 0x8000 4050 a[3][2] Cache ••• 0x8000 4054 a[3][3] 0x8000 4058 a[3][4] ••• ••• Byant/O’Hallaron, pp. 508
CMPUT 229 SumArrayCols Data Access Order 0x8000 4000 a[0][0] 0x8000 4004 a[0][1] 0x8000 4008 1 intSumArrayCols(int a[][], int M, int N) 2 { 3 inti, j; 4 int sum = 0; 5 6 for (j=0 ; j<N ; i++) 7 for (i=0 ; i<M ; i++) 8 sum += a[i][j]; 8 return sum; 9 } a[0][2] 0x8000 400C a[0][3] 0x8000 4010 a[0][4] 0x8000 4014 a[0][5] 0x8000 4018 a[1][0] 0x8000 401C a[1][1] 0x8000 4020 a[1][2] 0x8000 4024 a[1][3] 0x8000 4028 a[1][4] 0x8000 402C a[1][5] 0x8000 4030 a[2][0] 0x8000 4034 a[2][1] 0x8000 4038 a[2]2] 0x8000 403C a[2][3] 0x8000 4040 a[2][4] 0x8000 4044 a[2][5] a[0][0] a[0][1] a[0][2] a[0][3] 0x8000 4048 a[3][0] a[0][4] a[0][5] a[1][0] a[1][1] 0x8000 404C a[3][1] a[2][0] a[2][1] a[2][2] a[2][3] 0x8000 4050 a[3][2] Cache ••• 0x8000 4054 a[3][3] 0x8000 4058 a[3][4] ••• ••• Byant/O’Hallaron, pp. 508
CMPUT 229 Student Class University The Cost of Programming Productivity • Easy-to-read and easy-to-maintain code often result in lower runtime performance.
CMPUT 229 Student The Cost of Programming Productivity • Abstraction • Inheritance Person Support Staff Professor
CMPUT 229 Univ. ID Date of Adm Faculty Department Program Classes Enr. Grades Name Gender Date of Birth Address Citizenship Student Driver Lic. The Cost of Programming Productivity • Data Encapsulation Person
CMPUT 229 Data Locality Primer AMD Atlon 64 X2
CMPUT 229 Data Locality Primer: Cache Organization • POWER5 Cache Organization • L1 Data Cache: 32 Kbytes, 128-byte cache lines • L2 Cache: 1.44 Mbytes, 128-byte cache lines • L3 Cache: 32 Mbytes, 512-byte cache lines
CMPUT 229 Univ. ID Date of Adm Faculty Department Program Classes Enr. Grades 0 Student: 1 2 3 4 5 Name 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 4 bytes Person: Gender 32 bytes 4 bytes Date of Birth 1 bytes 1 byte Address 4 byte 1 byte Citizenship 32 bytes 2 bytes Driver Lic. 16 bytes 4 bytes 4 bytes 4 bytes 4 bytes Data Locality Primer: Cache Organization Bytes 0 2 Cache Lines ••• 255
CMPUT 229 Fa. Fa. De De Progr. Progr. Classes Enr. Classes Enr. Grades Grades Univ. ID Date of Adm Faculty Department Program Classes Enr. Grades 0 Student: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 4 bytes 4 bytes Univ. ID Univ. ID Date of Adm. Date of Adm. Fa. Fa. De De Progr. Progr. Classes Enr. Classes Enr. Grades Grades ••• ••• 1 byte 1 byte Univ. ID Univ. ID ••• ••• 2 bytes 4 bytes 4 bytes 4 bytes Data Locality Primer: Data in Memory Bytes 0 Memory Address 128 256 384
CMPUT 229 0 ••• 30 31 32 33 Name ••• 36 37 ••• 47 48 ••• 51 52 ••• 69 ••• 84 85 ••• 89 Person: Gender 32 bytes Name Name Ge Ge DofB DofB Address Address Citizens. Citizens. Dr. Lic. Dr. Lic. 768 Date of Birth 1 bytes dress dress Citizens. Citizens. Dr. Lic. Dr. Lic. Name Name Ge Ge DofB DofB Address 4 byte 1024 Memory Address Citizenship 32 bytes 1152 Driver Lic. 16 bytes 1280 4 bytes Data Locality Primer: Data in Memory
CMPUT 229 Fa. Fa. De De Progr. Progr. Classes Enr. Classes Enr. Grades Grades 0 0 1 ••• 30 2 31 3 32 4 5 33 6 ••• 7 36 37 8 ••• 9 10 47 11 48 ••• 12 51 13 52 14 15 ••• 69 16 17 ••• 84 18 85 19 ••• ••• 89 127 Name Name Ge Ge DofB DofB Address Address Citizens. Citizens. Dr. Lic. Dr. Lic. 768 Univ. ID Univ. ID Date of Adm. Date of Adm. Fa. Fa. De De Progr. Progr. Classes Enr. Classes Enr. Grades Grades ••• ••• dress dress Citizens. Citizens. Dr. Lic. Dr. Lic. Name Name Ge Ge DofB DofB 1024 Univ. ID Univ. ID ••• ••• Memory Address 1152 1280 Data Locality Primer: Data in Memory Bytes 0 Memory Address 128 256 384
CMPUT 229 Bytes 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 2 ••• Univ. ID Date of Adm. Fa. De Progr. Classes Enr. Grades ••• 255 Example: A search through the data structures • How many Computing Science students are younger than 23 year old? Cache Lines
CMPUT 229 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 Example: A search through the data structures • How many Computing Science students are younger than 23 year old? • Load 128 bytes and uses 5 bytes! Bytes 0 Univ. ID Date of Adm. Fa. De Progr. Classes Enr. Grades ••• 2 Cache Lines ••• 255
CMPUT 229 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 Example: A search through the data structures • How many Computing Science students are younger than 23 year old? • Load 128 bytes and uses 5 bytes! Bytes 0 Univ. ID Date of Adm. Fa. De Progr. Classes Enr. Grades ••• 2 Name Ge DofB Address Citizens. Dr. Lic. Cache Lines ••• 255
CMPUT 229 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ••• 127 Example: A search through the data structures • How many Computing Science students are younger than 23 year old? • Load 128 bytes and uses 5.3 bytes! • Load 128 bytes and uses 5.8 bytes! Bytes 0 Univ. ID Date of Adm. Fa. De Progr. Classes Enr. Grades ••• 2 Name Ge DofB Address Citizens. Dr. Lic. Cache Lines ••• 255
CMPUT 229 ••• ••• Univ. ID Univ. ID Univ. ID Univ. ID Univ. ID Univ. ID Date of Adm. Date of Adm. Date of Adm. Fa. Fa. Fa. De De De Progr. Progr. Progr. Classes Enr. Classes Enr. Classes Enr. Grades Grades Grades ••• ••• ••• ••• Date of Adm. Date of Adm. Date of Adm. ••• Fa. Fa. Fa. ••• De De De ••• Progr. Progr. Progr. ••• Data Reshaping for Arrays of Structures Student *ListOfStudents; …. ListOfStudents = (Student*)malloc(….); •••
CMPUT 229 Reshaping Linked Data Structures E.g. A linked list of students struct student { int age; int studentNumber; int studentProgram; float averageGrade; struct student *next; }; age num prog gpa age num prog gpa …
CMPUT 229 Maximal Structure Splitting age1 num1 prog1 gpa1 age1 age2 age3 num1 num2 num3 age2 num2 prog2 gpa2 prog1 prog2 prog3 age3 num3 prog3 gpa3 gpa1 gpa2 gpa3 … next1 next2 next3
CMPUT 229 Is it safe to transform a given data structure? • Build alias set • If a pointer P points to the structure • Then all the objects in the points-to set of P must have the same layout. • The layout of two structures is the same if each field has the same offset and the same length.
CMPUT 229 Pool Allocation • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. • If pool is full another pool can be allocated
CMPUT 229 Pool Allocation age1 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. num1 prog1 gpa1 next1
CMPUT 229 Pool Allocation age1 age2 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. num1 num2 prog1 prog2 gpa1 gpa2 next1 next2
CMPUT 229 Pool Allocation age1 age2 age3 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. num1 num2 num3 prog1 prog2 prog3 gpa1 gpa2 gpa3 next1 next2 next3
CMPUT 229 Pool Allocation age1 age2 age3 age4 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. num1 num2 num3 num4 prog1 prog2 prog3 prog4 gpa1 gpa2 gpa3 gpa4 next1 next2 next3 next4
CMPUT 229 Pool Allocation age1 age2 age3 age4 age5 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. num1 num2 num3 num4 num5 prog1 prog2 prog3 prog4 prog5 gpa1 gpa2 gpa3 gpa4 gpa5 next1 next2 next3 next4 next6
CMPUT 229 Pool Allocation age1 age2 age3 age4 age5 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. • If pool is full another pool can be allocated num1 num2 num3 num4 num5 prog1 prog2 prog3 prog4 prog5 gpa1 gpa2 gpa3 gpa4 gpa5 next1 next2 next3 next4 next6
CMPUT 229 Pool Allocation age1 age2 age3 age4 age5 • Intercept mallocs and replace by pool allocation: each structure layout gets its own pool. • If pool is full another pool can be allocated num1 num2 num3 num4 num5 prog1 prog2 prog3 prog4 prog5 gpa1 gpa2 gpa3 gpa4 gpa5 next1 next2 next3 next4 next6 age7 num7 prog7 gpa7 next7
CMPUT 229 age num prog gpa age num prog gpa … 0 4 8 12 16 0 4 8 12 16 s Pointer Dereferencing - Before struct student { int age; int studentNumber; int studentProgram; float averageGrade; struct student *next; }; struct student *s = malloc (sizeof (struct student)); s->age = 21; s->averageGrade = 3.8; s->age == *(s + 0) s->averageGrade == *(s + 12)
CMPUT 229 Uniform Structure Splitting • Requires that all in the structure have the same number of bytes • Advantage • Simpler address computation • Disadvantage • Either restrict the application of the technique • Or wastes memory with padding to create same-length fields
CMPUT 229 pool_field_len 3 * pool_field_len Uniform Splitting Pointer Transformation s1 Pool_field_len is the same for each field age1 age2 age3 num1 num2 num3 prog1 prog2 prog3 gpa1 gpa2 gpa3 next1 next2 next3 s1->age == *(s1 + 0) s1->gpa == *(s1 + (3 * pool_field_len))
CMPUT 229 Non-Uniform Structure Splitting • Requires pools to be aligned by the size of the pool. E.g. If the pools are 4k then they must be aligned on 4k boundaries. • More general • Address calculation is more involved
CMPUT 229 s Non-UniformExample struct example { type_2 a; /* 4 bytes */ type_8 b; /* 8 bytes */ type_4 c; /* 4 bytes */ }; How can the compiler find the address to access: s->c
CMPUT 229 s Non-UniformExample struct example { type_2 a; /* 4 bytes */ type_8 b; /* 8 bytes */ type_4 c; /* 4 bytes */ }; pool_base = s & 0x0…0FFF index = (s – pool_base) / 2 field_base = (2+8)*num_structs_per_pool s->c = *(s + field_base + 4*index - index*2) How can the compiler find the address to access: s->c s->c = *(s + field_base + 4*index - s + pool_base) s->c = *(field_base + 4*index + pool_base)
CMPUT 229 Experiments - Micro Benchmarks (Speedup) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc
CMPUT 229 Experiments - Micro Benchmarks (Instruction Count) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc
CMPUT 229 Experiments - Micro Benchmarks (CPI) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc
CMPUT 229 Experiments - Micro Benchmarks (DTLB Misses) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc
CMPUT 229 Experiments - Micro Benchmarks (L1D Misses) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc
CMPUT 229 Experiments - Micro Benchmarks (L2 Misses) Power 4 Power 5 Linked List 1A Linked List 2 Binary Tree Linked List 1A Linked List 2 Binary Tree Linked List 1B Linked List 1B Linked List 2 w/ alloc Binary Tree w/ alloc Linked List 2 w/ alloc Binary Tree w/ alloc