|
ViennaCL - The Vienna Computing Library
1.4.2
|
00001 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_ 00002 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_ 00003 //Automatically generated file from auxiliary-directory, do not edit manually! 00006 namespace viennacl 00007 { 00008 namespace linalg 00009 { 00010 namespace kernels 00011 { 00012 const char * const compressed_matrix_align8_vec_mul = 00013 "__kernel void vec_mul(\n" 00014 " __global const unsigned int * row_indices,\n" 00015 " __global const uint8 * column_indices, \n" 00016 " __global const float8 * elements,\n" 00017 " __global const float * vector, \n" 00018 " __global float * result,\n" 00019 " unsigned int size)\n" 00020 "{ \n" 00021 " float dot_prod;\n" 00022 " unsigned int start, next_stop;\n" 00023 " uint8 col_idx;\n" 00024 " float8 tmp_vec;\n" 00025 " float8 tmp_entries;\n" 00026 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00027 " {\n" 00028 " dot_prod = 0.0f;\n" 00029 " start = row_indices[row] / 8;\n" 00030 " next_stop = row_indices[row+1] / 8;\n" 00031 " for (unsigned int i = start; i < next_stop; ++i)\n" 00032 " {\n" 00033 " col_idx = column_indices[i];\n" 00034 " tmp_entries = elements[i];\n" 00035 " tmp_vec.s0 = vector[col_idx.s0];\n" 00036 " tmp_vec.s1 = vector[col_idx.s1];\n" 00037 " tmp_vec.s2 = vector[col_idx.s2];\n" 00038 " tmp_vec.s3 = vector[col_idx.s3];\n" 00039 " tmp_vec.s4 = vector[col_idx.s4];\n" 00040 " tmp_vec.s5 = vector[col_idx.s5];\n" 00041 " tmp_vec.s6 = vector[col_idx.s6];\n" 00042 " tmp_vec.s7 = vector[col_idx.s7];\n" 00043 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n" 00044 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n" 00045 " }\n" 00046 " result[row] = dot_prod;\n" 00047 " }\n" 00048 "}\n" 00049 ; //compressed_matrix_align8_vec_mul 00050 00051 const char * const compressed_matrix_align1_trans_unit_lu_forward = 00052 " \n" 00053 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00054 "__kernel void trans_unit_lu_forward(\n" 00055 " __global const unsigned int * row_indices,\n" 00056 " __global const unsigned int * column_indices, \n" 00057 " __global const float * elements,\n" 00058 " __global float * vector,\n" 00059 " unsigned int size) \n" 00060 "{\n" 00061 " __local unsigned int row_index_lookahead[256];\n" 00062 " __local unsigned int row_index_buffer[256];\n" 00063 " \n" 00064 " unsigned int row_index;\n" 00065 " unsigned int col_index;\n" 00066 " float matrix_entry;\n" 00067 " unsigned int nnz = row_indices[size];\n" 00068 " unsigned int row_at_window_start = 0;\n" 00069 " unsigned int row_at_window_end = 0;\n" 00070 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00071 " \n" 00072 " for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n" 00073 " {\n" 00074 " col_index = (i < nnz) ? column_indices[i] : 0;\n" 00075 " matrix_entry = (i < nnz) ? elements[i] : 0;\n" 00076 " row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1;\n" 00077 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00078 " \n" 00079 " if (i < nnz)\n" 00080 " {\n" 00081 " unsigned int row_index_inc = 0;\n" 00082 " while (i >= row_index_lookahead[row_index_inc + 1])\n" 00083 " ++row_index_inc;\n" 00084 " row_index = row_at_window_start + row_index_inc;\n" 00085 " row_index_buffer[get_local_id(0)] = row_index;\n" 00086 " }\n" 00087 " else\n" 00088 " {\n" 00089 " row_index = size+1;\n" 00090 " row_index_buffer[get_local_id(0)] = size - 1;\n" 00091 " }\n" 00092 " \n" 00093 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00094 " \n" 00095 " row_at_window_start = row_index_buffer[0];\n" 00096 " row_at_window_end = row_index_buffer[get_local_size(0) - 1];\n" 00097 " \n" 00098 " //forward elimination\n" 00099 " for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n" 00100 " { \n" 00101 " float result_entry = vector[row];\n" 00102 " \n" 00103 " if ( (row_index == row) && (col_index > row) )\n" 00104 " vector[col_index] -= result_entry * matrix_entry; \n" 00105 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00106 " }\n" 00107 " \n" 00108 " row_at_window_start = row_at_window_end;\n" 00109 " }\n" 00110 "}\n" 00111 ; //compressed_matrix_align1_trans_unit_lu_forward 00112 00113 const char * const compressed_matrix_align1_unit_lu_forward = 00114 " \n" 00115 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00116 "__kernel void unit_lu_forward(\n" 00117 " __global const unsigned int * row_indices,\n" 00118 " __global const unsigned int * column_indices, \n" 00119 " __global const float * elements,\n" 00120 " __global float * vector,\n" 00121 " unsigned int size) \n" 00122 "{\n" 00123 " __local unsigned int col_index_buffer[128];\n" 00124 " __local float element_buffer[128];\n" 00125 " __local float vector_buffer[128];\n" 00126 " unsigned int nnz = row_indices[size];\n" 00127 " unsigned int current_row = 0;\n" 00128 " unsigned int row_at_window_start = 0;\n" 00129 " float current_vector_entry = vector[0];\n" 00130 " unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0);\n" 00131 " unsigned int next_row = row_indices[1];\n" 00132 " \n" 00133 " for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n" 00134 " {\n" 00135 " //load into shared memory (coalesced access):\n" 00136 " if (i < nnz)\n" 00137 " {\n" 00138 " element_buffer[get_local_id(0)] = elements[i];\n" 00139 " unsigned int tmp = column_indices[i];\n" 00140 " col_index_buffer[get_local_id(0)] = tmp;\n" 00141 " vector_buffer[get_local_id(0)] = vector[tmp];\n" 00142 " }\n" 00143 " \n" 00144 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00145 " \n" 00146 " //now a single thread does the remaining work in shared memory:\n" 00147 " if (get_local_id(0) == 0)\n" 00148 " {\n" 00149 " // traverse through all the loaded data:\n" 00150 " for (unsigned int k=0; k<get_local_size(0); ++k)\n" 00151 " {\n" 00152 " if (i+k == next_row) //current row is finished. Write back result\n" 00153 " {\n" 00154 " vector[current_row] = current_vector_entry;\n" 00155 " ++current_row;\n" 00156 " if (current_row < size) //load next row's data\n" 00157 " {\n" 00158 " next_row = row_indices[current_row+1];\n" 00159 " current_vector_entry = vector[current_row];\n" 00160 " }\n" 00161 " }\n" 00162 " \n" 00163 " if (current_row < size && col_index_buffer[k] < current_row) //substitute\n" 00164 " {\n" 00165 " if (col_index_buffer[k] < row_at_window_start) //use recently computed results\n" 00166 " current_vector_entry -= element_buffer[k] * vector_buffer[k];\n" 00167 " else if (col_index_buffer[k] < current_row) //use buffered data\n" 00168 " current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n" 00169 " }\n" 00170 " } // for k\n" 00171 " \n" 00172 " row_at_window_start = current_row;\n" 00173 " } // if (get_local_id(0) == 0)\n" 00174 " \n" 00175 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00176 " } //for i\n" 00177 "}\n" 00178 ; //compressed_matrix_align1_unit_lu_forward 00179 00180 const char * const compressed_matrix_align1_trans_lu_backward = 00181 " \n" 00182 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00183 "__kernel void trans_lu_backward(\n" 00184 " __global const unsigned int * row_indices,\n" 00185 " __global const unsigned int * column_indices, \n" 00186 " __global const float * elements,\n" 00187 " __global const float * diagonal_entries,\n" 00188 " __global float * vector,\n" 00189 " unsigned int size) \n" 00190 "{\n" 00191 " __local unsigned int row_index_lookahead[256];\n" 00192 " __local unsigned int row_index_buffer[256];\n" 00193 " \n" 00194 " unsigned int row_index;\n" 00195 " unsigned int col_index;\n" 00196 " float matrix_entry;\n" 00197 " unsigned int nnz = row_indices[size];\n" 00198 " unsigned int row_at_window_start = size;\n" 00199 " unsigned int row_at_window_end;\n" 00200 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00201 " \n" 00202 " for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0))\n" 00203 " {\n" 00204 " unsigned int i = (nnz - i2) - 1;\n" 00205 " col_index = (i2 < nnz) ? column_indices[i] : 0;\n" 00206 " matrix_entry = (i2 < nnz) ? elements[i] : 0;\n" 00207 " row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0;\n" 00208 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00209 " \n" 00210 " if (i2 < nnz)\n" 00211 " {\n" 00212 " unsigned int row_index_dec = 0;\n" 00213 " while (row_index_lookahead[row_index_dec] > i)\n" 00214 " ++row_index_dec;\n" 00215 " row_index = row_at_window_start - row_index_dec;\n" 00216 " row_index_buffer[get_local_id(0)] = row_index;\n" 00217 " }\n" 00218 " else\n" 00219 " {\n" 00220 " row_index = size+1;\n" 00221 " row_index_buffer[get_local_id(0)] = 0;\n" 00222 " }\n" 00223 " \n" 00224 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00225 " \n" 00226 " row_at_window_start = row_index_buffer[0];\n" 00227 " row_at_window_end = row_index_buffer[get_local_size(0) - 1];\n" 00228 " \n" 00229 " //backward elimination\n" 00230 " for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n" 00231 " { \n" 00232 " unsigned int row = row_at_window_start - row2;\n" 00233 " float result_entry = vector[row] / diagonal_entries[row];\n" 00234 " \n" 00235 " if ( (row_index == row) && (col_index < row) )\n" 00236 " vector[col_index] -= result_entry * matrix_entry; \n" 00237 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00238 " }\n" 00239 " \n" 00240 " row_at_window_start = row_at_window_end;\n" 00241 " }\n" 00242 " \n" 00243 " // final step: Divide vector by diagonal entries:\n" 00244 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00245 " vector[i] /= diagonal_entries[i];\n" 00246 "}\n" 00247 ; //compressed_matrix_align1_trans_lu_backward 00248 00249 const char * const compressed_matrix_align1_trans_unit_lu_backward = 00250 " \n" 00251 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00252 "__kernel void trans_unit_lu_backward(\n" 00253 " __global const unsigned int * row_indices,\n" 00254 " __global const unsigned int * column_indices, \n" 00255 " __global const float * elements,\n" 00256 " __global float * vector,\n" 00257 " unsigned int size) \n" 00258 "{\n" 00259 " __local unsigned int row_index_lookahead[256];\n" 00260 " __local unsigned int row_index_buffer[256];\n" 00261 " \n" 00262 " unsigned int row_index;\n" 00263 " unsigned int col_index;\n" 00264 " float matrix_entry;\n" 00265 " unsigned int nnz = row_indices[size];\n" 00266 " unsigned int row_at_window_start = size;\n" 00267 " unsigned int row_at_window_end;\n" 00268 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00269 " \n" 00270 " for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0))\n" 00271 " {\n" 00272 " unsigned int i = (nnz - i2) - 1;\n" 00273 " col_index = (i2 < nnz) ? column_indices[i] : 0;\n" 00274 " matrix_entry = (i2 < nnz) ? elements[i] : 0;\n" 00275 " row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0;\n" 00276 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00277 " \n" 00278 " if (i2 < nnz)\n" 00279 " {\n" 00280 " unsigned int row_index_dec = 0;\n" 00281 " while (row_index_lookahead[row_index_dec] > i)\n" 00282 " ++row_index_dec;\n" 00283 " row_index = row_at_window_start - row_index_dec;\n" 00284 " row_index_buffer[get_local_id(0)] = row_index;\n" 00285 " }\n" 00286 " else\n" 00287 " {\n" 00288 " row_index = size+1;\n" 00289 " row_index_buffer[get_local_id(0)] = 0;\n" 00290 " }\n" 00291 " \n" 00292 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00293 " \n" 00294 " row_at_window_start = row_index_buffer[0];\n" 00295 " row_at_window_end = row_index_buffer[get_local_size(0) - 1];\n" 00296 " \n" 00297 " //backward elimination\n" 00298 " for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n" 00299 " { \n" 00300 " unsigned int row = row_at_window_start - row2;\n" 00301 " float result_entry = vector[row];\n" 00302 " \n" 00303 " if ( (row_index == row) && (col_index < row) )\n" 00304 " vector[col_index] -= result_entry * matrix_entry; \n" 00305 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00306 " }\n" 00307 " \n" 00308 " row_at_window_start = row_at_window_end;\n" 00309 " }\n" 00310 "}\n" 00311 ; //compressed_matrix_align1_trans_unit_lu_backward 00312 00313 const char * const compressed_matrix_align1_trans_unit_lu_forward_slow = 00314 " \n" 00315 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00316 "__kernel void trans_unit_lu_forward_slow(\n" 00317 " __global const unsigned int * row_indices,\n" 00318 " __global const unsigned int * column_indices, \n" 00319 " __global const float * elements,\n" 00320 " __global float * vector,\n" 00321 " unsigned int size) \n" 00322 "{\n" 00323 " for (unsigned int row = 0; row < size; ++row) \n" 00324 " { \n" 00325 " float result_entry = vector[row]; \n" 00326 " \n" 00327 " unsigned int row_start = row_indices[row]; \n" 00328 " unsigned int row_stop = row_indices[row + 1];\n" 00329 " for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n" 00330 " {\n" 00331 " unsigned int col_index = column_indices[entry_index];\n" 00332 " if (col_index > row)\n" 00333 " vector[col_index] -= result_entry * elements[entry_index]; \n" 00334 " }\n" 00335 " \n" 00336 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00337 " } \n" 00338 "}\n" 00339 ; //compressed_matrix_align1_trans_unit_lu_forward_slow 00340 00341 const char * const compressed_matrix_align1_row_info_extractor = 00342 "__kernel void row_info_extractor(\n" 00343 " __global const unsigned int * row_indices,\n" 00344 " __global const unsigned int * column_indices, \n" 00345 " __global const float * elements,\n" 00346 " __global float * result,\n" 00347 " unsigned int size,\n" 00348 " unsigned int option\n" 00349 " ) \n" 00350 "{ \n" 00351 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00352 " {\n" 00353 " float value = 0;\n" 00354 " unsigned int row_end = row_indices[row+1];\n" 00355 " \n" 00356 " switch (option)\n" 00357 " {\n" 00358 " case 0: //inf-norm\n" 00359 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00360 " value = max(value, fabs(elements[i]));\n" 00361 " break;\n" 00362 " \n" 00363 " case 1: //1-norm\n" 00364 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00365 " value += fabs(elements[i]);\n" 00366 " break;\n" 00367 " \n" 00368 " case 2: //2-norm\n" 00369 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00370 " value += elements[i] * elements[i];\n" 00371 " value = sqrt(value);\n" 00372 " break;\n" 00373 " \n" 00374 " case 3: //diagonal entry\n" 00375 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00376 " {\n" 00377 " if (column_indices[i] == row)\n" 00378 " {\n" 00379 " value = elements[i];\n" 00380 " break;\n" 00381 " }\n" 00382 " }\n" 00383 " break;\n" 00384 " \n" 00385 " default:\n" 00386 " break;\n" 00387 " }\n" 00388 " result[row] = value;\n" 00389 " }\n" 00390 "}\n" 00391 ; //compressed_matrix_align1_row_info_extractor 00392 00393 const char * const compressed_matrix_align1_vec_mul_cpu = 00394 "__kernel void vec_mul_cpu(\n" 00395 " __global const unsigned int * row_indices,\n" 00396 " __global const unsigned int * column_indices, \n" 00397 " __global const float * elements,\n" 00398 " __global const float * vector, \n" 00399 " __global float * result,\n" 00400 " unsigned int size) \n" 00401 "{ \n" 00402 " unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1);\n" 00403 " unsigned int row_start = get_global_id(0) * work_per_item;\n" 00404 " unsigned int row_stop = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size);\n" 00405 " for (unsigned int row = row_start; row < row_stop; ++row)\n" 00406 " {\n" 00407 " float dot_prod = 0.0f;\n" 00408 " unsigned int row_end = row_indices[row+1];\n" 00409 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00410 " dot_prod += elements[i] * vector[column_indices[i]];\n" 00411 " result[row] = dot_prod;\n" 00412 " }\n" 00413 "}\n" 00414 ; //compressed_matrix_align1_vec_mul_cpu 00415 00416 const char * const compressed_matrix_align1_jacobi = 00417 "__kernel void jacobi(\n" 00418 " __global const unsigned int * row_indices,\n" 00419 " __global const unsigned int * column_indices,\n" 00420 " __global const float * elements,\n" 00421 " float weight,\n" 00422 " __global const float * old_result,\n" 00423 " __global float * new_result,\n" 00424 " __global const float * rhs,\n" 00425 " unsigned int size)\n" 00426 " {\n" 00427 " float sum, diag=1;\n" 00428 " int col;\n" 00429 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n" 00430 " {\n" 00431 " sum = 0;\n" 00432 " for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n" 00433 " {\n" 00434 " col = column_indices[j];\n" 00435 " if (i == col)\n" 00436 " diag = elements[j];\n" 00437 " else \n" 00438 " sum += elements[j] * old_result[col]; \n" 00439 " } \n" 00440 " new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n" 00441 " } \n" 00442 " } \n" 00443 ; //compressed_matrix_align1_jacobi 00444 00445 const char * const compressed_matrix_align1_unit_lu_backward = 00446 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n" 00447 "__kernel void unit_lu_backward(\n" 00448 " __global const unsigned int * row_indices,\n" 00449 " __global const unsigned int * column_indices, \n" 00450 " __global const float * elements,\n" 00451 " __global float * vector,\n" 00452 " unsigned int size) \n" 00453 "{\n" 00454 " __local unsigned int col_index_buffer[128];\n" 00455 " __local float element_buffer[128];\n" 00456 " __local float vector_buffer[128];\n" 00457 " \n" 00458 " unsigned int nnz = row_indices[size];\n" 00459 " unsigned int current_row = size-1;\n" 00460 " unsigned int row_at_window_start = size-1;\n" 00461 " float current_vector_entry = vector[size-1];\n" 00462 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0);\n" 00463 " unsigned int next_row = row_indices[size-1];\n" 00464 " \n" 00465 " unsigned int i = loop_end + get_local_id(0);\n" 00466 " while (1)\n" 00467 " {\n" 00468 " //load into shared memory (coalesced access):\n" 00469 " if (i < nnz)\n" 00470 " {\n" 00471 " element_buffer[get_local_id(0)] = elements[i];\n" 00472 " unsigned int tmp = column_indices[i];\n" 00473 " col_index_buffer[get_local_id(0)] = tmp;\n" 00474 " vector_buffer[get_local_id(0)] = vector[tmp];\n" 00475 " }\n" 00476 " \n" 00477 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00478 " \n" 00479 " //now a single thread does the remaining work in shared memory:\n" 00480 " if (get_local_id(0) == 0)\n" 00481 " {\n" 00482 " // traverse through all the loaded data from back to front:\n" 00483 " for (unsigned int k2=0; k2<get_local_size(0); ++k2)\n" 00484 " {\n" 00485 " unsigned int k = (get_local_size(0) - k2) - 1;\n" 00486 " \n" 00487 " if (i+k >= nnz)\n" 00488 " continue;\n" 00489 " \n" 00490 " if (col_index_buffer[k] > row_at_window_start) //use recently computed results\n" 00491 " current_vector_entry -= element_buffer[k] * vector_buffer[k];\n" 00492 " else if (col_index_buffer[k] > current_row) //use buffered data\n" 00493 " current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n" 00494 " \n" 00495 " if (i+k == next_row) //current row is finished. Write back result\n" 00496 " {\n" 00497 " vector[current_row] = current_vector_entry;\n" 00498 " if (current_row > 0) //load next row's data\n" 00499 " {\n" 00500 " --current_row;\n" 00501 " next_row = row_indices[current_row];\n" 00502 " current_vector_entry = vector[current_row];\n" 00503 " }\n" 00504 " }\n" 00505 " \n" 00506 " \n" 00507 " } // for k\n" 00508 " \n" 00509 " row_at_window_start = current_row;\n" 00510 " } // if (get_local_id(0) == 0)\n" 00511 " \n" 00512 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00513 " \n" 00514 " if (i < get_local_size(0))\n" 00515 " break;\n" 00516 " \n" 00517 " i -= get_local_size(0);\n" 00518 " } //for i\n" 00519 "}\n" 00520 ; //compressed_matrix_align1_unit_lu_backward 00521 00522 const char * const compressed_matrix_align1_block_trans_lu_backward = 00523 " __kernel void block_trans_lu_backward(\n" 00524 " __global const unsigned int * row_jumper_U, //U part (note that U is transposed in memory)\n" 00525 " __global const unsigned int * column_indices_U,\n" 00526 " __global const float * elements_U,\n" 00527 " __global const float * diagonal_U,\n" 00528 " __global const unsigned int * block_offsets,\n" 00529 " __global float * result,\n" 00530 " unsigned int size)\n" 00531 " {\n" 00532 " unsigned int col_start = block_offsets[2*get_group_id(0)];\n" 00533 " unsigned int col_stop = block_offsets[2*get_group_id(0)+1];\n" 00534 " unsigned int row_start;\n" 00535 " unsigned int row_stop;\n" 00536 " float result_entry = 0;\n" 00537 " if (col_start >= col_stop)\n" 00538 " return;\n" 00539 " //backward elimination, using U and diagonal_U\n" 00540 " for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n" 00541 " { \n" 00542 " unsigned int col = (col_stop - iter) - 1;\n" 00543 " result_entry = result[col] / diagonal_U[col];\n" 00544 " row_start = row_jumper_U[col]; \n" 00545 " row_stop = row_jumper_U[col + 1]; \n" 00546 " for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n" 00547 " result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];\n" 00548 " barrier(CLK_GLOBAL_MEM_FENCE); \n" 00549 " } \n" 00550 " //divide result vector by diagonal:\n" 00551 " for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n" 00552 " result[col] /= diagonal_U[col];\n" 00553 " };\n" 00554 ; //compressed_matrix_align1_block_trans_lu_backward 00555 00556 const char * const compressed_matrix_align1_block_trans_unit_lu_forward = 00557 " __kernel void block_trans_unit_lu_forward(\n" 00558 " __global const unsigned int * row_jumper_L, //L part (note that L is transposed in memory)\n" 00559 " __global const unsigned int * column_indices_L, \n" 00560 " __global const float * elements_L,\n" 00561 " __global const unsigned int * block_offsets,\n" 00562 " __global float * result,\n" 00563 " unsigned int size)\n" 00564 " {\n" 00565 " unsigned int col_start = block_offsets[2*get_group_id(0)];\n" 00566 " unsigned int col_stop = block_offsets[2*get_group_id(0)+1];\n" 00567 " unsigned int row_start = row_jumper_L[col_start];\n" 00568 " unsigned int row_stop;\n" 00569 " float result_entry = 0;\n" 00570 " if (col_start >= col_stop)\n" 00571 " return;\n" 00572 " //forward elimination, using L:\n" 00573 " for (unsigned int col = col_start; col < col_stop; ++col)\n" 00574 " {\n" 00575 " result_entry = result[col];\n" 00576 " row_stop = row_jumper_L[col + 1];\n" 00577 " for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0))\n" 00578 " result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n" 00579 " row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)\n" 00580 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00581 " } \n" 00582 " };\n" 00583 ; //compressed_matrix_align1_block_trans_unit_lu_forward 00584 00585 const char * const compressed_matrix_align1_trans_lu_forward = 00586 " \n" 00587 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00588 "__kernel void trans_lu_forward(\n" 00589 " __global const unsigned int * row_indices,\n" 00590 " __global const unsigned int * column_indices, \n" 00591 " __global const float * elements,\n" 00592 " __global const float * diagonal_entries,\n" 00593 " __global float * vector,\n" 00594 " unsigned int size) \n" 00595 "{\n" 00596 " __local unsigned int row_index_lookahead[256];\n" 00597 " __local unsigned int row_index_buffer[256];\n" 00598 " \n" 00599 " unsigned int row_index;\n" 00600 " unsigned int col_index;\n" 00601 " float matrix_entry;\n" 00602 " unsigned int nnz = row_indices[size];\n" 00603 " unsigned int row_at_window_start = 0;\n" 00604 " unsigned int row_at_window_end = 0;\n" 00605 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n" 00606 " \n" 00607 " for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n" 00608 " {\n" 00609 " col_index = (i < nnz) ? column_indices[i] : 0;\n" 00610 " matrix_entry = (i < nnz) ? elements[i] : 0;\n" 00611 " row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1;\n" 00612 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00613 " \n" 00614 " if (i < nnz)\n" 00615 " {\n" 00616 " unsigned int row_index_inc = 0;\n" 00617 " while (i >= row_index_lookahead[row_index_inc + 1])\n" 00618 " ++row_index_inc;\n" 00619 " row_index = row_at_window_start + row_index_inc;\n" 00620 " row_index_buffer[get_local_id(0)] = row_index;\n" 00621 " }\n" 00622 " else\n" 00623 " {\n" 00624 " row_index = size+1;\n" 00625 " row_index_buffer[get_local_id(0)] = size - 1;\n" 00626 " }\n" 00627 " \n" 00628 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00629 " \n" 00630 " row_at_window_start = row_index_buffer[0];\n" 00631 " row_at_window_end = row_index_buffer[get_local_size(0) - 1];\n" 00632 " \n" 00633 " //forward elimination\n" 00634 " for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n" 00635 " { \n" 00636 " float result_entry = vector[row] / diagonal_entries[row];\n" 00637 " \n" 00638 " if ( (row_index == row) && (col_index > row) )\n" 00639 " vector[col_index] -= result_entry * matrix_entry; \n" 00640 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00641 " }\n" 00642 " \n" 00643 " row_at_window_start = row_at_window_end;\n" 00644 " }\n" 00645 " \n" 00646 " // final step: Divide vector by diagonal entries:\n" 00647 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n" 00648 " vector[i] /= diagonal_entries[i];\n" 00649 "}\n" 00650 ; //compressed_matrix_align1_trans_lu_forward 00651 00652 const char * const compressed_matrix_align1_lu_backward = 00653 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n" 00654 "__kernel void lu_backward(\n" 00655 " __global const unsigned int * row_indices,\n" 00656 " __global const unsigned int * column_indices, \n" 00657 " __global const float * elements,\n" 00658 " __global float * vector,\n" 00659 " unsigned int size) \n" 00660 "{\n" 00661 " __local unsigned int col_index_buffer[128];\n" 00662 " __local float element_buffer[128];\n" 00663 " __local float vector_buffer[128];\n" 00664 " \n" 00665 " unsigned int nnz = row_indices[size];\n" 00666 " unsigned int current_row = size-1;\n" 00667 " unsigned int row_at_window_start = size-1;\n" 00668 " float current_vector_entry = vector[size-1];\n" 00669 " float diagonal_entry = 0;\n" 00670 " unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0);\n" 00671 " unsigned int next_row = row_indices[size-1];\n" 00672 " \n" 00673 " unsigned int i = loop_end + get_local_id(0);\n" 00674 " while (1)\n" 00675 " {\n" 00676 " //load into shared memory (coalesced access):\n" 00677 " if (i < nnz)\n" 00678 " {\n" 00679 " element_buffer[get_local_id(0)] = elements[i];\n" 00680 " unsigned int tmp = column_indices[i];\n" 00681 " col_index_buffer[get_local_id(0)] = tmp;\n" 00682 " vector_buffer[get_local_id(0)] = vector[tmp];\n" 00683 " }\n" 00684 " \n" 00685 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00686 " \n" 00687 " //now a single thread does the remaining work in shared memory:\n" 00688 " if (get_local_id(0) == 0)\n" 00689 " {\n" 00690 " // traverse through all the loaded data from back to front:\n" 00691 " for (unsigned int k2=0; k2<get_local_size(0); ++k2)\n" 00692 " {\n" 00693 " unsigned int k = (get_local_size(0) - k2) - 1;\n" 00694 " \n" 00695 " if (i+k >= nnz)\n" 00696 " continue;\n" 00697 " \n" 00698 " if (col_index_buffer[k] > row_at_window_start) //use recently computed results\n" 00699 " current_vector_entry -= element_buffer[k] * vector_buffer[k];\n" 00700 " else if (col_index_buffer[k] > current_row) //use buffered data\n" 00701 " current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n" 00702 " else if (col_index_buffer[k] == current_row)\n" 00703 " diagonal_entry = element_buffer[k];\n" 00704 " \n" 00705 " if (i+k == next_row) //current row is finished. Write back result\n" 00706 " {\n" 00707 " vector[current_row] = current_vector_entry / diagonal_entry;\n" 00708 " if (current_row > 0) //load next row's data\n" 00709 " {\n" 00710 " --current_row;\n" 00711 " next_row = row_indices[current_row];\n" 00712 " current_vector_entry = vector[current_row];\n" 00713 " }\n" 00714 " }\n" 00715 " \n" 00716 " \n" 00717 " } // for k\n" 00718 " \n" 00719 " row_at_window_start = current_row;\n" 00720 " } // if (get_local_id(0) == 0)\n" 00721 " \n" 00722 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00723 " \n" 00724 " if (i < get_local_size(0))\n" 00725 " break;\n" 00726 " \n" 00727 " i -= get_local_size(0);\n" 00728 " } //for i\n" 00729 "}\n" 00730 ; //compressed_matrix_align1_lu_backward 00731 00732 const char * const compressed_matrix_align1_vec_mul = 00733 "__kernel void vec_mul(\n" 00734 " __global const unsigned int * row_indices,\n" 00735 " __global const unsigned int * column_indices, \n" 00736 " __global const float * elements,\n" 00737 " __global const float * vector, \n" 00738 " __global float * result,\n" 00739 " unsigned int size) \n" 00740 "{ \n" 00741 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00742 " {\n" 00743 " float dot_prod = 0.0f;\n" 00744 " unsigned int row_end = row_indices[row+1];\n" 00745 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n" 00746 " dot_prod += elements[i] * vector[column_indices[i]];\n" 00747 " result[row] = dot_prod;\n" 00748 " }\n" 00749 "}\n" 00750 ; //compressed_matrix_align1_vec_mul 00751 00752 const char * const compressed_matrix_align1_lu_forward = 00753 " \n" 00754 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n" 00755 "__kernel void lu_forward(\n" 00756 " __global const unsigned int * row_indices,\n" 00757 " __global const unsigned int * column_indices, \n" 00758 " __global const float * elements,\n" 00759 " __global float * vector,\n" 00760 " unsigned int size) \n" 00761 "{\n" 00762 " __local unsigned int col_index_buffer[128];\n" 00763 " __local float element_buffer[128];\n" 00764 " __local float vector_buffer[128];\n" 00765 " \n" 00766 " unsigned int nnz = row_indices[size];\n" 00767 " unsigned int current_row = 0;\n" 00768 " unsigned int row_at_window_start = 0;\n" 00769 " float current_vector_entry = vector[0];\n" 00770 " float diagonal_entry;\n" 00771 " unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0);\n" 00772 " unsigned int next_row = row_indices[1];\n" 00773 " \n" 00774 " for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n" 00775 " {\n" 00776 " //load into shared memory (coalesced access):\n" 00777 " if (i < nnz)\n" 00778 " {\n" 00779 " element_buffer[get_local_id(0)] = elements[i];\n" 00780 " unsigned int tmp = column_indices[i];\n" 00781 " col_index_buffer[get_local_id(0)] = tmp;\n" 00782 " vector_buffer[get_local_id(0)] = vector[tmp];\n" 00783 " }\n" 00784 " \n" 00785 " barrier(CLK_LOCAL_MEM_FENCE);\n" 00786 " \n" 00787 " //now a single thread does the remaining work in shared memory:\n" 00788 " if (get_local_id(0) == 0)\n" 00789 " {\n" 00790 " // traverse through all the loaded data:\n" 00791 " for (unsigned int k=0; k<get_local_size(0); ++k)\n" 00792 " {\n" 00793 " if (current_row < size && i+k == next_row) //current row is finished. Write back result\n" 00794 " {\n" 00795 " vector[current_row] = current_vector_entry / diagonal_entry;\n" 00796 " ++current_row;\n" 00797 " if (current_row < size) //load next row's data\n" 00798 " {\n" 00799 " next_row = row_indices[current_row+1];\n" 00800 " current_vector_entry = vector[current_row];\n" 00801 " }\n" 00802 " }\n" 00803 " \n" 00804 " if (current_row < size && col_index_buffer[k] < current_row) //substitute\n" 00805 " {\n" 00806 " if (col_index_buffer[k] < row_at_window_start) //use recently computed results\n" 00807 " current_vector_entry -= element_buffer[k] * vector_buffer[k];\n" 00808 " else if (col_index_buffer[k] < current_row) //use buffered data\n" 00809 " current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n" 00810 " }\n" 00811 " else if (col_index_buffer[k] == current_row)\n" 00812 " diagonal_entry = element_buffer[k];\n" 00813 " } // for k\n" 00814 " \n" 00815 " row_at_window_start = current_row;\n" 00816 " } // if (get_local_id(0) == 0)\n" 00817 " \n" 00818 " barrier(CLK_GLOBAL_MEM_FENCE);\n" 00819 " } //for i\n" 00820 "}\n" 00821 ; //compressed_matrix_align1_lu_forward 00822 00823 const char * const compressed_matrix_align4_vec_mul = 00824 "__kernel void vec_mul(\n" 00825 " __global const unsigned int * row_indices,\n" 00826 " __global const uint4 * column_indices, \n" 00827 " __global const float4 * elements,\n" 00828 " __global const float * vector, \n" 00829 " __global float * result,\n" 00830 " unsigned int size)\n" 00831 "{ \n" 00832 " float dot_prod;\n" 00833 " unsigned int start, next_stop;\n" 00834 " uint4 col_idx;\n" 00835 " float4 tmp_vec;\n" 00836 " float4 tmp_entries;\n" 00837 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n" 00838 " {\n" 00839 " dot_prod = 0.0f;\n" 00840 " start = row_indices[row] / 4;\n" 00841 " next_stop = row_indices[row+1] / 4;\n" 00842 " for (unsigned int i = start; i < next_stop; ++i)\n" 00843 " {\n" 00844 " col_idx = column_indices[i];\n" 00845 " tmp_entries = elements[i];\n" 00846 " tmp_vec.x = vector[col_idx.x];\n" 00847 " tmp_vec.y = vector[col_idx.y];\n" 00848 " tmp_vec.z = vector[col_idx.z];\n" 00849 " tmp_vec.w = vector[col_idx.w];\n" 00850 " dot_prod += dot(tmp_entries, tmp_vec);\n" 00851 " }\n" 00852 " result[row] = dot_prod;\n" 00853 " }\n" 00854 "}\n" 00855 ; //compressed_matrix_align4_vec_mul 00856 00857 } //namespace kernels 00858 } //namespace linalg 00859 } //namespace viennacl 00860 #endif 00861
1.7.6.1