ViennaCL - The Vienna Computing Library  1.4.2
viennacl/linalg/kernels/compressed_matrix_source.h
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00002 #define VIENNACL_LINALG_KERNELS_COMPRESSED_MATRIX_SOURCE_HPP_
00003 //Automatically generated file from auxiliary-directory, do not edit manually!
00006 namespace viennacl
00007 {
00008  namespace linalg
00009  {
00010   namespace kernels
00011   {
00012 const char * const compressed_matrix_align8_vec_mul = 
00013 "__kernel void vec_mul(\n"
00014 "          __global const unsigned int * row_indices,\n"
00015 "          __global const uint8 * column_indices, \n"
00016 "          __global const float8 * elements,\n"
00017 "          __global const float * vector,  \n"
00018 "          __global float * result,\n"
00019 "          unsigned int size)\n"
00020 "{ \n"
00021 "  float dot_prod;\n"
00022 "  unsigned int start, next_stop;\n"
00023 "  uint8 col_idx;\n"
00024 "  float8 tmp_vec;\n"
00025 "  float8 tmp_entries;\n"
00026 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00027 "  {\n"
00028 "    dot_prod = 0.0f;\n"
00029 "    start = row_indices[row] / 8;\n"
00030 "    next_stop = row_indices[row+1] / 8;\n"
00031 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00032 "    {\n"
00033 "      col_idx = column_indices[i];\n"
00034 "      tmp_entries = elements[i];\n"
00035 "      tmp_vec.s0 = vector[col_idx.s0];\n"
00036 "      tmp_vec.s1 = vector[col_idx.s1];\n"
00037 "      tmp_vec.s2 = vector[col_idx.s2];\n"
00038 "      tmp_vec.s3 = vector[col_idx.s3];\n"
00039 "      tmp_vec.s4 = vector[col_idx.s4];\n"
00040 "      tmp_vec.s5 = vector[col_idx.s5];\n"
00041 "      tmp_vec.s6 = vector[col_idx.s6];\n"
00042 "      tmp_vec.s7 = vector[col_idx.s7];\n"
00043 "      dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
00044 "      dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
00045 "    }\n"
00046 "    result[row] = dot_prod;\n"
00047 "  }\n"
00048 "}\n"
00049 ; //compressed_matrix_align8_vec_mul
00050 
00051 const char * const compressed_matrix_align1_trans_unit_lu_forward = 
00052 " \n"
00053 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00054 "__kernel void trans_unit_lu_forward(\n"
00055 "          __global const unsigned int * row_indices,\n"
00056 "          __global const unsigned int * column_indices, \n"
00057 "          __global const float * elements,\n"
00058 "          __global float * vector,\n"
00059 "          unsigned int size) \n"
00060 "{\n"
00061 "  __local unsigned int row_index_lookahead[256];\n"
00062 "  __local unsigned int row_index_buffer[256];\n"
00063 "  \n"
00064 "  unsigned int row_index;\n"
00065 "  unsigned int col_index;\n"
00066 "  float matrix_entry;\n"
00067 "  unsigned int nnz = row_indices[size];\n"
00068 "  unsigned int row_at_window_start = 0;\n"
00069 "  unsigned int row_at_window_end = 0;\n"
00070 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00071 "  \n"
00072 "  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n"
00073 "  {\n"
00074 "    col_index    = (i < nnz) ? column_indices[i] : 0;\n"
00075 "    matrix_entry = (i < nnz) ? elements[i]       : 0;\n"
00076 "    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1;\n"
00077 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00078 "    \n"
00079 "    if (i < nnz)\n"
00080 "    {\n"
00081 "      unsigned int row_index_inc = 0;\n"
00082 "      while (i >= row_index_lookahead[row_index_inc + 1])\n"
00083 "        ++row_index_inc;\n"
00084 "      row_index = row_at_window_start + row_index_inc;\n"
00085 "      row_index_buffer[get_local_id(0)] = row_index;\n"
00086 "    }\n"
00087 "    else\n"
00088 "    {\n"
00089 "      row_index = size+1;\n"
00090 "      row_index_buffer[get_local_id(0)] = size - 1;\n"
00091 "    }\n"
00092 "    \n"
00093 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00094 "    \n"
00095 "    row_at_window_start = row_index_buffer[0];\n"
00096 "    row_at_window_end   = row_index_buffer[get_local_size(0) - 1];\n"
00097 "    \n"
00098 "    //forward elimination\n"
00099 "    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n"
00100 "    { \n"
00101 "      float result_entry = vector[row];\n"
00102 "      \n"
00103 "      if ( (row_index == row) && (col_index > row) )\n"
00104 "        vector[col_index] -= result_entry * matrix_entry; \n"
00105 "      barrier(CLK_GLOBAL_MEM_FENCE);\n"
00106 "    }\n"
00107 "    \n"
00108 "    row_at_window_start = row_at_window_end;\n"
00109 "  }\n"
00110 "}\n"
00111 ; //compressed_matrix_align1_trans_unit_lu_forward
00112 
00113 const char * const compressed_matrix_align1_unit_lu_forward = 
00114 " \n"
00115 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00116 "__kernel void unit_lu_forward(\n"
00117 "          __global const unsigned int * row_indices,\n"
00118 "          __global const unsigned int * column_indices, \n"
00119 "          __global const float * elements,\n"
00120 "          __global float * vector,\n"
00121 "          unsigned int size) \n"
00122 "{\n"
00123 "  __local  unsigned int col_index_buffer[128];\n"
00124 "  __local  float element_buffer[128];\n"
00125 "  __local  float vector_buffer[128];\n"
00126 "  unsigned int nnz = row_indices[size];\n"
00127 "  unsigned int current_row = 0;\n"
00128 "  unsigned int row_at_window_start = 0;\n"
00129 "  float current_vector_entry = vector[0];\n"
00130 "  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0);\n"
00131 "  unsigned int next_row = row_indices[1];\n"
00132 "  \n"
00133 "  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n"
00134 "  {\n"
00135 "    //load into shared memory (coalesced access):\n"
00136 "    if (i < nnz)\n"
00137 "    {\n"
00138 "      element_buffer[get_local_id(0)] = elements[i];\n"
00139 "      unsigned int tmp = column_indices[i];\n"
00140 "      col_index_buffer[get_local_id(0)] = tmp;\n"
00141 "      vector_buffer[get_local_id(0)] = vector[tmp];\n"
00142 "    }\n"
00143 "    \n"
00144 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00145 "    \n"
00146 "    //now a single thread does the remaining work in shared memory:\n"
00147 "    if (get_local_id(0) == 0)\n"
00148 "    {\n"
00149 "      // traverse through all the loaded data:\n"
00150 "      for (unsigned int k=0; k<get_local_size(0); ++k)\n"
00151 "      {\n"
00152 "        if (i+k == next_row) //current row is finished. Write back result\n"
00153 "        {\n"
00154 "          vector[current_row] = current_vector_entry;\n"
00155 "          ++current_row;\n"
00156 "          if (current_row < size) //load next row's data\n"
00157 "          {\n"
00158 "            next_row = row_indices[current_row+1];\n"
00159 "            current_vector_entry = vector[current_row];\n"
00160 "          }\n"
00161 "        }\n"
00162 "        \n"
00163 "        if (current_row < size && col_index_buffer[k] < current_row) //substitute\n"
00164 "        {\n"
00165 "          if (col_index_buffer[k] < row_at_window_start) //use recently computed results\n"
00166 "            current_vector_entry -= element_buffer[k] * vector_buffer[k];\n"
00167 "          else if (col_index_buffer[k] < current_row) //use buffered data\n"
00168 "            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n"
00169 "        }\n"
00170 "      } // for k\n"
00171 "      \n"
00172 "      row_at_window_start = current_row;\n"
00173 "    } // if (get_local_id(0) == 0)\n"
00174 "    \n"
00175 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00176 "  } //for i\n"
00177 "}\n"
00178 ; //compressed_matrix_align1_unit_lu_forward
00179 
00180 const char * const compressed_matrix_align1_trans_lu_backward = 
00181 " \n"
00182 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00183 "__kernel void trans_lu_backward(\n"
00184 "          __global const unsigned int * row_indices,\n"
00185 "          __global const unsigned int * column_indices, \n"
00186 "          __global const float * elements,\n"
00187 "          __global const float * diagonal_entries,\n"
00188 "          __global float * vector,\n"
00189 "          unsigned int size) \n"
00190 "{\n"
00191 "  __local unsigned int row_index_lookahead[256];\n"
00192 "  __local unsigned int row_index_buffer[256];\n"
00193 "  \n"
00194 "  unsigned int row_index;\n"
00195 "  unsigned int col_index;\n"
00196 "  float matrix_entry;\n"
00197 "  unsigned int nnz = row_indices[size];\n"
00198 "  unsigned int row_at_window_start = size;\n"
00199 "  unsigned int row_at_window_end;\n"
00200 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00201 "  \n"
00202 "  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0))\n"
00203 "  {\n"
00204 "    unsigned int i = (nnz - i2) - 1;\n"
00205 "    col_index    = (i2 < nnz) ? column_indices[i] : 0;\n"
00206 "    matrix_entry = (i2 < nnz) ? elements[i]       : 0;\n"
00207 "    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0;\n"
00208 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00209 "    \n"
00210 "    if (i2 < nnz)\n"
00211 "    {\n"
00212 "      unsigned int row_index_dec = 0;\n"
00213 "      while (row_index_lookahead[row_index_dec] > i)\n"
00214 "        ++row_index_dec;\n"
00215 "      row_index = row_at_window_start - row_index_dec;\n"
00216 "      row_index_buffer[get_local_id(0)] = row_index;\n"
00217 "    }\n"
00218 "    else\n"
00219 "    {\n"
00220 "      row_index = size+1;\n"
00221 "      row_index_buffer[get_local_id(0)] = 0;\n"
00222 "    }\n"
00223 "    \n"
00224 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00225 "    \n"
00226 "    row_at_window_start = row_index_buffer[0];\n"
00227 "    row_at_window_end   = row_index_buffer[get_local_size(0) - 1];\n"
00228 "    \n"
00229 "    //backward elimination\n"
00230 "    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n"
00231 "    { \n"
00232 "      unsigned int row = row_at_window_start - row2;\n"
00233 "      float result_entry = vector[row] / diagonal_entries[row];\n"
00234 "      \n"
00235 "      if ( (row_index == row) && (col_index < row) )\n"
00236 "        vector[col_index] -= result_entry * matrix_entry; \n"
00237 "      barrier(CLK_GLOBAL_MEM_FENCE);\n"
00238 "    }\n"
00239 "    \n"
00240 "    row_at_window_start = row_at_window_end;\n"
00241 "  }\n"
00242 "  \n"
00243 "  // final step: Divide vector by diagonal entries:\n"
00244 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00245 "    vector[i] /= diagonal_entries[i];\n"
00246 "}\n"
00247 ; //compressed_matrix_align1_trans_lu_backward
00248 
00249 const char * const compressed_matrix_align1_trans_unit_lu_backward = 
00250 " \n"
00251 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00252 "__kernel void trans_unit_lu_backward(\n"
00253 "          __global const unsigned int * row_indices,\n"
00254 "          __global const unsigned int * column_indices, \n"
00255 "          __global const float * elements,\n"
00256 "          __global float * vector,\n"
00257 "          unsigned int size) \n"
00258 "{\n"
00259 "  __local unsigned int row_index_lookahead[256];\n"
00260 "  __local unsigned int row_index_buffer[256];\n"
00261 "  \n"
00262 "  unsigned int row_index;\n"
00263 "  unsigned int col_index;\n"
00264 "  float matrix_entry;\n"
00265 "  unsigned int nnz = row_indices[size];\n"
00266 "  unsigned int row_at_window_start = size;\n"
00267 "  unsigned int row_at_window_end;\n"
00268 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00269 "  \n"
00270 "  for (unsigned int i2 = get_local_id(0); i2 < loop_end; i2 += get_local_size(0))\n"
00271 "  {\n"
00272 "    unsigned int i = (nnz - i2) - 1;\n"
00273 "    col_index    = (i2 < nnz) ? column_indices[i] : 0;\n"
00274 "    matrix_entry = (i2 < nnz) ? elements[i]       : 0;\n"
00275 "    row_index_lookahead[get_local_id(0)] = (row_at_window_start >= get_local_id(0)) ? row_indices[row_at_window_start - get_local_id(0)] : 0;\n"
00276 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00277 "    \n"
00278 "    if (i2 < nnz)\n"
00279 "    {\n"
00280 "      unsigned int row_index_dec = 0;\n"
00281 "      while (row_index_lookahead[row_index_dec] > i)\n"
00282 "        ++row_index_dec;\n"
00283 "      row_index = row_at_window_start - row_index_dec;\n"
00284 "      row_index_buffer[get_local_id(0)] = row_index;\n"
00285 "    }\n"
00286 "    else\n"
00287 "    {\n"
00288 "      row_index = size+1;\n"
00289 "      row_index_buffer[get_local_id(0)] = 0;\n"
00290 "    }\n"
00291 "    \n"
00292 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00293 "    \n"
00294 "    row_at_window_start = row_index_buffer[0];\n"
00295 "    row_at_window_end   = row_index_buffer[get_local_size(0) - 1];\n"
00296 "    \n"
00297 "    //backward elimination\n"
00298 "    for (unsigned int row2 = 0; row2 <= (row_at_window_start - row_at_window_end); ++row2) \n"
00299 "    { \n"
00300 "      unsigned int row = row_at_window_start - row2;\n"
00301 "      float result_entry = vector[row];\n"
00302 "      \n"
00303 "      if ( (row_index == row) && (col_index < row) )\n"
00304 "        vector[col_index] -= result_entry * matrix_entry; \n"
00305 "      barrier(CLK_GLOBAL_MEM_FENCE);\n"
00306 "    }\n"
00307 "    \n"
00308 "    row_at_window_start = row_at_window_end;\n"
00309 "  }\n"
00310 "}\n"
00311 ; //compressed_matrix_align1_trans_unit_lu_backward
00312 
00313 const char * const compressed_matrix_align1_trans_unit_lu_forward_slow = 
00314 " \n"
00315 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00316 "__kernel void trans_unit_lu_forward_slow(\n"
00317 "          __global const unsigned int * row_indices,\n"
00318 "          __global const unsigned int * column_indices, \n"
00319 "          __global const float * elements,\n"
00320 "          __global float * vector,\n"
00321 "          unsigned int size) \n"
00322 "{\n"
00323 "  for (unsigned int row = 0; row < size; ++row) \n"
00324 "  { \n"
00325 "    float result_entry = vector[row]; \n"
00326 "    \n"
00327 "    unsigned int row_start = row_indices[row]; \n"
00328 "    unsigned int row_stop  = row_indices[row + 1];\n"
00329 "    for (unsigned int entry_index = row_start + get_local_id(0); entry_index < row_stop; entry_index += get_local_size(0)) \n"
00330 "    {\n"
00331 "      unsigned int col_index = column_indices[entry_index];\n"
00332 "      if (col_index > row)\n"
00333 "        vector[col_index] -= result_entry * elements[entry_index]; \n"
00334 "    }\n"
00335 "    \n"
00336 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00337 "  } \n"
00338 "}\n"
00339 ; //compressed_matrix_align1_trans_unit_lu_forward_slow
00340 
00341 const char * const compressed_matrix_align1_row_info_extractor = 
00342 "__kernel void row_info_extractor(\n"
00343 "          __global const unsigned int * row_indices,\n"
00344 "          __global const unsigned int * column_indices, \n"
00345 "          __global const float * elements,\n"
00346 "          __global float * result,\n"
00347 "          unsigned int size,\n"
00348 "          unsigned int option\n"
00349 "          ) \n"
00350 "{ \n"
00351 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00352 "  {\n"
00353 "    float value = 0;\n"
00354 "    unsigned int row_end = row_indices[row+1];\n"
00355 "    \n"
00356 "    switch (option)\n"
00357 "    {\n"
00358 "      case 0: //inf-norm\n"
00359 "        for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00360 "          value = max(value, fabs(elements[i]));\n"
00361 "        break;\n"
00362 "        \n"
00363 "      case 1: //1-norm\n"
00364 "        for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00365 "          value += fabs(elements[i]);\n"
00366 "        break;\n"
00367 "        \n"
00368 "      case 2: //2-norm\n"
00369 "        for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00370 "          value += elements[i] * elements[i];\n"
00371 "        value = sqrt(value);\n"
00372 "        break;\n"
00373 "        \n"
00374 "      case 3: //diagonal entry\n"
00375 "        for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00376 "        {\n"
00377 "          if (column_indices[i] == row)\n"
00378 "          {\n"
00379 "            value = elements[i];\n"
00380 "            break;\n"
00381 "          }\n"
00382 "        }\n"
00383 "        break;\n"
00384 "        \n"
00385 "      default:\n"
00386 "        break;\n"
00387 "    }\n"
00388 "    result[row] = value;\n"
00389 "  }\n"
00390 "}\n"
00391 ; //compressed_matrix_align1_row_info_extractor
00392 
00393 const char * const compressed_matrix_align1_vec_mul_cpu = 
00394 "__kernel void vec_mul_cpu(\n"
00395 "          __global const unsigned int * row_indices,\n"
00396 "          __global const unsigned int * column_indices, \n"
00397 "          __global const float * elements,\n"
00398 "          __global const float * vector,  \n"
00399 "          __global float * result,\n"
00400 "          unsigned int size) \n"
00401 "{ \n"
00402 "  unsigned int work_per_item = max((uint) (size / get_global_size(0)), (uint) 1);\n"
00403 "  unsigned int row_start = get_global_id(0) * work_per_item;\n"
00404 "  unsigned int row_stop  = min( (uint) ((get_global_id(0) + 1) * work_per_item), (uint) size);\n"
00405 "  for (unsigned int row = row_start; row < row_stop; ++row)\n"
00406 "  {\n"
00407 "    float dot_prod = 0.0f;\n"
00408 "    unsigned int row_end = row_indices[row+1];\n"
00409 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00410 "      dot_prod += elements[i] * vector[column_indices[i]];\n"
00411 "    result[row] = dot_prod;\n"
00412 "  }\n"
00413 "}\n"
00414 ; //compressed_matrix_align1_vec_mul_cpu
00415 
00416 const char * const compressed_matrix_align1_jacobi = 
00417 "__kernel void jacobi(\n"
00418 " __global const unsigned int * row_indices,\n"
00419 " __global const unsigned int * column_indices,\n"
00420 " __global const float * elements,\n"
00421 " float weight,\n"
00422 " __global const float * old_result,\n"
00423 " __global float * new_result,\n"
00424 " __global const float * rhs,\n"
00425 " unsigned int size)\n"
00426 " {\n"
00427 "  float sum, diag=1;\n"
00428 "  int col;\n"
00429 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00430 "  {\n"
00431 "    sum = 0;\n"
00432 "    for (unsigned int j = row_indices[i]; j<row_indices[i+1]; j++)\n"
00433 "    {\n"
00434 "      col = column_indices[j];\n"
00435 "      if (i == col)\n"
00436 "       diag = elements[j];\n"
00437 "      else \n"
00438 "       sum += elements[j] * old_result[col]; \n"
00439 "    } \n"
00440 "      new_result[i] = weight * (rhs[i]-sum) / diag + (1-weight) * old_result[i]; \n"
00441 "   } \n"
00442 " } \n"
00443 ; //compressed_matrix_align1_jacobi
00444 
00445 const char * const compressed_matrix_align1_unit_lu_backward = 
00446 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00447 "__kernel void unit_lu_backward(\n"
00448 "          __global const unsigned int * row_indices,\n"
00449 "          __global const unsigned int * column_indices, \n"
00450 "          __global const float * elements,\n"
00451 "          __global float * vector,\n"
00452 "          unsigned int size) \n"
00453 "{\n"
00454 "  __local  unsigned int col_index_buffer[128];\n"
00455 "  __local  float element_buffer[128];\n"
00456 "  __local  float vector_buffer[128];\n"
00457 "  \n"
00458 "  unsigned int nnz = row_indices[size];\n"
00459 "  unsigned int current_row = size-1;\n"
00460 "  unsigned int row_at_window_start = size-1;\n"
00461 "  float current_vector_entry = vector[size-1];\n"
00462 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0);\n"
00463 "  unsigned int next_row = row_indices[size-1];\n"
00464 "  \n"
00465 "  unsigned int i = loop_end + get_local_id(0);\n"
00466 "  while (1)\n"
00467 "  {\n"
00468 "    //load into shared memory (coalesced access):\n"
00469 "    if (i < nnz)\n"
00470 "    {\n"
00471 "      element_buffer[get_local_id(0)] = elements[i];\n"
00472 "      unsigned int tmp = column_indices[i];\n"
00473 "      col_index_buffer[get_local_id(0)] = tmp;\n"
00474 "      vector_buffer[get_local_id(0)] = vector[tmp];\n"
00475 "    }\n"
00476 "    \n"
00477 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00478 "    \n"
00479 "    //now a single thread does the remaining work in shared memory:\n"
00480 "    if (get_local_id(0) == 0)\n"
00481 "    {\n"
00482 "      // traverse through all the loaded data from back to front:\n"
00483 "      for (unsigned int k2=0; k2<get_local_size(0); ++k2)\n"
00484 "      {\n"
00485 "        unsigned int k = (get_local_size(0) - k2) - 1;\n"
00486 "        \n"
00487 "        if (i+k >= nnz)\n"
00488 "          continue;\n"
00489 "        \n"
00490 "        if (col_index_buffer[k] > row_at_window_start) //use recently computed results\n"
00491 "          current_vector_entry -= element_buffer[k] * vector_buffer[k];\n"
00492 "        else if (col_index_buffer[k] > current_row) //use buffered data\n"
00493 "          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n"
00494 "        \n"
00495 "        if (i+k == next_row) //current row is finished. Write back result\n"
00496 "        {\n"
00497 "          vector[current_row] = current_vector_entry;\n"
00498 "          if (current_row > 0) //load next row's data\n"
00499 "          {\n"
00500 "            --current_row;\n"
00501 "            next_row = row_indices[current_row];\n"
00502 "            current_vector_entry = vector[current_row];\n"
00503 "          }\n"
00504 "        }\n"
00505 "        \n"
00506 "        \n"
00507 "      } // for k\n"
00508 "      \n"
00509 "      row_at_window_start = current_row;\n"
00510 "    } // if (get_local_id(0) == 0)\n"
00511 "    \n"
00512 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00513 "    \n"
00514 "    if (i < get_local_size(0))\n"
00515 "      break;\n"
00516 "    \n"
00517 "    i -= get_local_size(0);\n"
00518 "  } //for i\n"
00519 "}\n"
00520 ; //compressed_matrix_align1_unit_lu_backward
00521 
00522 const char * const compressed_matrix_align1_block_trans_lu_backward = 
00523 " __kernel void block_trans_lu_backward(\n"
00524 "           __global const unsigned int * row_jumper_U,      //U part (note that U is transposed in memory)\n"
00525 "           __global const unsigned int * column_indices_U,\n"
00526 "           __global const float * elements_U,\n"
00527 "           __global const float * diagonal_U,\n"
00528 "           __global const unsigned int * block_offsets,\n"
00529 "           __global float * result,\n"
00530 "           unsigned int size)\n"
00531 " {\n"
00532 "   unsigned int col_start = block_offsets[2*get_group_id(0)];\n"
00533 "   unsigned int col_stop  = block_offsets[2*get_group_id(0)+1];\n"
00534 "   unsigned int row_start;\n"
00535 "   unsigned int row_stop;\n"
00536 "   float result_entry = 0;\n"
00537 "   if (col_start >= col_stop)\n"
00538 "     return;\n"
00539 "   //backward elimination, using U and diagonal_U\n"
00540 "   for (unsigned int iter = 0; iter < col_stop - col_start; ++iter) \n"
00541 "   { \n"
00542 "     unsigned int col = (col_stop - iter) - 1;\n"
00543 "     result_entry = result[col] / diagonal_U[col];\n"
00544 "     row_start = row_jumper_U[col]; \n"
00545 "     row_stop  = row_jumper_U[col + 1]; \n"
00546 "     for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0)) \n"
00547 "       result[column_indices_U[buffer_index]] -= result_entry * elements_U[buffer_index];\n"
00548 "     barrier(CLK_GLOBAL_MEM_FENCE); \n"
00549 "   } \n"
00550 "   //divide result vector by diagonal:\n"
00551 "   for (unsigned int col = col_start + get_local_id(0); col < col_stop; col += get_local_size(0)) \n"
00552 "     result[col] /= diagonal_U[col];\n"
00553 " };\n"
00554 ; //compressed_matrix_align1_block_trans_lu_backward
00555 
00556 const char * const compressed_matrix_align1_block_trans_unit_lu_forward = 
00557 " __kernel void block_trans_unit_lu_forward(\n"
00558 "           __global const unsigned int * row_jumper_L,      //L part (note that L is transposed in memory)\n"
00559 "           __global const unsigned int * column_indices_L, \n"
00560 "           __global const float * elements_L,\n"
00561 "           __global const unsigned int * block_offsets,\n"
00562 "           __global float * result,\n"
00563 "           unsigned int size)\n"
00564 " {\n"
00565 "   unsigned int col_start = block_offsets[2*get_group_id(0)];\n"
00566 "   unsigned int col_stop  = block_offsets[2*get_group_id(0)+1];\n"
00567 "   unsigned int row_start = row_jumper_L[col_start];\n"
00568 "   unsigned int row_stop;\n"
00569 "   float result_entry = 0;\n"
00570 "   if (col_start >= col_stop)\n"
00571 "     return;\n"
00572 "   //forward elimination, using L:\n"
00573 "   for (unsigned int col = col_start; col < col_stop; ++col)\n"
00574 "   {\n"
00575 "     result_entry = result[col];\n"
00576 "     row_stop = row_jumper_L[col + 1];\n"
00577 "     for (unsigned int buffer_index = row_start + get_local_id(0); buffer_index < row_stop; buffer_index += get_local_size(0))\n"
00578 "       result[column_indices_L[buffer_index]] -= result_entry * elements_L[buffer_index]; \n"
00579 "     row_start = row_stop; //for next iteration (avoid unnecessary loads from GPU RAM)\n"
00580 "     barrier(CLK_GLOBAL_MEM_FENCE);\n"
00581 "   } \n"
00582 " };\n"
00583 ; //compressed_matrix_align1_block_trans_unit_lu_forward
00584 
00585 const char * const compressed_matrix_align1_trans_lu_forward = 
00586 " \n"
00587 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00588 "__kernel void trans_lu_forward(\n"
00589 "          __global const unsigned int * row_indices,\n"
00590 "          __global const unsigned int * column_indices, \n"
00591 "          __global const float * elements,\n"
00592 "          __global const float * diagonal_entries,\n"
00593 "          __global float * vector,\n"
00594 "          unsigned int size) \n"
00595 "{\n"
00596 "  __local unsigned int row_index_lookahead[256];\n"
00597 "  __local unsigned int row_index_buffer[256];\n"
00598 "  \n"
00599 "  unsigned int row_index;\n"
00600 "  unsigned int col_index;\n"
00601 "  float matrix_entry;\n"
00602 "  unsigned int nnz = row_indices[size];\n"
00603 "  unsigned int row_at_window_start = 0;\n"
00604 "  unsigned int row_at_window_end = 0;\n"
00605 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00606 "  \n"
00607 "  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n"
00608 "  {\n"
00609 "    col_index    = (i < nnz) ? column_indices[i] : 0;\n"
00610 "    matrix_entry = (i < nnz) ? elements[i]       : 0;\n"
00611 "    row_index_lookahead[get_local_id(0)] = (row_at_window_start + get_local_id(0) < size) ? row_indices[row_at_window_start + get_local_id(0)] : size - 1;\n"
00612 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00613 "    \n"
00614 "    if (i < nnz)\n"
00615 "    {\n"
00616 "      unsigned int row_index_inc = 0;\n"
00617 "      while (i >= row_index_lookahead[row_index_inc + 1])\n"
00618 "        ++row_index_inc;\n"
00619 "      row_index = row_at_window_start + row_index_inc;\n"
00620 "      row_index_buffer[get_local_id(0)] = row_index;\n"
00621 "    }\n"
00622 "    else\n"
00623 "    {\n"
00624 "      row_index = size+1;\n"
00625 "      row_index_buffer[get_local_id(0)] = size - 1;\n"
00626 "    }\n"
00627 "    \n"
00628 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00629 "    \n"
00630 "    row_at_window_start = row_index_buffer[0];\n"
00631 "    row_at_window_end   = row_index_buffer[get_local_size(0) - 1];\n"
00632 "    \n"
00633 "    //forward elimination\n"
00634 "    for (unsigned int row = row_at_window_start; row <= row_at_window_end; ++row) \n"
00635 "    { \n"
00636 "      float result_entry = vector[row] / diagonal_entries[row];\n"
00637 "      \n"
00638 "      if ( (row_index == row) && (col_index > row) )\n"
00639 "        vector[col_index] -= result_entry * matrix_entry; \n"
00640 "      barrier(CLK_GLOBAL_MEM_FENCE);\n"
00641 "    }\n"
00642 "    \n"
00643 "    row_at_window_start = row_at_window_end;\n"
00644 "  }\n"
00645 "  \n"
00646 "  // final step: Divide vector by diagonal entries:\n"
00647 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00648 "    vector[i] /= diagonal_entries[i];\n"
00649 "}\n"
00650 ; //compressed_matrix_align1_trans_lu_forward
00651 
00652 const char * const compressed_matrix_align1_lu_backward = 
00653 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00654 "__kernel void lu_backward(\n"
00655 "          __global const unsigned int * row_indices,\n"
00656 "          __global const unsigned int * column_indices, \n"
00657 "          __global const float * elements,\n"
00658 "          __global float * vector,\n"
00659 "          unsigned int size) \n"
00660 "{\n"
00661 "  __local unsigned int col_index_buffer[128];\n"
00662 "  __local float element_buffer[128];\n"
00663 "  __local float vector_buffer[128];\n"
00664 "          \n"
00665 "  unsigned int nnz = row_indices[size];\n"
00666 "  unsigned int current_row = size-1;\n"
00667 "  unsigned int row_at_window_start = size-1;\n"
00668 "  float current_vector_entry = vector[size-1];\n"
00669 "  float diagonal_entry = 0;\n"
00670 "  unsigned int loop_end = ( (nnz - 1) / get_local_size(0)) * get_local_size(0);\n"
00671 "  unsigned int next_row = row_indices[size-1];\n"
00672 "  \n"
00673 "  unsigned int i = loop_end + get_local_id(0);\n"
00674 "  while (1)\n"
00675 "  {\n"
00676 "    //load into shared memory (coalesced access):\n"
00677 "    if (i < nnz)\n"
00678 "    {\n"
00679 "      element_buffer[get_local_id(0)] = elements[i];\n"
00680 "      unsigned int tmp = column_indices[i];\n"
00681 "      col_index_buffer[get_local_id(0)] = tmp;\n"
00682 "      vector_buffer[get_local_id(0)] = vector[tmp];\n"
00683 "    }\n"
00684 "    \n"
00685 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00686 "    \n"
00687 "    //now a single thread does the remaining work in shared memory:\n"
00688 "    if (get_local_id(0) == 0)\n"
00689 "    {\n"
00690 "      // traverse through all the loaded data from back to front:\n"
00691 "      for (unsigned int k2=0; k2<get_local_size(0); ++k2)\n"
00692 "      {\n"
00693 "        unsigned int k = (get_local_size(0) - k2) - 1;\n"
00694 "        \n"
00695 "        if (i+k >= nnz)\n"
00696 "          continue;\n"
00697 "        \n"
00698 "        if (col_index_buffer[k] > row_at_window_start) //use recently computed results\n"
00699 "          current_vector_entry -= element_buffer[k] * vector_buffer[k];\n"
00700 "        else if (col_index_buffer[k] > current_row) //use buffered data\n"
00701 "          current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n"
00702 "        else if (col_index_buffer[k] == current_row)\n"
00703 "          diagonal_entry = element_buffer[k];\n"
00704 "        \n"
00705 "        if (i+k == next_row) //current row is finished. Write back result\n"
00706 "        {\n"
00707 "          vector[current_row] = current_vector_entry / diagonal_entry;\n"
00708 "          if (current_row > 0) //load next row's data\n"
00709 "          {\n"
00710 "            --current_row;\n"
00711 "            next_row = row_indices[current_row];\n"
00712 "            current_vector_entry = vector[current_row];\n"
00713 "          }\n"
00714 "        }\n"
00715 "        \n"
00716 "        \n"
00717 "      } // for k\n"
00718 "      \n"
00719 "      row_at_window_start = current_row;\n"
00720 "    } // if (get_local_id(0) == 0)\n"
00721 "    \n"
00722 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00723 "    \n"
00724 "    if (i < get_local_size(0))\n"
00725 "      break;\n"
00726 "    \n"
00727 "    i -= get_local_size(0);\n"
00728 "  } //for i\n"
00729 "}\n"
00730 ; //compressed_matrix_align1_lu_backward
00731 
00732 const char * const compressed_matrix_align1_vec_mul = 
00733 "__kernel void vec_mul(\n"
00734 "          __global const unsigned int * row_indices,\n"
00735 "          __global const unsigned int * column_indices, \n"
00736 "          __global const float * elements,\n"
00737 "          __global const float * vector,  \n"
00738 "          __global float * result,\n"
00739 "          unsigned int size) \n"
00740 "{ \n"
00741 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00742 "  {\n"
00743 "    float dot_prod = 0.0f;\n"
00744 "    unsigned int row_end = row_indices[row+1];\n"
00745 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00746 "      dot_prod += elements[i] * vector[column_indices[i]];\n"
00747 "    result[row] = dot_prod;\n"
00748 "  }\n"
00749 "}\n"
00750 ; //compressed_matrix_align1_vec_mul
00751 
00752 const char * const compressed_matrix_align1_lu_forward = 
00753 " \n"
00754 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00755 "__kernel void lu_forward(\n"
00756 "          __global const unsigned int * row_indices,\n"
00757 "          __global const unsigned int * column_indices, \n"
00758 "          __global const float * elements,\n"
00759 "          __global float * vector,\n"
00760 "          unsigned int size) \n"
00761 "{\n"
00762 "  __local unsigned int col_index_buffer[128];\n"
00763 "  __local float element_buffer[128];\n"
00764 "  __local float vector_buffer[128];\n"
00765 "          \n"
00766 "  unsigned int nnz = row_indices[size];\n"
00767 "  unsigned int current_row = 0;\n"
00768 "  unsigned int row_at_window_start = 0;\n"
00769 "  float current_vector_entry = vector[0];\n"
00770 "  float diagonal_entry;\n"
00771 "  unsigned int loop_end = (nnz / get_local_size(0) + 1) * get_local_size(0);\n"
00772 "  unsigned int next_row = row_indices[1];\n"
00773 "  \n"
00774 "  for (unsigned int i = get_local_id(0); i < loop_end; i += get_local_size(0))\n"
00775 "  {\n"
00776 "    //load into shared memory (coalesced access):\n"
00777 "    if (i < nnz)\n"
00778 "    {\n"
00779 "      element_buffer[get_local_id(0)] = elements[i];\n"
00780 "      unsigned int tmp = column_indices[i];\n"
00781 "      col_index_buffer[get_local_id(0)] = tmp;\n"
00782 "      vector_buffer[get_local_id(0)] = vector[tmp];\n"
00783 "    }\n"
00784 "    \n"
00785 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00786 "    \n"
00787 "    //now a single thread does the remaining work in shared memory:\n"
00788 "    if (get_local_id(0) == 0)\n"
00789 "    {\n"
00790 "      // traverse through all the loaded data:\n"
00791 "      for (unsigned int k=0; k<get_local_size(0); ++k)\n"
00792 "      {\n"
00793 "        if (current_row < size && i+k == next_row) //current row is finished. Write back result\n"
00794 "        {\n"
00795 "          vector[current_row] = current_vector_entry / diagonal_entry;\n"
00796 "          ++current_row;\n"
00797 "          if (current_row < size) //load next row's data\n"
00798 "          {\n"
00799 "            next_row = row_indices[current_row+1];\n"
00800 "            current_vector_entry = vector[current_row];\n"
00801 "          }\n"
00802 "        }\n"
00803 "        \n"
00804 "        if (current_row < size && col_index_buffer[k] < current_row) //substitute\n"
00805 "        {\n"
00806 "          if (col_index_buffer[k] < row_at_window_start) //use recently computed results\n"
00807 "            current_vector_entry -= element_buffer[k] * vector_buffer[k];\n"
00808 "          else if (col_index_buffer[k] < current_row) //use buffered data\n"
00809 "            current_vector_entry -= element_buffer[k] * vector[col_index_buffer[k]];\n"
00810 "        }\n"
00811 "        else if (col_index_buffer[k] == current_row)\n"
00812 "          diagonal_entry = element_buffer[k];\n"
00813 "      } // for k\n"
00814 "      \n"
00815 "      row_at_window_start = current_row;\n"
00816 "    } // if (get_local_id(0) == 0)\n"
00817 "    \n"
00818 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00819 "  } //for i\n"
00820 "}\n"
00821 ; //compressed_matrix_align1_lu_forward
00822 
00823 const char * const compressed_matrix_align4_vec_mul = 
00824 "__kernel void vec_mul(\n"
00825 "          __global const unsigned int * row_indices,\n"
00826 "          __global const uint4 * column_indices, \n"
00827 "          __global const float4 * elements,\n"
00828 "          __global const float * vector,  \n"
00829 "          __global float * result,\n"
00830 "          unsigned int size)\n"
00831 "{ \n"
00832 "  float dot_prod;\n"
00833 "  unsigned int start, next_stop;\n"
00834 "  uint4 col_idx;\n"
00835 "  float4 tmp_vec;\n"
00836 "  float4 tmp_entries;\n"
00837 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00838 "  {\n"
00839 "    dot_prod = 0.0f;\n"
00840 "    start = row_indices[row] / 4;\n"
00841 "    next_stop = row_indices[row+1] / 4;\n"
00842 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00843 "    {\n"
00844 "      col_idx = column_indices[i];\n"
00845 "      tmp_entries = elements[i];\n"
00846 "      tmp_vec.x = vector[col_idx.x];\n"
00847 "      tmp_vec.y = vector[col_idx.y];\n"
00848 "      tmp_vec.z = vector[col_idx.z];\n"
00849 "      tmp_vec.w = vector[col_idx.w];\n"
00850 "      dot_prod += dot(tmp_entries, tmp_vec);\n"
00851 "    }\n"
00852 "    result[row] = dot_prod;\n"
00853 "  }\n"
00854 "}\n"
00855 ; //compressed_matrix_align4_vec_mul
00856 
00857   }  //namespace kernels
00858  }  //namespace linalg
00859 }  //namespace viennacl
00860 #endif
00861