00001 #ifndef _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_
00002 #define _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const compressed_matrix_align4_vec_mul =
00011 "__kernel void vec_mul(\n"
00012 " __global const unsigned int * row_indices,\n"
00013 " __global const uint4 * column_indices, \n"
00014 " __global const float4 * elements,\n"
00015 " __global const float * vector, \n"
00016 " __global float * result,\n"
00017 " unsigned int size)\n"
00018 "{ \n"
00019 " float dot_prod;\n"
00020 " unsigned int start, next_stop;\n"
00021 " uint4 col_idx;\n"
00022 " float4 tmp_vec;\n"
00023 " float4 tmp_entries;\n"
00024 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00025 " {\n"
00026 " dot_prod = 0.0f;\n"
00027 " start = row_indices[row] / 4;\n"
00028 " next_stop = row_indices[row+1] / 4;\n"
00029 " for (unsigned int i = start; i < next_stop; ++i)\n"
00030 " {\n"
00031 " col_idx = column_indices[i];\n"
00032 " tmp_entries = elements[i];\n"
00033 " tmp_vec.x = vector[col_idx.x];\n"
00034 " tmp_vec.y = vector[col_idx.y];\n"
00035 " tmp_vec.z = vector[col_idx.z];\n"
00036 " tmp_vec.w = vector[col_idx.w];\n"
00037 " dot_prod += dot(tmp_entries, tmp_vec);\n"
00038 " }\n"
00039 " result[row] = dot_prod;\n"
00040 " }\n"
00041 "}\n"
00042 ;
00043
00044 const char * const compressed_matrix_align1_vec_mul =
00045 "__kernel void vec_mul(\n"
00046 " __global const unsigned int * row_indices,\n"
00047 " __global const unsigned int * column_indices, \n"
00048 " __global const float * elements,\n"
00049 " __global const float * vector, \n"
00050 " __global float * result,\n"
00051 " unsigned int size) \n"
00052 "{ \n"
00053 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00054 " {\n"
00055 " float dot_prod = 0.0f;\n"
00056 " unsigned int row_end = row_indices[row+1];\n"
00057 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00058 " dot_prod += elements[i] * vector[column_indices[i]];\n"
00059 " result[row] = dot_prod;\n"
00060 " }\n"
00061 "}\n"
00062 ;
00063
00064 const char * const compressed_matrix_align1_row_scaling_2 =
00065 "__kernel void row_scaling_2(\n"
00066 " __global const unsigned int * row_indices,\n"
00067 " __global const unsigned int * column_indices, \n"
00068 " __global const float * elements,\n"
00069 " __global float * diag_M_inv,\n"
00070 " unsigned int size) \n"
00071 "{ \n"
00072 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00073 " {\n"
00074 " float dot_prod = 0.0f;\n"
00075 " float temp = 0.0f;\n"
00076 " unsigned int row_end = row_indices[row+1];\n"
00077 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00078 " {\n"
00079 " temp = elements[i];\n"
00080 " dot_prod += temp * temp;\n"
00081 " }\n"
00082 " diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
00083 " }\n"
00084 "}\n"
00085 ;
00086
00087 const char * const compressed_matrix_align1_jacobi_precond =
00088 "__kernel void jacobi_precond(\n"
00089 " __global const unsigned int * row_indices,\n"
00090 " __global const unsigned int * column_indices, \n"
00091 " __global const float * elements,\n"
00092 " __global float * diag_M_inv,\n"
00093 " unsigned int size) \n"
00094 "{ \n"
00095 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00096 " {\n"
00097 " float diag = 1.0f;\n"
00098 " unsigned int row_end = row_indices[row+1];\n"
00099 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00100 " {\n"
00101 " if (row == column_indices[i])\n"
00102 " {\n"
00103 " diag = elements[i];\n"
00104 " break;\n"
00105 " }\n"
00106 " }\n"
00107 " diag_M_inv[row] = 1.0f / diag;\n"
00108 " }\n"
00109 "}\n"
00110 ;
00111
00112 const char * const compressed_matrix_align1_row_scaling_1 =
00113 "__kernel void row_scaling_1(\n"
00114 " __global const unsigned int * row_indices,\n"
00115 " __global const unsigned int * column_indices, \n"
00116 " __global const float * elements,\n"
00117 " __global float * diag_M_inv,\n"
00118 " unsigned int size) \n"
00119 "{ \n"
00120 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00121 " {\n"
00122 " float dot_prod = 0.0f;\n"
00123 " unsigned int row_end = row_indices[row+1];\n"
00124 " for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00125 " dot_prod += fabs(elements[i]);\n"
00126 " diag_M_inv[row] = 1.0f / dot_prod;\n"
00127 " }\n"
00128 "}\n"
00129 ;
00130
00131 const char * const compressed_matrix_align1_lu_forward =
00132 " \n"
00133 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00134 "__kernel void lu_forward(\n"
00135 " __global const unsigned int * row_indices,\n"
00136 " __global const unsigned int * column_indices, \n"
00137 " __global const float * elements,\n"
00138 " __local int * buffer, \n"
00139 " __local float * vec_entries, //a memory block from vector\n"
00140 " __global float * vector,\n"
00141 " unsigned int size) \n"
00142 "{\n"
00143 " int waiting_for; //block index that must be finished before the current thread can start\n"
00144 " unsigned int waiting_for_index;\n"
00145 " int block_offset;\n"
00146 " unsigned int col;\n"
00147 " unsigned int row;\n"
00148 " unsigned int row_index_end;\n"
00149 " \n"
00150 " //backward substitution: one thread per row in blocks of get_global_size(0)\n"
00151 " for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
00152 " {\n"
00153 " block_offset = block_num * get_global_size(0);\n"
00154 " row = block_offset + get_global_id(0);\n"
00155 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00156 " waiting_for = -1;\n"
00157 " if (row < size)\n"
00158 " {\n"
00159 " vec_entries[get_global_id(0)] = vector[row];\n"
00160 " waiting_for_index = row_indices[row];\n"
00161 " row_index_end = row_indices[row+1];\n"
00162 " }\n"
00163 " \n"
00164 " if (get_global_id(0) == 0)\n"
00165 " buffer[get_global_size(0)] = 1;\n"
00166 " //try to eliminate all lines in the block. \n"
00167 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00168 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00169 " {\n"
00170 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00171 " if (row < size) //valid index?\n"
00172 " {\n"
00173 " if (waiting_for >= 0)\n"
00174 " {\n"
00175 " if (buffer[waiting_for] == 1)\n"
00176 " waiting_for = -1;\n"
00177 " }\n"
00178 " \n"
00179 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00180 " {\n"
00181 " //check whether reduction is possible:\n"
00182 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00183 " {\n"
00184 " col = column_indices[j];\n"
00185 " if (col < block_offset) //index valid, but not from current block\n"
00186 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00187 " else if (col < row) //index is from current block\n"
00188 " {\n"
00189 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00190 " {\n"
00191 " waiting_for = col - block_offset;\n"
00192 " waiting_for_index = j;\n"
00193 " break;\n"
00194 " }\n"
00195 " else //updated entry is available in shared memory:\n"
00196 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00197 " }\n"
00198 " }\n"
00199 " \n"
00200 " if (waiting_for == -1) //this row is done\n"
00201 " {\n"
00202 " buffer[get_global_id(0)] = 1;\n"
00203 " waiting_for = -2; //magic number: thread is finished\n"
00204 " }\n"
00205 " } \n"
00206 " } //row < size\n"
00207 " else\n"
00208 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00209 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00210 " \n"
00211 " if (buffer[get_global_id(0)] == 0)\n"
00212 " buffer[get_global_size(0)] = 0;\n"
00213 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00214 " \n"
00215 " if (buffer[get_global_size(0)] > 0) //all threads break this loop simultaneously\n"
00216 " break;\n"
00217 " if (get_global_id(0) == 0)\n"
00218 " buffer[get_global_size(0)] = 1;\n"
00219 " } //for k\n"
00220 " \n"
00221 " //write to vector:\n"
00222 " if (row < size)\n"
00223 " vector[row] = vec_entries[get_global_id(0)];\n"
00224 " \n"
00225 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00226 " } //for block_num\n"
00227 "}\n"
00228 ;
00229
00230 const char * const compressed_matrix_align1_bicgstab_kernel2 =
00231 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
00232 "{\n"
00233 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00234 " {\n"
00235 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00236 " if (get_local_id(0) < stride)\n"
00237 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00238 " }\n"
00239 "}\n"
00240 "//////// inner products:\n"
00241 "float bicgstab_kernel2_inner_prod(\n"
00242 " __global const float * vec1,\n"
00243 " __global const float * vec2,\n"
00244 " unsigned int size,\n"
00245 " __local float * tmp_buffer)\n"
00246 "{\n"
00247 " float tmp = 0;\n"
00248 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00249 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00250 " {\n"
00251 " if (i < size)\n"
00252 " tmp += vec1[i] * vec2[i];\n"
00253 " }\n"
00254 " tmp_buffer[get_local_id(0)] = tmp;\n"
00255 " \n"
00256 " helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
00257 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00258 " return tmp_buffer[0];\n"
00259 "}\n"
00260 "__kernel void bicgstab_kernel2(\n"
00261 " __global const float * tmp0,\n"
00262 " __global const float * tmp1,\n"
00263 " __global const float * r0star, \n"
00264 " __global const float * s, \n"
00265 " __global float * p, \n"
00266 " __global float * result,\n"
00267 " __global float * residual,\n"
00268 " __global const float * alpha,\n"
00269 " __global float * ip_rr0star,\n"
00270 " __global float * error_estimate,\n"
00271 " __local float * tmp_buffer,\n"
00272 " unsigned int size) \n"
00273 "{ \n"
00274 " float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
00275 " float alpha_local = alpha[0];\n"
00276 " \n"
00277 " //result += alpha * p + omega * s;\n"
00278 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00279 " result[i] += alpha_local * p[i] + omega_local * s[i];\n"
00280 " //residual = s - omega * tmp1;\n"
00281 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00282 " residual[i] = s[i] - omega_local * tmp1[i];\n"
00283 " //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
00284 " float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
00285 " float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
00286 " \n"
00287 " //p = residual + beta * (p - omega*tmp0);\n"
00288 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00289 " p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
00290 " //compute norm of residual:\n"
00291 " float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
00292 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00293 " //update values:\n"
00294 " if (get_global_id(0) == 0)\n"
00295 " {\n"
00296 " error_estimate[0] = new_error_estimate;\n"
00297 " ip_rr0star[0] = new_ip_rr0star;\n"
00298 " }\n"
00299 "}\n"
00300 ;
00301
00302 const char * const compressed_matrix_align1_lu_backward =
00303 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00304 "__kernel void lu_backward(\n"
00305 " __global const unsigned int * row_indices,\n"
00306 " __global const unsigned int * column_indices, \n"
00307 " __global const float * elements,\n"
00308 " __local int * buffer, \n"
00309 " __local float * vec_entries, //a memory block from vector\n"
00310 " __global float * vector,\n"
00311 " unsigned int size) \n"
00312 "{\n"
00313 " int waiting_for; //block index that must be finished before the current thread can start\n"
00314 " unsigned int waiting_for_index;\n"
00315 " unsigned int block_offset;\n"
00316 " unsigned int col;\n"
00317 " unsigned int row;\n"
00318 " unsigned int row_index_end;\n"
00319 " float diagonal_entry = 42;\n"
00320 " \n"
00321 " //forward substitution: one thread per row in blocks of get_global_size(0)\n"
00322 " for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
00323 " {\n"
00324 " block_offset = block_num * get_global_size(0);\n"
00325 " row = block_offset + get_global_id(0);\n"
00326 " buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00327 " waiting_for = -1;\n"
00328 " \n"
00329 " if (row < size)\n"
00330 " {\n"
00331 " vec_entries[get_global_id(0)] = vector[row];\n"
00332 " waiting_for_index = row_indices[row];\n"
00333 " row_index_end = row_indices[row+1];\n"
00334 " diagonal_entry = column_indices[waiting_for_index];\n"
00335 " }\n"
00336 " \n"
00337 " if (get_global_id(0) == 0)\n"
00338 " buffer[get_global_size(0)] = 1;\n"
00339 " //try to eliminate all lines in the block. \n"
00340 " //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00341 " for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00342 " {\n"
00343 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00344 " if (row < size) //valid index?\n"
00345 " {\n"
00346 " if (waiting_for >= 0)\n"
00347 " {\n"
00348 " if (buffer[waiting_for] == 1)\n"
00349 " waiting_for = -1;\n"
00350 " }\n"
00351 " \n"
00352 " if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00353 " {\n"
00354 " //check whether reduction is possible:\n"
00355 " for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00356 " {\n"
00357 " col = column_indices[j];\n"
00358 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00359 " if (col >= block_offset + get_global_size(0)) //index valid, but not from current block\n"
00360 " vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00361 " else if (col > row) //index is from current block\n"
00362 " {\n"
00363 " if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00364 " {\n"
00365 " waiting_for = col - block_offset;\n"
00366 " waiting_for_index = j;\n"
00367 " break;\n"
00368 " }\n"
00369 " else //updated entry is available in shared memory:\n"
00370 " vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00371 " }\n"
00372 " else if (col == row)\n"
00373 " diagonal_entry = elements[j];\n"
00374 " }\n"
00375 " \n"
00376 " if (waiting_for == -1) //this row is done\n"
00377 " {\n"
00378 " if (row == 0)\n"
00379 " vec_entries[get_global_id(0)] /= elements[0];\n"
00380 " else\n"
00381 " vec_entries[get_global_id(0)] /= diagonal_entry;\n"
00382 " buffer[get_global_id(0)] = 1;\n"
00383 " waiting_for = -2; //magic number: thread is finished\n"
00384 " }\n"
00385 " } \n"
00386 " } //row < size\n"
00387 " else\n"
00388 " buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00389 " \n"
00390 " ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00391 " if (buffer[get_global_id(0)] == 0)\n"
00392 " buffer[get_global_size(0)] = 0;\n"
00393 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00394 " \n"
00395 " if (buffer[get_global_size(0)] > 0) //all threads break the loop simultaneously\n"
00396 " break;\n"
00397 " if (get_global_id(0) == 0)\n"
00398 " buffer[get_global_size(0)] = 1;\n"
00399 " } //for k\n"
00400 " if (row < size)\n"
00401 " vector[row] = vec_entries[get_global_id(0)];\n"
00402 " //vector[row] = diagonal_entry;\n"
00403 " \n"
00404 " //if (row == 0)\n"
00405 " //vector[0] = diagonal_entry;\n"
00406 " //vector[0] = elements[0];\n"
00407 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00408 " } //for block_num\n"
00409 "}\n"
00410 ;
00411
00412 const char * const compressed_matrix_align1_bicgstab_kernel1 =
00413 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
00414 "{\n"
00415 " for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00416 " {\n"
00417 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00418 " if (get_local_id(0) < stride)\n"
00419 " tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00420 " }\n"
00421 "}\n"
00422 "//////// inner products:\n"
00423 "float bicgstab_kernel1_inner_prod(\n"
00424 " __global const float * vec1,\n"
00425 " __global const float * vec2,\n"
00426 " unsigned int size,\n"
00427 " __local float * tmp_buffer)\n"
00428 "{\n"
00429 " float tmp = 0;\n"
00430 " unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00431 " for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00432 " {\n"
00433 " if (i < size)\n"
00434 " tmp += vec1[i] * vec2[i];\n"
00435 " }\n"
00436 " tmp_buffer[get_local_id(0)] = tmp;\n"
00437 " \n"
00438 " helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
00439 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00440 " return tmp_buffer[0];\n"
00441 "}\n"
00442 "__kernel void bicgstab_kernel1(\n"
00443 " __global const float * tmp0,\n"
00444 " __global const float * r0star, \n"
00445 " __global const float * residual,\n"
00446 " __global float * s,\n"
00447 " __global float * alpha,\n"
00448 " __global const float * ip_rr0star,\n"
00449 " __local float * tmp_buffer,\n"
00450 " unsigned int size) \n"
00451 "{ \n"
00452 " float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
00453 " \n"
00454 " for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00455 " s[i] = residual[i] - alpha_local * tmp0[i];\n"
00456 " \n"
00457 " if (get_global_id(0) == 0)\n"
00458 " alpha[0] = alpha_local;\n"
00459 "}\n"
00460 ;
00461
00462 const char * const compressed_matrix_align8_vec_mul =
00463 "__kernel void vec_mul(\n"
00464 " __global const unsigned int * row_indices,\n"
00465 " __global const uint8 * column_indices, \n"
00466 " __global const float8 * elements,\n"
00467 " __global const float * vector, \n"
00468 " __global float * result,\n"
00469 " unsigned int size)\n"
00470 "{ \n"
00471 " float dot_prod;\n"
00472 " unsigned int start, next_stop;\n"
00473 " uint8 col_idx;\n"
00474 " float8 tmp_vec;\n"
00475 " float8 tmp_entries;\n"
00476 " for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00477 " {\n"
00478 " dot_prod = 0.0f;\n"
00479 " start = row_indices[row] / 8;\n"
00480 " next_stop = row_indices[row+1] / 8;\n"
00481 " for (unsigned int i = start; i < next_stop; ++i)\n"
00482 " {\n"
00483 " col_idx = column_indices[i];\n"
00484 " tmp_entries = elements[i];\n"
00485 " tmp_vec.s0 = vector[col_idx.s0];\n"
00486 " tmp_vec.s1 = vector[col_idx.s1];\n"
00487 " tmp_vec.s2 = vector[col_idx.s2];\n"
00488 " tmp_vec.s3 = vector[col_idx.s3];\n"
00489 " tmp_vec.s4 = vector[col_idx.s4];\n"
00490 " tmp_vec.s5 = vector[col_idx.s5];\n"
00491 " tmp_vec.s6 = vector[col_idx.s6];\n"
00492 " tmp_vec.s7 = vector[col_idx.s7];\n"
00493 " dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
00494 " dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
00495 " }\n"
00496 " result[row] = dot_prod;\n"
00497 " }\n"
00498 "}\n"
00499 ;
00500
00501 }
00502 }
00503 }
00504 #endif