• Main Page
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

/data/development/ViennaCL/ViennaCL-1.1.2/viennacl/linalg/kernels/compressed_matrix_source.h

Go to the documentation of this file.
00001 #ifndef _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_
00002 #define _VIENNACL_COMPRESSED_MATRIX_SOURCE_HPP_
00003 //Automatically generated file from aux-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const compressed_matrix_align4_vec_mul = 
00011 "__kernel void vec_mul(\n"
00012 "          __global const unsigned int * row_indices,\n"
00013 "          __global const uint4 * column_indices, \n"
00014 "          __global const float4 * elements,\n"
00015 "          __global const float * vector,  \n"
00016 "          __global float * result,\n"
00017 "          unsigned int size)\n"
00018 "{ \n"
00019 "  float dot_prod;\n"
00020 "  unsigned int start, next_stop;\n"
00021 "  uint4 col_idx;\n"
00022 "  float4 tmp_vec;\n"
00023 "  float4 tmp_entries;\n"
00024 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00025 "  {\n"
00026 "    dot_prod = 0.0f;\n"
00027 "    start = row_indices[row] / 4;\n"
00028 "    next_stop = row_indices[row+1] / 4;\n"
00029 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00030 "    {\n"
00031 "      col_idx = column_indices[i];\n"
00032 "      tmp_entries = elements[i];\n"
00033 "      tmp_vec.x = vector[col_idx.x];\n"
00034 "      tmp_vec.y = vector[col_idx.y];\n"
00035 "      tmp_vec.z = vector[col_idx.z];\n"
00036 "      tmp_vec.w = vector[col_idx.w];\n"
00037 "      dot_prod += dot(tmp_entries, tmp_vec);\n"
00038 "    }\n"
00039 "    result[row] = dot_prod;\n"
00040 "  }\n"
00041 "}\n"
00042 ; //compressed_matrix_align4_vec_mul
00043 
00044 const char * const compressed_matrix_align1_vec_mul = 
00045 "__kernel void vec_mul(\n"
00046 "          __global const unsigned int * row_indices,\n"
00047 "          __global const unsigned int * column_indices, \n"
00048 "          __global const float * elements,\n"
00049 "          __global const float * vector,  \n"
00050 "          __global float * result,\n"
00051 "          unsigned int size) \n"
00052 "{ \n"
00053 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00054 "  {\n"
00055 "    float dot_prod = 0.0f;\n"
00056 "    unsigned int row_end = row_indices[row+1];\n"
00057 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00058 "      dot_prod += elements[i] * vector[column_indices[i]];\n"
00059 "    result[row] = dot_prod;\n"
00060 "  }\n"
00061 "}\n"
00062 ; //compressed_matrix_align1_vec_mul
00063 
00064 const char * const compressed_matrix_align1_row_scaling_2 = 
00065 "__kernel void row_scaling_2(\n"
00066 "          __global const unsigned int * row_indices,\n"
00067 "          __global const unsigned int * column_indices, \n"
00068 "          __global const float * elements,\n"
00069 "          __global float * diag_M_inv,\n"
00070 "          unsigned int size) \n"
00071 "{ \n"
00072 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00073 "  {\n"
00074 "    float dot_prod = 0.0f;\n"
00075 "    float temp = 0.0f;\n"
00076 "    unsigned int row_end = row_indices[row+1];\n"
00077 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00078 "    {\n"
00079 "      temp = elements[i];\n"
00080 "      dot_prod += temp * temp;\n"
00081 "    }\n"
00082 "    diag_M_inv[row] = 1.0f / sqrt(dot_prod);\n"
00083 "  }\n"
00084 "}\n"
00085 ; //compressed_matrix_align1_row_scaling_2
00086 
00087 const char * const compressed_matrix_align1_jacobi_precond = 
00088 "__kernel void jacobi_precond(\n"
00089 "          __global const unsigned int * row_indices,\n"
00090 "          __global const unsigned int * column_indices, \n"
00091 "          __global const float * elements,\n"
00092 "          __global float * diag_M_inv,\n"
00093 "          unsigned int size) \n"
00094 "{ \n"
00095 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00096 "  {\n"
00097 "    float diag = 1.0f;\n"
00098 "    unsigned int row_end = row_indices[row+1];\n"
00099 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00100 "    {\n"
00101 "      if (row == column_indices[i])\n"
00102 "      {\n"
00103 "        diag = elements[i];\n"
00104 "        break;\n"
00105 "      }\n"
00106 "    }\n"
00107 "    diag_M_inv[row] = 1.0f / diag;\n"
00108 "  }\n"
00109 "}\n"
00110 ; //compressed_matrix_align1_jacobi_precond
00111 
00112 const char * const compressed_matrix_align1_row_scaling_1 = 
00113 "__kernel void row_scaling_1(\n"
00114 "          __global const unsigned int * row_indices,\n"
00115 "          __global const unsigned int * column_indices, \n"
00116 "          __global const float * elements,\n"
00117 "          __global float * diag_M_inv,\n"
00118 "          unsigned int size) \n"
00119 "{ \n"
00120 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00121 "  {\n"
00122 "    float dot_prod = 0.0f;\n"
00123 "    unsigned int row_end = row_indices[row+1];\n"
00124 "    for (unsigned int i = row_indices[row]; i < row_end; ++i)\n"
00125 "      dot_prod += fabs(elements[i]);\n"
00126 "    diag_M_inv[row] = 1.0f / dot_prod;\n"
00127 "  }\n"
00128 "}\n"
00129 ; //compressed_matrix_align1_row_scaling_1
00130 
00131 const char * const compressed_matrix_align1_lu_forward = 
00132 " \n"
00133 "// compute y in Ly = z for incomplete LU factorizations of a sparse matrix in compressed format\n"
00134 "__kernel void lu_forward(\n"
00135 "          __global const unsigned int * row_indices,\n"
00136 "          __global const unsigned int * column_indices, \n"
00137 "          __global const float * elements,\n"
00138 "          __local  int * buffer,                              \n"
00139 "          __local  float * vec_entries,   //a memory block from vector\n"
00140 "          __global float * vector,\n"
00141 "          unsigned int size) \n"
00142 "{\n"
00143 "  int waiting_for; //block index that must be finished before the current thread can start\n"
00144 "  unsigned int waiting_for_index;\n"
00145 "  int block_offset;\n"
00146 "  unsigned int col;\n"
00147 "  unsigned int row;\n"
00148 "  unsigned int row_index_end;\n"
00149 "  \n"
00150 "  //backward substitution: one thread per row in blocks of get_global_size(0)\n"
00151 "  for (unsigned int block_num = 0; block_num <= size / get_global_size(0); ++block_num)\n"
00152 "  {\n"
00153 "    block_offset = block_num * get_global_size(0);\n"
00154 "    row = block_offset + get_global_id(0);\n"
00155 "    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00156 "    waiting_for = -1;\n"
00157 "    if (row < size)\n"
00158 "    {\n"
00159 "      vec_entries[get_global_id(0)] = vector[row];\n"
00160 "      waiting_for_index = row_indices[row];\n"
00161 "      row_index_end = row_indices[row+1];\n"
00162 "    }\n"
00163 "    \n"
00164 "    if (get_global_id(0) == 0)\n"
00165 "      buffer[get_global_size(0)] = 1;\n"
00166 "    //try to eliminate all lines in the block. \n"
00167 "    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00168 "    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00169 "    {\n"
00170 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00171 "      if (row < size) //valid index?\n"
00172 "      {\n"
00173 "        if (waiting_for >= 0)\n"
00174 "        {\n"
00175 "          if (buffer[waiting_for] == 1)\n"
00176 "            waiting_for = -1;\n"
00177 "        }\n"
00178 "        \n"
00179 "        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00180 "        {\n"
00181 "          //check whether reduction is possible:\n"
00182 "          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00183 "          {\n"
00184 "            col = column_indices[j];\n"
00185 "            if (col < block_offset) //index valid, but not from current block\n"
00186 "              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00187 "            else if (col < row)  //index is from current block\n"
00188 "            {\n"
00189 "              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00190 "              {\n"
00191 "                waiting_for = col - block_offset;\n"
00192 "                waiting_for_index = j;\n"
00193 "                break;\n"
00194 "              }\n"
00195 "              else  //updated entry is available in shared memory:\n"
00196 "                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00197 "            }\n"
00198 "          }\n"
00199 "          \n"
00200 "          if (waiting_for == -1)  //this row is done\n"
00201 "          {\n"
00202 "            buffer[get_global_id(0)] = 1;\n"
00203 "            waiting_for = -2; //magic number: thread is finished\n"
00204 "          }\n"
00205 "        } \n"
00206 "      } //row < size\n"
00207 "      else\n"
00208 "        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00209 "      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00210 "      \n"
00211 "      if (buffer[get_global_id(0)] == 0)\n"
00212 "        buffer[get_global_size(0)] = 0;\n"
00213 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00214 "      \n"
00215 "      if (buffer[get_global_size(0)] > 0)  //all threads break this loop simultaneously\n"
00216 "        break;\n"
00217 "      if (get_global_id(0) == 0)\n"
00218 "        buffer[get_global_size(0)] = 1;\n"
00219 "    } //for k\n"
00220 "    \n"
00221 "    //write to vector:\n"
00222 "    if (row < size)\n"
00223 "      vector[row] = vec_entries[get_global_id(0)];\n"
00224 "    \n"
00225 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00226 "  } //for block_num\n"
00227 "}\n"
00228 ; //compressed_matrix_align1_lu_forward
00229 
00230 const char * const compressed_matrix_align1_bicgstab_kernel2 = 
00231 "void helper_bicgstab_kernel2_parallel_reduction( __local float * tmp_buffer )\n"
00232 "{\n"
00233 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00234 "  {\n"
00235 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00236 "    if (get_local_id(0) < stride)\n"
00237 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00238 "  }\n"
00239 "}\n"
00240 "//////// inner products:\n"
00241 "float bicgstab_kernel2_inner_prod(\n"
00242 "          __global const float * vec1,\n"
00243 "          __global const float * vec2,\n"
00244 "          unsigned int size,\n"
00245 "          __local float * tmp_buffer)\n"
00246 "{\n"
00247 "  float tmp = 0;\n"
00248 "  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00249 "  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00250 "  {\n"
00251 "    if (i < size)\n"
00252 "      tmp += vec1[i] * vec2[i];\n"
00253 "  }\n"
00254 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00255 "  \n"
00256 "  helper_bicgstab_kernel2_parallel_reduction(tmp_buffer);\n"
00257 "  barrier(CLK_LOCAL_MEM_FENCE);\n"
00258 "  return tmp_buffer[0];\n"
00259 "}\n"
00260 "__kernel void bicgstab_kernel2(\n"
00261 "          __global const float * tmp0,\n"
00262 "          __global const float * tmp1,\n"
00263 "          __global const float * r0star, \n"
00264 "          __global const float * s, \n"
00265 "          __global float * p, \n"
00266 "          __global float * result,\n"
00267 "          __global float * residual,\n"
00268 "          __global const float * alpha,\n"
00269 "          __global float * ip_rr0star,\n"
00270 "          __global float * error_estimate,\n"
00271 "          __local float * tmp_buffer,\n"
00272 "          unsigned int size) \n"
00273 "{ \n"
00274 "  float omega_local = bicgstab_kernel2_inner_prod(tmp1, s, size, tmp_buffer) / bicgstab_kernel2_inner_prod(tmp1, tmp1, size, tmp_buffer);\n"
00275 "  float alpha_local = alpha[0];\n"
00276 "  \n"
00277 "  //result += alpha * p + omega * s;\n"
00278 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00279 "    result[i] += alpha_local * p[i] + omega_local * s[i];\n"
00280 "  //residual = s - omega * tmp1;\n"
00281 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00282 "    residual[i] = s[i] - omega_local * tmp1[i];\n"
00283 "  //new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star);\n"
00284 "  float new_ip_rr0star = bicgstab_kernel2_inner_prod(residual, r0star, size, tmp_buffer);\n"
00285 "  float beta = (new_ip_rr0star / ip_rr0star[0]) * (alpha_local / omega_local);\n"
00286 "  \n"
00287 "  //p = residual + beta * (p - omega*tmp0);\n"
00288 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00289 "    p[i] = residual[i] + beta * (p[i] - omega_local * tmp0[i]);\n"
00290 "  //compute norm of residual:\n"
00291 "  float new_error_estimate = bicgstab_kernel2_inner_prod(residual, residual, size, tmp_buffer);\n"
00292 "  barrier(CLK_GLOBAL_MEM_FENCE);\n"
00293 "  //update values:\n"
00294 "  if (get_global_id(0) == 0)\n"
00295 "  {\n"
00296 "    error_estimate[0] = new_error_estimate;\n"
00297 "    ip_rr0star[0] = new_ip_rr0star;\n"
00298 "  }\n"
00299 "}\n"
00300 ; //compressed_matrix_align1_bicgstab_kernel2
00301 
00302 const char * const compressed_matrix_align1_lu_backward = 
00303 "// compute x in Ux = y for incomplete LU factorizations of a sparse matrix in compressed format\n"
00304 "__kernel void lu_backward(\n"
00305 "          __global const unsigned int * row_indices,\n"
00306 "          __global const unsigned int * column_indices, \n"
00307 "          __global const float * elements,\n"
00308 "          __local  int * buffer,                              \n"
00309 "          __local  float * vec_entries,   //a memory block from vector\n"
00310 "          __global float * vector,\n"
00311 "          unsigned int size) \n"
00312 "{\n"
00313 "  int waiting_for; //block index that must be finished before the current thread can start\n"
00314 "  unsigned int waiting_for_index;\n"
00315 "  unsigned int block_offset;\n"
00316 "  unsigned int col;\n"
00317 "  unsigned int row;\n"
00318 "  unsigned int row_index_end;\n"
00319 "  float diagonal_entry = 42;\n"
00320 "  \n"
00321 "  //forward substitution: one thread per row in blocks of get_global_size(0)\n"
00322 "  for (int block_num = size / get_global_size(0); block_num > -1; --block_num)\n"
00323 "  {\n"
00324 "    block_offset = block_num * get_global_size(0);\n"
00325 "    row = block_offset + get_global_id(0);\n"
00326 "    buffer[get_global_id(0)] = 0; //set flag to 'undone'\n"
00327 "    waiting_for = -1;\n"
00328 "    \n"
00329 "    if (row < size)\n"
00330 "    {\n"
00331 "      vec_entries[get_global_id(0)] = vector[row];\n"
00332 "      waiting_for_index = row_indices[row];\n"
00333 "      row_index_end = row_indices[row+1];\n"
00334 "      diagonal_entry = column_indices[waiting_for_index];\n"
00335 "    }\n"
00336 "    \n"
00337 "    if (get_global_id(0) == 0)\n"
00338 "       buffer[get_global_size(0)] = 1;\n"
00339 "    //try to eliminate all lines in the block. \n"
00340 "    //in worst case scenarios, in each step only one line can be substituted, thus loop\n"
00341 "    for (unsigned int k = 0; k<get_global_size(0); ++k)\n"
00342 "    {\n"
00343 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00344 "      if (row < size) //valid index?\n"
00345 "      {\n"
00346 "        if (waiting_for >= 0)\n"
00347 "        {\n"
00348 "          if (buffer[waiting_for] == 1)\n"
00349 "            waiting_for = -1;\n"
00350 "        }\n"
00351 "        \n"
00352 "        if (waiting_for == -1) //substitution not yet done, check whether possible\n"
00353 "        {\n"
00354 "          //check whether reduction is possible:\n"
00355 "          for (unsigned int j = waiting_for_index; j < row_index_end; ++j)\n"
00356 "          {\n"
00357 "            col = column_indices[j];\n"
00358 "            barrier(CLK_LOCAL_MEM_FENCE);\n"
00359 "            if (col >= block_offset + get_global_size(0))  //index valid, but not from current block\n"
00360 "              vec_entries[get_global_id(0)] -= elements[j] * vector[col];\n"
00361 "            else if (col > row)  //index is from current block\n"
00362 "            {\n"
00363 "              if (buffer[col - block_offset] == 0) //entry is not yet calculated\n"
00364 "              {\n"
00365 "                waiting_for = col - block_offset;\n"
00366 "                waiting_for_index = j;\n"
00367 "                break;\n"
00368 "              }\n"
00369 "              else  //updated entry is available in shared memory:\n"
00370 "                vec_entries[get_global_id(0)] -= elements[j] * vec_entries[col - block_offset];\n"
00371 "            }\n"
00372 "            else if (col == row)\n"
00373 "              diagonal_entry = elements[j];\n"
00374 "          }\n"
00375 "          \n"
00376 "          if (waiting_for == -1)  //this row is done\n"
00377 "          {\n"
00378 "            if (row == 0)\n"
00379 "              vec_entries[get_global_id(0)] /= elements[0];\n"
00380 "            else\n"
00381 "              vec_entries[get_global_id(0)] /= diagonal_entry;\n"
00382 "            buffer[get_global_id(0)] = 1;\n"
00383 "            waiting_for = -2; //magic number: thread is finished\n"
00384 "          }\n"
00385 "        } \n"
00386 "      } //row < size\n"
00387 "      else\n"
00388 "        buffer[get_global_id(0)] = 1; //work done (because there is no work to be done at all...)\n"
00389 "      \n"
00390 "      ///////// check whether all threads are done. If yes, exit loop /////////////\n"
00391 "      if (buffer[get_global_id(0)] == 0)\n"
00392 "        buffer[get_global_size(0)] = 0;\n"
00393 "      barrier(CLK_LOCAL_MEM_FENCE);\n"
00394 "      \n"
00395 "      if (buffer[get_global_size(0)] > 0)  //all threads break the loop simultaneously\n"
00396 "        break;\n"
00397 "      if (get_global_id(0) == 0)\n"
00398 "        buffer[get_global_size(0)] = 1;\n"
00399 "    } //for k\n"
00400 "    if (row < size)\n"
00401 "      vector[row] = vec_entries[get_global_id(0)];\n"
00402 "      //vector[row] = diagonal_entry;\n"
00403 "    \n"
00404 "    //if (row == 0)\n"
00405 "      //vector[0] = diagonal_entry;\n"
00406 "      //vector[0] = elements[0];\n"
00407 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00408 "  } //for block_num\n"
00409 "}\n"
00410 ; //compressed_matrix_align1_lu_backward
00411 
00412 const char * const compressed_matrix_align1_bicgstab_kernel1 = 
00413 "void helper_bicgstab_kernel1_parallel_reduction( __local float * tmp_buffer )\n"
00414 "{\n"
00415 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00416 "  {\n"
00417 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00418 "    if (get_local_id(0) < stride)\n"
00419 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00420 "  }\n"
00421 "}\n"
00422 "//////// inner products:\n"
00423 "float bicgstab_kernel1_inner_prod(\n"
00424 "          __global const float * vec1,\n"
00425 "          __global const float * vec2,\n"
00426 "          unsigned int size,\n"
00427 "          __local float * tmp_buffer)\n"
00428 "{\n"
00429 "  float tmp = 0;\n"
00430 "  unsigned int i_end = ((size - 1) / get_local_size(0) + 1) * get_local_size(0);\n"
00431 "  for (unsigned int i = get_local_id(0); i < i_end; i += get_local_size(0))\n"
00432 "  {\n"
00433 "    if (i < size)\n"
00434 "      tmp += vec1[i] * vec2[i];\n"
00435 "  }\n"
00436 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00437 "  \n"
00438 "  helper_bicgstab_kernel1_parallel_reduction(tmp_buffer);\n"
00439 "  barrier(CLK_LOCAL_MEM_FENCE);\n"
00440 "  return tmp_buffer[0];\n"
00441 "}\n"
00442 "__kernel void bicgstab_kernel1(\n"
00443 "          __global const float * tmp0,\n"
00444 "          __global const float * r0star, \n"
00445 "          __global const float * residual,\n"
00446 "          __global float * s,\n"
00447 "          __global float * alpha,\n"
00448 "          __global const float * ip_rr0star,\n"
00449 "          __local float * tmp_buffer,\n"
00450 "          unsigned int size) \n"
00451 "{ \n"
00452 "  float alpha_local = ip_rr0star[0] / bicgstab_kernel1_inner_prod(tmp0, r0star, size, tmp_buffer);\n"
00453 "  \n"
00454 "  for (unsigned int i = get_local_id(0); i < size; i += get_local_size(0))\n"
00455 "    s[i] = residual[i] - alpha_local * tmp0[i];\n"
00456 "  \n"
00457 "  if (get_global_id(0) == 0)\n"
00458 "    alpha[0] = alpha_local;\n"
00459 "}\n"
00460 ; //compressed_matrix_align1_bicgstab_kernel1
00461 
00462 const char * const compressed_matrix_align8_vec_mul = 
00463 "__kernel void vec_mul(\n"
00464 "          __global const unsigned int * row_indices,\n"
00465 "          __global const uint8 * column_indices, \n"
00466 "          __global const float8 * elements,\n"
00467 "          __global const float * vector,  \n"
00468 "          __global float * result,\n"
00469 "          unsigned int size)\n"
00470 "{ \n"
00471 "  float dot_prod;\n"
00472 "  unsigned int start, next_stop;\n"
00473 "  uint8 col_idx;\n"
00474 "  float8 tmp_vec;\n"
00475 "  float8 tmp_entries;\n"
00476 "  for (unsigned int row = get_global_id(0); row < size; row += get_global_size(0))\n"
00477 "  {\n"
00478 "    dot_prod = 0.0f;\n"
00479 "    start = row_indices[row] / 8;\n"
00480 "    next_stop = row_indices[row+1] / 8;\n"
00481 "    for (unsigned int i = start; i < next_stop; ++i)\n"
00482 "    {\n"
00483 "      col_idx = column_indices[i];\n"
00484 "      tmp_entries = elements[i];\n"
00485 "      tmp_vec.s0 = vector[col_idx.s0];\n"
00486 "      tmp_vec.s1 = vector[col_idx.s1];\n"
00487 "      tmp_vec.s2 = vector[col_idx.s2];\n"
00488 "      tmp_vec.s3 = vector[col_idx.s3];\n"
00489 "      tmp_vec.s4 = vector[col_idx.s4];\n"
00490 "      tmp_vec.s5 = vector[col_idx.s5];\n"
00491 "      tmp_vec.s6 = vector[col_idx.s6];\n"
00492 "      tmp_vec.s7 = vector[col_idx.s7];\n"
00493 "      dot_prod += dot(tmp_entries.lo, tmp_vec.lo);\n"
00494 "      dot_prod += dot(tmp_entries.hi, tmp_vec.hi);\n"
00495 "    }\n"
00496 "    result[row] = dot_prod;\n"
00497 "  }\n"
00498 "}\n"
00499 ; //compressed_matrix_align8_vec_mul
00500 
00501   }  //namespace kernels
00502  }  //namespace linalg
00503 }  //namespace viennacl
00504 #endif

Generated on Sat May 21 2011 20:36:50 for ViennaCL - The Vienna Computing Library by  doxygen 1.7.1