• Main Page
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

/data/development/ViennaCL/ViennaCL-1.1.2/viennacl/linalg/kernels/vector_source.h

Go to the documentation of this file.
00001 #ifndef _VIENNACL_VECTOR_SOURCE_HPP_
00002 #define _VIENNACL_VECTOR_SOURCE_HPP_
00003 //Automatically generated file from aux-directory, do not edit manually!
00004 namespace viennacl
00005 {
00006  namespace linalg
00007  {
00008   namespace kernels
00009   {
00010 const char * const vector_align16_inplace_sub = 
00011 "__kernel void inplace_sub(\n"
00012 "          __global float16 * vec1,\n"
00013 "          __global const float16 * vec2,\n"
00014 "          unsigned int size) \n"
00015 "{ \n"
00016 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00017 "    vec1[i] -= vec2[i];\n"
00018 "}\n"
00019 ; //vector_align16_inplace_sub
00020 
00021 const char * const vector_align16_mult = 
00022 "__kernel void mult(\n"
00023 "          __global const float16 * vec,\n"
00024 "          __global const float * fac, \n"
00025 "          __global float16 * result,\n"
00026 "          unsigned int size) \n"
00027 "{ \n"
00028 "  float factor = *fac;\n"
00029 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00030 "    result[i] = vec[i] * factor;\n"
00031 "}\n"
00032 ; //vector_align16_mult
00033 
00034 const char * const vector_align16_sub = 
00035 "__kernel void sub(\n"
00036 "          __global const float16 * vec1,\n"
00037 "          __global const float16 * vec2, \n"
00038 "          __global float16 * result,\n"
00039 "          unsigned int size)\n"
00040 "{ \n"
00041 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00042 "    result[i] = vec1[i] - vec2[i];\n"
00043 "}\n"
00044 ; //vector_align16_sub
00045 
00046 const char * const vector_align16_cpu_inplace_mul = 
00047 "\n"
00048 "__kernel void cpu_inplace_mult(\n"
00049 "          __global float16 * vec,\n"
00050 "          float factor, \n"
00051 "          unsigned int size) \n"
00052 "{ \n"
00053 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00054 "    vec[i] *= factor;\n"
00055 "}\n"
00056 "\n"
00057 ; //vector_align16_cpu_inplace_mul
00058 
00059 const char * const vector_align16_add = 
00060 "__kernel void add(\n"
00061 "          __global const float16 * vec1,\n"
00062 "          __global const float16 * vec2, \n"
00063 "          __global float16 * result,\n"
00064 "          unsigned int size)\n"
00065 "{ \n"
00066 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00067 "    result[i] = vec1[i] + vec2[i];\n"
00068 "}\n"
00069 ; //vector_align16_add
00070 
00071 const char * const vector_align16_cpu_mult = 
00072 "__kernel void cpu_mult(\n"
00073 "          __global const float16 * vec,\n"
00074 "          float factor, \n"
00075 "          __global float16 * result,\n"
00076 "          unsigned int size) \n"
00077 "{ \n"
00078 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00079 "    result[i] = vec[i] * factor;\n"
00080 "}\n"
00081 ; //vector_align16_cpu_mult
00082 
00083 const char * const vector_align16_inplace_divide = 
00084 "__kernel void inplace_divide(\n"
00085 "          __global float16 * vec,\n"
00086 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00087 "          unsigned int size) \n"
00088 "{ \n"
00089 "  float factor = *fac;\n"
00090 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00091 "    vec[i] /= factor;\n"
00092 "}\n"
00093 ; //vector_align16_inplace_divide
00094 
00095 const char * const vector_align16_inplace_add = 
00096 "__kernel void inplace_add(\n"
00097 "          __global float16 * vec1,\n"
00098 "          __global const float16 * vec2,\n"
00099 "          unsigned int size) \n"
00100 "{ \n"
00101 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00102 "    vec1[i] += vec2[i];\n"
00103 "}\n"
00104 ; //vector_align16_inplace_add
00105 
00106 const char * const vector_align16_divide = 
00107 "//Note: 'div' cannot be used because of complaints by the jit-compiler\n"
00108 "__kernel void divide(\n"
00109 "          __global const float16 * vec,\n"
00110 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00111 "          __global float16 * result,\n"
00112 "          unsigned int size)  \n"
00113 "{ \n"
00114 "  float factor = *fac;\n"
00115 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00116 "    result[i] = vec[i] / factor;\n"
00117 "}\n"
00118 ; //vector_align16_divide
00119 
00120 const char * const vector_align16_inplace_mult = 
00121 "__kernel void inplace_mult(\n"
00122 "          __global float16 * vec,\n"
00123 "          __global const float * fac, \n"
00124 "          unsigned int size) \n"
00125 "{ \n"
00126 "  float factor = *fac;\n"
00127 "  for (unsigned int i = get_global_id(0); i < size/16; i += get_global_size(0))\n"
00128 "    vec[i] *= factor;\n"
00129 "}\n"
00130 ; //vector_align16_inplace_mult
00131 
00132 const char * const vector_align4_inplace_div_add = 
00133 "__kernel void inplace_div_add(\n"
00134 "          __global float4 * vec1,\n"
00135 "          __global const float4 * vec2,\n"
00136 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00137 "          unsigned int size\n"
00138 "          ) \n"
00139 "{ \n"
00140 "  float factor = *fac;\n"
00141 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00142 "    vec1[i] -= vec2[i] / factor;\n"
00143 "}\n"
00144 ; //vector_align4_inplace_div_add
00145 
00146 const char * const vector_align4_cpu_mul_add = 
00147 "__kernel void cpu_mul_add(\n"
00148 "          __global const float4 * vec1,\n"
00149 "          float factor,\n"
00150 "          __global const float4 * vec2,\n"
00151 "          __global float4 * result,\n"
00152 "          unsigned int size\n"
00153 "          ) \n"
00154 "{ \n"
00155 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00156 "    result[i] = vec1[i] * factor + vec2[i];\n"
00157 "}\n"
00158 ; //vector_align4_cpu_mul_add
00159 
00160 const char * const vector_align4_inplace_mul_sub = 
00161 "__kernel void inplace_mul_sub(\n"
00162 "          __global float4 * vec1,\n"
00163 "          __global const float4 * vec2,\n"
00164 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00165 "          unsigned int size\n"
00166 "          ) \n"
00167 "{ \n"
00168 "  float factor = *fac;\n"
00169 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00170 "    vec1[i] -= vec2[i] * factor;\n"
00171 "}\n"
00172 ; //vector_align4_inplace_mul_sub
00173 
00174 const char * const vector_align4_inplace_mul_add = 
00175 "__kernel void inplace_mul_add(\n"
00176 "          __global float4 * vec1,\n"
00177 "          __global const float4 * vec2,\n"
00178 "          __global const float * fac,\n"
00179 "          unsigned int size\n"
00180 "          ) \n"
00181 "{ \n"
00182 "  float factor = *fac;\n"
00183 "  unsigned int size_div_4 = size >> 2;\n"
00184 "  for (unsigned int i = get_global_id(0); i < size_div_4; i += get_global_size(0))\n"
00185 "    vec1[i] += vec2[i] * factor;\n"
00186 "}\n"
00187 ; //vector_align4_inplace_mul_add
00188 
00189 const char * const vector_align4_mul_add = 
00190 "__kernel void mul_add(\n"
00191 "          __global const float4 * vec1,\n"
00192 "          __global const float * fac,\n"
00193 "          __global const float4 * vec2,\n"
00194 "          __global float4 * result,\n"
00195 "          unsigned int size\n"
00196 "          ) \n"
00197 "{ \n"
00198 "  float factor = *fac;\n"
00199 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00200 "    result[i] = vec1[i] * factor + vec2[i];\n"
00201 "}\n"
00202 ; //vector_align4_mul_add
00203 
00204 const char * const vector_align4_cpu_inplace_mul_add = 
00205 "__kernel void cpu_inplace_mul_add(\n"
00206 "          __global float4 * vec1,\n"
00207 "          __global const float4 * vec2,\n"
00208 "          float factor,\n"
00209 "          unsigned int size\n"
00210 "          ) \n"
00211 "{ \n"
00212 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00213 "    vec1[i] += vec2[i] * factor;\n"
00214 "}\n"
00215 ; //vector_align4_cpu_inplace_mul_add
00216 
00217 const char * const vector_align4_inplace_div_sub = 
00218 "__kernel void inplace_div_sub(\n"
00219 "          __global float4 * vec1,\n"
00220 "          __global const float4 * vec2,\n"
00221 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00222 "          unsigned int size\n"
00223 "          ) \n"
00224 "{ \n"
00225 "  float factor = *fac;\n"
00226 "  for (unsigned int i = get_global_id(0); i < size/4; i += get_global_size(0))\n"
00227 "    vec1[i] -= vec2[i] / factor;\n"
00228 "}\n"
00229 ; //vector_align4_inplace_div_sub
00230 
00231 const char * const vector_align1_inplace_sub = 
00232 "__kernel void inplace_sub(\n"
00233 "          __global float * vec1,\n"
00234 "          __global const float * vec2,\n"
00235 "          unsigned int size) \n"
00236 "{ \n"
00237 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00238 "    vec1[i] -= vec2[i];\n"
00239 "}\n"
00240 ; //vector_align1_inplace_sub
00241 
00242 const char * const vector_align1_norm_inf = 
00243 "\n"
00244 "////// norm_inf\n"
00245 "float impl_norm_inf(\n"
00246 "          __global const float * vec,\n"
00247 "          unsigned int start_index,\n"
00248 "          unsigned int end_index,\n"
00249 "          __local float * tmp_buffer)\n"
00250 "{\n"
00251 "  float tmp = 0;\n"
00252 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00253 "    tmp = fmax(fabs(vec[i]), tmp);\n"
00254 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00255 "  \n"
00256 "  //step 2: parallel reduction:\n"
00257 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00258 "  {\n"
00259 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00260 "    if (get_global_id(0) < stride)\n"
00261 "      tmp_buffer[get_global_id(0)] = fmax(tmp_buffer[get_global_id(0)], tmp_buffer[get_global_id(0)+stride]);\n"
00262 "  }\n"
00263 "  \n"
00264 "  return tmp_buffer[0];\n"
00265 "}\n"
00266 "\n"
00267 "__kernel void norm_inf(\n"
00268 "          __global const float * vec,\n"
00269 "          unsigned int size,\n"
00270 "          __local float * tmp_buffer,\n"
00271 "          global float * group_buffer)\n"
00272 "{\n"
00273 "  float tmp = impl_norm_inf(vec,\n"
00274 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00275 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00276 "                          tmp_buffer);\n"
00277 "  \n"
00278 "  if (get_local_id(0) == 0)\n"
00279 "    group_buffer[get_group_id(0)] = tmp;  \n"
00280 "}\n"
00281 ; //vector_align1_norm_inf
00282 
00283 const char * const vector_align1_index_norm_inf = 
00284 "//index_norm_inf:\n"
00285 "unsigned int float_vector1_index_norm_inf_impl(\n"
00286 "          __global const float * vec,\n"
00287 "          unsigned int size,\n"
00288 "          __local float * float_buffer,\n"
00289 "          __local unsigned int * index_buffer)\n"
00290 "{\n"
00291 "  //step 1: fill buffer:\n"
00292 "  float cur_max = 0.0f;\n"
00293 "  float tmp;\n"
00294 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00295 "  {\n"
00296 "    tmp = fabs(vec[i]);\n"
00297 "    if (cur_max < tmp)\n"
00298 "    {\n"
00299 "      float_buffer[get_global_id(0)] = tmp;\n"
00300 "      index_buffer[get_global_id(0)] = i;\n"
00301 "      cur_max = tmp;\n"
00302 "    }\n"
00303 "  }\n"
00304 "  \n"
00305 "  //step 2: parallel reduction:\n"
00306 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00307 "  {\n"
00308 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00309 "    if (get_global_id(0) < stride)\n"
00310 "    {\n"
00311 "      //find the first occurring index\n"
00312 "      if (float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride])\n"
00313 "      {\n"
00314 "        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride];\n"
00315 "        float_buffer[get_global_id(0)] = float_buffer[get_global_id(0)+stride];\n"
00316 "      }\n"
00317 "      \n"
00318 "      //index_buffer[get_global_id(0)] = float_buffer[get_global_id(0)] < float_buffer[get_global_id(0)+stride] ? index_buffer[get_global_id(0)+stride] : index_buffer[get_global_id(0)];\n"
00319 "      //float_buffer[get_global_id(0)] = max(float_buffer[get_global_id(0)], float_buffer[get_global_id(0)+stride]);\n"
00320 "    }\n"
00321 "  }\n"
00322 "  \n"
00323 "  return index_buffer[0];\n"
00324 "}\n"
00325 "\n"
00326 "__kernel void index_norm_inf(\n"
00327 "          __global float * vec,\n"
00328 "          unsigned int size,\n"
00329 "          __local float * float_buffer,\n"
00330 "          __local unsigned int * index_buffer,\n"
00331 "          global unsigned int * result) \n"
00332 "{ \n"
00333 "  unsigned int tmp = float_vector1_index_norm_inf_impl(vec, size, float_buffer, index_buffer);\n"
00334 "  if (get_global_id(0) == 0) *result = tmp;\n"
00335 "}\n"
00336 "\n"
00337 "\n"
00338 ; //vector_align1_index_norm_inf
00339 
00340 const char * const vector_align1_mult = 
00341 "__kernel void mult(\n"
00342 "          __global const float * vec,\n"
00343 "          __global const float * fac, \n"
00344 "          __global float * result,\n"
00345 "          unsigned int size) \n"
00346 "{ \n"
00347 "  float factor = *fac;\n"
00348 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00349 "    result[i] = vec[i] * factor;\n"
00350 "}\n"
00351 ; //vector_align1_mult
00352 
00353 const char * const vector_align1_swap = 
00354 "////// swap:\n"
00355 "__kernel void swap(\n"
00356 "          __global float * vec1,\n"
00357 "          __global float * vec2,\n"
00358 "          unsigned int size\n"
00359 "          ) \n"
00360 "{ \n"
00361 "  float tmp;\n"
00362 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00363 "  {\n"
00364 "    tmp = vec2[i];\n"
00365 "    vec2[i] = vec1[i];\n"
00366 "    vec1[i] = tmp;\n"
00367 "  }\n"
00368 "}\n"
00369 " \n"
00370 ; //vector_align1_swap
00371 
00372 const char * const vector_align1_inplace_div_add = 
00373 "///// divide add:\n"
00374 "__kernel void inplace_div_add(\n"
00375 "          __global float * vec1,\n"
00376 "          __global const float * vec2,\n"
00377 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00378 "          unsigned int size\n"
00379 "          ) \n"
00380 "{ \n"
00381 "  float factor = *fac;\n"
00382 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00383 "    vec1[i] -= vec2[i] / factor;\n"
00384 "}\n"
00385 ; //vector_align1_inplace_div_add
00386 
00387 const char * const vector_align1_norm_2 = 
00388 "//helper:\n"
00389 "void helper_norm2_parallel_reduction( __local float * tmp_buffer )\n"
00390 "{\n"
00391 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00392 "  {\n"
00393 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00394 "    if (get_global_id(0) < stride)\n"
00395 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00396 "  }\n"
00397 "}\n"
00398 "\n"
00399 "////// norm_2\n"
00400 "float impl_norm_2(\n"
00401 "          __global const float * vec,\n"
00402 "          unsigned int start_index,\n"
00403 "          unsigned int end_index,\n"
00404 "          __local float * tmp_buffer)\n"
00405 "{\n"
00406 "  float tmp = 0;\n"
00407 "  float vec_entry = 0;\n"
00408 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00409 "  {\n"
00410 "    vec_entry = vec[i];\n"
00411 "    tmp += vec_entry * vec_entry;\n"
00412 "  }\n"
00413 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00414 "  \n"
00415 "  helper_norm2_parallel_reduction(tmp_buffer);\n"
00416 "  \n"
00417 "  return tmp_buffer[0];\n"
00418 "};\n"
00419 "\n"
00420 "__kernel void norm_2(\n"
00421 "          __global const float * vec,\n"
00422 "          unsigned int size,\n"
00423 "          __local float * tmp_buffer,\n"
00424 "          global float * group_buffer)\n"
00425 "{\n"
00426 "  float tmp = impl_norm_2(vec,\n"
00427 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00428 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00429 "                          tmp_buffer);\n"
00430 "  \n"
00431 "  if (get_local_id(0) == 0)\n"
00432 "    group_buffer[get_group_id(0)] = tmp;  \n"
00433 "}\n"
00434 "\n"
00435 ; //vector_align1_norm_2
00436 
00437 const char * const vector_align1_sub = 
00438 "__kernel void sub(\n"
00439 "          __global const float * vec1,\n"
00440 "          __global const float * vec2, \n"
00441 "          __global float * result,\n"
00442 "          unsigned int size)\n"
00443 "{ \n"
00444 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00445 "    result[i] = vec1[i] - vec2[i];\n"
00446 "}\n"
00447 ; //vector_align1_sub
00448 
00449 const char * const vector_align1_cpu_mul_add = 
00450 "__kernel void cpu_mul_add(\n"
00451 "          __global const float * vec1,\n"
00452 "          float factor,\n"
00453 "          __global const float * vec2,\n"
00454 "          __global float * result,\n"
00455 "          unsigned int size\n"
00456 "          ) \n"
00457 "{ \n"
00458 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00459 "    result[i] = vec1[i] * factor + vec2[i];\n"
00460 "}\n"
00461 ; //vector_align1_cpu_mul_add
00462 
00463 const char * const vector_align1_vmax = 
00464 "__kernel void vmax(\n"
00465 "          __global float * vec1,\n"
00466 "          __global float * result,\n"
00467 "          unsigned int size) \n"
00468 "{ \n"
00469 "  //parallel reduction on global memory:\n"
00470 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00471 "  {\n"
00472 "    if (get_global_id(0) < stride)\n"
00473 "      vec1[get_global_id(0)] = fmax(vec1[get_global_id(0)+stride], vec1[get_global_id(0)]);\n"
00474 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00475 "  }\n"
00476 "  \n"
00477 "  if (get_global_id(0) == 0)\n"
00478 "    *result = vec1[0];  \n"
00479 "}\n"
00480 ; //vector_align1_vmax
00481 
00482 const char * const vector_align1_inner_prod = 
00483 "//helper:\n"
00484 "void helper_inner_prod_parallel_reduction( __local float * tmp_buffer )\n"
00485 "{\n"
00486 "  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2)\n"
00487 "  {\n"
00488 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00489 "    if (get_local_id(0) < stride)\n"
00490 "      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride];\n"
00491 "  }\n"
00492 "}\n"
00493 "//////// inner products:\n"
00494 "float impl_inner_prod(\n"
00495 "          __global const float * vec1,\n"
00496 "          __global const float * vec2,\n"
00497 "          unsigned int start_index,\n"
00498 "          unsigned int end_index,\n"
00499 "          __local float * tmp_buffer)\n"
00500 "{\n"
00501 "  float tmp = 0;\n"
00502 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00503 "    tmp += vec1[i] * vec2[i];\n"
00504 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00505 "  \n"
00506 "  helper_inner_prod_parallel_reduction(tmp_buffer);\n"
00507 "  \n"
00508 "  return tmp_buffer[0];\n"
00509 "}\n"
00510 "__kernel void inner_prod(\n"
00511 "          __global const float * vec1,\n"
00512 "          __global const float * vec2,\n"
00513 "          unsigned int size,\n"
00514 "          __local float * tmp_buffer,\n"
00515 "          global float * group_buffer)\n"
00516 "{\n"
00517 "  float tmp = impl_inner_prod(vec1,\n"
00518 "                              vec2,\n"
00519 "                              (      get_group_id(0) * size) / get_num_groups(0),\n"
00520 "                              ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00521 "                              tmp_buffer);\n"
00522 "  \n"
00523 "  if (get_local_id(0) == 0)\n"
00524 "    group_buffer[get_group_id(0)] = tmp;\n"
00525 "  \n"
00526 "}\n"
00527 ; //vector_align1_inner_prod
00528 
00529 const char * const vector_align1_add = 
00530 "__kernel void add(\n"
00531 "          __global const float * vec1,\n"
00532 "          __global const float * vec2, \n"
00533 "          __global float * result,\n"
00534 "          unsigned int size) \n"
00535 "{ \n"
00536 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00537 "    result[i] = vec1[i] + vec2[i];\n"
00538 "}\n"
00539 ; //vector_align1_add
00540 
00541 const char * const vector_align1_plane_rotation = 
00542 "////// plane rotation: (x,y) <- (\alpha x + \beta y, -\beta x + \alpha y)\n"
00543 "__kernel void plane_rotation(\n"
00544 "          __global float * vec1,\n"
00545 "          __global float * vec2, \n"
00546 "          float alpha,\n"
00547 "          float beta,\n"
00548 "          unsigned int size) \n"
00549 "{ \n"
00550 "  float tmp1 = 0;\n"
00551 "  float tmp2 = 0;\n"
00552 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00553 "  {\n"
00554 "    tmp1 = vec1[i];\n"
00555 "    tmp2 = vec2[i];\n"
00556 "    \n"
00557 "    vec1[i] = alpha * tmp1 + beta * tmp2;\n"
00558 "    vec2[i] = alpha * tmp2 - beta * tmp1;\n"
00559 "  }\n"
00560 "}\n"
00561 ; //vector_align1_plane_rotation
00562 
00563 const char * const vector_align1_inplace_mul_sub = 
00564 "__kernel void inplace_mul_sub(\n"
00565 "          __global float * vec1,\n"
00566 "          __global const float * vec2,\n"
00567 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00568 "          unsigned int size\n"
00569 "          ) \n"
00570 "{ \n"
00571 "  float factor = *fac;\n"
00572 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00573 "    vec1[i] -= vec2[i] * factor;\n"
00574 "}\n"
00575 ; //vector_align1_inplace_mul_sub
00576 
00577 const char * const vector_align1_inplace_mul_add = 
00578 "__kernel void inplace_mul_add(\n"
00579 "          __global float * vec1,\n"
00580 "          __global const float * vec2,\n"
00581 "          __global const float * fac,\n"
00582 "          unsigned int size\n"
00583 "          ) \n"
00584 "{ \n"
00585 "  float factor = *fac;\n"
00586 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00587 "    vec1[i] += vec2[i] * factor;\n"
00588 "}\n"
00589 ; //vector_align1_inplace_mul_add
00590 
00591 const char * const vector_align1_mul_add = 
00592 "__kernel void mul_add(\n"
00593 "          __global const float * vec1,\n"
00594 "          __global const float * fac,\n"
00595 "          __global const float * vec2,\n"
00596 "          __global float * result,\n"
00597 "          unsigned int size\n"
00598 "          ) \n"
00599 "{ \n"
00600 "  float factor = *fac;\n"
00601 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00602 "    result[i] = vec1[i] * factor + vec2[i];\n"
00603 "}\n"
00604 ; //vector_align1_mul_add
00605 
00606 const char * const vector_align1_cpu_mult = 
00607 "__kernel void cpu_mult(\n"
00608 "          __global const float * vec,\n"
00609 "          float factor, \n"
00610 "          __global float * result,\n"
00611 "          unsigned int size) \n"
00612 "{ \n"
00613 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00614 "    result[i] = vec[i] * factor;\n"
00615 "}\n"
00616 ; //vector_align1_cpu_mult
00617 
00618 const char * const vector_align1_inplace_divide = 
00619 "__kernel void inplace_divide(\n"
00620 "          __global float * vec,\n"
00621 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00622 "          unsigned int size) \n"
00623 "{ \n"
00624 "  float factor = *fac;\n"
00625 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00626 "    vec[i] /= factor;\n"
00627 "}\n"
00628 ; //vector_align1_inplace_divide
00629 
00630 const char * const vector_align1_sqrt_sum = 
00631 "__kernel void sqrt_sum(\n"
00632 "          __global float * vec1,\n"
00633 "          __global float * result,\n"
00634 "          unsigned int size) \n"
00635 "{ \n"
00636 "  //parallel reduction on global memory:  \n"
00637 "  \n"
00638 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00639 "  {\n"
00640 "    if (get_global_id(0) < stride)\n"
00641 "      vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n"
00642 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00643 "  }\n"
00644 "  \n"
00645 "  if (get_global_id(0) == 0)\n"
00646 "    *result = sqrt(vec1[0]);\n"
00647 "  \n"
00648 "}\n"
00649 ; //vector_align1_sqrt_sum
00650 
00651 const char * const vector_align1_cpu_inplace_mul_add = 
00652 "__kernel void cpu_inplace_mul_add(\n"
00653 "          __global float * vec1,\n"
00654 "          __global const float * vec2,\n"
00655 "          float factor,\n"
00656 "          unsigned int size\n"
00657 "          ) \n"
00658 "{ \n"
00659 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00660 "    vec1[i] += vec2[i] * factor;\n"
00661 "}\n"
00662 ; //vector_align1_cpu_inplace_mul_add
00663 
00664 const char * const vector_align1_inplace_add = 
00665 "__kernel void inplace_add(\n"
00666 "          __global float * vec1,\n"
00667 "          __global const float * vec2,\n"
00668 "          unsigned int size) \n"
00669 "{ \n"
00670 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00671 "    vec1[i] += vec2[i];\n"
00672 "}\n"
00673 ; //vector_align1_inplace_add
00674 
00675 const char * const vector_align1_divide = 
00676 "// Note: name 'div' is not allowed by the jit-compiler\n"
00677 "__kernel void divide(\n"
00678 "          __global const float * vec,\n"
00679 "          __global const float * fac,  //note: CPU variant is mapped to prod_scalar\n"
00680 "          __global float * result,\n"
00681 "          unsigned int size)  \n"
00682 "{ \n"
00683 "  float factor = *fac;\n"
00684 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00685 "    result[i] = vec[i] / factor;\n"
00686 "}\n"
00687 ; //vector_align1_divide
00688 
00689 const char * const vector_align1_norm_1 = 
00690 "//helper:\n"
00691 "void helper_norm1_parallel_reduction( __local float * tmp_buffer )\n"
00692 "{\n"
00693 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00694 "  {\n"
00695 "    barrier(CLK_LOCAL_MEM_FENCE);\n"
00696 "    if (get_global_id(0) < stride)\n"
00697 "      tmp_buffer[get_global_id(0)] += tmp_buffer[get_global_id(0)+stride];\n"
00698 "  }\n"
00699 "}\n"
00700 "\n"
00701 "////// norm_1\n"
00702 "float impl_norm_1(\n"
00703 "          __global const float * vec,\n"
00704 "          unsigned int start_index,\n"
00705 "          unsigned int end_index,\n"
00706 "          __local float * tmp_buffer)\n"
00707 "{\n"
00708 "  float tmp = 0;\n"
00709 "  for (unsigned int i = start_index + get_local_id(0); i < end_index; i += get_local_size(0))\n"
00710 "    tmp += fabs(vec[i]);\n"
00711 "  \n"
00712 "  tmp_buffer[get_local_id(0)] = tmp;\n"
00713 "  \n"
00714 "  helper_norm1_parallel_reduction(tmp_buffer);\n"
00715 "  \n"
00716 "  return tmp_buffer[0];\n"
00717 "};\n"
00718 "\n"
00719 "__kernel void norm_1(\n"
00720 "          __global const float * vec,\n"
00721 "          unsigned int size,\n"
00722 "          __local float * tmp_buffer,\n"
00723 "          global float * group_buffer)\n"
00724 "{\n"
00725 "  float tmp = impl_norm_1(vec,\n"
00726 "                          (      get_group_id(0) * size) / get_num_groups(0),\n"
00727 "                          ((get_group_id(0) + 1) * size) / get_num_groups(0),\n"
00728 "                          tmp_buffer);\n"
00729 "  \n"
00730 "  if (get_local_id(0) == 0)\n"
00731 "    group_buffer[get_group_id(0)] = tmp;  \n"
00732 "}\n"
00733 "\n"
00734 ; //vector_align1_norm_1
00735 
00736 const char * const vector_align1_clear = 
00737 "__kernel void clear(\n"
00738 "          __global float * vec,\n"
00739 "          unsigned int size) \n"
00740 "{ \n"
00741 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00742 "    vec[i] = 0;\n"
00743 "}\n"
00744 ; //vector_align1_clear
00745 
00746 const char * const vector_align1_cpu_inplace_mult = 
00747 "__kernel void cpu_inplace_mult(\n"
00748 "          __global float * vec,\n"
00749 "          float factor, \n"
00750 "          unsigned int size) \n"
00751 "{ \n"
00752 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00753 "    vec[i] *= factor;\n"
00754 "}\n"
00755 ; //vector_align1_cpu_inplace_mult
00756 
00757 const char * const vector_align1_inplace_mult = 
00758 "__kernel void inplace_mult(\n"
00759 "          __global float * vec,\n"
00760 "          __global const float * fac, \n"
00761 "          unsigned int size) \n"
00762 "{ \n"
00763 "  float factor = *fac;\n"
00764 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00765 "    vec[i] *= factor;\n"
00766 "}\n"
00767 ; //vector_align1_inplace_mult
00768 
00769 const char * const vector_align1_sum = 
00770 "__kernel void sum(\n"
00771 "          __global float * vec1,\n"
00772 "          __global float * result) \n"
00773 "{ \n"
00774 "  //parallel reduction on global memory:  \n"
00775 "  \n"
00776 "  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2)\n"
00777 "  {\n"
00778 "    if (get_global_id(0) < stride)\n"
00779 "      vec1[get_global_id(0)] += vec1[get_global_id(0)+stride];\n"
00780 "    barrier(CLK_GLOBAL_MEM_FENCE);\n"
00781 "  }\n"
00782 "  \n"
00783 "  if (get_global_id(0) == 0)\n"
00784 "    *result = vec1[0];  \n"
00785 "}\n"
00786 ; //vector_align1_sum
00787 
00788 const char * const vector_align1_inplace_div_sub = 
00789 "///// divide substract:\n"
00790 "__kernel void inplace_div_sub(\n"
00791 "          __global float * vec1,\n"
00792 "          __global const float * vec2,\n"
00793 "          __global const float * fac,   //CPU variant is mapped to mult_add\n"
00794 "          unsigned int size\n"
00795 "          ) \n"
00796 "{ \n"
00797 "  float factor = *fac;\n"
00798 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00799 "    vec1[i] -= vec2[i] / factor;\n"
00800 "}\n"
00801 ; //vector_align1_inplace_div_sub
00802 
00803 const char * const vector_align1_diag_precond = 
00804 "__kernel void diag_precond(\n"
00805 "          __global const float * diag_A_inv, \n"
00806 "          __global float * x, \n"
00807 "          unsigned int size) \n"
00808 "{ \n"
00809 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00810 "    x[i] *= diag_A_inv[i];\n"
00811 "}\n"
00812 ; //vector_align1_diag_precond
00813 
00814 const char * const vector_align1_mul_sub = 
00815 "///// multiply subtract:\n"
00816 "__kernel void mul_sub(\n"
00817 "          __global const float * vec1,\n"
00818 "          __global const float * fac,\n"
00819 "          __global const float * vec2,\n"
00820 "          __global float * result,\n"
00821 "          unsigned int size\n"
00822 "          ) \n"
00823 "{ \n"
00824 "  float factor = *fac;\n"
00825 "  for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00826 "    result[i] = vec1[i] * factor - vec2[i];\n"
00827 "}\n"
00828 ; //vector_align1_mul_sub
00829 
00830   }  //namespace kernels
00831  }  //namespace linalg
00832 }  //namespace viennacl
00833 #endif

Generated on Sat May 21 2011 20:36:50 for ViennaCL - The Vienna Computing Library by  doxygen 1.7.1