00001 #ifndef _VIENNACL_MATRIX_ROW_SOURCE_HPP_
00002 #define _VIENNACL_MATRIX_ROW_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const matrix_row_align1_unit_lower_triangular_substitute_inplace =
00011 "__kernel void unit_lower_triangular_substitute_inplace(\n"
00012 " __global const float * matrix,\n"
00013 " unsigned int matrix_rows,\n"
00014 " unsigned int matrix_cols,\n"
00015 " unsigned int matrix_internal_rows,\n"
00016 " unsigned int matrix_internal_cols,\n"
00017 " __global float * vector)\n"
00018 "{\n"
00019 " float temp;\n"
00020 " for (int row = 0; row < matrix_rows; ++row)\n"
00021 " {\n"
00022 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00023 " temp = vector[row];\n"
00024 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00025 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
00026 " }\n"
00027 "}\n"
00028 ;
00029
00030 const char * const matrix_row_align1_inplace_sub =
00031 "__kernel void inplace_sub(\n"
00032 " __global float * vec1,\n"
00033 " __global const float * vec2,\n"
00034 " unsigned int size) \n"
00035 "{ \n"
00036 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00037 " vec1[i] -= vec2[i];\n"
00038 "}\n"
00039 ;
00040
00041 const char * const matrix_row_align1_lower_triangular_substitute_inplace =
00042 "__kernel void lower_triangular_substitute_inplace(\n"
00043 " __global const float * matrix,\n"
00044 " unsigned int matrix_rows,\n"
00045 " unsigned int matrix_cols,\n"
00046 " unsigned int matrix_internal_rows,\n"
00047 " unsigned int matrix_internal_cols,\n"
00048 " __global float * vector)\n"
00049 "{\n"
00050 " float temp;\n"
00051 " for (int row = 0; row < matrix_rows; ++row)\n"
00052 " {\n"
00053 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00054 " if (get_global_id(0) == 0)\n"
00055 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
00056 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00057 " temp = vector[row];\n"
00058 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00059 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row];\n"
00060 " }\n"
00061 "}\n"
00062 ;
00063
00064 const char * const matrix_row_align1_trans_vec_mul =
00065 "__kernel void trans_vec_mul(\n"
00066 " __global const float * matrix,\n"
00067 " unsigned int matrix_rows,\n"
00068 " unsigned int matrix_cols,\n"
00069 " unsigned int matrix_internal_rows,\n"
00070 " unsigned int matrix_internal_cols,\n"
00071 " __global const float * vector, \n"
00072 " __global float * result) \n"
00073 "{ \n"
00074 " //row and col indicate indices within transposed matrix\n"
00075 " for (unsigned int row = get_global_id(0); row < matrix_cols; row += get_global_size(0))\n"
00076 " {\n"
00077 " float dot_prod2 = 0.0f;\n"
00078 " for (unsigned int col = 0; col < matrix_rows; ++col)\n"
00079 " dot_prod2 += matrix[row + col*matrix_internal_cols] * vector[col];\n"
00080 " result[row] = dot_prod2;\n"
00081 " }\n"
00082 "}\n"
00083 ;
00084
00085 const char * const matrix_row_align1_rank1_update =
00086 "//perform a rank-1 update of the matrix, i.e. A += x * x^T\n"
00087 "__kernel void rank1_update(\n"
00088 " __global float * matrix,\n"
00089 " unsigned int matrix_rows,\n"
00090 " unsigned int matrix_cols,\n"
00091 " unsigned int matrix_internal_rows,\n"
00092 " unsigned int matrix_internal_cols,\n"
00093 " __global const float * vector1, \n"
00094 " __global const float * vector2) \n"
00095 "{ \n"
00096 " float tmp;\n"
00097 " unsigned int offset;\n"
00098 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00099 " {\n"
00100 " tmp = vector1[row];\n"
00101 " offset = row*matrix_internal_cols;\n"
00102 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00103 " matrix[offset+col] += tmp * vector2[col];\n"
00104 " }\n"
00105 "}\n"
00106 ;
00107
00108 const char * const matrix_row_align1_sub =
00109 "__kernel void sub(\n"
00110 " __global const float * vec1,\n"
00111 " __global const float * vec2, \n"
00112 " __global float * result,\n"
00113 " unsigned int size)\n"
00114 "{ \n"
00115 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00116 " result[i] = vec1[i] - vec2[i];\n"
00117 "}\n"
00118 ;
00119
00120 const char * const matrix_row_align1_trans_unit_upper_triangular_substitute_inplace =
00121 "//transposed lower triangular matrix\n"
00122 "__kernel void trans_unit_upper_triangular_substitute_inplace(\n"
00123 " __global const float * matrix, \n"
00124 " unsigned int matrix_rows,\n"
00125 " unsigned int matrix_cols,\n"
00126 " unsigned int matrix_internal_rows,\n"
00127 " unsigned int matrix_internal_cols,\n"
00128 " __global float * vector) \n"
00129 "{ \n"
00130 " float temp; \n"
00131 " for (int row = matrix_rows-1; row > -1; --row) \n"
00132 " { \n"
00133 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00134 " temp = vector[row]; \n"
00135 " //eliminate column with index 'row' in parallel: \n"
00136 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00137 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
00138 " } \n"
00139 " \n"
00140 "}\n"
00141 ;
00142
00143 const char * const matrix_row_align1_lu_factorize =
00144 "__kernel void lu_factorize(\n"
00145 " __global float * matrix,\n"
00146 " unsigned int matrix_rows,\n"
00147 " unsigned int matrix_cols,\n"
00148 " unsigned int matrix_internal_rows,\n"
00149 " unsigned int matrix_internal_cols) \n"
00150 "{ \n"
00151 " float temp;\n"
00152 " unsigned rowi;\n"
00153 " unsigned rowk;\n"
00154 " for (unsigned int i=1; i<matrix_rows; ++i)\n"
00155 " {\n"
00156 " rowi = i * matrix_internal_cols;\n"
00157 " for (unsigned int k=0; k<i; ++k)\n"
00158 " {\n"
00159 " rowk = k * matrix_internal_cols;\n"
00160 " if (get_global_id(0) == 0)\n"
00161 " matrix[rowi + k] /= matrix[rowk + k];\n"
00162 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00163 " temp = matrix[rowi + k];\n"
00164 " \n"
00165 " //parallel subtraction:\n"
00166 " for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0))\n"
00167 " matrix[rowi + j] -= temp * matrix[rowk + j];\n"
00168 " }\n"
00169 " }\n"
00170 "} \n"
00171 ;
00172
00173 const char * const matrix_row_align1_add =
00174 "__kernel void add(\n"
00175 " __global const float * vec1,\n"
00176 " __global const float * vec2, \n"
00177 " __global float * result,\n"
00178 " unsigned int size) \n"
00179 "{ \n"
00180 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00181 " result[i] = vec1[i] + vec2[i];\n"
00182 "}\n"
00183 ;
00184
00185 const char * const matrix_row_align1_vec_mul =
00186 "\n"
00187 "\n"
00188 "\n"
00189 "__kernel void vec_mul(\n"
00190 " __global const float * matrix,\n"
00191 " unsigned int matrix_rows,\n"
00192 " unsigned int matrix_cols,\n"
00193 " unsigned int matrix_internal_rows,\n"
00194 " unsigned int matrix_internal_cols,\n"
00195 " __global const float * vector, \n"
00196 " __global float * result) \n"
00197 "{ \n"
00198 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00199 " {\n"
00200 " float dot_prod = 0.0f;\n"
00201 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00202 " dot_prod += matrix[row*matrix_internal_cols + col] * vector[col];\n"
00203 " result[row] = dot_prod;\n"
00204 " }\n"
00205 "}\n"
00206 "\n"
00207 "\n"
00208 ;
00209
00210 const char * const matrix_row_align1_trans_lower_triangular_substitute_inplace =
00211 "__kernel void trans_lower_triangular_substitute_inplace(\n"
00212 " __global const float * matrix,\n"
00213 " unsigned int matrix_rows,\n"
00214 " unsigned int matrix_cols,\n"
00215 " unsigned int matrix_internal_rows,\n"
00216 " unsigned int matrix_internal_cols,\n"
00217 " __global float * vector)\n"
00218 "{\n"
00219 " float temp;\n"
00220 " for (int row = 0; row < matrix_rows; ++row)\n"
00221 " {\n"
00222 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00223 " if (get_global_id(0) == 0)\n"
00224 " vector[row] /= matrix[row+row*matrix_internal_cols];\n"
00225 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00226 " temp = vector[row];\n"
00227 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00228 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
00229 " }\n"
00230 "}\n"
00231 ;
00232
00233 const char * const matrix_row_align1_inplace_divide =
00234 "__kernel void inplace_divide(\n"
00235 " __global float * vec,\n"
00236 " __global const float * fac, //note: CPU variant is mapped to prod_scalar\n"
00237 " unsigned int size) \n"
00238 "{ \n"
00239 " float factor = *fac;\n"
00240 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00241 " vec[i] /= factor;\n"
00242 "}\n"
00243 ;
00244
00245 const char * const matrix_row_align1_trans_upper_triangular_substitute_inplace =
00246 "//transposed lower triangular matrix\n"
00247 "__kernel void trans_upper_triangular_substitute_inplace(\n"
00248 " __global const float * matrix, \n"
00249 " unsigned int matrix_rows,\n"
00250 " unsigned int matrix_cols,\n"
00251 " unsigned int matrix_internal_rows,\n"
00252 " unsigned int matrix_internal_cols,\n"
00253 " __global float * vector) \n"
00254 "{ \n"
00255 " float temp; \n"
00256 " for (int row = matrix_rows-1; row > -1; --row) \n"
00257 " { \n"
00258 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00259 " if (get_global_id(0) == 0) \n"
00260 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
00261 " \n"
00262 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00263 " temp = vector[row]; \n"
00264 " //eliminate column with index 'row' in parallel: \n"
00265 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00266 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim]; \n"
00267 " } \n"
00268 " \n"
00269 "}\n"
00270 ;
00271
00272 const char * const matrix_row_align1_unit_upper_triangular_substitute_inplace =
00273 "__kernel void unit_upper_triangular_substitute_inplace( \n"
00274 " __global const float * matrix, \n"
00275 " unsigned int matrix_rows,\n"
00276 " unsigned int matrix_cols,\n"
00277 " unsigned int matrix_internal_rows,\n"
00278 " unsigned int matrix_internal_cols,\n"
00279 " __global float * vector) \n"
00280 "{ \n"
00281 " float temp; \n"
00282 " for (int row = matrix_rows-1; row > -1; --row) \n"
00283 " { \n"
00284 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00285 " temp = vector[row]; \n"
00286 " //eliminate column with index 'row' in parallel: \n"
00287 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00288 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
00289 " } \n"
00290 " \n"
00291 "}\n"
00292 ;
00293
00294 const char * const matrix_row_align1_inplace_add =
00295 "__kernel void inplace_add(\n"
00296 " __global float * vec1,\n"
00297 " __global const float * vec2,\n"
00298 " unsigned int size) \n"
00299 "{ \n"
00300 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00301 " vec1[i] += vec2[i];\n"
00302 "}\n"
00303 ;
00304
00305 const char * const matrix_row_align1_trans_unit_lower_triangular_substitute_inplace =
00306 "\n"
00307 "__kernel void trans_unit_lower_triangular_substitute_inplace(\n"
00308 " __global const float * matrix,\n"
00309 " unsigned int matrix_rows,\n"
00310 " unsigned int matrix_cols,\n"
00311 " unsigned int matrix_internal_rows,\n"
00312 " unsigned int matrix_internal_cols,\n"
00313 " __global float * vector)\n"
00314 "{\n"
00315 " float temp;\n"
00316 " for (int row = 0; row < matrix_rows; ++row)\n"
00317 " {\n"
00318 " barrier(CLK_GLOBAL_MEM_FENCE);\n"
00319 "\n"
00320 " temp = vector[row];\n"
00321 "\n"
00322 " for (int elim = row + get_global_id(0) + 1; elim < matrix_rows; elim += get_global_size(0))\n"
00323 " vector[elim] -= temp * matrix[row * matrix_internal_cols + elim];\n"
00324 " }\n"
00325 "}\n"
00326 "\n"
00327 "\n"
00328 ;
00329
00330 const char * const matrix_row_align1_scaled_rank1_update =
00331 "__kernel void scaled_rank1_update(\n"
00332 " __global float * matrix,\n"
00333 " unsigned int matrix_rows,\n"
00334 " unsigned int matrix_cols,\n"
00335 " unsigned int matrix_internal_rows,\n"
00336 " unsigned int matrix_internal_cols,\n"
00337 " float val,\n"
00338 " __global const float * vector1, \n"
00339 " __global const float * vector2) \n"
00340 "{ \n"
00341 " float tmp;\n"
00342 " unsigned int offset;\n"
00343 " for (unsigned int row = get_global_id(0); row < matrix_rows; row += get_global_size(0))\n"
00344 " {\n"
00345 " tmp = val * vector1[row];\n"
00346 " offset = row * matrix_internal_cols;\n"
00347 " for (unsigned int col = 0; col < matrix_cols; ++col)\n"
00348 " matrix[offset+col] += tmp * vector2[col];\n"
00349 " }\n"
00350 "}\n"
00351 ;
00352
00353 const char * const matrix_row_align1_clear =
00354 "__kernel void clear(\n"
00355 " __global float * vec,\n"
00356 " unsigned int size) \n"
00357 "{ \n"
00358 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00359 " vec[i] = 0;\n"
00360 "}\n"
00361 ;
00362
00363 const char * const matrix_row_align1_cpu_inplace_mult =
00364 "__kernel void cpu_inplace_mult(\n"
00365 " __global float * vec,\n"
00366 " float factor, \n"
00367 " unsigned int size) \n"
00368 "{ \n"
00369 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00370 " vec[i] *= factor;\n"
00371 "}\n"
00372 ;
00373
00374 const char * const matrix_row_align1_inplace_mult =
00375 "__kernel void inplace_mult(\n"
00376 " __global float * vec,\n"
00377 " __global const float * fac, \n"
00378 " unsigned int size) \n"
00379 "{ \n"
00380 " float factor = *fac;\n"
00381 " for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0))\n"
00382 " vec[i] *= factor;\n"
00383 "}\n"
00384 ;
00385
00386 const char * const matrix_row_align1_upper_triangular_substitute_inplace =
00387 "__kernel void upper_triangular_substitute_inplace( \n"
00388 " __global const float * matrix, \n"
00389 " unsigned int matrix_rows,\n"
00390 " unsigned int matrix_cols,\n"
00391 " unsigned int matrix_internal_rows,\n"
00392 " unsigned int matrix_internal_cols,\n"
00393 " __global float * vector) \n"
00394 "{ \n"
00395 " float temp; \n"
00396 " for (int row = matrix_rows-1; row > -1; --row) \n"
00397 " { \n"
00398 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00399 " if (get_global_id(0) == 0) \n"
00400 " vector[row] /= matrix[row*matrix_internal_cols + row]; \n"
00401 " \n"
00402 " barrier(CLK_GLOBAL_MEM_FENCE); \n"
00403 " temp = vector[row]; \n"
00404 " //eliminate column with index 'row' in parallel: \n"
00405 " for (int elim = get_global_id(0); elim < row; elim += get_global_size(0)) \n"
00406 " vector[elim] -= temp * matrix[elim * matrix_internal_cols + row]; \n"
00407 " } \n"
00408 " \n"
00409 "}\n"
00410 ;
00411
00412 }
00413 }
00414 }
00415 #endif