00001 #ifndef _VIENNACL_MATRIX_PROD_COL_ROW_ROW_SOURCE_HPP_
00002 #define _VIENNACL_MATRIX_PROD_COL_ROW_ROW_SOURCE_HPP_
00003
00004 namespace viennacl
00005 {
00006 namespace linalg
00007 {
00008 namespace kernels
00009 {
00010 const char * const matrix_prod_col_row_row_align1_prod_TT =
00011 "// file automatically generated - do not edit!\n"
00012 "// matrix-matrix multiplication C = A^T * B^T\n"
00013 "// matrix layouts: C...row_major, A...col_major, B...row_major\n"
00014 "__kernel void prod_TT(\n"
00015 " __global const float * A,\n"
00016 " unsigned int A_rows,\n"
00017 " unsigned int A_cols,\n"
00018 " unsigned int A_internal_rows,\n"
00019 " unsigned int A_internal_cols,\n"
00020 " __global const float * B, \n"
00021 " unsigned int B_rows,\n"
00022 " unsigned int B_cols,\n"
00023 " unsigned int B_internal_rows,\n"
00024 " unsigned int B_internal_cols,\n"
00025 " __global float * C,\n"
00026 " unsigned int C_rows,\n"
00027 " unsigned int C_cols,\n"
00028 " unsigned int C_internal_rows,\n"
00029 " unsigned int C_internal_cols,\n"
00030 " __local float * bufA,\n"
00031 " __local float * bufB) \n"
00032 "{ \n"
00033 " int block_size = get_local_size(0);\n"
00034 " int row_block_id = get_group_id(0);\n"
00035 " int col_block_id = get_group_id(1);\n"
00036 " int row_thread_id = get_local_id(0);\n"
00037 " int col_thread_id = get_local_id(1);\n"
00038 " int aBegin = row_block_id * block_size * A_internal_rows;\n"
00039 " int aStep = block_size;\n"
00040 " int bBegin = col_block_id * block_size * B_internal_cols;\n"
00041 " int bStep = block_size;\n"
00042 " int block_num = A_rows / block_size;\n"
00043 " if (block_num * block_size != A_rows)\n"
00044 " ++block_num;\n"
00045 " float Csub = 0;\n"
00046 " int aOffset = row_thread_id * A_internal_rows + col_thread_id;\n"
00047 " int bOffset = row_thread_id + col_thread_id * B_internal_cols;\n"
00048 " for (int block = 0;\n"
00049 " block < block_num;\n"
00050 " ++block)\n"
00051 " {\n"
00052 " if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)\n"
00053 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n"
00054 " else\n"
00055 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n"
00056 " if ( (block * block_size + row_thread_id < B_cols) && get_global_id(1) < B_rows )\n"
00057 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n"
00058 " else\n"
00059 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n"
00060 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00061 " for (int k = 0; k < block_size; ++k)\n"
00062 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n"
00063 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00064 " aBegin += aStep;\n"
00065 " bBegin += bStep;\n"
00066 " }\n"
00067 " if (get_global_id(0) < A_cols && get_global_id(1) < B_rows)\n"
00068 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n"
00069 "}\n"
00070 ;
00071
00072 const char * const matrix_prod_col_row_row_align1_prod_TA =
00073 "// file automatically generated - do not edit!\n"
00074 "// matrix-matrix multiplication C = A^T * B\n"
00075 "// matrix layouts: C...row_major, A...col_major, B...row_major\n"
00076 "__kernel void prod_TA(\n"
00077 " __global const float * A,\n"
00078 " unsigned int A_rows,\n"
00079 " unsigned int A_cols,\n"
00080 " unsigned int A_internal_rows,\n"
00081 " unsigned int A_internal_cols,\n"
00082 " __global const float * B, \n"
00083 " unsigned int B_rows,\n"
00084 " unsigned int B_cols,\n"
00085 " unsigned int B_internal_rows,\n"
00086 " unsigned int B_internal_cols,\n"
00087 " __global float * C,\n"
00088 " unsigned int C_rows,\n"
00089 " unsigned int C_cols,\n"
00090 " unsigned int C_internal_rows,\n"
00091 " unsigned int C_internal_cols,\n"
00092 " __local float * bufA,\n"
00093 " __local float * bufB) \n"
00094 "{ \n"
00095 " int block_size = get_local_size(0);\n"
00096 " int row_block_id = get_group_id(0);\n"
00097 " int col_block_id = get_group_id(1);\n"
00098 " int row_thread_id = get_local_id(0);\n"
00099 " int col_thread_id = get_local_id(1);\n"
00100 " int aBegin = row_block_id * block_size * A_internal_rows;\n"
00101 " int aStep = block_size;\n"
00102 " int bBegin = col_block_id * block_size;\n"
00103 " int bStep = block_size * B_internal_cols;\n"
00104 " int block_num = A_rows / block_size;\n"
00105 " if (block_num * block_size != A_rows)\n"
00106 " ++block_num;\n"
00107 " float Csub = 0;\n"
00108 " int aOffset = row_thread_id * A_internal_rows + col_thread_id;\n"
00109 " int bOffset = row_thread_id * B_internal_cols + col_thread_id;\n"
00110 " for (int block = 0;\n"
00111 " block < block_num;\n"
00112 " ++block)\n"
00113 " {\n"
00114 " if (block * block_size + col_thread_id < A_rows && get_global_id(0) < A_cols)\n"
00115 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n"
00116 " else\n"
00117 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n"
00118 " if ( (block * block_size + row_thread_id < B_rows) && get_global_id(1) < B_cols )\n"
00119 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n"
00120 " else\n"
00121 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n"
00122 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00123 " for (int k = 0; k < block_size; ++k)\n"
00124 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n"
00125 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00126 " aBegin += aStep;\n"
00127 " bBegin += bStep;\n"
00128 " }\n"
00129 " if (get_global_id(0) < A_cols && get_global_id(1) < B_cols)\n"
00130 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n"
00131 "}\n"
00132 ;
00133
00134 const char * const matrix_prod_col_row_row_align1_prod_AA =
00135 "// file automatically generated - do not edit!\n"
00136 "// matrix-matrix multiplication C = A * B\n"
00137 "// matrix layouts: C...row_major, A...col_major, B...row_major\n"
00138 "__kernel void prod_AA(\n"
00139 " __global const float * A,\n"
00140 " unsigned int A_rows,\n"
00141 " unsigned int A_cols,\n"
00142 " unsigned int A_internal_rows,\n"
00143 " unsigned int A_internal_cols,\n"
00144 " __global const float * B, \n"
00145 " unsigned int B_rows,\n"
00146 " unsigned int B_cols,\n"
00147 " unsigned int B_internal_rows,\n"
00148 " unsigned int B_internal_cols,\n"
00149 " __global float * C,\n"
00150 " unsigned int C_rows,\n"
00151 " unsigned int C_cols,\n"
00152 " unsigned int C_internal_rows,\n"
00153 " unsigned int C_internal_cols,\n"
00154 " __local float * bufA,\n"
00155 " __local float * bufB) \n"
00156 "{ \n"
00157 " int block_size = get_local_size(0);\n"
00158 " int row_block_id = get_group_id(0);\n"
00159 " int col_block_id = get_group_id(1);\n"
00160 " int row_thread_id = get_local_id(0);\n"
00161 " int col_thread_id = get_local_id(1);\n"
00162 " int aBegin = row_block_id * block_size;\n"
00163 " int aStep = block_size * A_internal_rows;\n"
00164 " int bBegin = col_block_id * block_size;\n"
00165 " int bStep = block_size * B_internal_cols;\n"
00166 " int block_num = A_cols / block_size;\n"
00167 " if (block_num * block_size != A_cols)\n"
00168 " ++block_num;\n"
00169 " float Csub = 0;\n"
00170 " int aOffset = row_thread_id + col_thread_id * A_internal_rows;\n"
00171 " int bOffset = row_thread_id * B_internal_cols + col_thread_id;\n"
00172 " for (int block = 0;\n"
00173 " block < block_num;\n"
00174 " ++block)\n"
00175 " {\n"
00176 " if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)\n"
00177 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n"
00178 " else\n"
00179 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n"
00180 " if ( (block * block_size + row_thread_id < B_rows) && get_global_id(1) < B_cols )\n"
00181 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n"
00182 " else\n"
00183 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n"
00184 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00185 " for (int k = 0; k < block_size; ++k)\n"
00186 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n"
00187 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00188 " aBegin += aStep;\n"
00189 " bBegin += bStep;\n"
00190 " }\n"
00191 " if (get_global_id(0) < A_rows && get_global_id(1) < B_cols)\n"
00192 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n"
00193 "}\n"
00194 ;
00195
00196 const char * const matrix_prod_col_row_row_align1_prod_AT =
00197 "// file automatically generated - do not edit!\n"
00198 "// matrix-matrix multiplication C = A * B^T\n"
00199 "// matrix layouts: C...row_major, A...col_major, B...row_major\n"
00200 "__kernel void prod_AT(\n"
00201 " __global const float * A,\n"
00202 " unsigned int A_rows,\n"
00203 " unsigned int A_cols,\n"
00204 " unsigned int A_internal_rows,\n"
00205 " unsigned int A_internal_cols,\n"
00206 " __global const float * B, \n"
00207 " unsigned int B_rows,\n"
00208 " unsigned int B_cols,\n"
00209 " unsigned int B_internal_rows,\n"
00210 " unsigned int B_internal_cols,\n"
00211 " __global float * C,\n"
00212 " unsigned int C_rows,\n"
00213 " unsigned int C_cols,\n"
00214 " unsigned int C_internal_rows,\n"
00215 " unsigned int C_internal_cols,\n"
00216 " __local float * bufA,\n"
00217 " __local float * bufB) \n"
00218 "{ \n"
00219 " int block_size = get_local_size(0);\n"
00220 " int row_block_id = get_group_id(0);\n"
00221 " int col_block_id = get_group_id(1);\n"
00222 " int row_thread_id = get_local_id(0);\n"
00223 " int col_thread_id = get_local_id(1);\n"
00224 " int aBegin = row_block_id * block_size;\n"
00225 " int aStep = block_size * A_internal_rows;\n"
00226 " int bBegin = col_block_id * block_size * B_internal_cols;\n"
00227 " int bStep = block_size;\n"
00228 " int block_num = A_cols / block_size;\n"
00229 " if (block_num * block_size != A_cols)\n"
00230 " ++block_num;\n"
00231 " float Csub = 0;\n"
00232 " int aOffset = row_thread_id + col_thread_id * A_internal_rows;\n"
00233 " int bOffset = row_thread_id + col_thread_id * B_internal_cols;\n"
00234 " for (int block = 0;\n"
00235 " block < block_num;\n"
00236 " ++block)\n"
00237 " {\n"
00238 " if (block * block_size + col_thread_id < A_cols && get_global_id(0) < A_rows)\n"
00239 " bufA[row_thread_id * block_size + col_thread_id] = A[aBegin + aOffset];\n"
00240 " else\n"
00241 " bufA[row_thread_id * block_size + col_thread_id] = 0;\n"
00242 " if ( (block * block_size + row_thread_id < B_cols) && get_global_id(1) < B_rows )\n"
00243 " bufB[row_thread_id * block_size + col_thread_id] = B[bBegin + bOffset]; \n"
00244 " else\n"
00245 " bufB[row_thread_id * block_size + col_thread_id] = 0;\n"
00246 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00247 " for (int k = 0; k < block_size; ++k)\n"
00248 " Csub += bufA[row_thread_id * block_size + k] * bufB[k * block_size + col_thread_id];\n"
00249 " barrier(CLK_LOCAL_MEM_FENCE);\n"
00250 " aBegin += aStep;\n"
00251 " bBegin += bStep;\n"
00252 " }\n"
00253 " if (get_global_id(0) < A_rows && get_global_id(1) < B_rows)\n"
00254 " C[get_global_id(0) * C_internal_cols + get_global_id(1)] = Csub;\n"
00255 "}\n"
00256 ;
00257
00258 }
00259 }
00260 }
00261 #endif