• Main Page
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

/data/development/ViennaCL/ViennaCL-1.1.2/viennacl/linalg/vector_operations.hpp

Go to the documentation of this file.
00001 /* =======================================================================
00002    Copyright (c) 2010, Institute for Microelectronics, TU Vienna.
00003    http://www.iue.tuwien.ac.at
00004                              -----------------
00005                      ViennaCL - The Vienna Computing Library
00006                              -----------------
00007                             
00008    authors:    Karl Rupp                          rupp@iue.tuwien.ac.at
00009                Florian Rudolf                     flo.rudy+viennacl@gmail.com
00010                Josef Weinbub                      weinbub@iue.tuwien.ac.at
00011 
00012    license:    MIT (X11), see file LICENSE in the ViennaCL base directory
00013 ======================================================================= */
00014 
00015 #ifndef _VIENNACL_VECTOR_OPERATIONS_HPP_
00016 #define _VIENNACL_VECTOR_OPERATIONS_HPP_
00017 
00022 #include "viennacl/forwards.h"
00023 #include "viennacl/ocl/device.hpp"
00024 #include "viennacl/ocl/handle.hpp"
00025 #include "viennacl/ocl/kernel.hpp"
00026 #include "viennacl/scalar.hpp"
00027 #include "viennacl/tools/tools.hpp"
00028 #include "viennacl/linalg/kernels/vector_kernels.h"
00029 
00030 namespace viennacl
00031 {
00032   namespace linalg
00033   {
00040     template<class SCALARTYPE, unsigned int ALIGNMENT>
00041     void add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 
00042              const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2, 
00043              viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00044     {
00045       assert(vec1.size() == vec2.size() && vec1.size() == result.size());
00046 
00047       unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
00048       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "add");
00049       
00050       viennacl::ocl::enqueue(k(vec1, vec2, result, size));        
00051     }
00052 
00060     template<class SCALARTYPE, unsigned int ALIGNMENT>
00061     void inplace_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00062                      const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
00063     {
00064       assert(vec1.size() == vec2.size());
00065       unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
00066       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_add");
00067 
00068       viennacl::ocl::enqueue(k(vec1, vec2, size));        
00069     }
00070 
00079     template<class SCALARTYPE, unsigned int ALIGNMENT>
00080     void sub(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00081              const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00082              viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00083     {
00084       assert(vec1.size() == vec2.size());
00085       result.resize(vec1.size());
00086       unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
00087       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sub");
00088 
00089       viennacl::ocl::enqueue(k(vec1, vec2, result, size));        
00090     }
00091 
00099     template<class SCALARTYPE, unsigned int ALIGNMENT>
00100     void inplace_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1, 
00101                      const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2)
00102     {
00103       assert(vec1.size() == vec2.size());
00104       unsigned int size = std::min(vec1.internal_size(), vec2.internal_size());
00105       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_sub");
00106 
00107       viennacl::ocl::enqueue(k(vec1, vec2, size));        
00108     }
00109 
00110 
00111     //result = vec * scalar
00120     template<class SCALARTYPE, unsigned int ALIGNMENT>
00121     void mult(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00122               scalar<SCALARTYPE> const & alpha,
00123               viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00124     {
00125       result.resize(vec.size());
00126       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mult");
00127 
00128       viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size())));        
00129     }
00130 
00139     template<class SCALARTYPE, unsigned int ALIGNMENT>
00140     void mult(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00141               SCALARTYPE alpha,
00142               viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00143     {
00144       result.resize(vec.size());
00145       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_mult");
00146 
00147       viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size())));        
00148     }
00149 
00157     template<class SCALARTYPE, unsigned int ALIGNMENT>
00158     void inplace_mult(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00159                       scalar<SCALARTYPE> const & alpha)
00160     {
00161       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mult");
00162 
00163       viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size())));        
00164     }
00165 
00173     template<class SCALARTYPE, unsigned int ALIGNMENT>
00174     void inplace_mult(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00175                       SCALARTYPE alpha)
00176     {
00177       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_inplace_mult");
00178 
00179       viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size())));        
00180     }
00181 
00182     //result = vec / scalar
00191     template<class SCALARTYPE, unsigned int ALIGNMENT>
00192     void divide(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00193                 scalar<SCALARTYPE> const & alpha,
00194                 viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00195     {
00196       assert(vec.size() == result.size());
00197       result.resize(vec.size());
00198       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "divide");
00199 
00200       viennacl::ocl::enqueue(k(vec, alpha, result, static_cast<cl_uint>(vec.internal_size())));        
00201     }
00202 
00210     template<class SCALARTYPE, unsigned int ALIGNMENT>
00211     void inplace_divide(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec,
00212                         scalar<SCALARTYPE> const & alpha)
00213     {
00214       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_divide");
00215 
00216       viennacl::ocl::enqueue(k(vec, alpha, static_cast<cl_uint>(vec.internal_size())));        
00217     }
00218 
00219     //result = factor * vec1 + vec2
00229     template<class SCALARTYPE, unsigned int ALIGNMENT>
00230     void mul_add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00231                  scalar<SCALARTYPE> const & alpha,
00232                  const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00233                  viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00234     {
00235       assert(vec1.size() == vec2.size() && result.size() == vec1.size());
00236       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mul_add");
00237       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00238 
00239       viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size));        
00240     }
00241 
00251     template<class SCALARTYPE, unsigned int ALIGNMENT>
00252     void mul_add(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00253                  SCALARTYPE alpha,
00254                  const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00255                  viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00256     {
00257       assert(vec1.size() == vec2.size() && result.size() == vec1.size());
00258       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_mul_add");
00259       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00260 
00261       viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size));        
00262     }
00263 
00264     //vec1 += factor * vec2
00273     template<class SCALARTYPE, unsigned int ALIGNMENT>
00274     void inplace_mul_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00275                          const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00276                          scalar<SCALARTYPE> const & alpha)
00277     {
00278       assert(vec1.size() == vec2.size());
00279       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00280       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mul_add");
00281 
00282       viennacl::ocl::enqueue(k(vec1, vec2, alpha, size));        
00283     }
00284 
00293     template<class SCALARTYPE, unsigned int ALIGNMENT>
00294     void inplace_mul_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00295                          const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00296                          SCALARTYPE alpha)
00297     {
00298       assert(vec1.size() == vec2.size());
00299       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "cpu_inplace_mul_add");
00300       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00301 
00302       viennacl::ocl::enqueue(k(vec1, vec2, alpha, size));        
00303     }
00304 
00314     template<class SCALARTYPE, unsigned int ALIGNMENT>
00315     void mul_sub(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00316                  scalar<SCALARTYPE> const & alpha,
00317                  const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00318                  viennacl::vector<SCALARTYPE, ALIGNMENT> & result)
00319     {
00320       assert(vec1.size() == vec2.size() && result.size() == vec1.size());
00321       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "mul_sub");
00322       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00323 
00324       viennacl::ocl::enqueue(k(vec1, alpha, vec2, result, size));        
00325     }
00326 
00327 
00336     template<class SCALARTYPE, unsigned int ALIGNMENT>
00337     void inplace_mul_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00338                          const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00339                          scalar<SCALARTYPE> const & alpha)
00340     {
00341       assert(vec1.size() == vec2.size());
00342       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_mul_sub");
00343       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00344 
00345       viennacl::ocl::enqueue(k(vec1, vec2, alpha, size));        
00346     }
00347 
00356     template<class SCALARTYPE, unsigned int ALIGNMENT>
00357     void inplace_div_add(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00358                          const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00359                          scalar<SCALARTYPE> const & alpha)
00360     {
00361       assert(vec1.size() == vec2.size());
00362       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_div_add");
00363       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00364 
00365       viennacl::ocl::enqueue(k(vec1, vec2, alpha, size));        
00366     }
00367 
00376     template<class SCALARTYPE, unsigned int ALIGNMENT>
00377     void inplace_div_sub(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00378                          const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00379                          scalar<SCALARTYPE> const & alpha)
00380     {
00381       assert(vec1.size() == vec2.size());
00382       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inplace_div_sub");
00383       cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00384 
00385       viennacl::ocl::enqueue(k(vec1, vec2, alpha, size));        
00386     }
00387 
00388 
00390 
00391 
00392     //implementation of inner product:
00393     //namespace {
00400       template<class SCALARTYPE, unsigned int ALIGNMENT>
00401       void inner_prod_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00402                            const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00403                            scalar<SCALARTYPE> & result)
00404       {
00405         assert(vec1.size() == vec2.size());
00406         viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "inner_prod");
00407         cl_uint size = static_cast<cl_uint>(std::min(vec1.internal_size(), vec2.internal_size()));
00408         unsigned int work_groups = k.global_work_size() / k.local_work_size();
00409         static viennacl::vector<SCALARTYPE> temp(work_groups);
00410         
00411         /*unsigned int pos = 0;
00412         k.argument(pos++, vec1.handle());
00413         k.argument(pos++, vec2.handle());
00414         k.argument(pos++, size);
00415         k.local_buffer(pos++, static_cast<unsigned int>(sizeof(SCALARTYPE) * k.local_work_size()));
00416         k.argument(pos++, temp.handle());*/
00417         
00418         //Note: Number of work groups MUST be a power of two!
00419         //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
00420         assert( work_groups * k.local_work_size() == k.global_work_size() );
00421         assert( (k.global_work_size() / k.local_work_size()) == 1 
00422                || (k.global_work_size() / k.local_work_size()) == 2 
00423                || (k.global_work_size() / k.local_work_size()) == 4
00424                || (k.global_work_size() / k.local_work_size()) == 8
00425                || (k.global_work_size() / k.local_work_size()) == 16
00426                || (k.global_work_size() / k.local_work_size()) == 32
00427                || (k.global_work_size() / k.local_work_size()) == 64
00428                || (k.global_work_size() / k.local_work_size()) == 128
00429                || (k.global_work_size() / k.local_work_size()) == 256
00430                || (k.global_work_size() / k.local_work_size()) == 512 );
00431                
00432         viennacl::ocl::enqueue(k(vec1, vec2, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));        
00433 
00434         viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sum");
00435         
00436         ksum.local_work_size(0, work_groups);
00437         ksum.global_work_size(0, work_groups);
00438         viennacl::ocl::enqueue(ksum(temp, result));
00439       }
00440     //}
00441 
00442     //public interface of inner product
00449     template<class SCALARTYPE, unsigned int ALIGNMENT1, unsigned int ALIGNMENT2>
00450     viennacl::scalar_expression< const viennacl::vector<SCALARTYPE, ALIGNMENT1>, 
00451                                  const viennacl::vector<SCALARTYPE, ALIGNMENT2>,
00452                                  viennacl::op_inner_prod >
00453     inner_prod_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT1> & vec1,
00454                     const viennacl::vector<SCALARTYPE, ALIGNMENT2> & vec2)
00455     {
00456       return viennacl::scalar_expression< const viennacl::vector<SCALARTYPE, ALIGNMENT1>, 
00457                                           const viennacl::vector<SCALARTYPE, ALIGNMENT2>,
00458                                           viennacl::op_inner_prod >(vec1, vec2);
00459     }
00460 
00461 
00462     
00468     template<class SCALARTYPE, unsigned int ALIGNMENT>
00469     void norm_1_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec,
00470                      scalar<SCALARTYPE> & result)
00471     {
00472       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_1");
00473       cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
00474       
00475       if (k.local_work_size() != k.global_work_size())
00476       {
00477         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
00478         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00479         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00480       }
00481       
00482       
00483       unsigned int work_groups = k.global_work_size() / k.local_work_size();
00484       viennacl::vector<SCALARTYPE> temp(work_groups);
00485         
00486       //Note: Number of work groups MUST be a power of two!
00487       //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
00488       assert( work_groups * k.local_work_size() == k.global_work_size() );
00489       assert( (k.global_work_size() / k.local_work_size()) == 1 
00490              || (k.global_work_size() / k.local_work_size()) == 2 
00491              || (k.global_work_size() / k.local_work_size()) == 4
00492              || (k.global_work_size() / k.local_work_size()) == 8
00493              || (k.global_work_size() / k.local_work_size()) == 16
00494              || (k.global_work_size() / k.local_work_size()) == 32
00495              || (k.global_work_size() / k.local_work_size()) == 64
00496              || (k.global_work_size() / k.local_work_size()) == 128
00497              || (k.global_work_size() / k.local_work_size()) == 256
00498              || (k.global_work_size() / k.local_work_size()) == 512 );
00499                
00500         viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));        
00501         
00502         viennacl::ocl::kernel & ksum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sum");
00503         
00504         ksum.local_work_size(0, work_groups);
00505         ksum.global_work_size(0, work_groups);
00506         viennacl::ocl::enqueue(ksum(temp, result));
00507     }
00508 
00514     template<class SCALARTYPE, unsigned int ALIGNMENT>
00515     void norm_2_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec,
00516                      scalar<SCALARTYPE> & result)
00517     {
00518       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_2");
00519       cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
00520       
00521       if (k.local_work_size() != k.global_work_size())
00522       {
00523         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
00524         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00525         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00526       }
00527 
00528       unsigned int work_groups = k.global_work_size() / k.local_work_size();
00529       viennacl::vector<SCALARTYPE> temp(work_groups);
00530         
00531       //Note: Number of work groups MUST be a power of two!
00532       //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
00533       assert( work_groups * k.local_work_size() == k.global_work_size() );
00534       assert( (k.global_work_size() / k.local_work_size()) == 1 
00535              || (k.global_work_size() / k.local_work_size()) == 2 
00536              || (k.global_work_size() / k.local_work_size()) == 4
00537              || (k.global_work_size() / k.local_work_size()) == 8
00538              || (k.global_work_size() / k.local_work_size()) == 16
00539              || (k.global_work_size() / k.local_work_size()) == 32
00540              || (k.global_work_size() / k.local_work_size()) == 64
00541              || (k.global_work_size() / k.local_work_size()) == 128
00542              || (k.global_work_size() / k.local_work_size()) == 256
00543              || (k.global_work_size() / k.local_work_size()) == 512 );
00544                
00545         viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));        
00546 
00547         viennacl::ocl::kernel & sqrt_sum = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "sqrt_sum");
00548         
00549         sqrt_sum.local_work_size(0, work_groups);
00550         sqrt_sum.global_work_size(0, work_groups);
00551         viennacl::ocl::enqueue(sqrt_sum(temp, result, work_groups));
00552     }
00553 
00559     template<class SCALARTYPE, unsigned int ALIGNMENT>
00560     void norm_inf_impl(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec,
00561                   scalar<SCALARTYPE> & result)
00562     {
00563       cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
00564       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "norm_inf");
00565 
00566       if (k.local_work_size() != k.global_work_size())
00567       {
00568         //NOTE: For some reasons the kernel could not be started with several work groups on NVIDIA hardware. This forces us to use as many parallel threads within a single work group as possible
00569         k.local_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00570         k.global_work_size(0, viennacl::ocl::current_device().max_work_group_size());
00571       }
00572       
00573       unsigned int work_groups = k.global_work_size() / k.local_work_size();
00574       viennacl::vector<SCALARTYPE> temp(work_groups);
00575         
00576       //Note: Number of work groups MUST be a power of two!
00577       //std::cout << work_groups << ", " << k.local_work_size() << ", " << k.global_work_size() << std::endl;
00578       assert( work_groups * k.local_work_size() == k.global_work_size() );
00579       assert( work_groups == 1 
00580              || work_groups == 2 
00581              || work_groups == 4
00582              || work_groups == 8
00583              || work_groups == 16
00584              || work_groups == 32
00585              || work_groups == 64
00586              || work_groups == 128
00587              || work_groups == 256
00588              || work_groups == 512 );
00589                
00590         viennacl::ocl::enqueue(k(vcl_vec, size, viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()), temp));
00591         //viennacl::ocl::get_queue().finish();
00592         
00593         //part 2: parallel reduction of reduced kernel:
00594         viennacl::ocl::kernel & max_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "vmax");
00595         max_kernel.local_work_size(0, work_groups);
00596         max_kernel.global_work_size(0, work_groups);
00597         
00598         viennacl::ocl::enqueue(max_kernel(temp, result, work_groups));
00599     }
00600 
00601     //This function should return a CPU scalar, otherwise statements like 
00602     // vcl_rhs[index_norm_inf(vcl_rhs)] 
00603     // are ambiguous
00609     template<class SCALARTYPE, unsigned int ALIGNMENT>
00610     cl_uint index_norm_inf(const viennacl::vector<SCALARTYPE, ALIGNMENT> & vcl_vec)
00611     {
00612       viennacl::ocl::handle<cl_mem> h = viennacl::ocl::current_context().create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
00613       
00614       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "index_norm_inf");
00615       cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
00616 
00617       k.global_work_size(0, k.local_work_size());
00618       viennacl::ocl::enqueue(k(vcl_vec,
00619                                size,
00620                                viennacl::ocl::local_mem(sizeof(SCALARTYPE) * k.local_work_size()),
00621                                viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
00622       
00623       //read value:
00624       cl_uint result;
00625       cl_int err;
00626       err = clEnqueueReadBuffer(viennacl::ocl::get_queue().handle(), h, CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
00627       VIENNACL_ERR_CHECK(err);
00628       return result;
00629     }
00630     
00631     //TODO: Special case vec1 == vec2 allows improvement!!
00641     template<class SCALARTYPE, unsigned int ALIGNMENT>
00642     void plane_rotation(viennacl::vector<SCALARTYPE, ALIGNMENT> & vec1,
00643                         const viennacl::vector<SCALARTYPE, ALIGNMENT> & vec2,
00644                         SCALARTYPE alpha,
00645                         SCALARTYPE beta)
00646     {
00647       assert(vec1.size() == vec2.size());
00648       viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<SCALARTYPE, ALIGNMENT>::program_name(), "plane_rotation");
00649 
00650       viennacl::ocl::enqueue(k(vec1, vec2, alpha, beta, static_cast<cl_uint>(vec1.size())));
00651     }
00652     
00653   } //namespace linalg
00654 } //namespace viennacl
00655 
00656 
00657 #endif

Generated on Sat May 21 2011 20:36:50 for ViennaCL - The Vienna Computing Library by  doxygen 1.7.1