doc/bisect__kernel__large__onei_8hpp_source.html

 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_

 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_


 /* =========================================================================

    Copyright (c) 2010-2016, Institute for Microelectronics,

                             Institute for Analysis and Scientific Computing,

                             TU Wien.

    Portions of this software are copyright by UChicago Argonne, LLC.


                             -----------------

                   ViennaCL - The Vienna Computing Library

                             -----------------


    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at


    (A list of authors and contributors can be found in the manual)


    License:         MIT (X11), see file LICENSE in the base directory

 ============================================================================= */


 // includes, project

 #include "viennacl/linalg/detail/bisect/config.hpp"

 #include "viennacl/linalg/detail/bisect/util.hpp"

 // additional kernel

 #include "viennacl/linalg/cuda/bisect_util.hpp"


 namespace viennacl

 {

 namespace linalg

 {

 namespace cuda

 {

 template<typename NumericT>

 __global__

 void

 bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n,

                                unsigned int num_intervals,

                                NumericT *g_left, NumericT *g_right,

                                unsigned int *g_pos,

                                NumericT  precision)

 {


   const unsigned int gtid = (blockDim.x * blockIdx.x) + threadIdx.x;


   __shared__  NumericT  s_left_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];

   __shared__  NumericT  s_right_scratch[VIENNACL_BISECT_MAX_THREADS_BLOCK];


   // active interval of thread

   // left and right limit of current interval

   NumericT left, right;

   // number of threads smaller than the right limit (also corresponds to the

   // global index of the eigenvalues contained in the active interval)

   unsigned int right_count;

   // flag if current thread converged

   unsigned int converged = 0;

   // midpoint when current interval is subdivided

   NumericT mid = 0.0f;

   // number of eigenvalues less than mid

   unsigned int mid_count = 0;


   // read data from global memory

   if (gtid < num_intervals)

   {

     left = g_left[gtid];

     right = g_right[gtid];

     right_count = g_pos[gtid];

   }


   // flag to determine if all threads converged to eigenvalue

   __shared__  unsigned int  converged_all_threads;


   // initialized shared flag

   if (0 == threadIdx.x)

   {

     converged_all_threads = 0;

   }


   __syncthreads();


   // process until all threads converged to an eigenvalue

   while (true)

   {


     converged_all_threads = 1;


     // update midpoint for all active threads

     if ((gtid < num_intervals) && (0 == converged))

     {

       mid = computeMidpoint(left, right);

     }


     // find number of eigenvalues that are smaller than midpoint

     mid_count = computeNumSmallerEigenvalsLarge(g_d, g_s, n,

                                                 mid, gtid, num_intervals,

                                                 s_left_scratch,

                                                 s_right_scratch,

                                                 converged);


     __syncthreads();


     // for all active threads

     if ((gtid < num_intervals) && (0 == converged))

     {


       // update intervals -- always one child interval survives

       if (right_count == mid_count)

       {

         right = mid;

       }

       else

       {

         left = mid;

       }


       // check for convergence

       NumericT t0 = right - left;

       NumericT t1 = max(abs(right), abs(left)) * precision;


       if (t0 < min(precision, t1))

       {

         NumericT lambda = computeMidpoint(left, right);

         left = lambda;

         right = lambda;


         converged = 1;

       }

       else

       {

         converged_all_threads = 0;

       }

     }


     __syncthreads();


     if (1 == converged_all_threads)

     {

       break;

     }


     __syncthreads();

   }


   // write data back to global memory

   __syncthreads();


   if (gtid < num_intervals)

   {

       // intervals converged so left and right interval limit are both identical

       // and identical to the eigenvalue

       g_left[gtid] = left;

   }

 }

 } // namespace cuda

 } // namespace linalg

 } // namespace viennacl

 #endif // #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_ONEI_HPP_

VIENNACL_BISECT_MAX_THREADS_BLOCK
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
Definition: config.hpp:32

util.hpp
Utility functions.

bisect_util.hpp

config.hpp
Global configuration parameters.

NumericT
float NumericT
Definition: bisect.cpp:40

viennacl::linalg::cuda::computeMidpoint
__device__ NumericT computeMidpoint(const NumericT left, const NumericT right)
Definition: bisect_util.hpp:89

viennacl::linalg::max
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47

viennacl::linalg::cuda::bisectKernelLarge_OneIntervals
__global__ void bisectKernelLarge_OneIntervals(const NumericT *g_d, const NumericT *g_s, const unsigned int n, unsigned int num_intervals, NumericT *g_left, NumericT *g_right, unsigned int *g_pos, NumericT precision)
Definition: bisect_kernel_large_onei.hpp:59

viennacl::linalg::cuda::computeNumSmallerEigenvalsLarge
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)
Definition: bisect_util.hpp:237

viennacl::linalg::min
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91