Open Qmin  0.8.0
GPU-accelerated Q-tensor-based liquid crystal simulations
Classes | Functions
utility Kernels

CUDA kernels and callers for the utilities base. More...

Classes

struct  SharedMemory< T >
 
struct  SharedMemory< double >
 

Functions

__global__ void gpu_compute_cell_list_kernel (dVec *d_pt, unsigned int *d_elementsPerCell, int *d_particleIndices, dVec *d_cellParticlePos, int Np, unsigned int Nmax, iVec gridCellsPerSide, dVec gridCellSizes, IndexDD cellIndexer, Index2D cellListIndexer, int *d_assist)
 
bool gpu_compute_cell_list (dVec *d_pt, unsigned int *d_cell_sizes, int *d_idx, dVec *d_cellParticlePos, int Np, int &Nmax, iVec gridCellsPerSide, dVec gridCellSizes, BoxPtr Box, IndexDD &ci, Index2D &cli, int *d_assist)
 Find the set indices of points in every cell bucket in the grid. More...
 
__global__ void gpu_compute_neighbor_list_TPP_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int threadsPerParticle)
 
__global__ void gpu_compute_neighbor_list_TPC_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np)
 
__global__ void gpu_compute_neighbor_list_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np)
 
bool gpu_compute_neighbor_list (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions &Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int maxBlockSize, bool threadPerCell)
 
__global__ void initialize_RNG_array_kernel (curandState *state, int N, int Timestep, int GlobalSeed)
 
bool gpu_initialize_RNG_array (curandState *states, int N, int Timestep, int GlobalSeed)
 Call the kernel to initialize a different RNG for each particle. More...
 
template<class T , unsigned int blockSize>
__global__ void reduce6 (T *g_idata, T *g_odata, unsigned int n)
 
template<class T >
void reduce (int size, int threads, int blocks, T *d_idata, T *d_odata)
 access cuda sdk reduction6 More...
 
template<class T >
gpuReduction (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, T *d_idata, T *d_odata)
 like benchmarkReduce, interfaces with reduce and returns result More...
 
__global__ void gpu_serial_reduction_kernel (scalar *array, scalar *output, int helperIdx, int N)
 
__global__ void gpu_serial_reduction_kernel2 (scalar *array, scalar *output, int helperIdx, int N)
 
__global__ void gpu_parallel_block_reduction_kernel (scalar *input, scalar *output, int N)
 
__global__ void gpu_parallel_block_reduction2_kernel (scalar *input, scalar *output, int N)
 
__global__ void gpu_parallel_block_reduction3_kernel (scalar *input, scalar *output, int N)
 
__global__ void gpu_vec_dot_product_kernel (dVec *input1, dVec *input2, scalar *output, int N)
 
__global__ void gpu_vec_dot_product_unrolled_kernel (dVec *input1, dVec *input2, scalar *output, int N)
 
__global__ void gpu_dVec_dot_products_kernel (dVec *input1, dVec *input2, scalar *output, int N)
 
__global__ void gpu_unrolled_dVec_dot_products_kernel (dVec *input1, dVec *input2, scalar *output, int N)
 
__global__ void gpu_scalar_times_dVec_squared_kernel (dVec *d_vec1, scalar *d_scalars, scalar factor, scalar *d_ans, int n)
 
__global__ void gpu_dot_dVec_vectors_kernel (dVec *d_vec1, dVec *d_vec2, scalar *d_ans, int n)
 
__global__ void gpu_dVec_times_scalar_kernel (dVec *d_vec1, scalar factor, int n)
 
__global__ void gpu_dVec_times_scalar_kernel (dVec *d_vec1, scalar factor, dVec *d_ans, int n)
 
__global__ void gpu_dVec_plusEqual_dVec_kernel (dVec *d_vec1, dVec *d_vec2, scalar factor, int n)
 
bool gpu_dVec_plusEqual_dVec (dVec *d_vec1, dVec *d_vec2, scalar factor, int N, int maxBlockSize=512)
 vec1 += a*vec2 More...
 
bool gpu_dVec_times_scalar (dVec *d_vec1, scalar factor, int N)
 (dVec) input *= factor More...
 
bool gpu_dVec_times_scalar (dVec *d_vec1, scalar factor, dVec *d_ans, int N)
 (dVec) ans = input * factor More...
 
bool gpu_scalar_times_dVec_squared (dVec *d_vec1, scalar *d_scalars, scalar factor, scalar *d_answer, int N)
 ans = a*b[i]*c[i]^2r More...
 
bool gpu_dot_dVec_vectors (dVec *d_vec1, dVec *d_vec2, scalar *d_ans, int N)
 (scalar) ans = (dVec) vec1 . vec2 More...
 
scalar gpu_gpuarray_dVec_dot_products (GPUArray< dVec > &input1, GPUArray< dVec > &input2, GPUArray< scalar > &intermediate, GPUArray< scalar > &intermediate2, int N=0, int maxBlockSize=512)
 A function of convenience: take the gpuarrays themselves and dot the data. More...
 
bool gpu_dVec_dot_products (dVec *input1, dVec *input2, scalar *intermediate, scalar *intermediate2, scalar *output, int helperIdx, int N, int block_size)
 Take two vectors of dVecs and compute the sum of the dot products between them. More...
 
bool gpu_dVec_dot_products (dVec *input1, dVec *input2, scalar *output, int helperIdx, int N)
 Take two vectors of dVecs and compute the sum of the dot products between them using thrust. More...
 
bool gpu_parallel_reduction (scalar *input, scalar *intermediate, scalar *output, int helperIdx, int N, int block_size)
 A straightforward two-step parallel reduction algorithm with block_size declared. More...
 
bool gpu_serial_reduction (scalar *array, scalar *output, int helperIdx, int N)
 A trivial reduction of an array by one thread in serial. Think before you use this. More...
 
template<typename T >
__global__ void gpu_set_array_kernel (T *arr, T value, int N)
 
template<typename T >
bool gpu_set_array (T *arr, T value, int N, int maxBlockSize=512)
 set every element of an array to the specified value More...
 
template<typename T >
__global__ void gpu_copy_gpuarray_kernel (T *copyInto, T *copyFrom, int N)
 
template<typename T >
bool gpu_copy_gpuarray (GPUArray< T > &copyInto, GPUArray< T > &copyFrom, int block_size=512)
 copy data into target on the device More...
 
scalar host_dVec_dot_products (dVec *input1, dVec *input2, int N)
 Take two vectors of dVecs and compute the sum of the dot products between them on the host. More...
 
void host_dVec_plusEqual_dVec (dVec *d_vec1, dVec *d_vec2, scalar factor, int N)
 vec1 += a*vec2... on the host! More...
 
void host_dVec_times_scalar (dVec *d_vec1, scalar factor, dVec *d_ans, int N)
 (dVec) ans = input * factor... on the host More...
 
template scalar gpuReduction< scalar > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, scalar *d_idata, scalar *d_odata)
 
template int gpuReduction< int > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int *d_idata, int *d_odata)
 
template void reduce< int > (int size, int threads, int blocks, int *d_idata, int *d_odata)
 
template void reduce< scalar > (int size, int threads, int blocks, scalar *d_idata, scalar *d_odata)
 
template bool gpu_copy_gpuarray< dVec > (GPUArray< dVec > &copyInto, GPUArray< dVec > &copyFrom, int maxBlockSize)
 
template bool gpu_copy_gpuarray< scalar > (GPUArray< scalar > &copyInto, GPUArray< scalar > &copyFrom, int maxBlockSize)
 
template bool gpu_set_array< int > (int *, int, int, int)
 
template bool gpu_set_array< unsigned int > (unsigned int *, unsigned int, int, int)
 
template bool gpu_set_array< int2 > (int2 *, int2, int, int)
 
template bool gpu_set_array< scalar > (scalar *, scalar, int, int)
 
template bool gpu_set_array< dVec > (dVec *, dVec, int, int)
 
template bool gpu_set_array< cubicLatticeDerivativeVector > (cubicLatticeDerivativeVector *, cubicLatticeDerivativeVector, int, int)
 
unsigned int nextPow2 (unsigned int x)
 
void getNumBlocksAndThreads (int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
 

Detailed Description

CUDA kernels and callers for the utilities base.

CUDA kernels and callers for generating rngs on the gpu.

CUDA kernels and callers.

Function Documentation

◆ gpu_compute_cell_list_kernel()

__global__ void gpu_compute_cell_list_kernel ( dVec *  d_pt,
unsigned int *  d_elementsPerCell,
int *  d_particleIndices,
dVec *  d_cellParticlePos,
int  Np,
unsigned int  Nmax,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
IndexDD  cellIndexer,
Index2D  cellListIndexer,
int *  d_assist 
)

Assign particles to bins, keep track of the number of particles per bin, etc.

References Index2D::getNumElements(), idx, and iVec::x.

◆ gpu_compute_cell_list()

bool gpu_compute_cell_list ( dVec *  d_pt,
unsigned int *  d_cell_sizes,
int *  d_idx,
dVec *  d_cellParticlePos,
int  Np,
int &  Nmax,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
BoxPtr  Box,
IndexDD ci,
Index2D cli,
int *  d_assist 
)

Find the set indices of points in every cell bucket in the grid.

References HANDLE_ERROR.

Referenced by hyperrectangularCellList::computeGPU().

◆ gpu_compute_neighbor_list_TPP_kernel()

__global__ void gpu_compute_neighbor_list_TPP_kernel ( int *  d_idx,
unsigned int *  d_npp,
dVec *  d_vec,
unsigned int *  particlesPerCell,
int *  indices,
dVec *  cellParticlePos,
dVec *  d_pt,
int *  d_assist,
int *  d_adj,
periodicBoundaryConditions  Box,
Index2D  neighborIndexer,
Index2D  cellListIndexer,
IndexDD  cellIndexer,
Index2D  adjacentCellIndexer,
int  adjacentCellsPerCell,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
int  cellListNmax,
scalar  maxRange,
int  nmax,
int  Np,
int  threadsPerParticle 
)

compute a neighbor list with some value of threads per particle (i.e, # threads = N_cells*adjacentCellsPerCell

References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list_TPC_kernel()

__global__ void gpu_compute_neighbor_list_TPC_kernel ( int *  d_idx,
unsigned int *  d_npp,
dVec *  d_vec,
unsigned int *  particlesPerCell,
int *  indices,
dVec *  cellParticlePos,
dVec *  d_pt,
int *  d_assist,
int *  d_adj,
periodicBoundaryConditions  Box,
Index2D  neighborIndexer,
Index2D  cellListIndexer,
IndexDD  cellIndexer,
Index2D  adjacentCellIndexer,
int  adjacentCellsPerCell,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
scalar  maxRange2,
int  nmax,
int  Np 
)

compute a neighbor list with one thread for each cell to scan for every particle (i.e, # threads = N_cells*adjacentCellsPerCell

References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list_kernel()

__global__ void gpu_compute_neighbor_list_kernel ( int *  d_idx,
unsigned int *  d_npp,
dVec *  d_vec,
unsigned int *  particlesPerCell,
int *  indices,
dVec *  d_pt,
int *  d_assist,
int *  d_adj,
periodicBoundaryConditions  Box,
Index2D  neighborIndexer,
Index2D  cellListIndexer,
IndexDD  cellIndexer,
Index2D  adjacentCellIndexer,
int  adjacentCellsPerCell,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
scalar  maxRange2,
int  nmax,
int  Np 
)

compute a neighbor list with one thread per particle

References dot(), idx, indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list()

bool gpu_compute_neighbor_list ( int *  d_idx,
unsigned int *  d_npp,
dVec *  d_vec,
unsigned int *  particlesPerCell,
int *  indices,
dVec *  cellParticlePos,
dVec *  d_pt,
int *  d_assist,
int *  d_adj,
periodicBoundaryConditions Box,
Index2D  neighborIndexer,
Index2D  cellListIndexer,
IndexDD  cellIndexer,
Index2D  adjacentCellIndexer,
int  adjacentCellsPerCell,
iVec  gridCellsPerSide,
dVec  gridCellSizes,
int  cellListNmax,
scalar  maxRange,
int  nmax,
int  Np,
int  maxBlockSize,
bool  threadPerCell 
)

compute neighbor list, one particle per thread

References HANDLE_ERROR, and indices.

Referenced by neighborList::computeGPU().

◆ initialize_RNG_array_kernel()

__global__ void initialize_RNG_array_kernel ( curandState *  state,
int  N,
int  Timestep,
int  GlobalSeed 
)

Each thread – most likely corresponding to each cell – is initialized with a different sequence of the same seed of a cudaRNG

References idx.

◆ gpu_initialize_RNG_array()

bool gpu_initialize_RNG_array ( curandState *  states,
int  N,
int  Timestep,
int  GlobalSeed 
)

Call the kernel to initialize a different RNG for each particle.

Initialize the GPU's random number generator.

References HANDLE_ERROR.

Referenced by noiseSource::initializeGPURNGs().

◆ reduce6()

template<class T , unsigned int blockSize>
__global__ void reduce6 ( T *  g_idata,
T *  g_odata,
unsigned int  n 
)

References n.

◆ reduce()

template<class T >
void reduce ( int  size,
int  threads,
int  blocks,
T *  d_idata,
T *  d_odata 
)

access cuda sdk reduction6

◆ gpuReduction()

template<class T >
T gpuReduction ( int  n,
int  numThreads,
int  numBlocks,
int  maxThreads,
int  maxBlocks,
T *  d_idata,
T *  d_odata 
)

like benchmarkReduce, interfaces with reduce and returns result

References getNumBlocksAndThreads(), HANDLE_ERROR, and n.

Referenced by landauDeGennesLC::computeEnergyGPU(), and gpu_gpuarray_dVec_dot_products().

◆ gpu_serial_reduction_kernel()

__global__ void gpu_serial_reduction_kernel ( scalar array,
scalar output,
int  helperIdx,
int  N 
)

add the first N elements of array and put it in output[helperIdx]

References scalar.

◆ gpu_serial_reduction_kernel2()

__global__ void gpu_serial_reduction_kernel2 ( scalar array,
scalar output,
int  helperIdx,
int  N 
)

add the first N elements of array and put it in output[helperIdx]...use shared memory a bit

References scalar.

Referenced by gpu_parallel_reduction().

◆ gpu_parallel_block_reduction_kernel()

__global__ void gpu_parallel_block_reduction_kernel ( scalar input,
scalar output,
int  N 
)

perform a block reduction, storing the partial sums of input into output

References scalar.

◆ gpu_parallel_block_reduction2_kernel()

__global__ void gpu_parallel_block_reduction2_kernel ( scalar input,
scalar output,
int  N 
)

a slight optimization of the previous block reduction, c.f. M. Harris presentation

References scalar.

◆ gpu_parallel_block_reduction3_kernel()

__global__ void gpu_parallel_block_reduction3_kernel ( scalar input,
scalar output,
int  N 
)

multiple loads and loop unrolling... a slight optimization of the previous block reduction, c.f. M. Harris presentation

References scalar.

◆ gpu_vec_dot_product_kernel()

__global__ void gpu_vec_dot_product_kernel ( dVec *  input1,
dVec *  input2,
scalar output,
int  N 
)

Store the dot product of two dVecs in a scalar vec

References dot(), and idx.

◆ gpu_vec_dot_product_unrolled_kernel()

__global__ void gpu_vec_dot_product_unrolled_kernel ( dVec *  input1,
dVec *  input2,
scalar output,
int  N 
)

Store the dot product of two dVecs in a scalar vec, unrolled by dimension

References idx.

◆ gpu_dVec_dot_products_kernel()

__global__ void gpu_dVec_dot_products_kernel ( dVec *  input1,
dVec *  input2,
scalar output,
int  N 
)

This kernel basically performs the operation of the "reduction2" kernel, but the shared memory gets dot products...BROKEN

References dot(), and scalar.

◆ gpu_unrolled_dVec_dot_products_kernel()

__global__ void gpu_unrolled_dVec_dot_products_kernel ( dVec *  input1,
dVec *  input2,
scalar output,
int  N 
)

This kernel basically performs the operation of the "reduction2" kernel, but the shared memory gets dot products

References scalar.

◆ gpu_scalar_times_dVec_squared_kernel()

__global__ void gpu_scalar_times_dVec_squared_kernel ( dVec *  d_vec1,
scalar d_scalars,
scalar  factor,
scalar d_ans,
int  n 
)

take a vector of dVecs, a vector of scalars, a factor, and return a vector where every entry is factor*scalar[i]*(dVec[i])^2

References dot(), idx, and n.

◆ gpu_dot_dVec_vectors_kernel()

__global__ void gpu_dot_dVec_vectors_kernel ( dVec *  d_vec1,
dVec *  d_vec2,
scalar d_ans,
int  n 
)

take two vectors of dVecs and return a vector of scalars, where each entry is vec1[i].vec2[i]

References dot(), idx, and n.

◆ gpu_dVec_times_scalar_kernel() [1/2]

__global__ void gpu_dVec_times_scalar_kernel ( dVec *  d_vec1,
scalar  factor,
int  n 
)

multiply every element of an array of dVecs by the same scalar

References idx, and n.

◆ gpu_dVec_times_scalar_kernel() [2/2]

__global__ void gpu_dVec_times_scalar_kernel ( dVec *  d_vec1,
scalar  factor,
dVec *  d_ans,
int  n 
)

multiply every element of an array of dVecs by the same scalar

References idx, and n.

◆ gpu_dVec_plusEqual_dVec_kernel()

__global__ void gpu_dVec_plusEqual_dVec_kernel ( dVec *  d_vec1,
dVec *  d_vec2,
scalar  factor,
int  n 
)

References idx, and n.

◆ gpu_dVec_plusEqual_dVec()

bool gpu_dVec_plusEqual_dVec ( dVec *  d_vec1,
dVec *  d_vec2,
scalar  factor,
int  N,
int  maxBlockSize 
)

vec1 += a*vec2

References HANDLE_ERROR.

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ gpu_dVec_times_scalar() [1/2]

bool gpu_dVec_times_scalar ( dVec *  d_vec1,
scalar  factor,
int  N 
)

(dVec) input *= factor

Parameters
d_vec1dVec input array
factorscalar multiplication factor
Nthe length of the arrays
Postcondition
d_vec1 *= factor for every element

References HANDLE_ERROR.

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ gpu_dVec_times_scalar() [2/2]

bool gpu_dVec_times_scalar ( dVec *  d_vec1,
scalar  factor,
dVec *  d_ans,
int  N 
)

(dVec) ans = input * factor

References HANDLE_ERROR.

◆ gpu_scalar_times_dVec_squared()

bool gpu_scalar_times_dVec_squared ( dVec *  d_vec1,
scalar d_scalars,
scalar  factor,
scalar d_ans,
int  N 
)

ans = a*b[i]*c[i]^2r

References HANDLE_ERROR.

◆ gpu_dot_dVec_vectors()

bool gpu_dot_dVec_vectors ( dVec *  d_vec1,
dVec *  d_vec2,
scalar d_ans,
int  N 
)

(scalar) ans = (dVec) vec1 . vec2

Parameters
d_vec1dVec input array
d_vec2dVec input array
d_ansscalar output array... d_ans[idx] = d_vec1[idx].d_vec2[idx]
Nthe length of the arrays
Postcondition
d_ans = d_vec1.d_vec2

References HANDLE_ERROR.

◆ gpu_gpuarray_dVec_dot_products()

scalar gpu_gpuarray_dVec_dot_products ( GPUArray< dVec > &  input1,
GPUArray< dVec > &  input2,
GPUArray< scalar > &  intermediate,
GPUArray< scalar > &  intermediate2,
int  N,
int  block_size 
)

◆ gpu_dVec_dot_products() [1/2]

bool gpu_dVec_dot_products ( dVec *  input1,
dVec *  input2,
scalar intermediate,
scalar intermediate2,
scalar output,
int  helperIdx,
int  N,
int  block_size 
)

Take two vectors of dVecs and compute the sum of the dot products between them.

takes the dot product of every element of the two input arrays and performs a reduction on the sum

Parameters
input1vector 1...wow!
input2vector 2...wow!
intermediatean array that input is dot producted to
intermediate2an array that input is block-reduced to
outputthe intermediate array will be sum reduced and stored in one of the components of output
helperIdxthe location in output to store the answer
Nthe size of the input and intermediate arrays
block_sizethe...block size. doxygen is annoying sometimes

References gpu_parallel_reduction(), and HANDLE_ERROR.

Referenced by gpu_gpuarray_dVec_dot_products(), and energyMinimizerNesterovAG::nesterovStepGPU().

◆ gpu_dVec_dot_products() [2/2]

bool gpu_dVec_dot_products ( dVec *  input1,
dVec *  input2,
scalar output,
int  helperIdx,
int  N 
)

Take two vectors of dVecs and compute the sum of the dot products between them using thrust.

References HANDLE_ERROR.

◆ gpu_parallel_reduction()

bool gpu_parallel_reduction ( scalar input,
scalar intermediate,
scalar output,
int  helperIdx,
int  N,
int  block_size 
)

A straightforward two-step parallel reduction algorithm with block_size declared.

a two-step parallel reduction algorithm that first does a partial sum reduction of input into the intermediate array, then launches a second kernel to sum reduce intermediate into output[helperIdx]

Parameters
inputthe input array to sum
intermediatean array that input is block-reduced to
outputthe intermediate array will be sum reduced and stored in one of the components of output
helperIdxthe location in output to store the answer
Nthe size of the input and intermediate arrays
block_sizethe...block size. doxygen is annoying sometimes

References gpu_serial_reduction_kernel2(), HANDLE_ERROR, and scalar.

Referenced by gpu_dVec_dot_products().

◆ gpu_serial_reduction()

bool gpu_serial_reduction ( scalar array,
scalar output,
int  helperIdx,
int  N 
)

A trivial reduction of an array by one thread in serial. Think before you use this.

This serial reduction routine should probably never be called. It provides an interface to the gpu_serial_reduction_kernel above that may be useful for testing

References HANDLE_ERROR.

◆ gpu_set_array_kernel()

template<typename T >
__global__ void gpu_set_array_kernel ( T *  arr,
value,
int  N 
)

A function of convenience... set an array on the device

References idx.

◆ gpu_set_array()

template<typename T >
bool gpu_set_array ( T *  array,
value,
int  N,
int  maxBlockSize 
)

◆ gpu_copy_gpuarray_kernel()

template<typename T >
__global__ void gpu_copy_gpuarray_kernel ( T *  copyInto,
T *  copyFrom,
int  N 
)

References idx.

◆ gpu_copy_gpuarray()

template<typename T >
bool gpu_copy_gpuarray ( GPUArray< T > &  copyInto,
GPUArray< T > &  copyFrom,
int  maxBlockSize 
)

◆ host_dVec_dot_products()

scalar host_dVec_dot_products ( dVec *  input1,
dVec *  input2,
int  N 
)

Take two vectors of dVecs and compute the sum of the dot products between them on the host.

References scalar.

Referenced by energyMinimizerLoLBFGS::lineSearchCPU(), and energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ host_dVec_plusEqual_dVec()

void host_dVec_plusEqual_dVec ( dVec *  d_vec1,
dVec *  d_vec2,
scalar  factor,
int  N 
)

vec1 += a*vec2... on the host!

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ host_dVec_times_scalar()

void host_dVec_times_scalar ( dVec *  d_vec1,
scalar  factor,
dVec *  d_ans,
int  N 
)

(dVec) ans = input * factor... on the host

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ gpuReduction< scalar >()

template scalar gpuReduction< scalar > ( int  n,
int  numThreads,
int  numBlocks,
int  maxThreads,
int  maxBlocks,
scalar d_idata,
scalar d_odata 
)

◆ gpuReduction< int >()

template int gpuReduction< int > ( int  n,
int  numThreads,
int  numBlocks,
int  maxThreads,
int  maxBlocks,
int *  d_idata,
int *  d_odata 
)

◆ reduce< int >()

template void reduce< int > ( int  size,
int  threads,
int  blocks,
int *  d_idata,
int *  d_odata 
)

◆ reduce< scalar >()

template void reduce< scalar > ( int  size,
int  threads,
int  blocks,
scalar d_idata,
scalar d_odata 
)

◆ gpu_copy_gpuarray< dVec >()

template bool gpu_copy_gpuarray< dVec > ( GPUArray< dVec > &  copyInto,
GPUArray< dVec > &  copyFrom,
int  maxBlockSize 
)

◆ gpu_copy_gpuarray< scalar >()

template bool gpu_copy_gpuarray< scalar > ( GPUArray< scalar > &  copyInto,
GPUArray< scalar > &  copyFrom,
int  maxBlockSize 
)

◆ gpu_set_array< int >()

template bool gpu_set_array< int > ( int *  ,
int  ,
int  ,
int   
)

◆ gpu_set_array< unsigned int >()

template bool gpu_set_array< unsigned int > ( unsigned int *  ,
unsigned int  ,
int  ,
int   
)

◆ gpu_set_array< int2 >()

template bool gpu_set_array< int2 > ( int2 *  ,
int2  ,
int  ,
int   
)

◆ gpu_set_array< scalar >()

template bool gpu_set_array< scalar > ( scalar ,
scalar  ,
int  ,
int   
)

◆ gpu_set_array< dVec >()

template bool gpu_set_array< dVec > ( dVec *  ,
dVec  ,
int  ,
int   
)

◆ gpu_set_array< cubicLatticeDerivativeVector >()

◆ nextPow2()

unsigned int nextPow2 ( unsigned int  x)
inline

Referenced by getNumBlocksAndThreads().

◆ getNumBlocksAndThreads()

void getNumBlocksAndThreads ( int  n,
int  maxBlocks,
int  maxThreads,
int &  blocks,
int &  threads 
)
inline