Open Qmin
0.8.0
GPU-accelerated Q-tensor-based liquid crystal simulations
|
CUDA kernels and callers for the utilities base. More...
Classes | |
struct | SharedMemory< T > |
struct | SharedMemory< double > |
Functions | |
__global__ void | gpu_compute_cell_list_kernel (dVec *d_pt, unsigned int *d_elementsPerCell, int *d_particleIndices, dVec *d_cellParticlePos, int Np, unsigned int Nmax, iVec gridCellsPerSide, dVec gridCellSizes, IndexDD cellIndexer, Index2D cellListIndexer, int *d_assist) |
bool | gpu_compute_cell_list (dVec *d_pt, unsigned int *d_cell_sizes, int *d_idx, dVec *d_cellParticlePos, int Np, int &Nmax, iVec gridCellsPerSide, dVec gridCellSizes, BoxPtr Box, IndexDD &ci, Index2D &cli, int *d_assist) |
Find the set indices of points in every cell bucket in the grid. More... | |
__global__ void | gpu_compute_neighbor_list_TPP_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int threadsPerParticle) |
__global__ void | gpu_compute_neighbor_list_TPC_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np) |
__global__ void | gpu_compute_neighbor_list_kernel (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np) |
bool | gpu_compute_neighbor_list (int *d_idx, unsigned int *d_npp, dVec *d_vec, unsigned int *particlesPerCell, int *indices, dVec *cellParticlePos, dVec *d_pt, int *d_assist, int *d_adj, periodicBoundaryConditions &Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int maxBlockSize, bool threadPerCell) |
__global__ void | initialize_RNG_array_kernel (curandState *state, int N, int Timestep, int GlobalSeed) |
bool | gpu_initialize_RNG_array (curandState *states, int N, int Timestep, int GlobalSeed) |
Call the kernel to initialize a different RNG for each particle. More... | |
template<class T , unsigned int blockSize> | |
__global__ void | reduce6 (T *g_idata, T *g_odata, unsigned int n) |
template<class T > | |
void | reduce (int size, int threads, int blocks, T *d_idata, T *d_odata) |
access cuda sdk reduction6 More... | |
template<class T > | |
T | gpuReduction (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, T *d_idata, T *d_odata) |
like benchmarkReduce, interfaces with reduce and returns result More... | |
__global__ void | gpu_serial_reduction_kernel (scalar *array, scalar *output, int helperIdx, int N) |
__global__ void | gpu_serial_reduction_kernel2 (scalar *array, scalar *output, int helperIdx, int N) |
__global__ void | gpu_parallel_block_reduction_kernel (scalar *input, scalar *output, int N) |
__global__ void | gpu_parallel_block_reduction2_kernel (scalar *input, scalar *output, int N) |
__global__ void | gpu_parallel_block_reduction3_kernel (scalar *input, scalar *output, int N) |
__global__ void | gpu_vec_dot_product_kernel (dVec *input1, dVec *input2, scalar *output, int N) |
__global__ void | gpu_vec_dot_product_unrolled_kernel (dVec *input1, dVec *input2, scalar *output, int N) |
__global__ void | gpu_dVec_dot_products_kernel (dVec *input1, dVec *input2, scalar *output, int N) |
__global__ void | gpu_unrolled_dVec_dot_products_kernel (dVec *input1, dVec *input2, scalar *output, int N) |
__global__ void | gpu_scalar_times_dVec_squared_kernel (dVec *d_vec1, scalar *d_scalars, scalar factor, scalar *d_ans, int n) |
__global__ void | gpu_dot_dVec_vectors_kernel (dVec *d_vec1, dVec *d_vec2, scalar *d_ans, int n) |
__global__ void | gpu_dVec_times_scalar_kernel (dVec *d_vec1, scalar factor, int n) |
__global__ void | gpu_dVec_times_scalar_kernel (dVec *d_vec1, scalar factor, dVec *d_ans, int n) |
__global__ void | gpu_dVec_plusEqual_dVec_kernel (dVec *d_vec1, dVec *d_vec2, scalar factor, int n) |
bool | gpu_dVec_plusEqual_dVec (dVec *d_vec1, dVec *d_vec2, scalar factor, int N, int maxBlockSize=512) |
vec1 += a*vec2 More... | |
bool | gpu_dVec_times_scalar (dVec *d_vec1, scalar factor, int N) |
(dVec) input *= factor More... | |
bool | gpu_dVec_times_scalar (dVec *d_vec1, scalar factor, dVec *d_ans, int N) |
(dVec) ans = input * factor More... | |
bool | gpu_scalar_times_dVec_squared (dVec *d_vec1, scalar *d_scalars, scalar factor, scalar *d_answer, int N) |
ans = a*b[i]*c[i]^2r More... | |
bool | gpu_dot_dVec_vectors (dVec *d_vec1, dVec *d_vec2, scalar *d_ans, int N) |
(scalar) ans = (dVec) vec1 . vec2 More... | |
scalar | gpu_gpuarray_dVec_dot_products (GPUArray< dVec > &input1, GPUArray< dVec > &input2, GPUArray< scalar > &intermediate, GPUArray< scalar > &intermediate2, int N=0, int maxBlockSize=512) |
A function of convenience: take the gpuarrays themselves and dot the data. More... | |
bool | gpu_dVec_dot_products (dVec *input1, dVec *input2, scalar *intermediate, scalar *intermediate2, scalar *output, int helperIdx, int N, int block_size) |
Take two vectors of dVecs and compute the sum of the dot products between them. More... | |
bool | gpu_dVec_dot_products (dVec *input1, dVec *input2, scalar *output, int helperIdx, int N) |
Take two vectors of dVecs and compute the sum of the dot products between them using thrust. More... | |
bool | gpu_parallel_reduction (scalar *input, scalar *intermediate, scalar *output, int helperIdx, int N, int block_size) |
A straightforward two-step parallel reduction algorithm with block_size declared. More... | |
bool | gpu_serial_reduction (scalar *array, scalar *output, int helperIdx, int N) |
A trivial reduction of an array by one thread in serial. Think before you use this. More... | |
template<typename T > | |
__global__ void | gpu_set_array_kernel (T *arr, T value, int N) |
template<typename T > | |
bool | gpu_set_array (T *arr, T value, int N, int maxBlockSize=512) |
set every element of an array to the specified value More... | |
template<typename T > | |
__global__ void | gpu_copy_gpuarray_kernel (T *copyInto, T *copyFrom, int N) |
template<typename T > | |
bool | gpu_copy_gpuarray (GPUArray< T > ©Into, GPUArray< T > ©From, int block_size=512) |
copy data into target on the device More... | |
scalar | host_dVec_dot_products (dVec *input1, dVec *input2, int N) |
Take two vectors of dVecs and compute the sum of the dot products between them on the host. More... | |
void | host_dVec_plusEqual_dVec (dVec *d_vec1, dVec *d_vec2, scalar factor, int N) |
vec1 += a*vec2... on the host! More... | |
void | host_dVec_times_scalar (dVec *d_vec1, scalar factor, dVec *d_ans, int N) |
(dVec) ans = input * factor... on the host More... | |
template scalar | gpuReduction< scalar > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, scalar *d_idata, scalar *d_odata) |
template int | gpuReduction< int > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int *d_idata, int *d_odata) |
template void | reduce< int > (int size, int threads, int blocks, int *d_idata, int *d_odata) |
template void | reduce< scalar > (int size, int threads, int blocks, scalar *d_idata, scalar *d_odata) |
template bool | gpu_copy_gpuarray< dVec > (GPUArray< dVec > ©Into, GPUArray< dVec > ©From, int maxBlockSize) |
template bool | gpu_copy_gpuarray< scalar > (GPUArray< scalar > ©Into, GPUArray< scalar > ©From, int maxBlockSize) |
template bool | gpu_set_array< int > (int *, int, int, int) |
template bool | gpu_set_array< unsigned int > (unsigned int *, unsigned int, int, int) |
template bool | gpu_set_array< int2 > (int2 *, int2, int, int) |
template bool | gpu_set_array< scalar > (scalar *, scalar, int, int) |
template bool | gpu_set_array< dVec > (dVec *, dVec, int, int) |
template bool | gpu_set_array< cubicLatticeDerivativeVector > (cubicLatticeDerivativeVector *, cubicLatticeDerivativeVector, int, int) |
unsigned int | nextPow2 (unsigned int x) |
void | getNumBlocksAndThreads (int n, int maxBlocks, int maxThreads, int &blocks, int &threads) |
CUDA kernels and callers for the utilities base.
CUDA kernels and callers for generating rngs on the gpu.
CUDA kernels and callers.
__global__ void gpu_compute_cell_list_kernel | ( | dVec * | d_pt, |
unsigned int * | d_elementsPerCell, | ||
int * | d_particleIndices, | ||
dVec * | d_cellParticlePos, | ||
int | Np, | ||
unsigned int | Nmax, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
IndexDD | cellIndexer, | ||
Index2D | cellListIndexer, | ||
int * | d_assist | ||
) |
Assign particles to bins, keep track of the number of particles per bin, etc.
References Index2D::getNumElements(), idx, and iVec::x.
bool gpu_compute_cell_list | ( | dVec * | d_pt, |
unsigned int * | d_cell_sizes, | ||
int * | d_idx, | ||
dVec * | d_cellParticlePos, | ||
int | Np, | ||
int & | Nmax, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
BoxPtr | Box, | ||
IndexDD & | ci, | ||
Index2D & | cli, | ||
int * | d_assist | ||
) |
Find the set indices of points in every cell bucket in the grid.
References HANDLE_ERROR.
Referenced by hyperrectangularCellList::computeGPU().
__global__ void gpu_compute_neighbor_list_TPP_kernel | ( | int * | d_idx, |
unsigned int * | d_npp, | ||
dVec * | d_vec, | ||
unsigned int * | particlesPerCell, | ||
int * | indices, | ||
dVec * | cellParticlePos, | ||
dVec * | d_pt, | ||
int * | d_assist, | ||
int * | d_adj, | ||
periodicBoundaryConditions | Box, | ||
Index2D | neighborIndexer, | ||
Index2D | cellListIndexer, | ||
IndexDD | cellIndexer, | ||
Index2D | adjacentCellIndexer, | ||
int | adjacentCellsPerCell, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
int | cellListNmax, | ||
scalar | maxRange, | ||
int | nmax, | ||
int | Np, | ||
int | threadsPerParticle | ||
) |
compute a neighbor list with some value of threads per particle (i.e, # threads = N_cells*adjacentCellsPerCell
References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.
__global__ void gpu_compute_neighbor_list_TPC_kernel | ( | int * | d_idx, |
unsigned int * | d_npp, | ||
dVec * | d_vec, | ||
unsigned int * | particlesPerCell, | ||
int * | indices, | ||
dVec * | cellParticlePos, | ||
dVec * | d_pt, | ||
int * | d_assist, | ||
int * | d_adj, | ||
periodicBoundaryConditions | Box, | ||
Index2D | neighborIndexer, | ||
Index2D | cellListIndexer, | ||
IndexDD | cellIndexer, | ||
Index2D | adjacentCellIndexer, | ||
int | adjacentCellsPerCell, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
scalar | maxRange2, | ||
int | nmax, | ||
int | Np | ||
) |
compute a neighbor list with one thread for each cell to scan for every particle (i.e, # threads = N_cells*adjacentCellsPerCell
References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.
__global__ void gpu_compute_neighbor_list_kernel | ( | int * | d_idx, |
unsigned int * | d_npp, | ||
dVec * | d_vec, | ||
unsigned int * | particlesPerCell, | ||
int * | indices, | ||
dVec * | d_pt, | ||
int * | d_assist, | ||
int * | d_adj, | ||
periodicBoundaryConditions | Box, | ||
Index2D | neighborIndexer, | ||
Index2D | cellListIndexer, | ||
IndexDD | cellIndexer, | ||
Index2D | adjacentCellIndexer, | ||
int | adjacentCellsPerCell, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
scalar | maxRange2, | ||
int | nmax, | ||
int | Np | ||
) |
compute a neighbor list with one thread per particle
References dot(), idx, indices, periodicBoundaryConditions::minDist(), and iVec::x.
bool gpu_compute_neighbor_list | ( | int * | d_idx, |
unsigned int * | d_npp, | ||
dVec * | d_vec, | ||
unsigned int * | particlesPerCell, | ||
int * | indices, | ||
dVec * | cellParticlePos, | ||
dVec * | d_pt, | ||
int * | d_assist, | ||
int * | d_adj, | ||
periodicBoundaryConditions & | Box, | ||
Index2D | neighborIndexer, | ||
Index2D | cellListIndexer, | ||
IndexDD | cellIndexer, | ||
Index2D | adjacentCellIndexer, | ||
int | adjacentCellsPerCell, | ||
iVec | gridCellsPerSide, | ||
dVec | gridCellSizes, | ||
int | cellListNmax, | ||
scalar | maxRange, | ||
int | nmax, | ||
int | Np, | ||
int | maxBlockSize, | ||
bool | threadPerCell | ||
) |
compute neighbor list, one particle per thread
References HANDLE_ERROR, and indices.
Referenced by neighborList::computeGPU().
__global__ void initialize_RNG_array_kernel | ( | curandState * | state, |
int | N, | ||
int | Timestep, | ||
int | GlobalSeed | ||
) |
Each thread – most likely corresponding to each cell – is initialized with a different sequence of the same seed of a cudaRNG
References idx.
bool gpu_initialize_RNG_array | ( | curandState * | states, |
int | N, | ||
int | Timestep, | ||
int | GlobalSeed | ||
) |
Call the kernel to initialize a different RNG for each particle.
Initialize the GPU's random number generator.
References HANDLE_ERROR.
Referenced by noiseSource::initializeGPURNGs().
__global__ void reduce6 | ( | T * | g_idata, |
T * | g_odata, | ||
unsigned int | n | ||
) |
References n.
void reduce | ( | int | size, |
int | threads, | ||
int | blocks, | ||
T * | d_idata, | ||
T * | d_odata | ||
) |
access cuda sdk reduction6
T gpuReduction | ( | int | n, |
int | numThreads, | ||
int | numBlocks, | ||
int | maxThreads, | ||
int | maxBlocks, | ||
T * | d_idata, | ||
T * | d_odata | ||
) |
like benchmarkReduce, interfaces with reduce and returns result
References getNumBlocksAndThreads(), HANDLE_ERROR, and n.
Referenced by landauDeGennesLC::computeEnergyGPU(), and gpu_gpuarray_dVec_dot_products().
__global__ void gpu_serial_reduction_kernel | ( | scalar * | array, |
scalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
add the first N elements of array and put it in output[helperIdx]
References scalar.
__global__ void gpu_serial_reduction_kernel2 | ( | scalar * | array, |
scalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
add the first N elements of array and put it in output[helperIdx]...use shared memory a bit
References scalar.
Referenced by gpu_parallel_reduction().
perform a block reduction, storing the partial sums of input into output
References scalar.
a slight optimization of the previous block reduction, c.f. M. Harris presentation
References scalar.
multiple loads and loop unrolling... a slight optimization of the previous block reduction, c.f. M. Harris presentation
References scalar.
__global__ void gpu_vec_dot_product_kernel | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | output, | ||
int | N | ||
) |
__global__ void gpu_vec_dot_product_unrolled_kernel | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | output, | ||
int | N | ||
) |
Store the dot product of two dVecs in a scalar vec, unrolled by dimension
References idx.
__global__ void gpu_dVec_dot_products_kernel | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | output, | ||
int | N | ||
) |
__global__ void gpu_unrolled_dVec_dot_products_kernel | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | output, | ||
int | N | ||
) |
This kernel basically performs the operation of the "reduction2" kernel, but the shared memory gets dot products
References scalar.
__global__ void gpu_dot_dVec_vectors_kernel | ( | dVec * | d_vec1, |
dVec * | d_vec2, | ||
scalar * | d_ans, | ||
int | n | ||
) |
__global__ void gpu_dVec_times_scalar_kernel | ( | dVec * | d_vec1, |
scalar | factor, | ||
int | n | ||
) |
__global__ void gpu_dVec_times_scalar_kernel | ( | dVec * | d_vec1, |
scalar | factor, | ||
dVec * | d_ans, | ||
int | n | ||
) |
__global__ void gpu_dVec_plusEqual_dVec_kernel | ( | dVec * | d_vec1, |
dVec * | d_vec2, | ||
scalar | factor, | ||
int | n | ||
) |
bool gpu_dVec_plusEqual_dVec | ( | dVec * | d_vec1, |
dVec * | d_vec2, | ||
scalar | factor, | ||
int | N, | ||
int | maxBlockSize | ||
) |
bool gpu_dVec_times_scalar | ( | dVec * | d_vec1, |
scalar | factor, | ||
int | N | ||
) |
(dVec) input *= factor
d_vec1 | dVec input array |
factor | scalar multiplication factor |
N | the length of the arrays |
References HANDLE_ERROR.
Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().
bool gpu_dVec_times_scalar | ( | dVec * | d_vec1, |
scalar | factor, | ||
dVec * | d_ans, | ||
int | N | ||
) |
(dVec) ans = input * factor
References HANDLE_ERROR.
bool gpu_scalar_times_dVec_squared | ( | dVec * | d_vec1, |
scalar * | d_scalars, | ||
scalar | factor, | ||
scalar * | d_ans, | ||
int | N | ||
) |
ans = a*b[i]*c[i]^2r
References HANDLE_ERROR.
bool gpu_dot_dVec_vectors | ( | dVec * | d_vec1, |
dVec * | d_vec2, | ||
scalar * | d_ans, | ||
int | N | ||
) |
(scalar) ans = (dVec) vec1 . vec2
d_vec1 | dVec input array |
d_vec2 | dVec input array |
d_ans | scalar output array... d_ans[idx] = d_vec1[idx].d_vec2[idx] |
N | the length of the arrays |
References HANDLE_ERROR.
scalar gpu_gpuarray_dVec_dot_products | ( | GPUArray< dVec > & | input1, |
GPUArray< dVec > & | input2, | ||
GPUArray< scalar > & | intermediate, | ||
GPUArray< scalar > & | intermediate2, | ||
int | N, | ||
int | block_size | ||
) |
A function of convenience: take the gpuarrays themselves and dot the data.
References ArrayHandle< T >::data, access_location::device, getNumBlocksAndThreads(), GPUArray< T >::getNumElements(), gpu_dVec_dot_products(), gpuReduction(), HANDLE_ERROR, access_location::host, access_mode::overwrite, access_mode::read, GPUArray< T >::resize(), and scalar.
Referenced by energyMinimizerFIRE::fireStepGPU(), energyMinimizerGradientDescent::gradientDescentGPU(), and energyMinimizerLoLBFGS::LoLBFGSStepGPU().
bool gpu_dVec_dot_products | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | intermediate, | ||
scalar * | intermediate2, | ||
scalar * | output, | ||
int | helperIdx, | ||
int | N, | ||
int | block_size | ||
) |
Take two vectors of dVecs and compute the sum of the dot products between them.
takes the dot product of every element of the two input arrays and performs a reduction on the sum
input1 | vector 1...wow! |
input2 | vector 2...wow! |
intermediate | an array that input is dot producted to |
intermediate2 | an array that input is block-reduced to |
output | the intermediate array will be sum reduced and stored in one of the components of output |
helperIdx | the location in output to store the answer |
N | the size of the input and intermediate arrays |
block_size | the...block size. doxygen is annoying sometimes |
References gpu_parallel_reduction(), and HANDLE_ERROR.
Referenced by gpu_gpuarray_dVec_dot_products(), and energyMinimizerNesterovAG::nesterovStepGPU().
bool gpu_dVec_dot_products | ( | dVec * | input1, |
dVec * | input2, | ||
scalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
Take two vectors of dVecs and compute the sum of the dot products between them using thrust.
References HANDLE_ERROR.
bool gpu_parallel_reduction | ( | scalar * | input, |
scalar * | intermediate, | ||
scalar * | output, | ||
int | helperIdx, | ||
int | N, | ||
int | block_size | ||
) |
A straightforward two-step parallel reduction algorithm with block_size declared.
a two-step parallel reduction algorithm that first does a partial sum reduction of input into the intermediate array, then launches a second kernel to sum reduce intermediate into output[helperIdx]
input | the input array to sum |
intermediate | an array that input is block-reduced to |
output | the intermediate array will be sum reduced and stored in one of the components of output |
helperIdx | the location in output to store the answer |
N | the size of the input and intermediate arrays |
block_size | the...block size. doxygen is annoying sometimes |
References gpu_serial_reduction_kernel2(), HANDLE_ERROR, and scalar.
Referenced by gpu_dVec_dot_products().
A trivial reduction of an array by one thread in serial. Think before you use this.
This serial reduction routine should probably never be called. It provides an interface to the gpu_serial_reduction_kernel above that may be useful for testing
References HANDLE_ERROR.
__global__ void gpu_set_array_kernel | ( | T * | arr, |
T | value, | ||
int | N | ||
) |
A function of convenience... set an array on the device
References idx.
bool gpu_set_array | ( | T * | array, |
T | value, | ||
int | N, | ||
int | maxBlockSize | ||
) |
set every element of an array to the specified value
References HANDLE_ERROR.
Referenced by simpleModel::computeForces(), energyMinimizerFIRE::fireStepGPU(), hyperrectangularCellList::resetCellSizes(), and neighborList::resetNeighborsGPU().
__global__ void gpu_copy_gpuarray_kernel | ( | T * | copyInto, |
T * | copyFrom, | ||
int | N | ||
) |
References idx.
bool gpu_copy_gpuarray | ( | GPUArray< T > & | copyInto, |
GPUArray< T > & | copyFrom, | ||
int | maxBlockSize | ||
) |
copy data into target on the device
References ArrayHandle< T >::data, access_location::device, GPUArray< T >::getNumElements(), HANDLE_ERROR, access_mode::overwrite, access_mode::read, and GPUArray< T >::resize().
Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().
scalar host_dVec_dot_products | ( | dVec * | input1, |
dVec * | input2, | ||
int | N | ||
) |
Take two vectors of dVecs and compute the sum of the dot products between them on the host.
References scalar.
Referenced by energyMinimizerLoLBFGS::lineSearchCPU(), and energyMinimizerLoLBFGS::LoLBFGSStepCPU().
void host_dVec_plusEqual_dVec | ( | dVec * | d_vec1, |
dVec * | d_vec2, | ||
scalar | factor, | ||
int | N | ||
) |
vec1 += a*vec2... on the host!
Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().
void host_dVec_times_scalar | ( | dVec * | d_vec1, |
scalar | factor, | ||
dVec * | d_ans, | ||
int | N | ||
) |
(dVec) ans = input * factor... on the host
Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().
template scalar gpuReduction< scalar > | ( | int | n, |
int | numThreads, | ||
int | numBlocks, | ||
int | maxThreads, | ||
int | maxBlocks, | ||
scalar * | d_idata, | ||
scalar * | d_odata | ||
) |
template int gpuReduction< int > | ( | int | n, |
int | numThreads, | ||
int | numBlocks, | ||
int | maxThreads, | ||
int | maxBlocks, | ||
int * | d_idata, | ||
int * | d_odata | ||
) |
template void reduce< int > | ( | int | size, |
int | threads, | ||
int | blocks, | ||
int * | d_idata, | ||
int * | d_odata | ||
) |
template void reduce< scalar > | ( | int | size, |
int | threads, | ||
int | blocks, | ||
scalar * | d_idata, | ||
scalar * | d_odata | ||
) |
template bool gpu_copy_gpuarray< dVec > | ( | GPUArray< dVec > & | copyInto, |
GPUArray< dVec > & | copyFrom, | ||
int | maxBlockSize | ||
) |
template bool gpu_copy_gpuarray< scalar > | ( | GPUArray< scalar > & | copyInto, |
GPUArray< scalar > & | copyFrom, | ||
int | maxBlockSize | ||
) |
template bool gpu_set_array< int > | ( | int * | , |
int | , | ||
int | , | ||
int | |||
) |
template bool gpu_set_array< unsigned int > | ( | unsigned int * | , |
unsigned int | , | ||
int | , | ||
int | |||
) |
template bool gpu_set_array< int2 > | ( | int2 * | , |
int2 | , | ||
int | , | ||
int | |||
) |
template bool gpu_set_array< scalar > | ( | scalar * | , |
scalar | , | ||
int | , | ||
int | |||
) |
template bool gpu_set_array< dVec > | ( | dVec * | , |
dVec | , | ||
int | , | ||
int | |||
) |
template bool gpu_set_array< cubicLatticeDerivativeVector > | ( | cubicLatticeDerivativeVector * | , |
cubicLatticeDerivativeVector | , | ||
int | , | ||
int | |||
) |
|
inline |
Referenced by getNumBlocksAndThreads().
|
inline |
References n, and nextPow2().
Referenced by landauDeGennesLC::computeEnergyGPU(), gpu_gpuarray_dVec_dot_products(), and gpuReduction().