Open Qmin: utility Kernels

CUDA kernels and callers for the utilities base. More...

Classes
struct	SharedMemory< T >

struct	SharedMemory< double >

Functions
__global__ void	gpu_compute_cell_list_kernel (dVec d_pt, unsigned int d_elementsPerCell, int d_particleIndices, dVec d_cellParticlePos, int Np, unsigned int Nmax, iVec gridCellsPerSide, dVec gridCellSizes, IndexDD cellIndexer, Index2D cellListIndexer, int *d_assist)

bool	gpu_compute_cell_list (dVec d_pt, unsigned int d_cell_sizes, int d_idx, dVec d_cellParticlePos, int Np, int &Nmax, iVec gridCellsPerSide, dVec gridCellSizes, BoxPtr Box, IndexDD &ci, Index2D &cli, int *d_assist)
	Find the set indices of points in every cell bucket in the grid. More...

__global__ void	gpu_compute_neighbor_list_TPP_kernel (int d_idx, unsigned int d_npp, dVec d_vec, unsigned int particlesPerCell, int indices, dVec cellParticlePos, dVec d_pt, int d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int threadsPerParticle)

__global__ void	gpu_compute_neighbor_list_TPC_kernel (int d_idx, unsigned int d_npp, dVec d_vec, unsigned int particlesPerCell, int indices, dVec cellParticlePos, dVec d_pt, int d_assist, int *d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np)

__global__ void	gpu_compute_neighbor_list_kernel (int d_idx, unsigned int d_npp, dVec d_vec, unsigned int particlesPerCell, int indices, dVec d_pt, int d_assist, int d_adj, periodicBoundaryConditions Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, scalar maxRange2, int nmax, int Np)

bool	gpu_compute_neighbor_list (int d_idx, unsigned int d_npp, dVec d_vec, unsigned int particlesPerCell, int indices, dVec cellParticlePos, dVec d_pt, int d_assist, int *d_adj, periodicBoundaryConditions &Box, Index2D neighborIndexer, Index2D cellListIndexer, IndexDD cellIndexer, Index2D adjacentCellIndexer, int adjacentCellsPerCell, iVec gridCellsPerSide, dVec gridCellSizes, int cellListNmax, scalar maxRange, int nmax, int Np, int maxBlockSize, bool threadPerCell)

__global__ void	initialize_RNG_array_kernel (curandState *state, int N, int Timestep, int GlobalSeed)

bool	gpu_initialize_RNG_array (curandState *states, int N, int Timestep, int GlobalSeed)
	Call the kernel to initialize a different RNG for each particle. More...

template<class T , unsigned int blockSize>
__global__ void	reduce6 (T g_idata, T g_odata, unsigned int n)

template<class T >
void	reduce (int size, int threads, int blocks, T d_idata, T d_odata)
	access cuda sdk reduction6 More...

template<class T >
T	gpuReduction (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, T d_idata, T d_odata)
	like benchmarkReduce, interfaces with reduce and returns result More...

__global__ void	gpu_serial_reduction_kernel (scalar array, scalar output, int helperIdx, int N)

__global__ void	gpu_serial_reduction_kernel2 (scalar array, scalar output, int helperIdx, int N)

__global__ void	gpu_parallel_block_reduction_kernel (scalar input, scalar output, int N)

__global__ void	gpu_parallel_block_reduction2_kernel (scalar input, scalar output, int N)

__global__ void	gpu_parallel_block_reduction3_kernel (scalar input, scalar output, int N)

__global__ void	gpu_vec_dot_product_kernel (dVec input1, dVec input2, scalar *output, int N)

__global__ void	gpu_vec_dot_product_unrolled_kernel (dVec input1, dVec input2, scalar *output, int N)

__global__ void	gpu_dVec_dot_products_kernel (dVec input1, dVec input2, scalar *output, int N)

__global__ void	gpu_unrolled_dVec_dot_products_kernel (dVec input1, dVec input2, scalar *output, int N)

__global__ void	gpu_scalar_times_dVec_squared_kernel (dVec d_vec1, scalar d_scalars, scalar factor, scalar *d_ans, int n)

__global__ void	gpu_dot_dVec_vectors_kernel (dVec d_vec1, dVec d_vec2, scalar *d_ans, int n)

__global__ void	gpu_dVec_times_scalar_kernel (dVec *d_vec1, scalar factor, int n)

__global__ void	gpu_dVec_times_scalar_kernel (dVec d_vec1, scalar factor, dVec d_ans, int n)

__global__ void	gpu_dVec_plusEqual_dVec_kernel (dVec d_vec1, dVec d_vec2, scalar factor, int n)

bool	gpu_dVec_plusEqual_dVec (dVec d_vec1, dVec d_vec2, scalar factor, int N, int maxBlockSize=512)
	vec1 += a*vec2 More...

bool	gpu_dVec_times_scalar (dVec *d_vec1, scalar factor, int N)
	(dVec) input *= factor More...

bool	gpu_dVec_times_scalar (dVec d_vec1, scalar factor, dVec d_ans, int N)
	(dVec) ans = input * factor More...

bool	gpu_scalar_times_dVec_squared (dVec d_vec1, scalar d_scalars, scalar factor, scalar *d_answer, int N)
	ans = ab[i]c[i]^2r More...

bool	gpu_dot_dVec_vectors (dVec d_vec1, dVec d_vec2, scalar *d_ans, int N)
	(scalar) ans = (dVec) vec1 . vec2 More...

scalar	gpu_gpuarray_dVec_dot_products (GPUArray< dVec > &input1, GPUArray< dVec > &input2, GPUArray< scalar > &intermediate, GPUArray< scalar > &intermediate2, int N=0, int maxBlockSize=512)
	A function of convenience: take the gpuarrays themselves and dot the data. More...

bool	gpu_dVec_dot_products (dVec input1, dVec input2, scalar intermediate, scalar intermediate2, scalar *output, int helperIdx, int N, int block_size)
	Take two vectors of dVecs and compute the sum of the dot products between them. More...

bool	gpu_dVec_dot_products (dVec input1, dVec input2, scalar *output, int helperIdx, int N)
	Take two vectors of dVecs and compute the sum of the dot products between them using thrust. More...

bool	gpu_parallel_reduction (scalar input, scalar intermediate, scalar *output, int helperIdx, int N, int block_size)
	A straightforward two-step parallel reduction algorithm with block_size declared. More...

bool	gpu_serial_reduction (scalar array, scalar output, int helperIdx, int N)
	A trivial reduction of an array by one thread in serial. Think before you use this. More...

template<typename T >
__global__ void	gpu_set_array_kernel (T *arr, T value, int N)

template<typename T >
bool	gpu_set_array (T *arr, T value, int N, int maxBlockSize=512)
	set every element of an array to the specified value More...

template<typename T >
__global__ void	gpu_copy_gpuarray_kernel (T copyInto, T copyFrom, int N)

template<typename T >
bool	gpu_copy_gpuarray (GPUArray< T > &copyInto, GPUArray< T > &copyFrom, int block_size=512)
	copy data into target on the device More...

scalar	host_dVec_dot_products (dVec input1, dVec input2, int N)
	Take two vectors of dVecs and compute the sum of the dot products between them on the host. More...

void	host_dVec_plusEqual_dVec (dVec d_vec1, dVec d_vec2, scalar factor, int N)
	vec1 += a*vec2... on the host! More...

void	host_dVec_times_scalar (dVec d_vec1, scalar factor, dVec d_ans, int N)
	(dVec) ans = input * factor... on the host More...

template scalar	gpuReduction< scalar > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, scalar d_idata, scalar d_odata)

template int	gpuReduction< int > (int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int d_idata, int d_odata)

template void	reduce< int > (int size, int threads, int blocks, int d_idata, int d_odata)

template void	reduce< scalar > (int size, int threads, int blocks, scalar d_idata, scalar d_odata)

template bool	gpu_copy_gpuarray< dVec > (GPUArray< dVec > &copyInto, GPUArray< dVec > &copyFrom, int maxBlockSize)

template bool	gpu_copy_gpuarray< scalar > (GPUArray< scalar > &copyInto, GPUArray< scalar > &copyFrom, int maxBlockSize)

template bool	gpu_set_array< int > (int *, int, int, int)

template bool	gpu_set_array< unsigned int > (unsigned int *, unsigned int, int, int)

template bool	gpu_set_array< int2 > (int2 *, int2, int, int)

template bool	gpu_set_array< scalar > (scalar *, scalar, int, int)

template bool	gpu_set_array< dVec > (dVec *, dVec, int, int)

template bool	gpu_set_array< cubicLatticeDerivativeVector > (cubicLatticeDerivativeVector *, cubicLatticeDerivativeVector, int, int)

unsigned int	nextPow2 (unsigned int x)

void	getNumBlocksAndThreads (int n, int maxBlocks, int maxThreads, int &blocks, int &threads)

Detailed Description

CUDA kernels and callers for the utilities base.

CUDA kernels and callers for generating rngs on the gpu.

CUDA kernels and callers.

Function Documentation

◆ gpu_compute_cell_list_kernel()

__global__ void gpu_compute_cell_list_kernel	(	dVec *	d_pt,
		unsigned int *	d_elementsPerCell,
		int *	d_particleIndices,
		dVec *	d_cellParticlePos,
		int	Np,
		unsigned int	Nmax,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		IndexDD	cellIndexer,
		Index2D	cellListIndexer,
		int *	d_assist
	)

Assign particles to bins, keep track of the number of particles per bin, etc.

References Index2D::getNumElements(), idx, and iVec::x.

◆ gpu_compute_cell_list()

bool gpu_compute_cell_list	(	dVec *	d_pt,
		unsigned int *	d_cell_sizes,
		int *	d_idx,
		dVec *	d_cellParticlePos,
		int	Np,
		int &	Nmax,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		BoxPtr	Box,
		IndexDD &	ci,
		Index2D &	cli,
		int *	d_assist
	)

Find the set indices of points in every cell bucket in the grid.

References HANDLE_ERROR.

Referenced by hyperrectangularCellList::computeGPU().

◆ gpu_compute_neighbor_list_TPP_kernel()

__global__ void gpu_compute_neighbor_list_TPP_kernel	(	int *	d_idx,
		unsigned int *	d_npp,
		dVec *	d_vec,
		unsigned int *	particlesPerCell,
		int *	indices,
		dVec *	cellParticlePos,
		dVec *	d_pt,
		int *	d_assist,
		int *	d_adj,
		periodicBoundaryConditions	Box,
		Index2D	neighborIndexer,
		Index2D	cellListIndexer,
		IndexDD	cellIndexer,
		Index2D	adjacentCellIndexer,
		int	adjacentCellsPerCell,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		int	cellListNmax,
		scalar	maxRange,
		int	nmax,
		int	Np,
		int	threadsPerParticle
	)

compute a neighbor list with some value of threads per particle (i.e, # threads = N_cells*adjacentCellsPerCell

References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list_TPC_kernel()

__global__ void gpu_compute_neighbor_list_TPC_kernel	(	int *	d_idx,
		unsigned int *	d_npp,
		dVec *	d_vec,
		unsigned int *	particlesPerCell,
		int *	indices,
		dVec *	cellParticlePos,
		dVec *	d_pt,
		int *	d_assist,
		int *	d_adj,
		periodicBoundaryConditions	Box,
		Index2D	neighborIndexer,
		Index2D	cellListIndexer,
		IndexDD	cellIndexer,
		Index2D	adjacentCellIndexer,
		int	adjacentCellsPerCell,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		scalar	maxRange2,
		int	nmax,
		int	Np
	)

compute a neighbor list with one thread for each cell to scan for every particle (i.e, # threads = N_cells*adjacentCellsPerCell

References dot(), indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list_kernel()

__global__ void gpu_compute_neighbor_list_kernel	(	int *	d_idx,
		unsigned int *	d_npp,
		dVec *	d_vec,
		unsigned int *	particlesPerCell,
		int *	indices,
		dVec *	d_pt,
		int *	d_assist,
		int *	d_adj,
		periodicBoundaryConditions	Box,
		Index2D	neighborIndexer,
		Index2D	cellListIndexer,
		IndexDD	cellIndexer,
		Index2D	adjacentCellIndexer,
		int	adjacentCellsPerCell,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		scalar	maxRange2,
		int	nmax,
		int	Np
	)

compute a neighbor list with one thread per particle

References dot(), idx, indices, periodicBoundaryConditions::minDist(), and iVec::x.

◆ gpu_compute_neighbor_list()

bool gpu_compute_neighbor_list	(	int *	d_idx,
		unsigned int *	d_npp,
		dVec *	d_vec,
		unsigned int *	particlesPerCell,
		int *	indices,
		dVec *	cellParticlePos,
		dVec *	d_pt,
		int *	d_assist,
		int *	d_adj,
		periodicBoundaryConditions &	Box,
		Index2D	neighborIndexer,
		Index2D	cellListIndexer,
		IndexDD	cellIndexer,
		Index2D	adjacentCellIndexer,
		int	adjacentCellsPerCell,
		iVec	gridCellsPerSide,
		dVec	gridCellSizes,
		int	cellListNmax,
		scalar	maxRange,
		int	nmax,
		int	Np,
		int	maxBlockSize,
		bool	threadPerCell
	)

compute neighbor list, one particle per thread

References HANDLE_ERROR, and indices.

Referenced by neighborList::computeGPU().

◆ initialize_RNG_array_kernel()

__global__ void initialize_RNG_array_kernel	(	curandState *	state,
		int	N,
		int	Timestep,
		int	GlobalSeed
	)

Each thread – most likely corresponding to each cell – is initialized with a different sequence of the same seed of a cudaRNG

References idx.

◆ gpu_initialize_RNG_array()

bool gpu_initialize_RNG_array	(	curandState *	states,
		int	N,
		int	Timestep,
		int	GlobalSeed
	)

Call the kernel to initialize a different RNG for each particle.

Initialize the GPU's random number generator.

References HANDLE_ERROR.

Referenced by noiseSource::initializeGPURNGs().

◆ reduce6()

template<class T , unsigned int blockSize>

__global__ void reduce6	(	T *	g_idata,
		T *	g_odata,
		unsigned int	n
	)

References n.

◆ reduce()

template<class T >

void reduce	(	int	size,
		int	threads,
		int	blocks,
		T *	d_idata,
		T *	d_odata
	)

access cuda sdk reduction6

◆ gpuReduction()

template<class T >

T gpuReduction	(	int	n,
		int	numThreads,
		int	numBlocks,
		int	maxThreads,
		int	maxBlocks,
		T *	d_idata,
		T *	d_odata
	)

like benchmarkReduce, interfaces with reduce and returns result

References getNumBlocksAndThreads(), HANDLE_ERROR, and n.

Referenced by landauDeGennesLC::computeEnergyGPU(), and gpu_gpuarray_dVec_dot_products().

◆ gpu_serial_reduction_kernel()

__global__ void gpu_serial_reduction_kernel	(	scalar *	array,
		scalar *	output,
		int	helperIdx,
		int	N
	)

add the first N elements of array and put it in output[helperIdx]

References scalar.

◆ gpu_serial_reduction_kernel2()

__global__ void gpu_serial_reduction_kernel2	(	scalar *	array,
		scalar *	output,
		int	helperIdx,
		int	N
	)

add the first N elements of array and put it in output[helperIdx]...use shared memory a bit

References scalar.

Referenced by gpu_parallel_reduction().

◆ gpu_parallel_block_reduction_kernel()

__global__ void gpu_parallel_block_reduction_kernel	(	scalar *	input,
		scalar *	output,
		int	N
	)

perform a block reduction, storing the partial sums of input into output

References scalar.

◆ gpu_parallel_block_reduction2_kernel()

__global__ void gpu_parallel_block_reduction2_kernel	(	scalar *	input,
		scalar *	output,
		int	N
	)

a slight optimization of the previous block reduction, c.f. M. Harris presentation

References scalar.

◆ gpu_parallel_block_reduction3_kernel()

__global__ void gpu_parallel_block_reduction3_kernel	(	scalar *	input,
		scalar *	output,
		int	N
	)

multiple loads and loop unrolling... a slight optimization of the previous block reduction, c.f. M. Harris presentation

References scalar.

◆ gpu_vec_dot_product_kernel()

__global__ void gpu_vec_dot_product_kernel	(	dVec *	input1,
		dVec *	input2,
		scalar *	output,
		int	N
	)

Store the dot product of two dVecs in a scalar vec

References dot(), and idx.

◆ gpu_vec_dot_product_unrolled_kernel()

__global__ void gpu_vec_dot_product_unrolled_kernel	(	dVec *	input1,
		dVec *	input2,
		scalar *	output,
		int	N
	)

Store the dot product of two dVecs in a scalar vec, unrolled by dimension

References idx.

◆ gpu_dVec_dot_products_kernel()

__global__ void gpu_dVec_dot_products_kernel	(	dVec *	input1,
		dVec *	input2,
		scalar *	output,
		int	N
	)

This kernel basically performs the operation of the "reduction2" kernel, but the shared memory gets dot products...BROKEN

References dot(), and scalar.

◆ gpu_unrolled_dVec_dot_products_kernel()

__global__ void gpu_unrolled_dVec_dot_products_kernel	(	dVec *	input1,
		dVec *	input2,
		scalar *	output,
		int	N
	)

This kernel basically performs the operation of the "reduction2" kernel, but the shared memory gets dot products

References scalar.

◆ gpu_scalar_times_dVec_squared_kernel()

__global__ void gpu_scalar_times_dVec_squared_kernel	(	dVec *	d_vec1,
		scalar *	d_scalars,
		scalar	factor,
		scalar *	d_ans,
		int	n
	)

take a vector of dVecs, a vector of scalars, a factor, and return a vector where every entry is factor*scalar[i]*(dVec[i])^2

References dot(), idx, and n.

◆ gpu_dot_dVec_vectors_kernel()

__global__ void gpu_dot_dVec_vectors_kernel	(	dVec *	d_vec1,
		dVec *	d_vec2,
		scalar *	d_ans,
		int	n
	)

take two vectors of dVecs and return a vector of scalars, where each entry is vec1[i].vec2[i]

References dot(), idx, and n.

◆ gpu_dVec_times_scalar_kernel() [1/2]

__global__ void gpu_dVec_times_scalar_kernel	(	dVec *	d_vec1,
		scalar	factor,
		int	n
	)

multiply every element of an array of dVecs by the same scalar

References idx, and n.

◆ gpu_dVec_times_scalar_kernel() [2/2]

__global__ void gpu_dVec_times_scalar_kernel	(	dVec *	d_vec1,
		scalar	factor,
		dVec *	d_ans,
		int	n
	)

multiply every element of an array of dVecs by the same scalar

References idx, and n.

◆ gpu_dVec_plusEqual_dVec_kernel()

__global__ void gpu_dVec_plusEqual_dVec_kernel	(	dVec *	d_vec1,
		dVec *	d_vec2,
		scalar	factor,
		int	n
	)

References idx, and n.

◆ gpu_dVec_plusEqual_dVec()

bool gpu_dVec_plusEqual_dVec	(	dVec *	d_vec1,
		dVec *	d_vec2,
		scalar	factor,
		int	N,
		int	maxBlockSize
	)

vec1 += a*vec2

References HANDLE_ERROR.

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ gpu_dVec_times_scalar() [1/2]

bool gpu_dVec_times_scalar	(	dVec *	d_vec1,
		scalar	factor,
		int	N
	)

(dVec) input *= factor

Parameters

d_vec1	dVec input array
factor	scalar multiplication factor
N	the length of the arrays

Postcondition: d_vec1 *= factor for every element

References HANDLE_ERROR.

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ gpu_dVec_times_scalar() [2/2]

bool gpu_dVec_times_scalar	(	dVec *	d_vec1,
		scalar	factor,
		dVec *	d_ans,
		int	N
	)

(dVec) ans = input * factor

References HANDLE_ERROR.

◆ gpu_scalar_times_dVec_squared()

bool gpu_scalar_times_dVec_squared	(	dVec *	d_vec1,
		scalar *	d_scalars,
		scalar	factor,
		scalar *	d_ans,
		int	N
	)

ans = a*b[i]*c[i]^2r

References HANDLE_ERROR.

◆ gpu_dot_dVec_vectors()

bool gpu_dot_dVec_vectors	(	dVec *	d_vec1,
		dVec *	d_vec2,
		scalar *	d_ans,
		int	N
	)

(scalar) ans = (dVec) vec1 . vec2

Parameters

d_vec1	dVec input array
d_vec2	dVec input array
d_ans	scalar output array... d_ans[idx] = d_vec1[idx].d_vec2[idx]
N	the length of the arrays

Postcondition: d_ans = d_vec1.d_vec2

References HANDLE_ERROR.

◆ gpu_gpuarray_dVec_dot_products()

scalar gpu_gpuarray_dVec_dot_products	(	GPUArray< dVec > &	input1,
		GPUArray< dVec > &	input2,
		GPUArray< scalar > &	intermediate,
		GPUArray< scalar > &	intermediate2,
		int	N,
		int	block_size
	)

A function of convenience: take the gpuarrays themselves and dot the data.

References ArrayHandle< T >::data, access_location::device, getNumBlocksAndThreads(), GPUArray< T >::getNumElements(), gpu_dVec_dot_products(), gpuReduction(), HANDLE_ERROR, access_location::host, access_mode::overwrite, access_mode::read, GPUArray< T >::resize(), and scalar.

Referenced by energyMinimizerFIRE::fireStepGPU(), energyMinimizerGradientDescent::gradientDescentGPU(), and energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ gpu_dVec_dot_products() [1/2]

bool gpu_dVec_dot_products	(	dVec *	input1,
		dVec *	input2,
		scalar *	intermediate,
		scalar *	intermediate2,
		scalar *	output,
		int	helperIdx,
		int	N,
		int	block_size
	)

Take two vectors of dVecs and compute the sum of the dot products between them.

takes the dot product of every element of the two input arrays and performs a reduction on the sum

Parameters

input1	vector 1...wow!
input2	vector 2...wow!
intermediate	an array that input is dot producted to
intermediate2	an array that input is block-reduced to
output	the intermediate array will be sum reduced and stored in one of the components of output
helperIdx	the location in output to store the answer
N	the size of the input and intermediate arrays
block_size	the...block size. doxygen is annoying sometimes

References gpu_parallel_reduction(), and HANDLE_ERROR.

Referenced by gpu_gpuarray_dVec_dot_products(), and energyMinimizerNesterovAG::nesterovStepGPU().

◆ gpu_dVec_dot_products() [2/2]

bool gpu_dVec_dot_products	(	dVec *	input1,
		dVec *	input2,
		scalar *	output,
		int	helperIdx,
		int	N
	)

Take two vectors of dVecs and compute the sum of the dot products between them using thrust.

References HANDLE_ERROR.

◆ gpu_parallel_reduction()

bool gpu_parallel_reduction	(	scalar *	input,
		scalar *	intermediate,
		scalar *	output,
		int	helperIdx,
		int	N,
		int	block_size
	)

A straightforward two-step parallel reduction algorithm with block_size declared.

a two-step parallel reduction algorithm that first does a partial sum reduction of input into the intermediate array, then launches a second kernel to sum reduce intermediate into output[helperIdx]

Parameters

input	the input array to sum
intermediate	an array that input is block-reduced to
output	the intermediate array will be sum reduced and stored in one of the components of output
helperIdx	the location in output to store the answer
N	the size of the input and intermediate arrays
block_size	the...block size. doxygen is annoying sometimes

References gpu_serial_reduction_kernel2(), HANDLE_ERROR, and scalar.

Referenced by gpu_dVec_dot_products().

◆ gpu_serial_reduction()

bool gpu_serial_reduction	(	scalar *	array,
		scalar *	output,
		int	helperIdx,
		int	N
	)

A trivial reduction of an array by one thread in serial. Think before you use this.

This serial reduction routine should probably never be called. It provides an interface to the gpu_serial_reduction_kernel above that may be useful for testing

References HANDLE_ERROR.

◆ gpu_set_array_kernel()

template<typename T >

__global__ void gpu_set_array_kernel	(	T *	arr,
		T	value,
		int	N
	)

A function of convenience... set an array on the device

References idx.

◆ gpu_set_array()

template<typename T >

bool gpu_set_array	(	T *	array,
		T	value,
		int	N,
		int	maxBlockSize
	)

set every element of an array to the specified value

References HANDLE_ERROR.

Referenced by simpleModel::computeForces(), energyMinimizerFIRE::fireStepGPU(), hyperrectangularCellList::resetCellSizes(), and neighborList::resetNeighborsGPU().

◆ gpu_copy_gpuarray_kernel()

template<typename T >

__global__ void gpu_copy_gpuarray_kernel	(	T *	copyInto,
		T *	copyFrom,
		int	N
	)

References idx.

◆ gpu_copy_gpuarray()

template<typename T >

bool gpu_copy_gpuarray	(	GPUArray< T > &	copyInto,
		GPUArray< T > &	copyFrom,
		int	maxBlockSize
	)

copy data into target on the device

References ArrayHandle< T >::data, access_location::device, GPUArray< T >::getNumElements(), HANDLE_ERROR, access_mode::overwrite, access_mode::read, and GPUArray< T >::resize().

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepGPU().

◆ host_dVec_dot_products()

scalar host_dVec_dot_products	(	dVec *	input1,
		dVec *	input2,
		int	N
	)

Take two vectors of dVecs and compute the sum of the dot products between them on the host.

References scalar.

Referenced by energyMinimizerLoLBFGS::lineSearchCPU(), and energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ host_dVec_plusEqual_dVec()

void host_dVec_plusEqual_dVec	(	dVec *	d_vec1,
		dVec *	d_vec2,
		scalar	factor,
		int	N
	)

vec1 += a*vec2... on the host!

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ host_dVec_times_scalar()

void host_dVec_times_scalar	(	dVec *	d_vec1,
		scalar	factor,
		dVec *	d_ans,
		int	N
	)

(dVec) ans = input * factor... on the host

Referenced by energyMinimizerLoLBFGS::LoLBFGSStepCPU().

◆ gpuReduction< scalar >()

template scalar gpuReduction< scalar >	(	int	n,
		int	numThreads,
		int	numBlocks,
		int	maxThreads,
		int	maxBlocks,
		scalar *	d_idata,
		scalar *	d_odata
	)

◆ gpuReduction< int >()

template int gpuReduction< int >	(	int	n,
		int	numThreads,
		int	numBlocks,
		int	maxThreads,
		int	maxBlocks,
		int *	d_idata,
		int *	d_odata
	)

◆ reduce< int >()

template void reduce< int >	(	int	size,
		int	threads,
		int	blocks,
		int *	d_idata,
		int *	d_odata
	)

◆ reduce< scalar >()

template void reduce< scalar >	(	int	size,
		int	threads,
		int	blocks,
		scalar *	d_idata,
		scalar *	d_odata
	)

◆ gpu_copy_gpuarray< dVec >()

template bool gpu_copy_gpuarray< dVec >	(	GPUArray< dVec > &	copyInto,
		GPUArray< dVec > &	copyFrom,
		int	maxBlockSize
	)

◆ gpu_copy_gpuarray< scalar >()

template bool gpu_copy_gpuarray< scalar >	(	GPUArray< scalar > &	copyInto,
		GPUArray< scalar > &	copyFrom,
		int	maxBlockSize
	)

◆ gpu_set_array< int >()

template bool gpu_set_array< int >	(	int *	,
		int	,
		int	,
		int
	)

◆ gpu_set_array< unsigned int >()

template bool gpu_set_array< unsigned int >	(	unsigned int *	,
		unsigned int	,
		int	,
		int
	)

◆ gpu_set_array< int2 >()

template bool gpu_set_array< int2 >	(	int2 *	,
		int2	,
		int	,
		int
	)

◆ gpu_set_array< scalar >()

template bool gpu_set_array< scalar >	(	scalar *	,
		scalar	,
		int	,
		int
	)

◆ gpu_set_array< dVec >()

template bool gpu_set_array< dVec >	(	dVec *	,
		dVec	,
		int	,
		int
	)

◆ gpu_set_array< cubicLatticeDerivativeVector >()

template bool gpu_set_array< cubicLatticeDerivativeVector >	(	cubicLatticeDerivativeVector *	,
		cubicLatticeDerivativeVector	,
		int	,
		int
	)

◆ nextPow2()

unsigned int nextPow2 ( unsigned int x )

inline

Referenced by getNumBlocksAndThreads().

◆ getNumBlocksAndThreads()

void getNumBlocksAndThreads	(	int	n,
		int	maxBlocks,
		int	maxThreads,
		int &	blocks,
		int &	threads
	)

inline

References n, and nextPow2().

Referenced by landauDeGennesLC::computeEnergyGPU(), gpu_gpuarray_dVec_dot_products(), and gpuReduction().

Classes

Functions

Detailed Description

Function Documentation

◆ gpu_compute_cell_list_kernel()

◆ gpu_compute_cell_list()

◆ gpu_compute_neighbor_list_TPP_kernel()

◆ gpu_compute_neighbor_list_TPC_kernel()

◆ gpu_compute_neighbor_list_kernel()

◆ gpu_compute_neighbor_list()

◆ initialize_RNG_array_kernel()

◆ gpu_initialize_RNG_array()

◆ reduce6()

◆ reduce()

◆ gpuReduction()

◆ gpu_serial_reduction_kernel()

◆ gpu_serial_reduction_kernel2()

◆ gpu_parallel_block_reduction_kernel()

◆ gpu_parallel_block_reduction2_kernel()

◆ gpu_parallel_block_reduction3_kernel()

◆ gpu_vec_dot_product_kernel()

◆ gpu_vec_dot_product_unrolled_kernel()

◆ gpu_dVec_dot_products_kernel()

◆ gpu_unrolled_dVec_dot_products_kernel()

◆ gpu_scalar_times_dVec_squared_kernel()

◆ gpu_dot_dVec_vectors_kernel()

◆ gpu_dVec_times_scalar_kernel() [1/2]

◆ gpu_dVec_times_scalar_kernel() [2/2]

◆ gpu_dVec_plusEqual_dVec_kernel()

◆ gpu_dVec_plusEqual_dVec()

◆ gpu_dVec_times_scalar() [1/2]

◆ gpu_dVec_times_scalar() [2/2]

◆ gpu_scalar_times_dVec_squared()

◆ gpu_dot_dVec_vectors()

◆ gpu_gpuarray_dVec_dot_products()

◆ gpu_dVec_dot_products() [1/2]

◆ gpu_dVec_dot_products() [2/2]

◆ gpu_parallel_reduction()

◆ gpu_serial_reduction()

◆ gpu_set_array_kernel()

◆ gpu_set_array()

◆ gpu_copy_gpuarray_kernel()

◆ gpu_copy_gpuarray()

◆ host_dVec_dot_products()

◆ host_dVec_plusEqual_dVec()

◆ host_dVec_times_scalar()

◆ gpuReduction< scalar >()

◆ gpuReduction< int >()

◆ reduce< int >()

◆ reduce< scalar >()

◆ gpu_copy_gpuarray< dVec >()

◆ gpu_copy_gpuarray< scalar >()

◆ gpu_set_array< int >()

◆ gpu_set_array< unsigned int >()

◆ gpu_set_array< int2 >()

◆ gpu_set_array< scalar >()

◆ gpu_set_array< dVec >()

◆ gpu_set_array< cubicLatticeDerivativeVector >()

◆ nextPow2()

◆ getNumBlocksAndThreads()