CellGPU
0.8.0
GPU-accelerated simulations of cells
|
CUDA kernels and callers for the utilities base. More...
Functions | |
bool | gpu_initialize_RNG_array (curandState *states, int N, int Timestep, int GlobalSeed) |
Initialize the GPU's random number generator. More... | |
bool | gpu_dot_Dscalar2_vectors (Dscalar2 *d_vec1, Dscalar2 *d_vec2, Dscalar *d_ans, int N) |
(Dscalar) ans = (Dscalar2) vec1 . vec2 More... | |
bool | gpu_serial_reduction (Dscalar *array, Dscalar *output, int helperIdx, int N) |
A trivial reduction of an array by one thread in serial. Think before you use this. More... | |
bool | gpu_parallel_reduction (Dscalar *input, Dscalar *intermediate, Dscalar *output, int helperIdx, int N) |
A straightforward two-step parallel reduction algorithm. More... | |
bool | gpu_parallel_reduction (Dscalar2 *input, Dscalar2 *intermediate, Dscalar2 *output, int helperIdx, int N) |
A straightforward two-step parallel reduction algorithm for Dscalar2 arrays. More... | |
bool | gpu_dot_Dscalar_Dscalar2_vectors (Dscalar *d_vec1, Dscalar2 *d_vec2, Dscalar2 *d_ans, int N) |
(Dscalar2) ans = (Dscalar2) vec1 * vec2 More... | |
__global__ void | initialize_RNG_array_kernel (curandState *state, int N, int Timestep, int GlobalSeed) |
__global__ void | gpu_dot_Dscalar_Dscalar2_vectors_kernel (Dscalar *d_vec1, Dscalar2 *d_vec2, Dscalar2 *d_ans, int n) |
__global__ void | gpu_dot_Dscalar2_vectors_kernel (Dscalar2 *d_vec1, Dscalar2 *d_vec2, Dscalar *d_ans, int n) |
__global__ void | gpu_serial_reduction_kernel (Dscalar *array, Dscalar *output, int helperIdx, int N) |
__global__ void | gpu_serial_reduction_kernel (Dscalar2 *array, Dscalar2 *output, int helperIdx, int N) |
__global__ void | gpu_parallel_block_reduction_kernel (Dscalar *input, Dscalar *output, int N) |
__global__ void | gpu_parallel_block_reduction2_kernel (Dscalar *input, Dscalar *output, int N) |
__global__ void | gpu_parallel_block_reduction2_kernel (Dscalar2 *input, Dscalar2 *output, int N) |
CUDA kernels and callers for the utilities base.
CUDA kernels and callers for generating rngs on the gpu.
bool gpu_initialize_RNG_array | ( | curandState * | states, |
int | N, | ||
int | Timestep, | ||
int | GlobalSeed | ||
) |
Initialize the GPU's random number generator.
Call the kernel to initialize a different RNG for each particle.
Referenced by noiseSource::initializeGPURNGs().
bool gpu_dot_Dscalar2_vectors | ( | Dscalar2 * | d_vec1, |
Dscalar2 * | d_vec2, | ||
Dscalar * | d_ans, | ||
int | N | ||
) |
(Dscalar) ans = (Dscalar2) vec1 . vec2
d_vec1 | Dscalar2 input array |
d_vec2 | Dscalar2 input array |
d_ans | Dscalar output array... d_ans[idx] = d_vec1[idx].d_vec2[idx] |
N | the length of the arrays |
Referenced by EnergyMinimizerFIRE::fireStepGPU().
bool gpu_serial_reduction | ( | Dscalar * | array, |
Dscalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
A trivial reduction of an array by one thread in serial. Think before you use this.
This serial reduction routine should probably never be called. It provides an interface to the gpu_serial_reduction_kernel above that may be useful for testing
bool gpu_parallel_reduction | ( | Dscalar * | input, |
Dscalar * | intermediate, | ||
Dscalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
A straightforward two-step parallel reduction algorithm.
a two-step parallel reduction algorithm that first does a partial sum reduction of input into the intermediate array, then launches a second kernel to sum reduce intermediate into output[helperIdx]
input | the input array to sum |
intermediate | an array that input is block-reduced to |
output | the intermediate array will be sum reduced and stored in one of the components of output |
helperIdx | the location in output to store the answer |
N | the size of the input and intermediate arrays |
Referenced by NoseHooverChainNVT::calculateKineticEnergyGPU(), EnergyMinimizerFIRE::fireStepGPU(), and setTotalLinearMomentum::setLinearMomentumGPU().
bool gpu_parallel_reduction | ( | Dscalar2 * | input, |
Dscalar2 * | intermediate, | ||
Dscalar2 * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
A straightforward two-step parallel reduction algorithm for Dscalar2 arrays.
a two-step parallel reduction algorithm for Dscalar2's that first does a partial sum reduction of input into the intermediate array, then launches a second kernel to sum reduce intermediate into output[helperIdx]
input | the input array to sum |
intermediate | an array that input is block-reduced to |
output | the intermediate array will be sum reduced and stored in one of the components of output |
helperIdx | the location in output to store the answer |
N | the size of the input and intermediate arrays |
bool gpu_dot_Dscalar_Dscalar2_vectors | ( | Dscalar * | d_vec1, |
Dscalar2 * | d_vec2, | ||
Dscalar2 * | d_ans, | ||
int | N | ||
) |
(Dscalar2) ans = (Dscalar2) vec1 * vec2
d_vec1 | Dscalar input array |
d_vec2 | Dscalar2 input array |
d_ans | Dscalar2 output array... d_ans[idx] = d_vec1[idx] * d_vec2[idx] |
N | the length of the arrays |
Referenced by setTotalLinearMomentum::setLinearMomentumGPU().
__global__ void initialize_RNG_array_kernel | ( | curandState * | state, |
int | N, | ||
int | Timestep, | ||
int | GlobalSeed | ||
) |
Each thread – most likely corresponding to each cell – is initialized with a different sequence of the same seed of a cudaRNG
__global__ void gpu_dot_Dscalar_Dscalar2_vectors_kernel | ( | Dscalar * | d_vec1, |
Dscalar2 * | d_vec2, | ||
Dscalar2 * | d_ans, | ||
int | n | ||
) |
take two vectors and return a vector of Dscalar2s, where each entry is vec1[i].vec2[i]
__global__ void gpu_dot_Dscalar2_vectors_kernel | ( | Dscalar2 * | d_vec1, |
Dscalar2 * | d_vec2, | ||
Dscalar * | d_ans, | ||
int | n | ||
) |
take two vectors of Dscalar2 and return a vector of Dscalars, where each entry is vec1[i].vec2[i]
__global__ void gpu_serial_reduction_kernel | ( | Dscalar * | array, |
Dscalar * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
add the first N elements of array and put it in output[helperIdx]
__global__ void gpu_serial_reduction_kernel | ( | Dscalar2 * | array, |
Dscalar2 * | output, | ||
int | helperIdx, | ||
int | N | ||
) |
add the first N elements of Dscalar2 array and put it in output[helperIdx]
__global__ void gpu_parallel_block_reduction_kernel | ( | Dscalar * | input, |
Dscalar * | output, | ||
int | N | ||
) |
perform a block reduction, storing the partial sums of input into output
__global__ void gpu_parallel_block_reduction2_kernel | ( | Dscalar * | input, |
Dscalar * | output, | ||
int | N | ||
) |
a slight optimization of the previous block reduction, c.f. M. Harris presentation
__global__ void gpu_parallel_block_reduction2_kernel | ( | Dscalar2 * | input, |
Dscalar2 * | output, | ||
int | N | ||
) |
block reduction for Dscalar2 arrays, c.f. M. Harris presentation