tcopy constant memory to all devices - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit b620d26c6032c277e46b2e68fc566449bcdc015b
(DIR) parent 6de1181f3e695a80fe56a139e3384cba82034404
(HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
Date: Mon, 30 Jun 2014 10:53:33 +0200
copy constant memory to all devices
Diffstat:
M src/contactsearch.cuh | 2 --
M src/device.cu | 95 +++++++++++++++----------------
M src/sphere.h | 3 +++
3 files changed, 49 insertions(+), 51 deletions(-)
---
(DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -485,8 +485,6 @@ __global__ void interact(
mempos = (unsigned int)(idx_a_orig * devC_nc + i);
__syncthreads();
idx_b_orig = dev_contacts[mempos];
- //radius_b = distmod.w;
-
if (idx_b_orig != (unsigned int)devC_np) {
(DIR) diff --git a/src/device.cu b/src/device.cu
t@@ -67,6 +67,7 @@ __host__ void DEM::initializeGPU(void)
// Register number of devices
cudaGetDeviceCount(&deviceCount);
+ ndevices = deviceCount; // store in DEM class
if (deviceCount == 0) {
std::cerr << "\nERROR: No CUDA-enabled devices availible. Bye."
t@@ -84,7 +85,7 @@ __host__ void DEM::initializeGPU(void)
// Loop through GPU's and choose the one with the most CUDA cores
int ncudacores;
int max_ncudacores = 0;
- for (int d=0; d<deviceCount; d++) {
+ for (int d=0; d<ndevices; d++) {
cudaGetDeviceProperties(&prop, d);
cudaDriverGetVersion(&cudaDriverVersion);
cudaRuntimeGetVersion(&cudaRuntimeVersion);
t@@ -107,6 +108,7 @@ __host__ void DEM::initializeGPU(void)
}
}
+ device = cudadevice; // store in DEM class
cout << " Using CUDA device ID " << cudadevice << " with "
<< max_ncudacores << " cores." << std::endl;
t@@ -140,7 +142,6 @@ __global__ void checkConstantValues(int* dev_equal,
Grid* dev_grid,
Params* dev_params)
{
-
// Values ok (0)
*dev_equal = 0;
t@@ -158,7 +159,6 @@ __global__ void checkConstantValues(int* dev_equal,
dev_grid->periodic != devC_grid.periodic)
*dev_equal = 1; // Not ok
-
else if (dev_params->g[0] != devC_params.g[0] ||
dev_params->g[1] != devC_params.g[1] ||
dev_params->g[2] != devC_params.g[2] ||
t@@ -188,8 +188,6 @@ __global__ void checkConstantValues(int* dev_equal,
// values in constant memory.
__host__ void DEM::checkConstantMemory()
{
-
-
// Allocate space in global device memory
Grid* dev_grid;
Params* dev_params;
t@@ -217,7 +215,6 @@ __host__ void DEM::checkConstantMemory()
cudaFree(dev_params);
cudaFree(dev_equal);
-
// Are the values equal?
if (*equal != 0) {
std::cerr << "Error! The values in constant memory do not "
t@@ -238,25 +235,24 @@ __host__ void DEM::transferToConstantDeviceMemory()
if (verbose == 1)
cout << " Transfering data to constant device memory: ";
- /*// Reference by string deprecated in cuda 5.0
- cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd));
- cudaMemcpyToSymbol("devC_np", &np, sizeof(np));
- cudaMemcpyToSymbol("devC_nw", &walls.nw, sizeof(unsigned int));
- cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int));
- cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float));*/
- cudaMemcpyToSymbol(devC_nd, &nd, sizeof(nd));
- cudaMemcpyToSymbol(devC_np, &np, sizeof(np));
- cudaMemcpyToSymbol(devC_nw, &walls.nw, sizeof(unsigned int));
- cudaMemcpyToSymbol(devC_nc, &NC, sizeof(int));
- cudaMemcpyToSymbol(devC_dt, &time.dt, sizeof(Float));
- cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
- cudaMemcpyToSymbol(devC_params, ¶ms, sizeof(Params));
+ for (int d=0; d<ndevices; d++) {
+ cudaSetDevice(d);
+ cudaMemcpyToSymbol(devC_nd, &nd, sizeof(nd));
+ cudaMemcpyToSymbol(devC_np, &np, sizeof(np));
+ cudaMemcpyToSymbol(devC_nw, &walls.nw, sizeof(unsigned int));
+ cudaMemcpyToSymbol(devC_nc, &NC, sizeof(int));
+ cudaMemcpyToSymbol(devC_dt, &time.dt, sizeof(Float));
+ cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
+ cudaMemcpyToSymbol(devC_params, ¶ms, sizeof(Params));
+ }
+ cudaSetDevice(device);
checkForCudaErrors("After transferring to device constant memory");
if (verbose == 1)
cout << "Done\n";
+ // only for device with most CUDA cores
checkConstantMemory();
}
t@@ -295,7 +291,7 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
// Particle contact bookkeeping arrays
cudaMalloc((void**)&dev_contacts,
- sizeof(unsigned int)*np*NC); // Max NC contacts per particle
+ sizeof(unsigned int)*np*NC);
cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
cudaMalloc((void**)&dev_bonds, sizeof(uint2)*params.nb0);
t@@ -317,10 +313,10 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
// Cell-related arrays
cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
- cudaMalloc((void**)&dev_cellStart,
- sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
- cudaMalloc((void**)&dev_cellEnd,
- sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)
+ *grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)
+ *grid.num[0]*grid.num[1]*grid.num[2]);
// Host contact bookkeeping arrays
k.contacts = new unsigned int[np*NC];
t@@ -348,6 +344,7 @@ __host__ void DEM::freeGlobalDeviceMemory()
{
if (verbose == 1)
printf("\nFreeing device memory: ");
+
// Particle arrays
cudaFree(dev_x);
cudaFree(dev_xyzsum);
t@@ -416,59 +413,59 @@ __host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
// Kinematic particle values
cudaMemcpy( dev_x, k.x,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_xyzsum, k.xyzsum,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_vel, k.vel,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_vel0, k.vel,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_acc, k.acc,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_force, k.force,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_angpos, k.angpos,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_angvel, k.angvel,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_angvel0, k.angvel,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_angacc, k.angacc,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_torque, k.torque,
- memSizeF4, cudaMemcpyHostToDevice);
+ memSizeF4, cudaMemcpyHostToDevice);
cudaMemcpy( dev_contacts, k.contacts,
- sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
+ sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
cudaMemcpy( dev_distmod, k.distmod,
- memSizeF4*NC, cudaMemcpyHostToDevice);
+ memSizeF4*NC, cudaMemcpyHostToDevice);
cudaMemcpy( dev_delta_t, k.delta_t,
- memSizeF4*NC, cudaMemcpyHostToDevice);
+ memSizeF4*NC, cudaMemcpyHostToDevice);
cudaMemcpy( dev_bonds, k.bonds,
- sizeof(uint2)*params.nb0, cudaMemcpyHostToDevice);
+ sizeof(uint2)*params.nb0, cudaMemcpyHostToDevice);
cudaMemcpy( dev_bonds_delta, k.bonds_delta,
- sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
+ sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
cudaMemcpy( dev_bonds_omega, k.bonds_omega,
- sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
+ sizeof(Float4)*params.nb0, cudaMemcpyHostToDevice);
// Individual particle energy values
cudaMemcpy( dev_es_dot, e.es_dot,
- memSizeF, cudaMemcpyHostToDevice);
+ memSizeF, cudaMemcpyHostToDevice);
cudaMemcpy( dev_es, e.es,
- memSizeF, cudaMemcpyHostToDevice);
+ memSizeF, cudaMemcpyHostToDevice);
cudaMemcpy( dev_ev_dot, e.ev_dot,
- memSizeF, cudaMemcpyHostToDevice);
+ memSizeF, cudaMemcpyHostToDevice);
cudaMemcpy( dev_ev, e.ev,
- memSizeF, cudaMemcpyHostToDevice);
+ memSizeF, cudaMemcpyHostToDevice);
cudaMemcpy( dev_p, e.p,
- memSizeF, cudaMemcpyHostToDevice);
+ memSizeF, cudaMemcpyHostToDevice);
// Wall parameters
cudaMemcpy( dev_walls_wmode, walls.wmode,
- sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
+ sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
cudaMemcpy( dev_walls_nx, walls.nx,
- sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
+ sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
cudaMemcpy( dev_walls_mvfd, walls.mvfd,
- sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
+ sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
// Fluid arrays
if (navierstokes == 1) {
(DIR) diff --git a/src/sphere.h b/src/sphere.h
t@@ -53,6 +53,9 @@ class DEM {
unsigned int width;
unsigned int height;
+ int ndevices; // number of CUDA GPUs
+ int device; // primary GPU
+
// DEVICE ARRAYS