tFurther work on implementing OO - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit 287f0082450f6709173703f7a43f708676d4f29e
(DIR) parent bd8883f07cd1e54218c7cde17e6376d52918727c
(HTM) Author: Anders Damsgaard Christensen <adc@geo.au.dk>
Date: Fri, 26 Oct 2012 23:05:19 +0200
Further work on implementing OO
Diffstat:
M src/constants.cuh | 1 +
M src/contactsearch.cuh | 20 +++++++++-----------
M src/datatypes.h | 31 ++++++++++++++++++++++++-------
M src/device.cu | 512 ++++++++++++++-----------------
M src/integration.cuh | 4 ++--
M src/sorting.cuh | 19 ++++++++-----------
M src/sphere.h | 31 ++++++++++++++++---------------
7 files changed, 297 insertions(+), 321 deletions(-)
---
(DIR) diff --git a/src/constants.cuh b/src/constants.cuh
t@@ -10,6 +10,7 @@
__constant__ unsigned int devC_nd; // Number of dimensions
__constant__ unsigned int devC_np; // Number of particles
__constant__ int devC_nc; // Max. number of contacts a particle can have
+__constant__ Float devC_dt; // Computational time step length
// Device constant memory structures
__constant__ Params devC_params;
(DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -87,7 +87,6 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
Float* es_dot, Float* ev_dot,
Float* p,
Float4* dev_x_sorted,
- Float* dev_radius_sorted,
Float4* dev_vel_sorted,
Float4* dev_angvel_sorted,
unsigned int* dev_cellStart,
t@@ -125,7 +124,7 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
// Fetch position and velocity of particle B.
Float4 x_b = dev_x_sorted[idx_b];
- Float radius_b = dev_radius_sorted[idx_b];
+ Float radius_b = x_b.w;
Float kappa = devC_params.kappa;
// Distance between particle centers (Float4 -> Float3)
t@@ -183,7 +182,6 @@ __device__ void findContactsInCell(int3 targetCell,
unsigned int idx_a,
Float4 x_a, Float radius_a,
Float4* dev_x_sorted,
- Float* dev_radius_sorted,
unsigned int* dev_cellStart,
unsigned int* dev_cellEnd,
unsigned int* dev_gridParticleIndex,
t@@ -226,7 +224,7 @@ __device__ void findContactsInCell(int3 targetCell,
// Fetch position and radius of particle B.
Float4 x_b = dev_x_sorted[idx_b];
- Float radius_b = dev_radius_sorted[idx_b];
+ Float radius_b = x_b.w;
// Read the original index of particle B
unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
t@@ -316,7 +314,7 @@ __device__ void findContactsInCell(int3 targetCell,
__global__ void topology(unsigned int* dev_cellStart,
unsigned int* dev_cellEnd, // Input: Particles in cell
unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
- Float4* dev_x_sorted, Float* dev_radius_sorted,
+ Float4* dev_x_sorted,
unsigned int* dev_contacts,
Float4* dev_distmod)
{
t@@ -325,7 +323,7 @@ __global__ void topology(unsigned int* dev_cellStart,
if (idx_a < devC_params.np) {
// Fetch particle data in global read
Float4 x_a = dev_x_sorted[idx_a];
- Float radius_a = dev_radius_sorted[idx_a];
+ Float radius_a = x_a.w;
// Count the number of contacts in this time step
int nc = 0;
t@@ -346,7 +344,7 @@ __global__ void topology(unsigned int* dev_cellStart,
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
findContactsInCell(targetPos, idx_a, x_a, radius_a,
- dev_x_sorted, dev_radius_sorted,
+ dev_x_sorted,
dev_cellStart, dev_cellEnd,
dev_gridParticleIndex,
&nc, dev_contacts, dev_distmod);
t@@ -371,8 +369,8 @@ __global__ void topology(unsigned int* dev_cellStart,
__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
unsigned int* dev_cellStart,
unsigned int* dev_cellEnd,
- Float4* dev_x, Float* dev_radius,
- Float4* dev_x_sorted, Float* dev_radius_sorted,
+ Float4* dev_x,
+ Float4* dev_x_sorted,
Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
Float4* dev_vel, Float4* dev_angvel,
Float4* dev_force, Float4* dev_torque,
t@@ -395,7 +393,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
// Fetch particle data in global read
unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
Float4 x_a = dev_x_sorted[idx_a];
- Float radius_a = dev_radius_sorted[idx_a];
+ Float radius_a = x_a.w;
// Fetch wall data in global read
Float4 w_up_nx = dev_w_nx[0];
t@@ -521,7 +519,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a,
&F, &T, &es_dot, &ev_dot, &p,
- dev_x_sorted, dev_radius_sorted,
+ dev_x_sorted,
dev_vel_sorted, dev_angvel_sorted,
dev_cellStart, dev_cellEnd,
dev_w_nx, dev_w_mvfd);
(DIR) diff --git a/src/datatypes.h b/src/datatypes.h
t@@ -15,13 +15,18 @@
// Structure containing kinematic particle values
struct Kinematics {
- Float4 *x; // Positions + radii (w)
- Float2 *xysum; // Horizontal distance traveled
- Float4 *vel; // Translational velocities + fixvels (w)
- Float4 *force; // Sums of forces
- Float4 *angpos; // Angular positions
- Float4 *angvel; // Angular velocities
- Float4 *torque; // Sums of torques
+ Float4 *x; // Positions + radii (w)
+ Float2 *xysum; // Horizontal distance traveled
+ Float4 *vel; // Translational velocities + fixvels (w)
+ Float4 *acc; // Translational accelerations
+ Float4 *force; // Sums of forces
+ Float4 *angpos; // Angular positions
+ Float4 *angvel; // Angular velocities
+ Float4 *angacc; // Angular accelerations
+ Float4 *torque; // Sums of torques
+ unsigned int *contacts; // List of contacts per particle
+ Float4 *distmod; // Distance modifiers for contacts across periodic boundaries
+ Float4 *delta_t; // Accumulated shear distance of contacts
};
// Structure containing individual physical particle parameters
t@@ -42,6 +47,17 @@ struct Grid {
int periodic; // Behavior of boundaries at 1st and 2nd world edge
};
+struct Sorting {
+ Float4 *x_sorted; // Positions + radii (w) (sorted)
+ Float4 *vel_sorted; // Translational velocities + fixvels (w) (sorted)
+ Float4 *angvel_sorted; // Angular velocities (sorted)
+ unsigned int *gridParticleCellID; // Hash key (cell index) from position in grid
+ unsigned int *gridParticleIndex; // Original indexes of particles
+ unsigned int *cellStart; // First index of sorted idx'es in cells
+ unsigned int *cellEnd; // Last index of sorted idx'es in cells
+};
+
+
// Structure containing time parameters
struct Time {
Float dt; // Computational time step length
t@@ -76,6 +92,7 @@ struct Walls {
int wmode[MAXWALLS]; // Wall modes
Float4* nx; // Wall normal and position
Float4* mvfd; // Wall mass, velocity, force and dev. stress
+ Float* force; // Resulting forces on walls per particle
Float gamma_wn; // Wall normal viscosity
Float gamma_wt; // Wall tangential viscosity
Float gamma_wr; // Wall rolling viscosity
(DIR) diff --git a/src/device.cu b/src/device.cu
t@@ -197,6 +197,7 @@ __host__ void DEM::transferToConstantDeviceMemory()
cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd));
cudaMemcpyToSymbol("devC_np", &np, sizeof(np));
cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int));
+ cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float));
cudaMemcpyToSymbol(devC_grid, grid, sizeof(Grid));
cudaMemcpyToSymbol(devC_params, params, sizeof(Params));
t@@ -205,149 +206,178 @@ __host__ void DEM::transferToConstantDeviceMemory()
if (verbose == 1)
cout << "Done\n";
- checkConstantMemory();
+ //checkConstantMemory();
}
-//extern "C"
-__host__ void DEM::startTime()
+// Allocate device memory for particle variables,
+// tied to previously declared pointers in structures
+__host__ void DEM::allocateGlobalDeviceMemory(void)
{
+ // Particle memory size
+ unsigned int memSizeF = sizeof(Float) * np;
+ unsigned int memSizeF4 = sizeof(Float4) * np;
- using std::cout; // Namespace directive
+ if (verbose == 1)
+ std::cout << " Allocating device memory: ";
- // Copy data to constant global device memory
- transferToConstantDeviceMemory();
+ // Particle arrays
+ cudaMalloc((void**)&dev_k.x, memSizeF4);
+ cudaMalloc((void**)&dev_sort.x_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_k.vel, memSizeF4);
+ cudaMalloc((void**)&dev_sort.vel_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_k.angvel, memSizeF4);
+ cudaMalloc((void**)&dev_sort.angvel_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_k.acc, memSizeF4);
+ k.acc = new Float4[np];
+ cudaMalloc((void**)&dev_k.angacc, memSizeF4);
+ k.angacc = new Float4[np];
+ cudaMalloc((void**)&dev_k.force, memSizeF4);
+ cudaMalloc((void**)&dev_k.torque, memSizeF4);
+ cudaMalloc((void**)&dev_k.angpos, memSizeF4);
+ cudaMalloc((void**)&dev_e.es_dot, memSizeF);
+ cudaMalloc((void**)&dev_e.ev_dot, memSizeF);
+ cudaMalloc((void**)&dev_e.es, memSizeF);
+ cudaMalloc((void**)&dev_e.ev, memSizeF);
+ cudaMalloc((void**)&dev_e.p, memSizeF);
- Float* dev_w_force; // Resulting force on wall per particle
- Float* dev_w_force_partial; // Partial sum from block of threads
-
- // Memory for sorted particle data
- Float4* dev_x_sorted;
- Float4* dev_vel_sorted;
- Float4* dev_angvel_sorted;
- Float* dev_radius_sorted;
-
- // Grid-particle array pointers
- unsigned int* dev_gridParticleCellID;
- unsigned int* dev_gridParticleIndex;
- unsigned int* dev_cellStart;
- unsigned int* dev_cellEnd;
-
- // Particle contact bookkeeping
- unsigned int* dev_contacts;
- unsigned int* host_contacts = new unsigned int[p.np*NC];
- // Particle pair distance correction across periodic boundaries
- Float4* dev_distmod;
- Float4* host_distmod = new Float4[p.np*NC];
- // x,y,z contains the interparticle vector, corrected if contact
- // is across a periodic boundary.
- Float4* dev_delta_t; // Accumulated shear distance of contact
- Float4* host_delta_t = new Float4[p.np*NC];
+ // Cell-related arrays
+ cudaMalloc((void**)&dev_sort.gridParticleCellID, sizeof(unsigned int)*np);
+ cudaMalloc((void**)&dev_sort.gridParticleIndex, sizeof(unsigned int)*np);
+ cudaMalloc((void**)&dev_sort.cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&dev_sort.cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
- // Particle memory size
- unsigned int memSizeF = sizeof(Float) * p.np;
- unsigned int memSizeF4 = sizeof(Float4) * p.np;
+ // Particle contact bookkeeping arrays
+ cudaMalloc((void**)&dev_k.contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
+ cudaMalloc((void**)&dev_k.distmod, sizeof(Float4)*np*NC);
+ cudaMalloc((void**)&dev_k.delta_t, sizeof(Float4)*np*NC);
+
+ // Host contact bookkeeping arrays
+ k.contacts = new unsigned int[np*NC];
+ // Initialize contacts lists to np
+ for (unsigned int i=0; i<(np*NC); ++i)
+ k.contacts[i] = np;
+ k.distmod = new Float4[np*NC];
+ k.delta_t = new Float4[np*NC];
- // Allocate device memory for particle variables,
- // tied to previously declared pointers in structures
- allocateGlobalDeviceMemory();
- cout << " Allocating device memory: ";
+ // Wall arrays
+ cudaMalloc((void**)&dev_walls.nx, sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&dev_walls.mvfd, sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&dev_walls.force, sizeof(Float)*walls.nw*np);
+ // dev_w_force_partial allocated later
+ checkForCudaErrors("End of allocateGlobalDeviceMemory");
+ if (verbose == 1)
+ std::cout << "Done\n";
+}
+
+__host__ void DEM::freeGlobalDeviceMemory()
+{
+ if (verbose == 1)
+ printf("\nLiberating device memory: ");
// Particle arrays
- cudaMalloc((void**)&dev_x, memSizeF4);
- cudaMalloc((void**)&dev_x_sorted, memSizeF4);
- cudaMalloc((void**)&dev_vel, memSizeF4);
- cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
- cudaMalloc((void**)&dev_angvel, memSizeF4);
- cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
- cudaMalloc((void**)&dev_acc, memSizeF4);
- cudaMalloc((void**)&dev_angacc, memSizeF4);
- cudaMalloc((void**)&dev_force, memSizeF4);
- cudaMalloc((void**)&dev_torque, memSizeF4);
- cudaMalloc((void**)&dev_angpos, memSizeF4);
- cudaMalloc((void**)&dev_radius, memSizeF);
- cudaMalloc((void**)&dev_radius_sorted, memSizeF);
- cudaMalloc((void**)&dev_es_dot, memSizeF);
- cudaMalloc((void**)&dev_ev_dot, memSizeF);
- cudaMalloc((void**)&dev_es, memSizeF);
- cudaMalloc((void**)&dev_ev, memSizeF);
- cudaMalloc((void**)&dev_p, memSizeF);
- //cudaMalloc((void**)&dev_bonds, sizeof(uint4) * p.np);
- //cudaMalloc((void**)&dev_bonds_sorted, sizeof(uint4) * p.np);
+ cudaFree(dev_k.x);
+ cudaFree(dev_sort.x_sorted);
+ cudaFree(dev_k.vel);
+ cudaFree(dev_sort.vel_sorted);
+ cudaFree(dev_k.angvel);
+ cudaFree(dev_sort.angvel_sorted);
+ cudaFree(dev_k.acc);
+ cudaFree(dev_k.angacc);
+ cudaFree(dev_k.force);
+ cudaFree(dev_k.torque);
+ cudaFree(dev_k.angpos);
+ cudaFree(dev_e.es_dot);
+ cudaFree(dev_e.ev_dot);
+ cudaFree(dev_e.es);
+ cudaFree(dev_e.ev);
+ cudaFree(dev_e.p);
+ cudaFree(dev_k.contacts);
+ cudaFree(dev_k.distmod);
+ cudaFree(dev_k.delta_t);
// Cell-related arrays
- cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*p.np);
- cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*p.np);
- cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
- cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
-
- // Particle contact bookkeeping arrays
- cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*p.np*NC); // Max NC contacts per particle
- cudaMalloc((void**)&dev_distmod, sizeof(Float4)*p.np*NC);
- cudaMalloc((void**)&dev_delta_t, sizeof(Float4)*p.np*NC);
+ cudaFree(dev_sort.gridParticleIndex);
+ cudaFree(dev_sort.cellStart);
+ cudaFree(dev_sort.cellEnd);
// Wall arrays
- cudaMalloc((void**)&dev_w_nx, sizeof(Float)*params.nw*4);
- cudaMalloc((void**)&dev_w_mvfd, sizeof(Float)*params.nw*4);
- cudaMalloc((void**)&dev_w_force, sizeof(Float)*params.nw*p.np);
+ cudaFree(dev_walls.nx);
+ cudaFree(dev_walls.mvfd);
+ cudaFree(dev_walls.force);
+ cudaFree(dev_w_force_partial);
+
+ if (verbose == 1)
+ printf("Done\n");
+}
- checkForCudaErrors("Post device memory allocation");
- cout << "Done\n";
- // Transfer data from host to gpu device memory
+__host__ void DEM::transferToGlobalDeviceMemory()
+{
+ if (verbose == 1)
+ std::cout << " Transfering data to the device: ";
+
+ // Copy structure data from host to global device memory
+ /*cudaMemcpy(dev_k, k, sizeof(Kinematics), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_e, e, sizeof(Energies), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_time, time, sizeof(Time), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_walls, walls, sizeof(Walls), cudaMemcpyHostToDevice);*/
+ cudaMemcpy(dev_k, k, sizeof(k), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_e, e, sizeof(e), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_time, time, sizeof(time), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_walls, walls, sizeof(walls), cudaMemcpyHostToDevice);
+
+ checkForCudaErrors("End of transferToGlobalDeviceMemory");
+ if (verbose == 1)
+ std::cout << "Done\n";
+}
+
+__host__ void DEM::transferToGlobalDeviceMemory()
+{
cout << " Transfering data to the device: ";
- // Particle data
- cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_angpos, host_angpos, memSizeF4, cudaMemcpyHostToDevice);
- //cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p.np, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_radius, p.radius, memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_es_dot, p.es_dot, memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_ev_dot, p.ev_dot, memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_es, p.es, memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_ev, p.ev, memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_p, p.p, memSizeF, cudaMemcpyHostToDevice);
-
- // Wall data (wall mass and number in constant memory)
- cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)*params.nw*4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)*params.nw*4, cudaMemcpyHostToDevice);
-
- // Initialize contacts lists to p.np
- unsigned int* npu = new unsigned int[p.np*NC];
- for (unsigned int i=0; i<(p.np*NC); ++i)
- npu[i] = p.np;
- cudaMemcpy(dev_contacts, npu, sizeof(unsigned int)*p.np*NC, cudaMemcpyHostToDevice);
- delete[] npu;
-
- // Create array of 0.0 values on the host and transfer these to the distance
- // modifier and shear displacement arrays
- Float4* zerosF4 = new Float4[p.np*NC];
- for (unsigned int i=0; i<(p.np*NC); ++i)
- zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
- cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDevice);
- delete[] zerosF4;
-
- checkForCudaErrors("Post memcopy");
- cout << "Done\n";
+ // Copy structure data from host to global device memory
+ cudaMemcpy(k, dev_k, sizeof(k), cudaMemcpyDeviceToHost);
+ cudaMemcpy(e, dev_e, sizeof(e), cudaMemcpyDeviceToHost);
+ cudaMemcpy(time, dev_time, sizeof(time), cudaMemcpyDeviceToHost);
+ cudaMemcpy(walls, dev_walls, sizeof(walls), cudaMemcpyDeviceToHost);
+
+ checkForCudaErrors("End of transferFromGlobalDeviceMemory");
+ if (verbose == 1)
+ std::cout << "Done\n";
+}
+
+
+// Iterate through time by explicit time integration
+__host__ void DEM::startTime()
+{
+
+ using std::cout; // Namespace directive
+ char file[200]; // Output filename
+ FILE *fp;
+
+ // Copy data to constant global device memory
+ transferToConstantDeviceMemory();
+
+
+ // Particle memory size
+ unsigned int memSizeF = sizeof(Float) * np;
+ unsigned int memSizeF4 = sizeof(Float4) * np;
+
+ // Allocate device memory for particle variables,
+ // tied to previously declared pointers in structures
+ allocateGlobalDeviceMemory();
+
+ // Transfer data from host to gpu device memory
+ transferToGlobalDeviceMemory();
// Synchronization point
cudaThreadSynchronize();
- checkForCudaErrors("Start of mainLoop()");
+ checkForCudaErrors("Start of startTime()");
// Model world variables
float tic, toc, filetimeclock, time_spent, dev_time_spent;
- // File output
- FILE* fp;
- char file[1000]; // Complete path+filename variable
-
// Start CPU clock
tic = clock();
t@@ -360,22 +390,25 @@ __host__ void DEM::startTime()
// Shared memory per block
unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
+ Float* dev_w_force_partial;
cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
// Report to stdout
- cout << "\n Device memory allocation and transfer complete.\n"
- << " - Blocks per grid: "
- << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
- << " - Threads per block: "
- << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
- << " - Shared memory required per block: " << smemSize << " bytes\n";
+ if (verbose == 1) {
+ cout << "\n Device memory allocation and transfer complete.\n"
+ << " - Blocks per grid: "
+ << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
+ << " - Threads per block: "
+ << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
+ << " - Shared memory required per block: " << smemSize << " bytes\n";
+ }
// Initialize counter variable values
filetimeclock = 0.0;
long iter = 0;
// Create first status.dat
- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
+ sprintf(file,"output/%s.status.dat", inputbin);
fp = fopen(file, "w");
fprintf(fp,"%2.4e %2.4e %d\n",
time.current,
t@@ -384,23 +417,17 @@ __host__ void DEM::startTime()
fclose(fp);
// Write first output data file: output0.bin, thus testing writing of bin files
- sprintf(file,"%s/output/%s.output0.bin", cwd, inputbin);
- if (fwritebin(file, &p, host_x, host_vel,
- host_angvel, host_force,
- host_torque, host_angpos,
- host_bonds,
- &grid, &time, ¶ms,
- host_w_nx, host_w_mvfd) != 0) {
- std::cerr << "\n Problem during fwritebin \n";
- exit(EXIT_FAILURE);
+ sprintf(file,"output/%s.output0.bin", inputbin);
+ writebin(file);
+
+ if (verbose == 1) {
+ cout << "\n Entering the main calculation time loop...\n\n"
+ << " IMPORTANT: Do not close this terminal, doing so will \n"
+ << " terminate this SPHERE process. Follow the \n"
+ << " progress by executing:\n"
+ << " $ ./sphere_status " << inputbin << "\n\n";
}
- cout << "\n Entering the main calculation time loop...\n\n"
- << " IMPORTANT: Do not close this terminal, doing so will \n"
- << " terminate this SPHERE process. Follow the \n"
- << " progress by executing:\n"
- << " $ ./sphere_status " << inputbin << "\n\n";
-
// Start GPU clock
cudaEvent_t dev_tic, dev_toc;
t@@ -447,9 +474,9 @@ __host__ void DEM::startTime()
// in the fine, uniform and homogenous grid.
if (PROFILING == 1)
startTimer(&kernel_tic);
- calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
- dev_gridParticleIndex,
- dev_x);
+ calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort.gridParticleCellID,
+ dev_sort.gridParticleIndex,
+ dev_k.x);
// Synchronization point
cudaThreadSynchronize();
t@@ -461,19 +488,19 @@ __host__ void DEM::startTime()
// Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
if (PROFILING == 1)
startTimer(&kernel_tic);
- thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
- thrust::device_ptr<uint>(dev_gridParticleCellID + p.np),
- thrust::device_ptr<uint>(dev_gridParticleIndex));
+ thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort.gridParticleCellID),
+ thrust::device_ptr<uint>(dev_sort.gridParticleCellID + np),
+ thrust::device_ptr<uint>(dev_sort.gridParticleIndex));
cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
if (PROFILING == 1)
stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
- checkForCudaErrors("Post calcParticleCellID");
+ checkForCudaErrors("Post thrust::sort_by_key");
// Zero cell array values by setting cellStart to its highest possible value,
// specified with pointer value 0xffffffff, which for a 32 bit unsigned int
// is 4294967295.
- cudaMemset(dev_cellStart, 0xffffffff,
+ cudaMemset(dev_sort.cellStart, 0xffffffff,
grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
cudaThreadSynchronize();
checkForCudaErrors("Post cudaMemset");
t@@ -482,18 +509,15 @@ __host__ void DEM::startTime()
// coherent memory access. Save ordered configurations in new arrays (*_sorted).
if (PROFILING == 1)
startTimer(&kernel_tic);
- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
- dev_cellEnd,
- dev_gridParticleCellID,
- dev_gridParticleIndex,
- dev_x, dev_vel,
- dev_angvel, dev_radius,
- //dev_bonds,
- dev_x_sorted,
- dev_vel_sorted,
- dev_angvel_sorted,
- dev_radius_sorted);
- //dev_bonds_sorted);
+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort.cellStart,
+ dev_sort.cellEnd,
+ dev_sort.gridParticleCellID,
+ dev_sort.gridParticleIndex,
+ dev_k.x, dev_k.vel,
+ dev_k.angvel,
+ dev_sort.x_sorted,
+ dev_sort.vel_sorted,
+ dev_sort.angvel_sorted);
// Synchronization point
cudaThreadSynchronize();
t@@ -504,17 +528,16 @@ __host__ void DEM::startTime()
// The contact search in topology() is only necessary for determining
// the accumulated shear distance needed in the linear elastic
// and nonlinear contact force model
- if (params.shearmodel == 2 || params.shearmodel == 3) {
+ if (params.contactmodel == 2 || params.contactmodel == 3) {
// For each particle: Search contacts in neighbor cells
if (PROFILING == 1)
startTimer(&kernel_tic);
- topology<<<dimGrid, dimBlock>>>(dev_cellStart,
- dev_cellEnd,
- dev_gridParticleIndex,
- dev_x_sorted,
- dev_radius_sorted,
- dev_contacts,
- dev_distmod);
+ topology<<<dimGrid, dimBlock>>>(dev_sort.cellStart,
+ dev_sort.cellEnd,
+ dev_sort.gridParticleIndex,
+ dev_sort.x_sorted,
+ dev_k.contacts,
+ dev_k.distmod);
// Synchronization point
t@@ -528,21 +551,26 @@ __host__ void DEM::startTime()
// For each particle: Process collisions and compute resulting forces.
if (PROFILING == 1)
startTimer(&kernel_tic);
- interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
- dev_cellStart,
- dev_cellEnd,
- dev_x, dev_radius,
- dev_x_sorted, dev_radius_sorted,
- dev_vel_sorted, dev_angvel_sorted,
- dev_vel, dev_angvel,
- dev_force, dev_torque,
- dev_es_dot, dev_ev_dot,
- dev_es, dev_ev, dev_p,
- dev_w_nx, dev_w_mvfd, dev_w_force,
- //dev_bonds_sorted,
- dev_contacts,
- dev_distmod,
- dev_delta_t);
+ interact<<<dimGrid, dimBlock>>>(dev_sort.gridParticleIndex,
+ dev_sort.cellStart,
+ dev_sort.cellEnd,
+ dev_k.x,
+ dev_sort.x_sorted,
+ dev_sort.vel_sorted,
+ dev_sort.angvel_sorted,
+ dev_k.vel,
+ dev_k.angvel,
+ dev_k.force,
+ dev_k.torque,
+ dev_e.es_dot,
+ dev_e.ev_dot,
+ dev_e.es, dev_e.ev, dev_e.p,
+ dev_walls.nx,
+ dev_walls.mvfd,
+ dev_walls.force,
+ dev_k.contacts,
+ dev_k.distmod,
+ dev_k.delta_t);
// Synchronization point
t@@ -554,11 +582,16 @@ __host__ void DEM::startTime()
// Update particle kinematics
if (PROFILING == 1)
startTimer(&kernel_tic);
- integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted,
- dev_angvel_sorted, dev_radius_sorted,
- dev_x, dev_vel, dev_angvel,
- dev_force, dev_torque, dev_angpos,
- dev_gridParticleIndex);
+ integrate<<<dimGrid, dimBlock>>>(dev_sort.x_sorted,
+ dev_sort.vel_sorted,
+ dev_sort.angvel_sorted,
+ dev_k.x,
+ dev_k.vel,
+ dev_k.angvel,
+ dev_k.force,
+ dev_k.torque,
+ dev_k.angpos,
+ dev_sort.gridParticleIndex);
cudaThreadSynchronize();
if (PROFILING == 1)
t@@ -567,7 +600,7 @@ __host__ void DEM::startTime()
// Summation of forces on wall
if (PROFILING == 1)
startTimer(&kernel_tic);
- summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);
+ summation<<<dimGrid, dimBlock>>>(dev_walls.force, dev_w_force_partial);
// Synchronization point
cudaThreadSynchronize();
t@@ -578,10 +611,10 @@ __host__ void DEM::startTime()
// Update wall kinematics
if (PROFILING == 1)
startTimer(&kernel_tic);
- integrateWalls<<< 1, params.nw>>>(dev_w_nx,
- dev_w_mvfd,
- dev_w_force_partial,
- blocksPerGrid);
+ integrateWalls<<< 1, walls.nw>>>(dev_walls.nx,
+ dev_walls.mvfd,
+ dev_w_force_partial,
+ blocksPerGrid);
// Synchronization point
cudaThreadSynchronize();
t@@ -591,12 +624,14 @@ __host__ void DEM::startTime()
// Update timers and counters
- time.current += time.dt;
- filetimeclock += time.dt;
+ time.current += time.dt;
+ filetimeclock += time.dt;
// Report time to console
- cout << "\r Current simulation time: "
- << time.current << " s. ";// << std::flush;
+ if (verbose == 1) {
+ cout << "\r Current simulation time: "
+ << time.current << " s. ";// << std::flush;
+ }
// Produce output binary if the time interval
t@@ -608,52 +643,16 @@ __host__ void DEM::startTime()
checkForCudaErrors("Beginning of file output section");
//// Copy device data to host memory
-
- // Particle data
- cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_angpos, dev_angpos, memSizeF4, cudaMemcpyDeviceToHost);
- //cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p.np, cudaMemcpyDeviceToHost);
- cudaMemcpy(p.es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy(p.ev_dot, dev_ev_dot, memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy(p.es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy(p.ev, dev_ev, memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy(p.p, dev_p, memSizeF, cudaMemcpyDeviceToHost);
-
- // Wall data
- cudaMemcpy(host_w_nx, dev_w_nx,
- sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_w_mvfd, dev_w_mvfd,
- sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost);
-
-
- // Contact information
- if (CONTACTINFO == 1) {
- cudaMemcpy(host_contacts, dev_contacts, sizeof(unsigned int)*p.np*NC, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_delta_t, dev_delta_t, memSizeF4*NC, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_distmod, dev_distmod, memSizeF4*NC, cudaMemcpyDeviceToHost);
- }
+ transferFromGlobalDeviceMemory();
// Pause the CPU thread until all CUDA calls previously issued are completed
cudaThreadSynchronize();
// Write binary output file
time.step_count += 1;
- sprintf(file,"%s/output/%s.output%d.bin", cwd, inputbin, time.step_count);
-
- if (fwritebin(file, &p, host_x, host_vel,
- host_angvel, host_force,
- host_torque, host_angpos,
- host_bonds,
- &grid, &time, ¶ms,
- host_w_nx, host_w_mvfd) != 0) {
- cout << "\n Error during fwritebin() in main loop\n";
- exit(EXIT_FAILURE);
- }
+ sprintf(file,"output/%s.output%d.bin", inputbin, time.step_count);
+ writebin(file);
+
if (CONTACTINFO == 1) {
// Write contact information to stdout
t@@ -686,7 +685,7 @@ __host__ void DEM::startTime()
}
// Update status.dat at the interval of filetime
- sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
+ sprintf(file,"output/%s.status.dat", inputbin);
fp = fopen(file, "w");
fprintf(fp,"%2.4e %2.4e %d\n",
time.current,
t@@ -747,48 +746,11 @@ __host__ void DEM::startTime()
// Free GPU device memory
- printf("\nLiberating device memory: ");
-
- // Particle arrays
- cudaFree(dev_x);
- cudaFree(dev_x_sorted);
- cudaFree(dev_vel);
- cudaFree(dev_vel_sorted);
- cudaFree(dev_angvel);
- cudaFree(dev_angvel_sorted);
- cudaFree(dev_acc);
- cudaFree(dev_angacc);
- cudaFree(dev_force);
- cudaFree(dev_torque);
- cudaFree(dev_angpos);
- cudaFree(dev_radius);
- cudaFree(dev_radius_sorted);
- cudaFree(dev_es_dot);
- cudaFree(dev_ev_dot);
- cudaFree(dev_es);
- cudaFree(dev_ev);
- cudaFree(dev_p);
- //cudaFree(dev_bonds);
- //cudaFree(dev_bonds_sorted);
- cudaFree(dev_contacts);
- cudaFree(dev_distmod);
- cudaFree(dev_delta_t);
-
- // Cell-related arrays
- cudaFree(dev_gridParticleIndex);
- cudaFree(dev_cellStart);
- cudaFree(dev_cellEnd);
-
- // Wall arrays
- cudaFree(dev_w_nx);
- cudaFree(dev_w_mvfd);
- cudaFree(dev_w_force);
- cudaFree(dev_w_force_partial);
+ freeGlobalDeviceMemory();
// Contact info arrays
delete[] host_contacts;
delete[] host_distmod;
delete[] host_delta_t;
- printf("Done\n");
} /* EOF */
(DIR) diff --git a/src/integration.cuh b/src/integration.cuh
t@@ -8,7 +8,7 @@
// Second order integration scheme based on Taylor expansion of particle kinematics.
// Kernel executed on device, and callable from host only.
__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
- Float4* dev_angvel_sorted, Float* dev_radius_sorted, // Input
+ Float4* dev_angvel_sorted,
Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
t@@ -32,7 +32,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
Float4 x = dev_x_sorted[idx];
Float4 vel = dev_vel_sorted[idx];
Float4 angvel = dev_angvel_sorted[idx];
- Float radius = dev_radius_sorted[idx];
+ Float radius = x.w;
// Coherent read from constant memory to registers
Float dt = devC_params.dt;
(DIR) diff --git a/src/sorting.cuh b/src/sorting.cuh
t@@ -43,15 +43,16 @@ __global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
// Reorder particle data into sorted order, and find the start and end particle indexes
// of each cell in the sorted hash array.
// Kernel executed on device, and callable from host only.
-__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
+__global__ void reorderArrays(unsigned int* dev_cellStart,
+ unsigned int* dev_cellEnd,
unsigned int* dev_gridParticleCellID,
unsigned int* dev_gridParticleIndex,
- Float4* dev_x, Float4* dev_vel,
- Float4* dev_angvel, Float* dev_radius,
- //uint4* dev_bonds,
- Float4* dev_x_sorted, Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted, Float* dev_radius_sorted)
- //uint4* dev_bonds_sorted)
+ Float4* dev_x,
+ Float4* dev_vel,
+ Float4* dev_angvel,
+ Float4* dev_x_sorted,
+ Float4* dev_vel_sorted,
+ Float4* dev_angvel_sorted)
{
// Create hash array in shared on-chip memory. The size of the array
t@@ -107,16 +108,12 @@ __global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cel
Float4 x = dev_x[sortedIndex];
Float4 vel = dev_vel[sortedIndex];
Float4 angvel = dev_angvel[sortedIndex];
- Float radius = dev_radius[sortedIndex];
- //uint4 bonds = dev_bonds[sortedIndex];
__syncthreads();
// Write sorted data to global memory
dev_x_sorted[idx] = x;
dev_vel_sorted[idx] = vel;
dev_angvel_sorted[idx] = angvel;
- dev_radius_sorted[idx] = radius;
- //dev_bonds_sorted[idx] = bonds;
}
} // End of reorderArrays(...)
(DIR) diff --git a/src/sphere.h b/src/sphere.h
t@@ -20,31 +20,29 @@ class DEM {
unsigned int np;
// Structure containing individual particle kinematics
- Kinematics k;
- Kinematics dev_k;
+ Kinematics k; // host
+ Kinematics dev_k; // device
// Structure containing energy values
- Energies e;
- Energies dev_e;
+ Energies e; // host
+ Energies dev_e; // device
// Structure of global parameters
- Params params;
- Params dev_params;
+ Params params; // host
// Structure containing spatial parameters
- Grid grid;
- Grid dev_grid;
+ Grid grid; // host
+
+ // Structure containing sorting arrays
+ Sorting dev_sort; // device
// Structure of temporal parameters
- Time time;
- Time dev_time;
+ Time time; // host
+ Time *dev_time; // device
// Structure of wall parameters
- Walls walls;
- Walls dev_walls;
-
- // Wall force arrays
- Float* dev_w_force;
+ Walls walls; // host
+ Walls dev_walls; // device
// GPU initialization, must be called before startTime()
void initializeGPU(void);
t@@ -58,6 +56,9 @@ class DEM {
// Allocate global device memory to hold data
void allocateGlobalDeviceMemory(void);
+ // Free dynamically allocated global device memory
+ void freeGlobalDeviceMemory(void);
+
// Copy non-constant data to global GPU memory
void transferToGlobalDeviceMemory(void);