tFurther work on implementing OO - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 287f0082450f6709173703f7a43f708676d4f29e
 (DIR) parent bd8883f07cd1e54218c7cde17e6376d52918727c
 (HTM) Author: Anders Damsgaard Christensen <adc@geo.au.dk>
       Date:   Fri, 26 Oct 2012 23:05:19 +0200
       
       Further work on implementing OO
       
       Diffstat:
         M src/constants.cuh                   |       1 +
         M src/contactsearch.cuh               |      20 +++++++++-----------
         M src/datatypes.h                     |      31 ++++++++++++++++++++++++-------
         M src/device.cu                       |     512 ++++++++++++++-----------------
         M src/integration.cuh                 |       4 ++--
         M src/sorting.cuh                     |      19 ++++++++-----------
         M src/sphere.h                        |      31 ++++++++++++++++---------------
       
       7 files changed, 297 insertions(+), 321 deletions(-)
       ---
 (DIR) diff --git a/src/constants.cuh b/src/constants.cuh
       t@@ -10,6 +10,7 @@
        __constant__ unsigned int devC_nd;   // Number of dimensions
        __constant__ unsigned int devC_np;   // Number of particles
        __constant__ int          devC_nc;   // Max. number of contacts a particle can have
       +__constant__ Float          devC_dt;   // Computational time step length
        
        // Device constant memory structures
        __constant__ Params devC_params;
 (DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
       t@@ -87,7 +87,6 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
                                                     Float* es_dot, Float* ev_dot,
                                                     Float* p,
                                                     Float4* dev_x_sorted, 
       -                                             Float* dev_radius_sorted,
                                                     Float4* dev_vel_sorted, 
                                                     Float4* dev_angvel_sorted,
                                                     unsigned int* dev_cellStart, 
       t@@ -125,7 +124,7 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
        
                // Fetch position and velocity of particle B.
                Float4 x_b      = dev_x_sorted[idx_b];
       -        Float  radius_b = dev_radius_sorted[idx_b];
       +        Float  radius_b = x_b.w;
                Float  kappa         = devC_params.kappa;
        
                // Distance between particle centers (Float4 -> Float3)
       t@@ -183,7 +182,6 @@ __device__ void findContactsInCell(int3 targetCell,
                                               unsigned int idx_a, 
                                           Float4 x_a, Float radius_a,
                                           Float4* dev_x_sorted, 
       -                                   Float* dev_radius_sorted,
                                           unsigned int* dev_cellStart, 
                                           unsigned int* dev_cellEnd,
                                           unsigned int* dev_gridParticleIndex,
       t@@ -226,7 +224,7 @@ __device__ void findContactsInCell(int3 targetCell,
        
                // Fetch position and radius of particle B.
                Float4 x_b      = dev_x_sorted[idx_b];
       -        Float  radius_b = dev_radius_sorted[idx_b];
       +        Float  radius_b = x_b.w;
        
                // Read the original index of particle B
                unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
       t@@ -316,7 +314,7 @@ __device__ void findContactsInCell(int3 targetCell,
        __global__ void topology(unsigned int* dev_cellStart, 
                                     unsigned int* dev_cellEnd, // Input: Particles in cell 
                                 unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       -                         Float4* dev_x_sorted, Float* dev_radius_sorted, 
       +                         Float4* dev_x_sorted, 
                                 unsigned int* dev_contacts,
                                 Float4* dev_distmod)
        {
       t@@ -325,7 +323,7 @@ __global__ void topology(unsigned int* dev_cellStart,
          if (idx_a < devC_params.np) {
            // Fetch particle data in global read
            Float4 x_a      = dev_x_sorted[idx_a];
       -    Float  radius_a = dev_radius_sorted[idx_a];
       +    Float  radius_a = x_a.w;
        
            // Count the number of contacts in this time step
            int nc = 0;
       t@@ -346,7 +344,7 @@ __global__ void topology(unsigned int* dev_cellStart,
                for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
                  targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
                  findContactsInCell(targetPos, idx_a, x_a, radius_a,
       -                                    dev_x_sorted, dev_radius_sorted,
       +                                    dev_x_sorted, 
                                     dev_cellStart, dev_cellEnd,
                                     dev_gridParticleIndex,
                                         &nc, dev_contacts, dev_distmod);
       t@@ -371,8 +369,8 @@ __global__ void topology(unsigned int* dev_cellStart,
        __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
                                 unsigned int* dev_cellStart,
                                 unsigned int* dev_cellEnd,
       -                         Float4* dev_x, Float* dev_radius,
       -                             Float4* dev_x_sorted, Float* dev_radius_sorted, 
       +                         Float4* dev_x,
       +                             Float4* dev_x_sorted,
                                 Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
                                 Float4* dev_vel, Float4* dev_angvel,
                                 Float4* dev_force, Float4* dev_torque,
       t@@ -395,7 +393,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            // Fetch particle data in global read
            unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
            Float4 x_a      = dev_x_sorted[idx_a];
       -    Float  radius_a = dev_radius_sorted[idx_a];
       +    Float  radius_a = x_a.w;
        
            // Fetch wall data in global read
            Float4 w_up_nx   = dev_w_nx[0];
       t@@ -521,7 +519,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                    targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
                    findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a,
                                                 &F, &T, &es_dot, &ev_dot, &p,
       -                                         dev_x_sorted, dev_radius_sorted, 
       +                                         dev_x_sorted,
                                                 dev_vel_sorted, dev_angvel_sorted,
                                                 dev_cellStart, dev_cellEnd,
                                                 dev_w_nx, dev_w_mvfd);
 (DIR) diff --git a/src/datatypes.h b/src/datatypes.h
       t@@ -15,13 +15,18 @@
        
        // Structure containing kinematic particle values
        struct Kinematics {
       -  Float4 *x;                // Positions + radii (w)
       -  Float2 *xysum;        // Horizontal distance traveled
       -  Float4 *vel;                // Translational velocities + fixvels (w)
       -  Float4 *force;        // Sums of forces
       -  Float4 *angpos;        // Angular positions
       -  Float4 *angvel;        // Angular velocities
       -  Float4 *torque;        // Sums of torques
       +  Float4 *x;                  // Positions + radii (w)
       +  Float2 *xysum;          // Horizontal distance traveled
       +  Float4 *vel;                  // Translational velocities + fixvels (w)
       +  Float4 *acc;                  // Translational accelerations
       +  Float4 *force;          // Sums of forces
       +  Float4 *angpos;          // Angular positions
       +  Float4 *angvel;          // Angular velocities
       +  Float4 *angacc;          // Angular accelerations
       +  Float4 *torque;          // Sums of torques
       +  unsigned int *contacts; // List of contacts per particle
       +  Float4 *distmod;          // Distance modifiers for contacts across periodic boundaries
       +  Float4 *delta_t;          // Accumulated shear distance of contacts
        };
        
        // Structure containing individual physical particle parameters
       t@@ -42,6 +47,17 @@ struct Grid {
          int periodic;                // Behavior of boundaries at 1st and 2nd world edge
        };
        
       +struct Sorting {
       +  Float4 *x_sorted;          // Positions + radii (w) (sorted)
       +  Float4 *vel_sorted;          // Translational velocities + fixvels (w) (sorted)
       +  Float4 *angvel_sorted;  // Angular velocities (sorted)
       +  unsigned int *gridParticleCellID; // Hash key (cell index) from position in grid
       +  unsigned int *gridParticleIndex;  // Original indexes of particles
       +  unsigned int *cellStart;            // First index of sorted idx'es in cells
       +  unsigned int *cellEnd;            // Last index of sorted idx'es in cells
       +};
       + 
       +
        // Structure containing time parameters
        struct Time {
          Float dt;                // Computational time step length
       t@@ -76,6 +92,7 @@ struct Walls {
          int wmode[MAXWALLS];        // Wall modes
          Float4* nx;                // Wall normal and position
          Float4* mvfd;                // Wall mass, velocity, force and dev. stress
       +  Float* force;                // Resulting forces on walls per particle
          Float gamma_wn;        // Wall normal viscosity
          Float gamma_wt;        // Wall tangential viscosity
          Float gamma_wr;        // Wall rolling viscosity
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -197,6 +197,7 @@ __host__ void DEM::transferToConstantDeviceMemory()
          cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd));
          cudaMemcpyToSymbol("devC_np", &np, sizeof(np));
          cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int));
       +  cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float));
          cudaMemcpyToSymbol(devC_grid, grid, sizeof(Grid));
          cudaMemcpyToSymbol(devC_params, params, sizeof(Params));
          
       t@@ -205,149 +206,178 @@ __host__ void DEM::transferToConstantDeviceMemory()
          if (verbose == 1)
            cout << "Done\n";
        
       -  checkConstantMemory();
       +  //checkConstantMemory();
        }
        
        
       -//extern "C"
       -__host__ void DEM::startTime()
       +// Allocate device memory for particle variables,
       +// tied to previously declared pointers in structures
       +__host__ void DEM::allocateGlobalDeviceMemory(void)
        {
       +  // Particle memory size
       +  unsigned int memSizeF  = sizeof(Float) * np;
       +  unsigned int memSizeF4 = sizeof(Float4) * np;
        
       -  using std::cout;        // Namespace directive
       +  if (verbose == 1)
       +    std::cout << "  Allocating device memory:                       ";
        
       -  // Copy data to constant global device memory
       -  transferToConstantDeviceMemory();
       +  // Particle arrays
       +  cudaMalloc((void**)&dev_k.x, memSizeF4);
       +  cudaMalloc((void**)&dev_sort.x_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k.vel, memSizeF4);
       +  cudaMalloc((void**)&dev_sort.vel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k.angvel, memSizeF4);
       +  cudaMalloc((void**)&dev_sort.angvel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k.acc, memSizeF4);
       +  k.acc = new Float4[np];
       +  cudaMalloc((void**)&dev_k.angacc, memSizeF4);
       +  k.angacc = new Float4[np];
       +  cudaMalloc((void**)&dev_k.force, memSizeF4);
       +  cudaMalloc((void**)&dev_k.torque, memSizeF4);
       +  cudaMalloc((void**)&dev_k.angpos, memSizeF4);
       +  cudaMalloc((void**)&dev_e.es_dot, memSizeF);
       +  cudaMalloc((void**)&dev_e.ev_dot, memSizeF);
       +  cudaMalloc((void**)&dev_e.es, memSizeF);
       +  cudaMalloc((void**)&dev_e.ev, memSizeF);
       +  cudaMalloc((void**)&dev_e.p, memSizeF);
        
       -  Float*  dev_w_force;               // Resulting force on wall per particle
       -  Float*  dev_w_force_partial; // Partial sum from block of threads
       -
       -  // Memory for sorted particle data
       -  Float4* dev_x_sorted;
       -  Float4* dev_vel_sorted;
       -  Float4* dev_angvel_sorted;
       -  Float*  dev_radius_sorted; 
       -
       -  // Grid-particle array pointers
       -  unsigned int* dev_gridParticleCellID;
       -  unsigned int* dev_gridParticleIndex;
       -  unsigned int* dev_cellStart;
       -  unsigned int* dev_cellEnd;
       -
       -  // Particle contact bookkeeping
       -  unsigned int* dev_contacts;
       -  unsigned int* host_contacts = new unsigned int[p.np*NC];
       -  // Particle pair distance correction across periodic boundaries
       -  Float4* dev_distmod;
       -  Float4* host_distmod = new Float4[p.np*NC];
       -  // x,y,z contains the interparticle vector, corrected if contact 
       -  // is across a periodic boundary. 
       -  Float4* dev_delta_t; // Accumulated shear distance of contact
       -  Float4* host_delta_t = new Float4[p.np*NC];
       +  // Cell-related arrays
       +  cudaMalloc((void**)&dev_sort.gridParticleCellID, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_sort.gridParticleIndex, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_sort.cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +  cudaMalloc((void**)&dev_sort.cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
        
       -  // Particle memory size
       -  unsigned int memSizeF  = sizeof(Float) * p.np;
       -  unsigned int memSizeF4 = sizeof(Float4) * p.np;
       +  // Particle contact bookkeeping arrays
       +  cudaMalloc((void**)&dev_k.contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       +  cudaMalloc((void**)&dev_k.distmod, sizeof(Float4)*np*NC);
       +  cudaMalloc((void**)&dev_k.delta_t, sizeof(Float4)*np*NC);
       +
       +  // Host contact bookkeeping arrays
       +  k.contacts = new unsigned int[np*NC];
       +  // Initialize contacts lists to np
       +  for (unsigned int i=0; i<(np*NC); ++i)
       +    k.contacts[i] = np;
       +  k.distmod = new Float4[np*NC];
       +  k.delta_t = new Float4[np*NC];
        
       -  // Allocate device memory for particle variables,
       -  // tied to previously declared pointers in structures
       -  allocateGlobalDeviceMemory();
       -  cout << "  Allocating device memory:                       ";
       +  // Wall arrays
       +  cudaMalloc((void**)&dev_walls.nx, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls.mvfd, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls.force, sizeof(Float)*walls.nw*np);
       +  // dev_w_force_partial allocated later
        
       +  checkForCudaErrors("End of allocateGlobalDeviceMemory");
       +  if (verbose == 1)
       +    std::cout << "Done\n";
       +}
       +
       +__host__ void DEM::freeGlobalDeviceMemory()
       +{
       +  if (verbose == 1)
       +    printf("\nLiberating device memory:                        ");
          // Particle arrays
       -  cudaMalloc((void**)&dev_x, memSizeF4);
       -  cudaMalloc((void**)&dev_x_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_vel, memSizeF4);
       -  cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_angvel, memSizeF4);
       -  cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_acc, memSizeF4);
       -  cudaMalloc((void**)&dev_angacc, memSizeF4);
       -  cudaMalloc((void**)&dev_force, memSizeF4);
       -  cudaMalloc((void**)&dev_torque, memSizeF4);
       -  cudaMalloc((void**)&dev_angpos, memSizeF4);
       -  cudaMalloc((void**)&dev_radius, memSizeF);
       -  cudaMalloc((void**)&dev_radius_sorted, memSizeF);
       -  cudaMalloc((void**)&dev_es_dot, memSizeF);
       -  cudaMalloc((void**)&dev_ev_dot, memSizeF);
       -  cudaMalloc((void**)&dev_es, memSizeF);
       -  cudaMalloc((void**)&dev_ev, memSizeF);
       -  cudaMalloc((void**)&dev_p, memSizeF);
       -  //cudaMalloc((void**)&dev_bonds, sizeof(uint4) * p.np);
       -  //cudaMalloc((void**)&dev_bonds_sorted, sizeof(uint4) * p.np);
       +  cudaFree(dev_k.x);
       +  cudaFree(dev_sort.x_sorted);
       +  cudaFree(dev_k.vel);
       +  cudaFree(dev_sort.vel_sorted);
       +  cudaFree(dev_k.angvel);
       +  cudaFree(dev_sort.angvel_sorted);
       +  cudaFree(dev_k.acc);
       +  cudaFree(dev_k.angacc);
       +  cudaFree(dev_k.force);
       +  cudaFree(dev_k.torque);
       +  cudaFree(dev_k.angpos);
       +  cudaFree(dev_e.es_dot);
       +  cudaFree(dev_e.ev_dot);
       +  cudaFree(dev_e.es);
       +  cudaFree(dev_e.ev);
       +  cudaFree(dev_e.p);
       +  cudaFree(dev_k.contacts);
       +  cudaFree(dev_k.distmod);
       +  cudaFree(dev_k.delta_t);
        
          // Cell-related arrays
       -  cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*p.np);
       -  cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*p.np);
       -  cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       -  cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       -
       -  // Particle contact bookkeeping arrays
       -  cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*p.np*NC); // Max NC contacts per particle
       -  cudaMalloc((void**)&dev_distmod, sizeof(Float4)*p.np*NC);
       -  cudaMalloc((void**)&dev_delta_t, sizeof(Float4)*p.np*NC);
       +  cudaFree(dev_sort.gridParticleIndex);
       +  cudaFree(dev_sort.cellStart);
       +  cudaFree(dev_sort.cellEnd);
        
          // Wall arrays
       -  cudaMalloc((void**)&dev_w_nx, sizeof(Float)*params.nw*4);
       -  cudaMalloc((void**)&dev_w_mvfd, sizeof(Float)*params.nw*4);
       -  cudaMalloc((void**)&dev_w_force, sizeof(Float)*params.nw*p.np);
       +  cudaFree(dev_walls.nx);
       +  cudaFree(dev_walls.mvfd);
       +  cudaFree(dev_walls.force);
       +  cudaFree(dev_w_force_partial);
       +
       +  if (verbose == 1)
       +    printf("Done\n");
       +}
        
       -  checkForCudaErrors("Post device memory allocation");
       -  cout << "Done\n";
        
       -  // Transfer data from host to gpu device memory
       +__host__ void DEM::transferToGlobalDeviceMemory()
       +{
       +  if (verbose == 1)
       +    std::cout << "  Transfering data to the device:                 ";
       +
       +  // Copy structure data from host to global device memory
       +  /*cudaMemcpy(dev_k, k, sizeof(Kinematics), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_e, e, sizeof(Energies), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_time, time, sizeof(Time), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_walls, walls, sizeof(Walls), cudaMemcpyHostToDevice);*/
       +  cudaMemcpy(dev_k, k, sizeof(k), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_e, e, sizeof(e), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_time, time, sizeof(time), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_walls, walls, sizeof(walls), cudaMemcpyHostToDevice);
       +
       +  checkForCudaErrors("End of transferToGlobalDeviceMemory");
       +  if (verbose == 1)
       +    std::cout << "Done\n";
       +}
       +
       +__host__ void DEM::transferToGlobalDeviceMemory()
       +{
          cout << "  Transfering data to the device:                 ";
        
       -  // Particle data
       -  cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_angpos, host_angpos, memSizeF4, cudaMemcpyHostToDevice);
       -  //cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p.np, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_radius, p.radius, memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_es_dot, p.es_dot, memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_ev_dot, p.ev_dot, memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_es, p.es, memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_ev, p.ev, memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_p, p.p, memSizeF, cudaMemcpyHostToDevice);
       -
       -  // Wall data (wall mass and number in constant memory)
       -  cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)*params.nw*4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)*params.nw*4, cudaMemcpyHostToDevice);
       -  
       -  // Initialize contacts lists to p.np
       -  unsigned int* npu = new unsigned int[p.np*NC];
       -  for (unsigned int i=0; i<(p.np*NC); ++i)
       -    npu[i] = p.np;
       -  cudaMemcpy(dev_contacts, npu, sizeof(unsigned int)*p.np*NC, cudaMemcpyHostToDevice);
       -  delete[] npu;
       -
       -  // Create array of 0.0 values on the host and transfer these to the distance 
       -  // modifier and shear displacement arrays
       -  Float4* zerosF4 = new Float4[p.np*NC];
       -  for (unsigned int i=0; i<(p.np*NC); ++i)
       -    zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       -  cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)*p.np*NC, cudaMemcpyHostToDevice);
       -  delete[] zerosF4;
       -
       -  checkForCudaErrors("Post memcopy");
       -  cout << "Done\n";
       +  // Copy structure data from host to global device memory
       +  cudaMemcpy(k, dev_k, sizeof(k), cudaMemcpyDeviceToHost);
       +  cudaMemcpy(e, dev_e, sizeof(e), cudaMemcpyDeviceToHost);
       +  cudaMemcpy(time, dev_time, sizeof(time), cudaMemcpyDeviceToHost);
       +  cudaMemcpy(walls, dev_walls, sizeof(walls), cudaMemcpyDeviceToHost);
       +
       +  checkForCudaErrors("End of transferFromGlobalDeviceMemory");
       +  if (verbose == 1)
       +    std::cout << "Done\n";
       +}
       +
       +
       +// Iterate through time by explicit time integration
       +__host__ void DEM::startTime()
       +{
       +
       +  using std::cout; // Namespace directive
       +  char file[200];  // Output filename
       +  FILE *fp;
       +
       +  // Copy data to constant global device memory
       +  transferToConstantDeviceMemory();
       +
       +
       +  // Particle memory size
       +  unsigned int memSizeF  = sizeof(Float) * np;
       +  unsigned int memSizeF4 = sizeof(Float4) * np;
       +
       +  // Allocate device memory for particle variables,
       +  // tied to previously declared pointers in structures
       +  allocateGlobalDeviceMemory();
       +
       +  // Transfer data from host to gpu device memory
       +  transferToGlobalDeviceMemory();
        
          // Synchronization point
          cudaThreadSynchronize();
       -  checkForCudaErrors("Start of mainLoop()");
       +  checkForCudaErrors("Start of startTime()");
        
          // Model world variables
          float tic, toc, filetimeclock, time_spent, dev_time_spent;
        
       -  // File output
       -  FILE* fp;
       -  char file[1000];  // Complete path+filename variable
       -
          // Start CPU clock
          tic = clock();
        
       t@@ -360,22 +390,25 @@ __host__ void DEM::startTime()
          // Shared memory per block
          unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
        
       +  Float* dev_w_force_partial;
          cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
        
          // Report to stdout
       -  cout << "\n  Device memory allocation and transfer complete.\n"
       -       << "  - Blocks per grid: "
       -       << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
       -       << "  - Threads per block: "
       -       << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
       -       << "  - Shared memory required per block: " << smemSize << " bytes\n";
       +  if (verbose == 1) {
       +    cout << "\n  Device memory allocation and transfer complete.\n"
       +      << "  - Blocks per grid: "
       +      << dimGrid.x << "*" << dimGrid.y << "*" << dimGrid.z << "\n"
       +      << "  - Threads per block: "
       +      << dimBlock.x << "*" << dimBlock.y << "*" << dimBlock.z << "\n"
       +      << "  - Shared memory required per block: " << smemSize << " bytes\n";
       +  }
        
          // Initialize counter variable values
          filetimeclock = 0.0;
          long iter = 0;
        
          // Create first status.dat
       -  sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
       +  sprintf(file,"output/%s.status.dat", inputbin);
          fp = fopen(file, "w");
          fprintf(fp,"%2.4e %2.4e %d\n", 
                        time.current, 
       t@@ -384,23 +417,17 @@ __host__ void DEM::startTime()
          fclose(fp);
        
          // Write first output data file: output0.bin, thus testing writing of bin files
       -  sprintf(file,"%s/output/%s.output0.bin", cwd, inputbin);
       -  if (fwritebin(file, &p, host_x, host_vel, 
       -                host_angvel, host_force, 
       -                host_torque, host_angpos,
       -                host_bonds,
       -                &grid, &time, &params,
       -                host_w_nx, host_w_mvfd) != 0)  {
       -    std::cerr << "\n Problem during fwritebin \n";
       -    exit(EXIT_FAILURE);
       +  sprintf(file,"output/%s.output0.bin", inputbin);
       +  writebin(file);
       +
       +  if (verbose == 1) {
       +    cout << "\n  Entering the main calculation time loop...\n\n"
       +      << "  IMPORTANT: Do not close this terminal, doing so will \n"
       +      << "             terminate this SPHERE process. Follow the \n"
       +      << "             progress by executing:\n"
       +      << "                $ ./sphere_status " << inputbin << "\n\n";
          }
        
       -  cout << "\n  Entering the main calculation time loop...\n\n"
       -       << "  IMPORTANT: Do not close this terminal, doing so will \n"
       -       << "             terminate this SPHERE process. Follow the \n"
       -       << "             progress by executing:\n"
       -       << "                $ ./sphere_status " << inputbin << "\n\n";
       -
        
          // Start GPU clock
          cudaEvent_t dev_tic, dev_toc;
       t@@ -447,9 +474,9 @@ __host__ void DEM::startTime()
            // in the fine, uniform and homogenous grid.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID, 
       -                                              dev_gridParticleIndex, 
       -                                              dev_x);
       +    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort.gridParticleCellID, 
       +                                              dev_sort.gridParticleIndex, 
       +                                              dev_k.x);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -461,19 +488,19 @@ __host__ void DEM::startTime()
            // Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
       -                        thrust::device_ptr<uint>(dev_gridParticleCellID + p.np),
       -                        thrust::device_ptr<uint>(dev_gridParticleIndex));
       +    thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort.gridParticleCellID),
       +                        thrust::device_ptr<uint>(dev_sort.gridParticleCellID + np),
       +                        thrust::device_ptr<uint>(dev_sort.gridParticleIndex));
            cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
            if (PROFILING == 1)
              stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
       -    checkForCudaErrors("Post calcParticleCellID");
       +    checkForCudaErrors("Post thrust::sort_by_key");
        
        
            // Zero cell array values by setting cellStart to its highest possible value,
            // specified with pointer value 0xffffffff, which for a 32 bit unsigned int
            // is 4294967295.
       -    cudaMemset(dev_cellStart, 0xffffffff, 
       +    cudaMemset(dev_sort.cellStart, 0xffffffff, 
                       grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
            cudaThreadSynchronize();
            checkForCudaErrors("Post cudaMemset");
       t@@ -482,18 +509,15 @@ __host__ void DEM::startTime()
            // coherent memory access. Save ordered configurations in new arrays (*_sorted).
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart, 
       -                                                   dev_cellEnd,
       -                                                   dev_gridParticleCellID, 
       -                                                   dev_gridParticleIndex,
       -                                                   dev_x, dev_vel, 
       -                                                   dev_angvel, dev_radius, 
       -                                                   //dev_bonds,
       -                                                   dev_x_sorted, 
       -                                                   dev_vel_sorted, 
       -                                                   dev_angvel_sorted, 
       -                                                   dev_radius_sorted);
       -                                                   //dev_bonds_sorted);
       +    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort.cellStart, 
       +                                                   dev_sort.cellEnd,
       +                                                   dev_sort.gridParticleCellID, 
       +                                                   dev_sort.gridParticleIndex,
       +                                                   dev_k.x, dev_k.vel, 
       +                                                   dev_k.angvel,
       +                                                   dev_sort.x_sorted, 
       +                                                   dev_sort.vel_sorted, 
       +                                                   dev_sort.angvel_sorted);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -504,17 +528,16 @@ __host__ void DEM::startTime()
            // The contact search in topology() is only necessary for determining
            // the accumulated shear distance needed in the linear elastic
            // and nonlinear contact force model
       -    if (params.shearmodel == 2 || params.shearmodel == 3) {
       +    if (params.contactmodel == 2 || params.contactmodel == 3) {
              // For each particle: Search contacts in neighbor cells
              if (PROFILING == 1)
                startTimer(&kernel_tic);
       -      topology<<<dimGrid, dimBlock>>>(dev_cellStart, 
       -                                      dev_cellEnd,
       -                                      dev_gridParticleIndex,
       -                                      dev_x_sorted, 
       -                                      dev_radius_sorted, 
       -                                      dev_contacts,
       -                                      dev_distmod);
       +      topology<<<dimGrid, dimBlock>>>(dev_sort.cellStart, 
       +                                      dev_sort.cellEnd,
       +                                      dev_sort.gridParticleIndex,
       +                                      dev_sort.x_sorted, 
       +                                      dev_k.contacts,
       +                                      dev_k.distmod);
        
        
              // Synchronization point
       t@@ -528,21 +551,26 @@ __host__ void DEM::startTime()
            // For each particle: Process collisions and compute resulting forces.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
       -                                    dev_cellStart,
       -                                    dev_cellEnd,
       -                                    dev_x, dev_radius,
       -                                    dev_x_sorted, dev_radius_sorted,
       -                                    dev_vel_sorted, dev_angvel_sorted,
       -                                    dev_vel, dev_angvel,
       -                                    dev_force, dev_torque,
       -                                    dev_es_dot, dev_ev_dot, 
       -                                    dev_es, dev_ev, dev_p,
       -                                    dev_w_nx, dev_w_mvfd, dev_w_force,
       -                                    //dev_bonds_sorted,
       -                                    dev_contacts,
       -                                    dev_distmod,
       -                                    dev_delta_t);
       +    interact<<<dimGrid, dimBlock>>>(dev_sort.gridParticleIndex,
       +                                    dev_sort.cellStart,
       +                                    dev_sort.cellEnd,
       +                                    dev_k.x,
       +                                    dev_sort.x_sorted,
       +                                    dev_sort.vel_sorted,
       +                                    dev_sort.angvel_sorted,
       +                                    dev_k.vel,
       +                                    dev_k.angvel,
       +                                    dev_k.force, 
       +                                    dev_k.torque,
       +                                    dev_e.es_dot,
       +                                    dev_e.ev_dot, 
       +                                    dev_e.es, dev_e.ev, dev_e.p,
       +                                    dev_walls.nx,
       +                                    dev_walls.mvfd,
       +                                    dev_walls.force,
       +                                    dev_k.contacts,
       +                                    dev_k.distmod,
       +                                    dev_k.delta_t);
        
        
            // Synchronization point
       t@@ -554,11 +582,16 @@ __host__ void DEM::startTime()
            // Update particle kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted, 
       -                                     dev_angvel_sorted, dev_radius_sorted,
       -                                     dev_x, dev_vel, dev_angvel,
       -                                     dev_force, dev_torque, dev_angpos,
       -                                     dev_gridParticleIndex);
       +    integrate<<<dimGrid, dimBlock>>>(dev_sort.x_sorted, 
       +                                     dev_sort.vel_sorted, 
       +                                     dev_sort.angvel_sorted,
       +                                     dev_k.x, 
       +                                     dev_k.vel, 
       +                                     dev_k.angvel,
       +                                     dev_k.force,
       +                                     dev_k.torque, 
       +                                     dev_k.angpos,
       +                                     dev_sort.gridParticleIndex);
        
            cudaThreadSynchronize();
            if (PROFILING == 1)
       t@@ -567,7 +600,7 @@ __host__ void DEM::startTime()
            // Summation of forces on wall
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);
       +    summation<<<dimGrid, dimBlock>>>(dev_walls.force, dev_w_force_partial);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -578,10 +611,10 @@ __host__ void DEM::startTime()
            // Update wall kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrateWalls<<< 1, params.nw>>>(dev_w_nx, 
       -                                       dev_w_mvfd,
       -                                       dev_w_force_partial,
       -                                       blocksPerGrid);
       +    integrateWalls<<< 1, walls.nw>>>(dev_walls.nx, 
       +                                     dev_walls.mvfd,
       +                                     dev_w_force_partial,
       +                                     blocksPerGrid);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -591,12 +624,14 @@ __host__ void DEM::startTime()
        
        
            // Update timers and counters
       -    time.current    += time.dt;
       -    filetimeclock    += time.dt;
       +    time.current  += time.dt;
       +    filetimeclock += time.dt;
        
            // Report time to console
       -    cout << "\r  Current simulation time: " 
       -         << time.current << " s.        ";// << std::flush;
       +    if (verbose == 1) {
       +      cout << "\r  Current simulation time: " 
       +        << time.current << " s.        ";// << std::flush;
       +    }
        
        
            // Produce output binary if the time interval 
       t@@ -608,52 +643,16 @@ __host__ void DEM::startTime()
              checkForCudaErrors("Beginning of file output section");
        
              //// Copy device data to host memory
       -
       -      // Particle data
       -      cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_angpos, dev_angpos, memSizeF4, cudaMemcpyDeviceToHost);
       -      //cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p.np, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p.es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p.ev_dot, dev_ev_dot, memSizeF, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p.es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p.ev, dev_ev, memSizeF, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p.p, dev_p, memSizeF, cudaMemcpyDeviceToHost);
       -
       -      // Wall data
       -      cudaMemcpy(host_w_nx, dev_w_nx, 
       -                   sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_w_mvfd, dev_w_mvfd, 
       -                   sizeof(Float)*params.nw*4, cudaMemcpyDeviceToHost);
       -
       -  
       -      // Contact information
       -      if (CONTACTINFO == 1) {
       -        cudaMemcpy(host_contacts, dev_contacts, sizeof(unsigned int)*p.np*NC, cudaMemcpyDeviceToHost);
       -        cudaMemcpy(host_delta_t, dev_delta_t, memSizeF4*NC, cudaMemcpyDeviceToHost);
       -        cudaMemcpy(host_distmod, dev_distmod, memSizeF4*NC, cudaMemcpyDeviceToHost);
       -      }
       +      transferFromGlobalDeviceMemory();
        
              // Pause the CPU thread until all CUDA calls previously issued are completed
              cudaThreadSynchronize();
        
              // Write binary output file
              time.step_count += 1;
       -      sprintf(file,"%s/output/%s.output%d.bin", cwd, inputbin, time.step_count);
       -
       -      if (fwritebin(file, &p, host_x, host_vel, 
       -                        host_angvel, host_force, 
       -                    host_torque, host_angpos,
       -                    host_bonds,
       -                    &grid, &time, &params,
       -                        host_w_nx, host_w_mvfd) != 0) {
       -        cout << "\n Error during fwritebin() in main loop\n";
       -        exit(EXIT_FAILURE);
       -      }
       +      sprintf(file,"output/%s.output%d.bin", inputbin, time.step_count);
       +      writebin(file);
       +
        
              if (CONTACTINFO == 1) {
                // Write contact information to stdout
       t@@ -686,7 +685,7 @@ __host__ void DEM::startTime()
              }
        
              // Update status.dat at the interval of filetime 
       -      sprintf(file,"%s/output/%s.status.dat", cwd, inputbin);
       +      sprintf(file,"output/%s.status.dat", inputbin);
              fp = fopen(file, "w");
              fprintf(fp,"%2.4e %2.4e %d\n", 
                      time.current, 
       t@@ -747,48 +746,11 @@ __host__ void DEM::startTime()
        
        
          // Free GPU device memory  
       -  printf("\nLiberating device memory:                        ");
       -
       -  // Particle arrays
       -  cudaFree(dev_x);
       -  cudaFree(dev_x_sorted);
       -  cudaFree(dev_vel);
       -  cudaFree(dev_vel_sorted);
       -  cudaFree(dev_angvel);
       -  cudaFree(dev_angvel_sorted);
       -  cudaFree(dev_acc);
       -  cudaFree(dev_angacc);
       -  cudaFree(dev_force);
       -  cudaFree(dev_torque);
       -  cudaFree(dev_angpos);
       -  cudaFree(dev_radius);
       -  cudaFree(dev_radius_sorted);
       -  cudaFree(dev_es_dot);
       -  cudaFree(dev_ev_dot);
       -  cudaFree(dev_es);
       -  cudaFree(dev_ev);
       -  cudaFree(dev_p);
       -  //cudaFree(dev_bonds);
       -  //cudaFree(dev_bonds_sorted);
       -  cudaFree(dev_contacts);
       -  cudaFree(dev_distmod);
       -  cudaFree(dev_delta_t);
       -
       -  // Cell-related arrays
       -  cudaFree(dev_gridParticleIndex);
       -  cudaFree(dev_cellStart);
       -  cudaFree(dev_cellEnd);
       -
       -  // Wall arrays
       -  cudaFree(dev_w_nx);
       -  cudaFree(dev_w_mvfd);
       -  cudaFree(dev_w_force);
       -  cudaFree(dev_w_force_partial);
       +  freeGlobalDeviceMemory();
        
          // Contact info arrays
          delete[] host_contacts;
          delete[] host_distmod;
          delete[] host_delta_t;
        
       -  printf("Done\n");
        } /* EOF */
 (DIR) diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -8,7 +8,7 @@
        // Second order integration scheme based on Taylor expansion of particle kinematics. 
        // Kernel executed on device, and callable from host only.
        __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
       -                          Float4* dev_angvel_sorted, Float* dev_radius_sorted, // Input
       +                          Float4* dev_angvel_sorted,
                                  Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
                                  Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
                                  unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
       t@@ -32,7 +32,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
            Float4 x      = dev_x_sorted[idx];
            Float4 vel    = dev_vel_sorted[idx];
            Float4 angvel = dev_angvel_sorted[idx];
       -    Float  radius = dev_radius_sorted[idx];
       +    Float  radius = x.w;
        
            // Coherent read from constant memory to registers
            Float  dt    = devC_params.dt;
 (DIR) diff --git a/src/sorting.cuh b/src/sorting.cuh
       t@@ -43,15 +43,16 @@ __global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
        // Reorder particle data into sorted order, and find the start and end particle indexes
        // of each cell in the sorted hash array.
        // Kernel executed on device, and callable from host only.
       -__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
       +__global__ void reorderArrays(unsigned int* dev_cellStart, 
       +                                  unsigned int* dev_cellEnd,
                                      unsigned int* dev_gridParticleCellID, 
                                      unsigned int* dev_gridParticleIndex,
       -                              Float4* dev_x, Float4* dev_vel, 
       -                              Float4* dev_angvel, Float* dev_radius,
       -                              //uint4* dev_bonds,
       -                              Float4* dev_x_sorted, Float4* dev_vel_sorted,
       -                              Float4* dev_angvel_sorted, Float* dev_radius_sorted)
       -                              //uint4* dev_bonds_sorted)
       +                              Float4* dev_x, 
       +                              Float4* dev_vel, 
       +                              Float4* dev_angvel,
       +                              Float4* dev_x_sorted, 
       +                              Float4* dev_vel_sorted,
       +                              Float4* dev_angvel_sorted)
        { 
        
          // Create hash array in shared on-chip memory. The size of the array 
       t@@ -107,16 +108,12 @@ __global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cel
            Float4 x      = dev_x[sortedIndex];
            Float4 vel    = dev_vel[sortedIndex];
            Float4 angvel = dev_angvel[sortedIndex];
       -    Float  radius = dev_radius[sortedIndex];
       -    //uint4  bonds  = dev_bonds[sortedIndex];
        
            __syncthreads();
            // Write sorted data to global memory
            dev_x_sorted[idx]      = x;
            dev_vel_sorted[idx]    = vel;
            dev_angvel_sorted[idx] = angvel;
       -    dev_radius_sorted[idx] = radius;
       -    //dev_bonds_sorted[idx]  = bonds;
          }
        } // End of reorderArrays(...)
        
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -20,31 +20,29 @@ class DEM {
            unsigned int np;
        
            // Structure containing individual particle kinematics
       -    Kinematics k;
       -    Kinematics dev_k;
       +    Kinematics k;        // host
       +    Kinematics dev_k;        // device
        
            // Structure containing energy values
       -    Energies e;
       -    Energies dev_e;
       +    Energies e;                // host
       +    Energies dev_e;        // device
        
            // Structure of global parameters
       -    Params params;
       -    Params dev_params;
       +    Params params;        // host
        
            // Structure containing spatial parameters
       -    Grid grid;
       -    Grid dev_grid;
       +    Grid grid;                // host
       +
       +    // Structure containing sorting arrays
       +    Sorting dev_sort;        // device
        
            // Structure of temporal parameters
       -    Time time;
       -    Time dev_time;
       +    Time time;                // host
       +    Time *dev_time;        // device
        
            // Structure of wall parameters
       -    Walls walls;
       -    Walls dev_walls;
       -
       -    // Wall force arrays
       -    Float* dev_w_force; 
       +    Walls walls;        // host
       +    Walls dev_walls;        // device
        
            // GPU initialization, must be called before startTime()
            void initializeGPU(void);
       t@@ -58,6 +56,9 @@ class DEM {
            // Allocate global device memory to hold data
            void allocateGlobalDeviceMemory(void);
        
       +    // Free dynamically allocated global device memory
       +    void freeGlobalDeviceMemory(void);
       +
            // Copy non-constant data to global GPU memory
            void transferToGlobalDeviceMemory(void);