tHalf-step leapfrog Verlet integration scheme added - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit a8f9bb4ed81e686acb8e1a51b4ba60fedcf03dae
 (DIR) parent 68987a0bcf02ed1b43919109acc000099694f787
 (HTM) Author: Anders Damsgaard Christensen <adc@geo.au.dk>
       Date:   Tue, 13 Nov 2012 20:36:03 +0100
       
       Half-step leapfrog Verlet integration scheme added
       
       Diffstat:
         M src/device.cu                       |      10 ++++++++++
         M src/integration.cuh                 |     154 +++++++++++++++++++------------
         M src/sphere.h                        |       2 ++
       
       3 files changed, 109 insertions(+), 57 deletions(-)
       ---
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -237,10 +237,12 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
          cudaMalloc((void**)&dev_x, memSizeF4);
          cudaMalloc((void**)&dev_xysum, memSizeF4);
          cudaMalloc((void**)&dev_vel, memSizeF4);
       +  cudaMalloc((void**)&dev_vel0, memSizeF4);
          cudaMalloc((void**)&dev_acc, memSizeF4);
          cudaMalloc((void**)&dev_force, memSizeF4);
          cudaMalloc((void**)&dev_angpos, memSizeF4);
          cudaMalloc((void**)&dev_angvel, memSizeF4);
       +  cudaMalloc((void**)&dev_angvel0, memSizeF4);
          cudaMalloc((void**)&dev_angacc, memSizeF4);
          cudaMalloc((void**)&dev_torque, memSizeF4);
        
       t@@ -296,10 +298,12 @@ __host__ void DEM::freeGlobalDeviceMemory()
          cudaFree(dev_x);
          cudaFree(dev_xysum);
          cudaFree(dev_vel);
       +  cudaFree(dev_vel0);
          cudaFree(dev_acc);
          cudaFree(dev_force);
          cudaFree(dev_angpos);
          cudaFree(dev_angvel);
       +  cudaFree(dev_angvel0);
          cudaFree(dev_angacc);
          cudaFree(dev_torque);
        
       t@@ -352,6 +356,8 @@ __host__ void DEM::transferToGlobalDeviceMemory()
              sizeof(Float2)*np, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_vel,      k.vel,
              memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy( dev_vel0,     k.vel,
       +      memSizeF4, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_acc,      k.acc, 
              memSizeF4, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_force,    k.force,
       t@@ -360,6 +366,8 @@ __host__ void DEM::transferToGlobalDeviceMemory()
              memSizeF4, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_angvel,   k.angvel,
              memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy( dev_angvel0,  k.angvel,
       +      memSizeF4, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_angacc,   k.angacc,
              memSizeF4, cudaMemcpyHostToDevice);
          cudaMemcpy( dev_torque,   k.torque,
       t@@ -695,6 +703,8 @@ __host__ void DEM::startTime()
                                             dev_angpos,
                                             dev_acc,
                                             dev_angacc,
       +                                     dev_vel0,
       +                                     dev_angvel0,
                                             dev_xysum,
                                             dev_gridParticleIndex);
            cudaThreadSynchronize();
 (DIR) diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -11,6 +11,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
                                  Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
                                  Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
                                  Float4* dev_acc, Float4* dev_angacc,
       +                          Float4* dev_vel0, Float4* dev_angvel0,
                                  Float2* dev_xysum,
                                  unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
        {
       t@@ -21,15 +22,17 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
            // Copy data to temporary arrays to avoid any potential read-after-write, 
            // write-after-read, or write-after-write hazards. 
            unsigned int orig_idx = dev_gridParticleIndex[idx];
       -    Float4 force  = dev_force[orig_idx];
       -    Float4 torque = dev_torque[orig_idx];
       -    Float4 angpos = dev_angpos[orig_idx];
       -    Float4 acc    = dev_acc[orig_idx];
       -    Float4 angacc = dev_angacc[orig_idx];
       -    Float4 x      = dev_x_sorted[idx];
       -    Float4 vel    = dev_vel_sorted[idx];
       -    Float4 angvel = dev_angvel_sorted[idx];
       -    Float  radius = x.w;
       +    Float4 force   = dev_force[orig_idx];
       +    Float4 torque  = dev_torque[orig_idx];
       +    Float4 angpos  = dev_angpos[orig_idx];
       +    Float4 acc     = dev_acc[orig_idx];
       +    Float4 angacc  = dev_angacc[orig_idx];
       +    Float4 vel0    = dev_vel0[orig_idx];
       +    Float4 angvel0 = dev_angvel0[orig_idx];
       +    Float4 x       = dev_x_sorted[idx];
       +    Float4 vel     = dev_vel_sorted[idx];
       +    Float4 angvel  = dev_angvel_sorted[idx];
       +    Float  radius  = x.w;
        
            Float2 xysum  = MAKE_FLOAT2(0.0f, 0.0f);
        
       t@@ -42,47 +45,6 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
            // Particle mass
            Float m = 4.0/3.0 * PI * radius*radius*radius * rho;
        
       -#if 0
       -    //// First-order Euler integration scheme ///
       -    // Update angular position
       -    angpos.x += angvel.x * dt;
       -    angpos.y += angvel.y * dt;
       -    angpos.z += angvel.z * dt;
       -
       -    // Update position
       -    x.x += vel.x * dt;
       -    x.y += vel.y * dt;
       -    x.z += vel.z * dt;
       -#else 
       -    
       -    /// Second-order scheme based on Taylor expansion ///
       -    // Update angular position
       -    angpos.x += angvel.x * dt + angacc.x * dt*dt * 0.5;
       -    angpos.y += angvel.y * dt + angacc.y * dt*dt * 0.5;
       -    angpos.z += angvel.z * dt + angacc.z * dt*dt * 0.5;
       -
       -    // Update position
       -    x.x += vel.x * dt + acc.x * dt*dt * 0.5;
       -    x.y += vel.y * dt + acc.y * dt*dt * 0.5;
       -    x.z += vel.z * dt + acc.z * dt*dt * 0.5;
       -#endif
       -
       -    // Update angular velocity
       -    angvel.x += angacc.x * dt;
       -    angvel.y += angacc.y * dt;
       -    angvel.z += angacc.z * dt;
       -
       -    // Update linear velocity
       -    vel.x += acc.x * dt;
       -    vel.y += acc.y * dt;
       -    vel.z += acc.z * dt;
       -
       -    // Add x-displacement for this time step to 
       -    // sum of x-displacements
       -    //x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
       -    xysum.x += vel.x * dt;
       -    xysum.y += vel.y * dt;// + (acc.y * dt*dt * 0.5f;
       -
            // Update linear acceleration of particle
            acc.x = force.x / m;
            acc.y = force.y / m;
       t@@ -132,17 +94,95 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
                x.x -= L.x;
            }
        
       +
       +    //// Half-step leapfrog Verlet integration scheme ////
       +    // Update half-step linear velocities
       +    vel0.x += acc.x * dt;
       +    vel0.y += acc.y * dt;
       +    vel0.z += acc.z * dt;
       +
       +    // Update half-step angular velocities
       +    angvel0.x += angacc.x * dt;
       +    angvel0.y += angacc.y * dt;
       +    angvel0.z += angacc.z * dt;
       +
       +    // Update positions
       +    x.x += vel0.x * dt;
       +    x.y += vel0.y * dt;
       +    x.z += vel0.z * dt;
       +
       +    // Update angular positions
       +    angpos.x += angvel0.x * dt;
       +    angpos.y += angvel0.y * dt;
       +    angpos.z += angvel0.z * dt;
       +
       +    // Update full-step linear velocity
       +    vel.x = vel0.x + 0.5 * acc.x * dt;
       +    vel.y = vel0.y + 0.5 * acc.x * dt;
       +    vel.z = vel0.z + 0.5 * acc.x * dt;
       +
       +    // Update full-step angular velocity
       +    angvel.x = angvel0.x + 0.5 * angacc.x * dt;
       +    angvel.y = angvel0.y + 0.5 * angacc.x * dt;
       +    angvel.z = angvel0.z + 0.5 * angacc.x * dt;
       +
       +    /*
       +    //// First-order Euler integration scheme ///
       +    // Update angular position
       +    angpos.x += angvel.x * dt;
       +    angpos.y += angvel.y * dt;
       +    angpos.z += angvel.z * dt;
       +
       +    // Update position
       +    x.x += vel.x * dt;
       +    x.y += vel.y * dt;
       +    x.z += vel.z * dt;
       +    */
       +
       +    /*
       +    /// Second-order scheme based on Taylor expansion ///
       +    // Update angular position
       +    angpos.x += angvel.x * dt + angacc.x * dt*dt * 0.5;
       +    angpos.y += angvel.y * dt + angacc.y * dt*dt * 0.5;
       +    angpos.z += angvel.z * dt + angacc.z * dt*dt * 0.5;
       +
       +    // Update position
       +    x.x += vel.x * dt + acc.x * dt*dt * 0.5;
       +    x.y += vel.y * dt + acc.y * dt*dt * 0.5;
       +    x.z += vel.z * dt + acc.z * dt*dt * 0.5;
       +    */
       +
       +    /*
       +    // Update angular velocity
       +    angvel.x += angacc.x * dt;
       +    angvel.y += angacc.y * dt;
       +    angvel.z += angacc.z * dt;
       +
       +    // Update linear velocity
       +    vel.x += acc.x * dt;
       +    vel.y += acc.y * dt;
       +    vel.z += acc.z * dt;
       +    */
       +
       +    // Add x-displacement for this time step to 
       +    // sum of x-displacements
       +    //x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
       +    xysum.x += vel.x * dt;
       +    xysum.y += vel.y * dt;// + (acc.y * dt*dt * 0.5f;
       +
            // Hold threads for coalesced write
            __syncthreads();
        
            // Store data in global memory at original, pre-sort positions
       -    dev_xysum[orig_idx] += xysum;
       -    dev_acc[orig_idx]    = acc;
       -    dev_angacc[orig_idx] = angacc;
       -    dev_angvel[orig_idx] = angvel;
       -    dev_vel[orig_idx]    = vel;
       -    dev_angpos[orig_idx] = angpos;
       -    dev_x[orig_idx]      = x;
       +    dev_xysum[orig_idx]  += xysum;
       +    dev_acc[orig_idx]     = acc;
       +    dev_angacc[orig_idx]  = angacc;
       +    dev_angvel[orig_idx]  = angvel;
       +    dev_angvel0[orig_idx] = angvel0;
       +    dev_vel[orig_idx]     = vel;
       +    dev_vel0[orig_idx]    = vel0;
       +    dev_angpos[orig_idx]  = angpos;
       +    dev_x[orig_idx]       = x;
          } 
        } // End of integrate(...)
        
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -54,10 +54,12 @@ class DEM {
            Float4 *dev_x;
            Float2 *dev_xysum;
            Float4 *dev_vel;
       +    Float4 *dev_vel0;
            Float4 *dev_acc;
            Float4 *dev_force;
            Float4 *dev_angpos;
            Float4 *dev_angvel;
       +    Float4 *dev_angvel0;
            Float4 *dev_angacc;
            Float4 *dev_torque;
            unsigned int *dev_contacts;