tIt compiles without errors. Still need to update sphere.py to new file format - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit c7e1dd22dc1892e430ab40a74fcb8df9de98c9d7
 (DIR) parent ad86704c71d456826b62ba4d8b1e23dc9f420467
 (HTM) Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Sat, 27 Oct 2012 00:37:45 +0200
       
       It compiles without errors. Still need to update sphere.py to new file format
       
       Diffstat:
         M src/constants.cuh                   |       1 +
         M src/contactmodels.cuh               |      16 ++++++++--------
         M src/contactsearch.cuh               |      54 +++++++++++++++++--------------
         D src/datatypes.cuh                   |      18 ------------------
         M src/device.cu                       |     220 +++++++++++++++----------------
         M src/integration.cuh                 |      27 +++++++++++++--------------
         M src/sorting.cuh                     |       8 ++++----
         M src/sphere.cpp                      |      16 +++++++++++++++-
         M src/sphere.h                        |       5 ++++-
       
       9 files changed, 181 insertions(+), 184 deletions(-)
       ---
 (DIR) diff --git a/src/constants.cuh b/src/constants.cuh
       t@@ -9,6 +9,7 @@
        // Constant memory size: 64 kb
        __constant__ unsigned int devC_nd;   // Number of dimensions
        __constant__ unsigned int devC_np;   // Number of particles
       +__constant__ unsigned int devC_nw;   // Number of dynamic walls
        __constant__ int          devC_nc;   // Max. number of contacts a particle can have
        __constant__ Float          devC_dt;   // Computational time step length
        
 (DIR) diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
       t@@ -44,7 +44,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
          //Float3 f_n = -devC_params.k_n * delta * n;
        
          // Normal force component: Elastic - viscous damping
       -  Float3 f_n = (-devC_params.k_n * delta - devC_params.gamma_wn * vel_n) * n;
       +  Float3 f_n = (-devC_params.k_n * delta - devC_params.gamma_n * vel_n) * n;
        
          // Make sure the viscous damping doesn't exceed the elastic component,
          // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       t@@ -61,7 +61,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
          // divide by zero (producing a NaN)
          if (vel_t_length > 0.f) {
        
       -    Float f_t_visc  = devC_params.gamma_wt * vel_t_length; // Tangential force by viscous model
       +    Float f_t_visc  = devC_params.gamma_t * vel_t_length; // Tangential force by viscous model
            Float f_t_limit = devC_params.mu_s * f_n_length;      // Max. friction
        
            // If the shear force component exceeds the friction,
       t@@ -382,15 +382,15 @@ __device__ void contactLinear(Float3* F, Float3* T,
              // Shear friction heat production rate: 
              // The energy lost from the tangential spring is dissipated as heat
              //*es_dot += -dot(vel_t_ab, f_t);
       -      *es_dot += length(delta_t0 - delta_t) * devC_params.k_t / devC_params.dt; // Seen in EsyS-Particle
       -      //*es_dot += fabs(dot(delta_t0 - delta_t, f_t)) / devC_params.dt; 
       +      *es_dot += length(delta_t0 - delta_t) * devC_params.k_t / devC_dt; // Seen in EsyS-Particle
       +      //*es_dot += fabs(dot(delta_t0 - delta_t, f_t)) / devC_dt; 
        
            } else { // Static case
        
              // No correction of f_t is required
        
              // Add tangential displacement to total tangential displacement
       -      delta_t = delta_t0 + vel_t_ab * devC_params.dt;
       +      delta_t = delta_t0 + vel_t_ab * devC_dt;
            }
          }
        
       t@@ -555,15 +555,15 @@ __device__ void contactHertz(Float3* F, Float3* T,
              // Shear friction heat production rate: 
              // The energy lost from the tangential spring is dissipated as heat
              //*es_dot += -dot(vel_t_ab, f_t);
       -      *es_dot += length(delta_t0 - delta_t) * devC_params.k_t / devC_params.dt; // Seen in EsyS-Particle
       -      //*es_dot += fabs(dot(delta_t0 - delta_t, f_t)) / devC_params.dt; 
       +      *es_dot += length(delta_t0 - delta_t) * devC_params.k_t / devC_dt; // Seen in EsyS-Particle
       +      //*es_dot += fabs(dot(delta_t0 - delta_t, f_t)) / devC_dt; 
        
            } else { // Static case
        
              // No correction of f_t is required
        
              // Add tangential displacement to total tangential displacement
       -      delta_t = delta_t0 + vel_t_ab * devC_params.dt;
       +      delta_t = delta_t0 + vel_t_ab * devC_dt;
            }
          }
        
 (DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
       t@@ -14,7 +14,7 @@ __device__ int findDistMod(int3* targetCell, Float3* distmod)
          // Check whether x- and y boundaries are to be treated as periodic
          // 1: x- and y boundaries periodic
          // 2: x boundaries periodic
       -  if (devC_params.periodic == 1) {
       +  if (devC_grid.periodic == 1) {
        
            // Periodic x-boundary
            if (targetCell->x < 0) {
       t@@ -38,7 +38,7 @@ __device__ int findDistMod(int3* targetCell, Float3* distmod)
        
        
          // Only x-boundaries are periodic
       -  } else if (devC_params.periodic == 2) {
       +  } else if (devC_grid.periodic == 2) {
        
            // Periodic x-boundary
            if (targetCell->x < 0) {
       t@@ -77,7 +77,7 @@ __device__ int findDistMod(int3* targetCell, Float3* distmod)
        
        // Find overlaps between particle no. 'idx' and particles in cell 'gridpos'.
        // Contacts are processed as soon as they are identified.
       -// Used for shearmodel=1, where contact history is not needed.
       +// Used for contactmodel=1, where contact history is not needed.
        // Kernel executed on device, and callable from device only.
        // Function is called from interact().
        __device__ void findAndProcessContactsInCell(int3 targetCell, 
       t@@ -175,7 +175,7 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
        
        // Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
        // Write the indexes of the overlaps in array contacts[].
       -// Used for shearmodel=2, where bookkeeping of contact history is necessary.
       +// Used for contactmodel=2, where bookkeeping of contact history is necessary.
        // Kernel executed on device, and callable from device only.
        // Function is called from topology().
        __device__ void findContactsInCell(int3 targetCell, 
       t@@ -257,7 +257,7 @@ __device__ void findContactsInCell(int3 targetCell,
                  for (int i=0; i < devC_nc; ++i) {
                    __syncthreads();
                    cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
       -            if (cidx == devC_params.np) // Write to position of now-deleted contact
       +            if (cidx == devC_np) // Write to position of now-deleted contact
                      cpos = i;
                    else if (cidx == idx_b_orig) { // Write to position of same contact
                      cpos = i;
       t@@ -320,7 +320,7 @@ __global__ void topology(unsigned int* dev_cellStart,
        {
          // Thread index equals index of particle A
          unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
       -  if (idx_a < devC_params.np) {
       +  if (idx_a < devC_np) {
            // Fetch particle data in global read
            Float4 x_a      = dev_x_sorted[idx_a];
            Float  radius_a = x_a.w;
       t@@ -356,13 +356,13 @@ __global__ void topology(unsigned int* dev_cellStart,
        
        
        // For a single particle:
       -// If shearmodel=1:
       +// If contactmodel=1:
        //   Search for neighbors to particle 'idx' inside the 27 closest cells, 
        //   and compute the resulting force and torque on it.
       -// If shearmodel=2:
       +// If contactmodel=2:
        //   Process contacts saved in dev_contacts by topology(), and compute
        //   the resulting force and torque on it.
       -// For all shearmodels:
       +// For all contactmodels:
        //   Collide with top- and bottom walls, save resulting force on upper wall.
        // Kernel is executed on device, and is callable from host only.
        // Function is called from mainGPU loop.
       t@@ -371,15 +371,19 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                 unsigned int* dev_cellEnd,
                                 Float4* dev_x,
                                     Float4* dev_x_sorted,
       -                         Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
       -                         Float4* dev_vel, Float4* dev_angvel,
       -                         Float4* dev_force, Float4* dev_torque,
       +                         Float4* dev_vel_sorted, 
       +                         Float4* dev_angvel_sorted,
       +                         Float4* dev_vel, 
       +                         Float4* dev_angvel,
       +                         Float4* dev_force, 
       +                         Float4* dev_torque,
                                 Float* dev_es_dot, 
                                 Float* dev_ev_dot, 
                                 Float* dev_es, 
                                 Float* dev_ev, 
                                 Float* dev_p,
       -                         Float4* dev_w_nx, Float4* dev_w_mvfd, 
       +                         Float4* dev_w_nx, 
       +                         Float4* dev_w_mvfd, 
                                 Float* dev_w_force, //uint4* dev_bonds_sorted,
                                 unsigned int* dev_contacts, 
                                 Float4* dev_distmod,
       t@@ -388,7 +392,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
          // Thread index equals index of particle A
          unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
        
       -  if (idx_a < devC_params.np) {
       +  if (idx_a < devC_np) {
        
            // Fetch particle data in global read
            unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
       t@@ -424,7 +428,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            Float3 T = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
        
            // Apply linear elastic, frictional contact model to registered contacts
       -    if (devC_params.shearmodel == 2 || devC_params.shearmodel == 3) {
       +    if (devC_params.contactmodel == 2 || devC_params.contactmodel == 3) {
              unsigned int idx_b_orig, mempos;
              Float delta_n, x_ab_length, radius_b;
              Float3 x_ab;
       t@@ -452,11 +456,11 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                delta_n = x_ab_length - (radius_a + radius_b);
        
        
       -        if (idx_b_orig != (unsigned int)devC_params.np) {
       +        if (idx_b_orig != (unsigned int)devC_np) {
        
                  // Process collision if the particles are overlapping
                  if (delta_n < 0.0f) {
       -            if (devC_params.shearmodel == 2) {
       +            if (devC_params.contactmodel == 2) {
                      contactLinear(&F, &T, &es_dot, &ev_dot, &p, 
                                    idx_a_orig,
                                    idx_b_orig,
       t@@ -468,7 +472,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                    x_ab, x_ab_length,
                                    delta_n, dev_delta_t, 
                                    mempos);
       -            } else if (devC_params.shearmodel == 3) {
       +            } else if (devC_params.contactmodel == 3) {
                      contactHertz(&F, &T, &es_dot, &ev_dot, &p, 
                                   idx_a_orig,
                                   idx_b_orig,
       t@@ -484,7 +488,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                  } else {
                    __syncthreads();
                    // Remove this contact (there is no particle with index=np)
       -            dev_contacts[mempos] = devC_params.np;
       +            dev_contacts[mempos] = devC_np;
                    // Zero sum of shear displacement in this position
                    dev_delta_t[mempos] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
                  }
       t@@ -498,8 +502,8 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
        
        
            // Find contacts and process collisions immidiately for
       -    // shearmodel 1 (visco-frictional).
       -    } else if (devC_params.shearmodel == 1) {
       +    // contactmodel 1 (visco-frictional).
       +    } else if (devC_params.contactmodel == 1) {
        
              int3 gridPos;
              int3 targetPos;
       t@@ -553,7 +557,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            }
        
        
       -    if (devC_params.periodic == 0) {
       +    if (devC_grid.periodic == 0) {
        
              // Left wall
              delta_w = x_a.x - radius_a - origo.x;
       t@@ -591,7 +595,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                         w_n, delta_w, 0.0f);
              }
        
       -    } else if (devC_params.periodic == 2) {
       +    } else if (devC_grid.periodic == 2) {
        
              // Front wall
              delta_w = x_a.y - radius_a - origo.y;
       t@@ -622,8 +626,8 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            dev_torque[orig_idx]  = MAKE_FLOAT4(T.x, T.y, T.z, 0.0f);
            dev_es_dot[orig_idx]  = es_dot;
            dev_ev_dot[orig_idx]  = ev_dot;
       -    dev_es[orig_idx]     += es_dot * devC_params.dt;
       -    dev_ev[orig_idx]     += ev_dot * devC_params.dt;
       +    dev_es[orig_idx]     += es_dot * devC_dt;
       +    dev_ev[orig_idx]     += ev_dot * devC_dt;
            dev_p[orig_idx]       = p;
            dev_w_force[orig_idx] = w_force;
          }
 (DIR) diff --git a/src/datatypes.cuh b/src/datatypes.cuh
       t@@ -1,18 +0,0 @@
       -// datatypes.cuh -- Device structure templates and function prototypes
       -
       -// Avoiding multiple inclusions of header file
       -#ifndef DATATYPES_CUH_
       -#define DATATYPES_CUH_
       -
       -#include "datatypes.h"
       -#include "vector_functions.h"
       -
       -unsigned int iDivUp(unsigned int a, unsigned int b);
       -void checkForCudaErrors(const char* checkpoint_description);
       -void checkForCudaErrors(const char* checkpoint_description, const unsigned int iteration);
       -
       -// Device constant memory structures
       -__constant__ Params devC_params;
       -__constant__ Grid   devC_grid;
       -  
       -#endif
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -97,8 +97,7 @@ __global__ void checkConstantValues(int* dev_equal,
        
          // Compare values between global- and constant
          // memory structures
       -  if (dev_grid->nd != devC_grid.nd ||
       -      dev_grid->origo[0] != devC_grid.origo[0] ||
       +  if (dev_grid->origo[0] != devC_grid.origo[0] ||
              dev_grid->origo[1] != devC_grid.origo[1] ||
              dev_grid->origo[2] != devC_grid.origo[2] ||
              dev_grid->L[0] != devC_grid.L[0] ||
       t@@ -106,35 +105,28 @@ __global__ void checkConstantValues(int* dev_equal,
              dev_grid->L[2] != devC_grid.L[2] ||
              dev_grid->num[0] != devC_grid.num[0] ||
              dev_grid->num[1] != devC_grid.num[1] ||
       -      dev_grid->num[2] != devC_grid.num[2])
       +      dev_grid->num[2] != devC_grid.num[2] ||
       +      dev_grid->periodic != devC_grid.periodic)
            *dev_equal = 1; // Not ok
        
       -  else if (dev_params->global != devC_params.global ||
       -      dev_params->g[0] != devC_params.g[0] ||
       +  
       +  else if (dev_params->g[0] != devC_params.g[0] ||
              dev_params->g[1] != devC_params.g[1] ||
              dev_params->g[2] != devC_params.g[2] ||
       -      dev_params->dt != devC_params.dt ||
       -      dev_params->np != devC_params.np ||
       -      dev_params->nw != devC_params.nw ||
       -      dev_params->wmode[0] != devC_params.wmode[0] ||
              dev_params->k_n != devC_params.k_n ||
              dev_params->k_t != devC_params.k_t ||
              dev_params->k_r != devC_params.k_r ||
              dev_params->gamma_n != devC_params.gamma_n ||
              dev_params->gamma_t != devC_params.gamma_t ||
              dev_params->gamma_r != devC_params.gamma_r ||
       -      dev_params->gamma_wn != devC_params.gamma_wn ||
       -      dev_params->gamma_wt != devC_params.gamma_wt ||
       -      dev_params->gamma_wr != devC_params.gamma_wr ||
              dev_params->mu_s != devC_params.mu_s ||
              dev_params->mu_d != devC_params.mu_d ||
              dev_params->mu_r != devC_params.mu_r ||
              dev_params->rho != devC_params.rho ||
       +      dev_params->contactmodel != devC_params.contactmodel ||
              dev_params->kappa != devC_params.kappa ||
              dev_params->db != devC_params.db ||
       -      dev_params->V_b != devC_params.V_b ||
       -      dev_params->periodic != devC_params.periodic ||
       -      dev_params->shearmodel != devC_params.shearmodel)
       +      dev_params->V_b != devC_params.V_b)
            *dev_equal = 2; // Not ok
        
        }
       t@@ -155,8 +147,8 @@ __host__ void DEM::checkConstantMemory()
          cudaMalloc((void**)&dev_params, sizeof(Params));
        
          // Copy structure data from host to global device memory
       -  cudaMemcpy(dev_grid, grid, sizeof(Grid), cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_params, params, sizeof(Params), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_grid, &grid, sizeof(Grid), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_params, &params, sizeof(Params), cudaMemcpyHostToDevice);
        
          // Compare values between global and constant memory
          // structures on the device.
       t@@ -197,17 +189,18 @@ __host__ void DEM::transferToConstantDeviceMemory()
        
          cudaMemcpyToSymbol("devC_nd", &nd, sizeof(nd));
          cudaMemcpyToSymbol("devC_np", &np, sizeof(np));
       +  cudaMemcpyToSymbol("devC_nw", &walls.nw, sizeof(unsigned int));
          cudaMemcpyToSymbol("devC_nc", &NC, sizeof(int));
          cudaMemcpyToSymbol("devC_dt", &time.dt, sizeof(Float));
       -  cudaMemcpyToSymbol(devC_grid, grid, sizeof(Grid));
       -  cudaMemcpyToSymbol(devC_params, params, sizeof(Params));
       +  cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
       +  cudaMemcpyToSymbol(devC_params, &params, sizeof(Params));
          
          checkForCudaErrors("After transferring to device constant memory");
          
          if (verbose == 1)
            cout << "Done\n";
        
       -  //checkConstantMemory();
       +  checkConstantMemory();
        }
        
        
       t@@ -223,35 +216,35 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
            std::cout << "  Allocating device memory:                       ";
        
          // Particle arrays
       -  cudaMalloc((void**)&dev_k.x, memSizeF4);
       -  cudaMalloc((void**)&dev_sort.x_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k.vel, memSizeF4);
       -  cudaMalloc((void**)&dev_sort.vel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k.angvel, memSizeF4);
       -  cudaMalloc((void**)&dev_sort.angvel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k.acc, memSizeF4);
       +  cudaMalloc((void**)&dev_k->x, memSizeF4);
       +  cudaMalloc((void**)&dev_sort->x_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k->vel, memSizeF4);
       +  cudaMalloc((void**)&dev_sort->vel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k->angvel, memSizeF4);
       +  cudaMalloc((void**)&dev_sort->angvel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_k->acc, memSizeF4);
          k.acc = new Float4[np];
       -  cudaMalloc((void**)&dev_k.angacc, memSizeF4);
       +  cudaMalloc((void**)&dev_k->angacc, memSizeF4);
          k.angacc = new Float4[np];
       -  cudaMalloc((void**)&dev_k.force, memSizeF4);
       -  cudaMalloc((void**)&dev_k.torque, memSizeF4);
       -  cudaMalloc((void**)&dev_k.angpos, memSizeF4);
       -  cudaMalloc((void**)&dev_e.es_dot, memSizeF);
       -  cudaMalloc((void**)&dev_e.ev_dot, memSizeF);
       -  cudaMalloc((void**)&dev_e.es, memSizeF);
       -  cudaMalloc((void**)&dev_e.ev, memSizeF);
       -  cudaMalloc((void**)&dev_e.p, memSizeF);
       +  cudaMalloc((void**)&dev_k->force, memSizeF4);
       +  cudaMalloc((void**)&dev_k->torque, memSizeF4);
       +  cudaMalloc((void**)&dev_k->angpos, memSizeF4);
       +  cudaMalloc((void**)&dev_e->es_dot, memSizeF);
       +  cudaMalloc((void**)&dev_e->ev_dot, memSizeF);
       +  cudaMalloc((void**)&dev_e->es, memSizeF);
       +  cudaMalloc((void**)&dev_e->ev, memSizeF);
       +  cudaMalloc((void**)&dev_e->p, memSizeF);
        
          // Cell-related arrays
       -  cudaMalloc((void**)&dev_sort.gridParticleCellID, sizeof(unsigned int)*np);
       -  cudaMalloc((void**)&dev_sort.gridParticleIndex, sizeof(unsigned int)*np);
       -  cudaMalloc((void**)&dev_sort.cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       -  cudaMalloc((void**)&dev_sort.cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +  cudaMalloc((void**)&dev_sort->gridParticleCellID, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_sort->gridParticleIndex, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_sort->cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +  cudaMalloc((void**)&dev_sort->cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
        
          // Particle contact bookkeeping arrays
       -  cudaMalloc((void**)&dev_k.contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       -  cudaMalloc((void**)&dev_k.distmod, sizeof(Float4)*np*NC);
       -  cudaMalloc((void**)&dev_k.delta_t, sizeof(Float4)*np*NC);
       +  cudaMalloc((void**)&dev_k->contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       +  cudaMalloc((void**)&dev_k->distmod, sizeof(Float4)*np*NC);
       +  cudaMalloc((void**)&dev_k->delta_t, sizeof(Float4)*np*NC);
        
          // Host contact bookkeeping arrays
          k.contacts = new unsigned int[np*NC];
       t@@ -262,9 +255,9 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
          k.delta_t = new Float4[np*NC];
        
          // Wall arrays
       -  cudaMalloc((void**)&dev_walls.nx, sizeof(Float4)*walls.nw);
       -  cudaMalloc((void**)&dev_walls.mvfd, sizeof(Float4)*walls.nw);
       -  cudaMalloc((void**)&dev_walls.force, sizeof(Float)*walls.nw*np);
       +  cudaMalloc((void**)&dev_walls->nx, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls->mvfd, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls->force, sizeof(Float)*walls.nw*np);
          // dev_w_force_partial allocated later
        
          checkForCudaErrors("End of allocateGlobalDeviceMemory");
       t@@ -306,7 +299,7 @@ __host__ void DEM::freeGlobalDeviceMemory()
          cudaFree(dev_walls->nx);
          cudaFree(dev_walls->mvfd);
          cudaFree(dev_walls->force);
       -  cudaFree(dev_w_force_partial);
       +  //cudaFree(dev_w_force_partial);
        
          if (verbose == 1)
            printf("Done\n");
       t@@ -323,19 +316,19 @@ __host__ void DEM::transferToGlobalDeviceMemory()
          cudaMemcpy(dev_e, e, sizeof(Energies), cudaMemcpyHostToDevice);
          cudaMemcpy(dev_time, time, sizeof(Time), cudaMemcpyHostToDevice);
          cudaMemcpy(dev_walls, walls, sizeof(Walls), cudaMemcpyHostToDevice);*/
       -  cudaMemcpy(dev_k, k, sizeof(k), cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_e, e, sizeof(e), cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_time, time, sizeof(time), cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_walls, walls, sizeof(walls), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_k, &k, sizeof(k), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_e, &e, sizeof(e), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_time, &time, sizeof(time), cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_walls, &walls, sizeof(walls), cudaMemcpyHostToDevice);
        
          checkForCudaErrors("End of transferToGlobalDeviceMemory");
          if (verbose == 1)
            std::cout << "Done\n";
        }
        
       -__host__ void DEM::transferToGlobalDeviceMemory()
       +__host__ void DEM::transferFromGlobalDeviceMemory()
        {
       -  cout << "  Transfering data to the device:                 ";
       +  std::cout << "  Transfering data to the device:                 ";
        
          // Copy structure data from host to global device memory
          cudaMemcpy(&k, dev_k, sizeof(k), cudaMemcpyDeviceToHost);
       t@@ -409,7 +402,7 @@ __host__ void DEM::startTime()
          long iter = 0;
        
          // Create first status.dat
       -  sprintf(file,"output/%s.status.dat", inputbin);
       +  sprintf(file,"output/%s.status.dat", sid);
          fp = fopen(file, "w");
          fprintf(fp,"%2.4e %2.4e %d\n", 
                        time.current, 
       t@@ -418,7 +411,7 @@ __host__ void DEM::startTime()
          fclose(fp);
        
          // Write first output data file: output0.bin, thus testing writing of bin files
       -  sprintf(file,"output/%s.output0.bin", inputbin);
       +  sprintf(file,"output/%s.output0.bin", sid);
          writebin(file);
        
          if (verbose == 1) {
       t@@ -426,7 +419,7 @@ __host__ void DEM::startTime()
              << "  IMPORTANT: Do not close this terminal, doing so will \n"
              << "             terminate this SPHERE process. Follow the \n"
              << "             progress by executing:\n"
       -      << "                $ ./sphere_status " << inputbin << "\n\n";
       +      << "                $ ./sphere_status " << sid << "\n\n";
          }
        
        
       t@@ -475,9 +468,9 @@ __host__ void DEM::startTime()
            // in the fine, uniform and homogenous grid.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort.gridParticleCellID, 
       -                                              dev_sort.gridParticleIndex, 
       -                                              dev_k.x);
       +    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort->gridParticleCellID, 
       +                                              dev_sort->gridParticleIndex, 
       +                                              dev_k->x);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -489,9 +482,9 @@ __host__ void DEM::startTime()
            // Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort.gridParticleCellID),
       -                        thrust::device_ptr<uint>(dev_sort.gridParticleCellID + np),
       -                        thrust::device_ptr<uint>(dev_sort.gridParticleIndex));
       +    thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort->gridParticleCellID),
       +                        thrust::device_ptr<uint>(dev_sort->gridParticleCellID + np),
       +                        thrust::device_ptr<uint>(dev_sort->gridParticleIndex));
            cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
            if (PROFILING == 1)
              stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
       t@@ -501,7 +494,7 @@ __host__ void DEM::startTime()
            // Zero cell array values by setting cellStart to its highest possible value,
            // specified with pointer value 0xffffffff, which for a 32 bit unsigned int
            // is 4294967295.
       -    cudaMemset(dev_sort.cellStart, 0xffffffff, 
       +    cudaMemset(dev_sort->cellStart, 0xffffffff, 
                       grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
            cudaThreadSynchronize();
            checkForCudaErrors("Post cudaMemset");
       t@@ -510,15 +503,15 @@ __host__ void DEM::startTime()
            // coherent memory access. Save ordered configurations in new arrays (*_sorted).
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort.cellStart, 
       -                                                   dev_sort.cellEnd,
       -                                                   dev_sort.gridParticleCellID, 
       -                                                   dev_sort.gridParticleIndex,
       -                                                   dev_k.x, dev_k.vel, 
       -                                                   dev_k.angvel,
       -                                                   dev_sort.x_sorted, 
       -                                                   dev_sort.vel_sorted, 
       -                                                   dev_sort.angvel_sorted);
       +    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort->cellStart, 
       +                                                   dev_sort->cellEnd,
       +                                                   dev_sort->gridParticleCellID, 
       +                                                   dev_sort->gridParticleIndex,
       +                                                   dev_k->x, dev_k->vel, 
       +                                                   dev_k->angvel,
       +                                                   dev_sort->x_sorted, 
       +                                                   dev_sort->vel_sorted, 
       +                                                   dev_sort->angvel_sorted);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -533,12 +526,12 @@ __host__ void DEM::startTime()
              // For each particle: Search contacts in neighbor cells
              if (PROFILING == 1)
                startTimer(&kernel_tic);
       -      topology<<<dimGrid, dimBlock>>>(dev_sort.cellStart, 
       -                                      dev_sort.cellEnd,
       -                                      dev_sort.gridParticleIndex,
       -                                      dev_sort.x_sorted, 
       -                                      dev_k.contacts,
       -                                      dev_k.distmod);
       +      topology<<<dimGrid, dimBlock>>>(dev_sort->cellStart, 
       +                                      dev_sort->cellEnd,
       +                                      dev_sort->gridParticleIndex,
       +                                      dev_sort->x_sorted, 
       +                                      dev_k->contacts,
       +                                      dev_k->distmod);
        
        
              // Synchronization point
       t@@ -552,26 +545,28 @@ __host__ void DEM::startTime()
            // For each particle: Process collisions and compute resulting forces.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    interact<<<dimGrid, dimBlock>>>(dev_sort.gridParticleIndex,
       -                                    dev_sort.cellStart,
       -                                    dev_sort.cellEnd,
       -                                    dev_k.x,
       -                                    dev_sort.x_sorted,
       -                                    dev_sort.vel_sorted,
       -                                    dev_sort.angvel_sorted,
       -                                    dev_k.vel,
       -                                    dev_k.angvel,
       -                                    dev_k.force, 
       -                                    dev_k.torque,
       -                                    dev_e.es_dot,
       -                                    dev_e.ev_dot, 
       -                                    dev_e.es, dev_e.ev, dev_e.p,
       -                                    dev_walls.nx,
       -                                    dev_walls.mvfd,
       -                                    dev_walls.force,
       -                                    dev_k.contacts,
       -                                    dev_k.distmod,
       -                                    dev_k.delta_t);
       +    interact<<<dimGrid, dimBlock>>>(dev_sort->gridParticleIndex,
       +                                    dev_sort->cellStart,
       +                                    dev_sort->cellEnd,
       +                                    dev_k->x,
       +                                    dev_sort->x_sorted,
       +                                    dev_sort->vel_sorted,
       +                                    dev_sort->angvel_sorted,
       +                                    dev_k->vel,
       +                                    dev_k->angvel,
       +                                    dev_k->force, 
       +                                    dev_k->torque, 
       +                                    dev_e->es_dot,
       +                                    dev_e->ev_dot, 
       +                                    dev_e->es,
       +                                    dev_e->ev,
       +                                    dev_e->p,
       +                                    dev_walls->nx,
       +                                    dev_walls->mvfd,
       +                                    dev_walls->force,
       +                                    dev_k->contacts,
       +                                    dev_k->distmod,
       +                                    dev_k->delta_t);
        
        
            // Synchronization point
       t@@ -583,16 +578,16 @@ __host__ void DEM::startTime()
            // Update particle kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrate<<<dimGrid, dimBlock>>>(dev_sort.x_sorted, 
       -                                     dev_sort.vel_sorted, 
       -                                     dev_sort.angvel_sorted,
       -                                     dev_k.x, 
       -                                     dev_k.vel, 
       -                                     dev_k.angvel,
       -                                     dev_k.force,
       -                                     dev_k.torque, 
       -                                     dev_k.angpos,
       -                                     dev_sort.gridParticleIndex);
       +    integrate<<<dimGrid, dimBlock>>>(dev_sort->x_sorted, 
       +                                     dev_sort->vel_sorted, 
       +                                     dev_sort->angvel_sorted,
       +                                     dev_k->x, 
       +                                     dev_k->vel, 
       +                                     dev_k->angvel,
       +                                     dev_k->force,
       +                                     dev_k->torque, 
       +                                     dev_k->angpos,
       +                                     dev_sort->gridParticleIndex);
        
            cudaThreadSynchronize();
            if (PROFILING == 1)
       t@@ -601,7 +596,7 @@ __host__ void DEM::startTime()
            // Summation of forces on wall
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    summation<<<dimGrid, dimBlock>>>(dev_walls.force, dev_w_force_partial);
       +    summation<<<dimGrid, dimBlock>>>(dev_walls->force, dev_w_force_partial);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -612,8 +607,7 @@ __host__ void DEM::startTime()
            // Update wall kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrateWalls<<< 1, walls.nw>>>(dev_walls.nx, 
       -                                     dev_walls.mvfd,
       +    integrateWalls<<< 1, walls.nw>>>(dev_walls,
                                             dev_w_force_partial,
                                             blocksPerGrid);
        
       t@@ -651,7 +645,7 @@ __host__ void DEM::startTime()
        
              // Write binary output file
              time.step_count += 1;
       -      sprintf(file,"output/%s.output%d.bin", inputbin, time.step_count);
       +      sprintf(file,"output/%s.output%d.bin", sid, time.step_count);
              writebin(file);
        
        
       t@@ -686,7 +680,7 @@ __host__ void DEM::startTime()
              }
        
              // Update status.dat at the interval of filetime 
       -      sprintf(file,"output/%s.status.dat", inputbin);
       +      sprintf(file,"output/%s.status.dat", sid);
              fp = fopen(file, "w");
              fprintf(fp,"%2.4e %2.4e %d\n", 
                      time.current, 
 (DIR) diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -15,7 +15,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
        {
          unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
        
       -  if (idx < devC_params.np) { // Condition prevents block size error
       +  if (idx < devC_np) { // Condition prevents block size error
        
            // Copy data to temporary arrays to avoid any potential read-after-write, 
            // write-after-read, or write-after-write hazards. 
       t@@ -35,7 +35,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
            Float  radius = x.w;
        
            // Coherent read from constant memory to registers
       -    Float  dt    = devC_params.dt;
       +    Float  dt    = devC_dt;
            Float3 origo = MAKE_FLOAT3(devC_grid.origo[0], devC_grid.origo[1], devC_grid.origo[2]); 
            Float3 L     = MAKE_FLOAT3(devC_grid.L[0], devC_grid.L[1], devC_grid.L[2]);
            Float  rho   = devC_params.rho;
       t@@ -106,7 +106,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
            x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
        
            // Move particle across boundary if it is periodic
       -    if (devC_params.periodic == 1) {
       +    if (devC_grid.periodic == 1) {
              if (x.x < origo.x)
                x.x += L.x;
              if (x.x > L.x)
       t@@ -115,7 +115,7 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
                x.y += L.y;
              if (x.y > L.y)
                x.y -= L.y;
       -    } else if (devC_params.periodic == 2) {
       +    } else if (devC_grid.periodic == 2) {
              if (x.x < origo.x)
                x.x += L.x;
              if (x.x > L.x)
       t@@ -142,7 +142,7 @@ __global__ void summation(Float* in, Float *out)
          unsigned int cacheIdx = threadIdx.x;
        
          Float temp = 0.0f;
       -  while (idx < devC_params.np) {
       +  while (idx < devC_np) {
            temp += in[idx];
            idx += blockDim.x * gridDim.x;
          }
       t@@ -168,20 +168,19 @@ __global__ void summation(Float* in, Float *out)
        }
        
        // Update wall positions
       -__global__ void integrateWalls(Float4* dev_w_nx, 
       -                                   Float4* dev_w_mvfd,
       +__global__ void integrateWalls(Walls* dev_walls, 
                                       Float* dev_w_force_partial,
                                       unsigned int blocksPerGrid)
        {
          unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
        
       -  if (idx < devC_params.nw) { // Condition prevents block size error
       +  if (idx < devC_nw) { // Condition prevents block size error
        
            // Copy data to temporary arrays to avoid any potential read-after-write, 
            // write-after-read, or write-after-write hazards. 
       -    Float4 w_nx   = dev_w_nx[idx];
       -    Float4 w_mvfd = dev_w_mvfd[idx];
       -    int wmode = devC_params.wmode[idx];  // Wall BC, 0: fixed, 1: devs, 2: vel
       +    Float4 w_nx   = dev_walls->nx[idx];
       +    Float4 w_mvfd = dev_walls->mvfd[idx];
       +    int wmode = dev_walls->wmode[idx];  // Wall BC, 0: fixed, 1: devs, 2: vel
            Float acc;
        
            if (wmode == 0) // Wall fixed: do nothing
       t@@ -193,7 +192,7 @@ __global__ void integrateWalls(Float4* dev_w_nx,
              w_mvfd.z += dev_w_force_partial[i];
            }
        
       -    Float dt = devC_params.dt;
       +    Float dt = devC_dt;
        
            // Normal load = Deviatoric stress times wall surface area,
            // directed downwards.
       t@@ -216,8 +215,8 @@ __global__ void integrateWalls(Float4* dev_w_nx,
            w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
        
            // Store data in global memory
       -    dev_w_nx[idx]   = w_nx;
       -    dev_w_mvfd[idx] = w_mvfd;
       +    dev_walls->nx[idx]   = w_nx;
       +    dev_walls->mvfd[idx] = w_mvfd;
          }
        } // End of integrateWalls(...)
        
 (DIR) diff --git a/src/sorting.cuh b/src/sorting.cuh
       t@@ -25,7 +25,7 @@ __global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
          //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
          unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
        
       -  if (idx < devC_params.np) { // Condition prevents block size error
       +  if (idx < devC_np) { // Condition prevents block size error
        
            //volatile Float4 x = dev_x[idx]; // Ensure coalesced read
            Float4 x = dev_x[idx]; // Ensure coalesced read
       t@@ -69,7 +69,7 @@ __global__ void reorderArrays(unsigned int* dev_cellStart,
          unsigned int cellID;
        
          // Read cellID data and store it in shared memory (shared_data)
       -  if (idx < devC_params.np) { // Condition prevents block size error
       +  if (idx < devC_np) { // Condition prevents block size error
            cellID = dev_gridParticleCellID[idx];
        
            // Load hash data into shared memory, allowing access to neighbor particle cellID values
       t@@ -86,7 +86,7 @@ __global__ void reorderArrays(unsigned int* dev_cellStart,
          __syncthreads();
        
          // Find lowest and highest particle index in each cell
       -  if (idx < devC_params.np) { // Condition prevents block size error
       +  if (idx < devC_np) { // Condition prevents block size error
            // If this particle has a different cell index to the previous particle, it's the first
            // particle in the cell -> Store the index of this particle in the cell.
            // The previous particle must be the last particle in the previous cell.
       t@@ -97,7 +97,7 @@ __global__ void reorderArrays(unsigned int* dev_cellStart,
            }
        
            // Check wether the thread is the last one
       -    if (idx == (devC_params.np - 1)) 
       +    if (idx == (devC_np - 1)) 
              dev_cellEnd[cellID] = idx + 1;
        
        
 (DIR) diff --git a/src/sphere.cpp b/src/sphere.cpp
       t@@ -1,6 +1,7 @@
        #include <iostream>
        #include <cstdio>
        #include <cstdlib>
       +#include <cstring>
        
        #include "typedefs.h"
        #include "datatypes.h"
       t@@ -9,7 +10,7 @@
        
        // Constructor: Reads an input binary, and optionally checks
        // and reports the values
       -DEM::DEM(const char *inputbin, 
       +DEM::DEM(char *inputbin, 
            const int verbosity,
            const int checkVals)
        : verbose(verbosity)
       t@@ -17,6 +18,19 @@ DEM::DEM(const char *inputbin,
          using std::cout;
          using std::cerr;
        
       +  // Get basename from input file
       +  char *name = inputbin;
       +  char *sid = name;
       +  while (*name) {
       +    if (*name++ == '/') {
       +      sid = name;
       +    }
       +  } // sid is now everything after the last '/' char
       +  char *lastdot = strrchr(sid, '.');
       +  if (lastdot != NULL)
       +    *lastdot = '\0'; // Put in string-end char. at last dot
       +
       +
          // Initialize CUDA
          initializeGPU();
        
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -10,6 +10,9 @@ class DEM {
          // Values and functions only accessible from the class internally
          private:
        
       +    // Simulation ID
       +    char *sid;
       +
            // Output level
            int verbose;
        
       t@@ -70,7 +73,7 @@ class DEM {
          public:
        
            // Constructor, some parameters with default values
       -    DEM(const char *inputbin, 
       +    DEM(char *inputbin, 
                const int verbosity = 1,
                const int checkVals = 1);