tIO now works in Python, C++ and CUDA, see python/tests.py. Now working on implementing raytracer - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 472d81e85b29f2ad92df2031ff56a9ac2cdf8699
 (DIR) parent c494a43f245cf27c2c498ce27636347fbbb1a909
 (HTM) Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Fri,  2 Nov 2012 09:29:51 +0100
       
       IO now works in Python, C++ and CUDA, see python/tests.py. Now working on implementing raytracer
       
       Diffstat:
         M python/sphere.py                    |      77 +++++++++++++++----------------
         M python/tests.py                     |      20 +++++++++++---------
         M src/Makefile                        |       4 ++--
         M src/contactsearch.cuh               |      29 +++++++++++++++++++----------
         M src/datatypes.h                     |       1 -
         M src/device.cu                       |     346 ++++++++++++++++---------------
         M src/file_io.cpp                     |       1 -
         M src/integration.cuh                 |      18 ++++++++++--------
         M src/sphere.cpp                      |      22 ++++++++++++----------
         M src/sphere.h                        |      51 +++++++++++++++++++++++++------
         M src/utility.cu                      |       4 ++--
       
       11 files changed, 315 insertions(+), 258 deletions(-)
       ---
 (DIR) diff --git a/python/sphere.py b/python/sphere.py
       t@@ -51,17 +51,21 @@ class Spherebin:
            self.ev         = numpy.zeros(self.np, dtype=numpy.float64)
            self.p         = numpy.zeros(self.np, dtype=numpy.float64)
        
       -    self.g            = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
       -    self.k_n     = numpy.ones(1, dtype=numpy.float64) * 1.16e9
       -    self.k_t     = numpy.ones(1, dtype=numpy.float64) * 1.16e9
       -    self.k_r         = numpy.zeros(1, dtype=numpy.float64)
       -    self.gamma_n = numpy.zeros(1, dtype=numpy.float64)
       -    self.gamma_t = numpy.zeros(1, dtype=numpy.float64)
       -    self.gamma_r = numpy.zeros(1, dtype=numpy.float64)
       -    self.mu_s    = numpy.ones(1, dtype=numpy.float64)
       -    self.mu_d    = numpy.ones(1, dtype=numpy.float64)
       -    self.mu_r    = numpy.zeros(1, dtype=numpy.float64)
       -    self.rho     = numpy.ones(1, dtype=numpy.float64) * 2600.0
       +    self.g        = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
       +    self.k_n      = numpy.ones(1, dtype=numpy.float64) * 1.16e9
       +    self.k_t      = numpy.ones(1, dtype=numpy.float64) * 1.16e9
       +    self.k_r          = numpy.zeros(1, dtype=numpy.float64)
       +    self.gamma_n  = numpy.zeros(1, dtype=numpy.float64)
       +    self.gamma_t  = numpy.zeros(1, dtype=numpy.float64)
       +    self.gamma_r  = numpy.zeros(1, dtype=numpy.float64)
       +    self.mu_s     = numpy.ones(1, dtype=numpy.float64)
       +    self.mu_d     = numpy.ones(1, dtype=numpy.float64)
       +    self.mu_r     = numpy.zeros(1, dtype=numpy.float64)
       +    self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
       +    self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
       +    self.mu_ws    = numpy.ones(1, dtype=numpy.float64)
       +    self.mu_wd    = numpy.ones(1, dtype=numpy.float64)
       +    self.rho      = numpy.ones(1, dtype=numpy.float64) * 2600.0
            self.contactmodel   = numpy.ones(1, dtype=numpy.uint32) * 2    # contactLinear default
            self.kappa        = numpy.zeros(1, dtype=numpy.float64)
            self.db           = numpy.zeros(1, dtype=numpy.float64)
       t@@ -70,19 +74,16 @@ class Spherebin:
            # Wall data
            self.nw          = numpy.ones(1, dtype=numpy.uint32) * nw
            self.wmode   = numpy.zeros(self.nw, dtype=numpy.int32)
       +
            self.w_n     = numpy.zeros(self.nw*self.nd, dtype=numpy.float64).reshape(self.nw,self.nd)
       -    self.w_n[nw-1,nd-1] = -1.0
       +    if (self.nw > 0):
       +      self.w_n[0,2] = -1.0
            self.w_x     = numpy.ones(self.nw, dtype=numpy.float64)
            self.w_m     = numpy.zeros(self.nw, dtype=numpy.float64)
            self.w_vel   = numpy.zeros(self.nw, dtype=numpy.float64)
            self.w_force = numpy.zeros(self.nw, dtype=numpy.float64)
            self.w_devs  = numpy.zeros(self.nw, dtype=numpy.float64)
            
       -    # x- and y-boundary behavior
       -    self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
       -    self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
       -    self.gamma_wr = numpy.ones(1, dtype=numpy.float64) * 1.0e3
       -
          # Compare the values of two Spherebin objects, and check
          # whether the values are identical
          def __cmp__(self, other):
       t@@ -136,8 +137,7 @@ class Spherebin:
                (self.w_force == other.w_force).all() and\
                (self.w_devs == other.w_devs).all() and\
                self.gamma_wn == other.gamma_wn and\
       -        self.gamma_wt == other.gamma_wt and\
       -        self.gamma_wr == other.gamma_wr\
       +        self.gamma_wt == other.gamma_wt\
                ).all() == True):
              return 0 # All equal
            else:
       t@@ -224,6 +224,10 @@ class Spherebin:
              self.mu_s         = numpy.fromfile(fh, dtype=numpy.float64, count=1) 
              self.mu_d         = numpy.fromfile(fh, dtype=numpy.float64, count=1) 
              self.mu_r         = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       +      self.gamma_wn     = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       +      self.gamma_wt     = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       +      self.mu_ws        = numpy.fromfile(fh, dtype=numpy.float64, count=1) 
       +      self.mu_wd        = numpy.fromfile(fh, dtype=numpy.float64, count=1) 
              self.rho          = numpy.fromfile(fh, dtype=numpy.float64, count=1)
              self.contactmodel = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
              self.kappa        = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       t@@ -250,10 +254,6 @@ class Spherebin:
                self.w_force = numpy.fromfile(fh, dtype=numpy.float64, count=1)
                self.w_devs  = numpy.fromfile(fh, dtype=numpy.float64, count=1)
            
       -      # x- and y-boundary behavior
       -      self.gamma_wn = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       -      self.gamma_wt = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       -      self.gamma_wr = numpy.fromfile(fh, dtype=numpy.float64, count=1)
        
              fh.close()
              
       t@@ -325,6 +325,10 @@ class Spherebin:
              fh.write(self.mu_s.astype(numpy.float64))
              fh.write(self.mu_d.astype(numpy.float64))
              fh.write(self.mu_r.astype(numpy.float64))
       +      fh.write(self.gamma_wn.astype(numpy.float64))
       +      fh.write(self.gamma_wt.astype(numpy.float64))
       +      fh.write(self.mu_ws.astype(numpy.float64))
       +      fh.write(self.mu_wd.astype(numpy.float64))
              fh.write(self.rho.astype(numpy.float64))
              fh.write(self.contactmodel.astype(numpy.uint32))
              fh.write(self.kappa.astype(numpy.float64))
       t@@ -343,11 +347,6 @@ class Spherebin:
                fh.write(self.w_force[i].astype(numpy.float64))
                fh.write(self.w_devs[i].astype(numpy.float64))
            
       -      # x- and y-boundary behavior
       -      fh.write(self.gamma_wn.astype(numpy.float64))
       -      fh.write(self.gamma_wt.astype(numpy.float64))
       -      fh.write(self.gamma_wr.astype(numpy.float64))
       -
              fh.close()
              
            finally:
       t@@ -633,14 +632,15 @@ class Spherebin:
            self.L = self.num * cellsize
        
            # Initialize upper wall
       -    self.wmode[0] = 0
       -    self.w_n[0,2] = -1.0
       -    self.w_x[0] = self.L[2]
       -    self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
       -    self.w_vel[0] = 0.0
       -    self.w_force[0] = 0.0
       -    self.w_devs[0] = 0.0
       -    self.nw = numpy.ones(1, dtype=numpy.uint32) * 1
       +    if (self.nw > 0):
       +      self.wmode[0] = 0
       +      self.w_n[0,2] = -1.0
       +      self.w_x[0] = self.L[2]
       +      self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
       +      self.w_vel[0] = 0.0
       +      self.w_force[0] = 0.0
       +      self.w_devs[0] = 0.0
       +      self.nw = numpy.ones(1, dtype=numpy.uint32) * 1
        
          # Adjust grid and upper wall for consolidation under deviatoric stress
          def consolidate(self, deviatoric_stress = 10e3, 
       t@@ -755,8 +755,6 @@ class Spherebin:
            # Set wall viscosities to zero
            self.gamma_wn[0] = 0.0
            self.gamma_wt[0] = 0.0
       -    self.gamma_wr[0] = 0.0
       -
        
         
          def initTemporal(self, total,
       t@@ -792,7 +790,6 @@ class Spherebin:
                                  gamma_r = 0,
                                  gamma_wn = 1e3,
                                  gamma_wt = 1e3,
       -                          gamma_wr = 2e3,
                                  capillaryCohesion = 0):
            """ Initialize particle parameters to default values.
                Radii must be set prior to calling this function.
       t@@ -837,8 +834,6 @@ class Spherebin:
            # Wall viscosities
            self.gamma_wn[0] = gamma_wn # normal
            self.gamma_wt[0] = gamma_wt # sliding
       -    self.gamma_wr[0] = gamma_wr # rolling
       -
        
            ### Parameters related to capillary bonds
        
 (DIR) diff --git a/python/tests.py b/python/tests.py
       t@@ -12,17 +12,13 @@ def compare(first, second, string):
        print("### Input/output tests ###")
        
        # Generate data in python
       -orig = Spherebin(100)
       +orig = Spherebin(np = 100, nw = 0)
        orig.generateRadii()
        orig.defaultParams()
       -orig.initRandomGridPos()
       -orig.initTemporal(total = 0.0, current = 0.0)
       -orig.xysum = numpy.ones(orig.np*2, dtype=numpy.float64).reshape(orig.np, 2) * 1
       -orig.vel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 2
       -orig.force = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 3
       -orig.angpos = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 4
       -orig.angvel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 5
       -orig.torque = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 6
       +orig.initRandomGridPos(g = numpy.zeros(orig.nd))
       +orig.initTemporal(current = 0.0, total = 0.0)
       +orig.time_total = 2.0*orig.time_dt;
       +orig.time_file_dt = orig.time_dt;
        orig.writebin("orig.bin", verbose=False)
        
        # Test Python IO routines
       t@@ -36,4 +32,10 @@ cpp = Spherebin()
        cpp.readbin("../output/orig.output0.bin", verbose=False)
        compare(orig, cpp, "C++ IO:   ")
        
       +# Test CUDA IO routines
       +cuda = Spherebin()
       +cuda.readbin("../output/orig.output1.bin", verbose=False)
       +cuda.time_current = orig.time_current
       +cuda.time_step_count = orig.time_step_count
       +compare(orig, cuda, "CUDA IO:  ")
        
 (DIR) diff --git a/src/Makefile b/src/Makefile
       t@@ -27,8 +27,8 @@ NVCCFLAGS=--use_fast_math -O3 -m64 -gencode=arch=compute_20,code=\"sm_20,compute
        
        # Debugable code? Beware that enabling this option will 
        # considerably slow down the execution.
       -CCFLAGS=-g -O0 -Wall
       -NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -Xcompiler "-O0 -g"
       +#CCFLAGS=-g -O0 -Wall
       +#NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -Xcompiler "-O0 -g"
        
        DATE=`date +'%Y.%m.%d-%H:%M:%S'`
                BACKUPNAME=sphere.$(DATE).tar.gz
 (DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
       t@@ -91,8 +91,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
                                                     Float4* dev_angvel_sorted,
                                                     unsigned int* dev_cellStart, 
                                                     unsigned int* dev_cellEnd,
       -                                             Float4* dev_w_nx, 
       -                                             Float4* dev_w_mvfd)
       +                                             Float4* dev_walls_nx, 
       +                                             Float4* dev_walls_mvfd)
        //uint4 bonds)
        {
        
       t@@ -382,9 +382,9 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                 Float* dev_es, 
                                 Float* dev_ev, 
                                 Float* dev_p,
       -                         Float4* dev_w_nx, 
       -                         Float4* dev_w_mvfd, 
       -                         Float* dev_w_force, //uint4* dev_bonds_sorted,
       +                         Float4* dev_walls_nx, 
       +                         Float4* dev_walls_mvfd, 
       +                         Float* dev_walls_force_pp, //uint4* dev_bonds_sorted,
                                 unsigned int* dev_contacts, 
                                 Float4* dev_distmod,
                                 Float4* dev_delta_t)
       t@@ -399,9 +399,6 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            Float4 x_a      = dev_x_sorted[idx_a];
            Float  radius_a = x_a.w;
        
       -    // Fetch wall data in global read
       -    Float4 w_up_nx   = dev_w_nx[0];
       -    Float4 w_up_mvfd = dev_w_mvfd[0];
        
            // Fetch world dimensions in constant memory read
            Float3 origo = MAKE_FLOAT3(devC_grid.origo[0], 
       t@@ -411,6 +408,17 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                   devC_grid.L[1], 
                                   devC_grid.L[2]);
        
       +    // Fetch wall data in global read
       +    Float4 w_up_nx;
       +    Float4 w_up_mvfd;
       +    if (devC_nw > 0) {
       +      w_up_nx   = dev_walls_nx[0];
       +      w_up_mvfd = dev_walls_mvfd[0];
       +    } else {
       +      w_up_nx   = MAKE_FLOAT4(0.0f, 0.0f, -1.0f, L.z);
       +      w_up_mvfd = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +    }
       +
            // Index of particle which is bonded to particle A.
            // The index is equal to the particle no (p.np)
            // if particle A is bond-less.
       t@@ -526,7 +534,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
                                                 dev_x_sorted,
                                                 dev_vel_sorted, dev_angvel_sorted,
                                                 dev_cellStart, dev_cellEnd,
       -                                         dev_w_nx, dev_w_mvfd);
       +                                         dev_walls_nx, dev_walls_mvfd);
                  }
                }
              }
       t@@ -629,7 +637,8 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
            dev_es[orig_idx]     += es_dot * devC_dt;
            dev_ev[orig_idx]     += ev_dot * devC_dt;
            dev_p[orig_idx]       = p;
       -    dev_w_force[orig_idx] = w_force;
       +    if (devC_nw > 0)
       +      dev_walls_force_pp[orig_idx] = w_force;
          }
        } // End of interact(...)
        
 (DIR) diff --git a/src/datatypes.h b/src/datatypes.h
       t@@ -95,7 +95,6 @@ struct Walls {
          int wmode[MAXWALLS];        // Wall modes
          Float4* nx;                // Wall normal and position
          Float4* mvfd;                // Wall mass, velocity, force and dev. stress
       -  Float* force;                // Resulting forces on walls per particle
        };
        
        #endif
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -220,38 +220,44 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
          if (verbose == 1)
            std::cout << "  Allocating global device memory:                ";
        
       -  // Particle arrays
       -  cudaMalloc((void**)&dev_k->x, memSizeF4);
       -  cudaMalloc((void**)&dev_sort->x_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k->vel, memSizeF4);
       -  cudaMalloc((void**)&dev_sort->vel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k->angvel, memSizeF4);
       -  cudaMalloc((void**)&dev_sort->angvel_sorted, memSizeF4);
       -  cudaMalloc((void**)&dev_k->acc, memSizeF4);
          k.acc = new Float4[np];
       -  cudaMalloc((void**)&dev_k->angacc, memSizeF4);
          k.angacc = new Float4[np];
       -  cudaMalloc((void**)&dev_k->force, memSizeF4);
       -  cudaMalloc((void**)&dev_k->torque, memSizeF4);
       -  cudaMalloc((void**)&dev_k->angpos, memSizeF4);
       -  cudaMalloc((void**)&dev_e->es_dot, memSizeF);
       -  cudaMalloc((void**)&dev_e->ev_dot, memSizeF);
       -  cudaMalloc((void**)&dev_e->es, memSizeF);
       -  cudaMalloc((void**)&dev_e->ev, memSizeF);
       -  cudaMalloc((void**)&dev_e->p, memSizeF);
        
       -  // Cell-related arrays
       -  cudaMalloc((void**)&dev_sort->gridParticleCellID, sizeof(unsigned int)*np);
       -  cudaMalloc((void**)&dev_sort->gridParticleIndex, sizeof(unsigned int)*np);
       -  cudaMalloc((void**)&dev_sort->cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       -  cudaMalloc((void**)&dev_sort->cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +  // Kinematics arrays
       +  cudaMalloc((void**)&dev_x, memSizeF4);
       +  cudaMalloc((void**)&dev_xysum, memSizeF4);
       +  cudaMalloc((void**)&dev_vel, memSizeF4);
       +  cudaMalloc((void**)&dev_acc, memSizeF4);
       +  cudaMalloc((void**)&dev_force, memSizeF4);
       +  cudaMalloc((void**)&dev_angpos, memSizeF4);
       +  cudaMalloc((void**)&dev_angvel, memSizeF4);
       +  cudaMalloc((void**)&dev_angacc, memSizeF4);
       +  cudaMalloc((void**)&dev_torque, memSizeF4);
        
          // Particle contact bookkeeping arrays
       -  cudaMalloc((void**)&dev_k->contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       -  cudaMalloc((void**)&dev_k->distmod, sizeof(Float4)*np*NC);
       -  cudaMalloc((void**)&dev_k->delta_t, sizeof(Float4)*np*NC);
       +  cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
       +  cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
       +  cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
       +
       +  // Sorted arrays
       +  cudaMalloc((void**)&dev_x_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
       +
       +  // Energy arrays
       +  cudaMalloc((void**)&dev_es_dot, memSizeF);
       +  cudaMalloc((void**)&dev_ev_dot, memSizeF);
       +  cudaMalloc((void**)&dev_es, memSizeF);
       +  cudaMalloc((void**)&dev_ev, memSizeF);
       +  cudaMalloc((void**)&dev_p, memSizeF);
       +
       +  // Cell-related arrays
       +  cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
       +  cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
       +  cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
        
       -  // Host contact bookkeeping arrays
       +    // Host contact bookkeeping arrays
          k.contacts = new unsigned int[np*NC];
          // Initialize contacts lists to np
          for (unsigned int i=0; i<(np*NC); ++i)
       t@@ -260,10 +266,11 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
          k.delta_t = new Float4[np*NC];
        
          // Wall arrays
       -  cudaMalloc((void**)&dev_walls->nx, sizeof(Float4)*walls.nw);
       -  cudaMalloc((void**)&dev_walls->mvfd, sizeof(Float4)*walls.nw);
       -  cudaMalloc((void**)&dev_walls->force, sizeof(Float)*walls.nw*np);
       -  // dev_w_force_partial allocated later
       +  cudaMalloc((void**)&dev_walls_wmode, sizeof(int)*walls.nw);
       +  cudaMalloc((void**)&dev_walls_nx, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls_mvfd, sizeof(Float4)*walls.nw);
       +  cudaMalloc((void**)&dev_walls_force_pp, sizeof(Float)*walls.nw*np);
       +  // dev_walls_force_partial allocated later
        
          checkForCudaErrors("End of allocateGlobalDeviceMemory");
          if (verbose == 1)
       t@@ -275,36 +282,40 @@ __host__ void DEM::freeGlobalDeviceMemory()
          if (verbose == 1)
            printf("\nLiberating device memory:                        ");
          // Particle arrays
       -  cudaFree(dev_k->x);
       -  cudaFree(dev_sort->x_sorted);
       -  cudaFree(dev_k->vel);
       -  cudaFree(dev_sort->vel_sorted);
       -  cudaFree(dev_k->angvel);
       -  cudaFree(dev_sort->angvel_sorted);
       -  cudaFree(dev_k->acc);
       -  cudaFree(dev_k->angacc);
       -  cudaFree(dev_k->force);
       -  cudaFree(dev_k->torque);
       -  cudaFree(dev_k->angpos);
       -  cudaFree(dev_e->es_dot);
       -  cudaFree(dev_e->ev_dot);
       -  cudaFree(dev_e->es);
       -  cudaFree(dev_e->ev);
       -  cudaFree(dev_e->p);
       -  cudaFree(dev_k->contacts);
       -  cudaFree(dev_k->distmod);
       -  cudaFree(dev_k->delta_t);
       +  cudaFree(dev_x);
       +  cudaFree(dev_xysum);
       +  cudaFree(dev_vel);
       +  cudaFree(dev_acc);
       +  cudaFree(dev_force);
       +  cudaFree(dev_angpos);
       +  cudaFree(dev_angvel);
       +  cudaFree(dev_angacc);
       +  cudaFree(dev_torque);
       +
       +  cudaFree(dev_contacts);
       +  cudaFree(dev_distmod);
       +  cudaFree(dev_delta_t);
       +
       +  cudaFree(dev_es_dot);
       +  cudaFree(dev_es);
       +  cudaFree(dev_ev_dot);
       +  cudaFree(dev_ev);
       +  cudaFree(dev_p);
       +
       +  cudaFree(dev_x_sorted);
       +  cudaFree(dev_vel_sorted);
       +  cudaFree(dev_angvel_sorted);
        
          // Cell-related arrays
       -  cudaFree(dev_sort->gridParticleIndex);
       -  cudaFree(dev_sort->cellStart);
       -  cudaFree(dev_sort->cellEnd);
       +  cudaFree(dev_gridParticleIndex);
       +  cudaFree(dev_cellStart);
       +  cudaFree(dev_cellEnd);
        
          // Wall arrays
       -  cudaFree(dev_walls->nx);
       -  cudaFree(dev_walls->mvfd);
       -  cudaFree(dev_walls->force);
       -  //cudaFree(dev_w_force_partial);
       +  cudaFree(dev_walls_nx);
       +  cudaFree(dev_walls_mvfd);
       +  cudaFree(dev_walls_force_partial);
       +  cudaFree(dev_walls_force_pp);
        
          if (verbose == 1)
            printf("Done\n");
       t@@ -321,55 +332,53 @@ __host__ void DEM::transferToGlobalDeviceMemory()
          unsigned int memSizeF4 = sizeof(Float4) * np;
        
          // Copy static-size structure data from host to global device memory
       -  cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
       +  //cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
        
          // Kinematic particle values
       -  cudaMemcpy( dev_k->x,               k.x,           
       +  cudaMemcpy( dev_x,               k.x,           
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->xysum,    k.xysum,
       +  cudaMemcpy( dev_xysum,    k.xysum,
              sizeof(Float2)*np, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->vel,      k.vel,
       +  cudaMemcpy( dev_vel,      k.vel,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->acc,      k.acc, 
       +  cudaMemcpy( dev_acc,      k.acc, 
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->force,    k.force,
       +  cudaMemcpy( dev_force,    k.force,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->angpos,   k.angpos,
       +  cudaMemcpy( dev_angpos,   k.angpos,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->angvel,   k.angvel,
       +  cudaMemcpy( dev_angvel,   k.angvel,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->angacc,   k.angacc,
       +  cudaMemcpy( dev_angacc,   k.angacc,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->torque,   k.torque,
       +  cudaMemcpy( dev_torque,   k.torque,
              memSizeF4, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->contacts, k.contacts,
       +  cudaMemcpy( dev_contacts, k.contacts,
              sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->distmod, k.distmod,
       +  cudaMemcpy( dev_distmod, k.distmod,
              memSizeF4*NC, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_k->delta_t, k.delta_t,
       +  cudaMemcpy( dev_delta_t, k.delta_t,
              memSizeF4*NC, cudaMemcpyHostToDevice);
        
          // Individual particle energy values
       -  cudaMemcpy( dev_e->es_dot, e.es_dot,
       +  cudaMemcpy( dev_es_dot, e.es_dot,
              memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_e->es,     e.es,
       +  cudaMemcpy( dev_es,     e.es,
              memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_e->ev_dot, e.ev_dot,
       +  cudaMemcpy( dev_ev_dot, e.ev_dot,
              memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_e->ev,     e.ev,
       +  cudaMemcpy( dev_ev,     e.ev,
              memSizeF, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_e->p, e.p,
       +  cudaMemcpy( dev_p, e.p,
              memSizeF, cudaMemcpyHostToDevice);
        
          // Wall parameters
       -  cudaMemcpy( dev_walls->wmode, walls.wmode,
       +  cudaMemcpy( dev_walls_wmode, walls.wmode,
              sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_walls->nx,    walls.nx,
       +  cudaMemcpy( dev_walls_nx,    walls.nx,
              sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_walls->mvfd,  walls.mvfd,
       +  cudaMemcpy( dev_walls_mvfd,  walls.mvfd,
              sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
       -  cudaMemcpy( dev_walls->force, walls.force,
       -      memSizeF*walls.nw, cudaMemcpyHostToDevice);
        
          checkForCudaErrors("End of transferToGlobalDeviceMemory");
          if (verbose == 1)
       t@@ -385,55 +394,53 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
          unsigned int memSizeF4 = sizeof(Float4) * np;
        
          // Copy static-size structure data from host to global device memory
       -  cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
       +  //cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
        
          // Kinematic particle values
       -  cudaMemcpy( k.x, dev_k->x,
       +  cudaMemcpy( k.x, dev_x,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.xysum, dev_k->xysum,
       +  cudaMemcpy( k.xysum, dev_xysum,
              sizeof(Float2)*np, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.vel, dev_k->vel,
       +  cudaMemcpy( k.vel, dev_vel,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.acc, dev_k->acc,
       +  cudaMemcpy( k.acc, dev_acc,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.force, dev_k->force,
       +  cudaMemcpy( k.force, dev_force,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.angpos, dev_k->angpos,
       +  cudaMemcpy( k.angpos, dev_angpos,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.angvel, dev_k->angvel,
       +  cudaMemcpy( k.angvel, dev_angvel,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.angacc, dev_k->angacc,
       +  cudaMemcpy( k.angacc, dev_angacc,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.torque, dev_k->torque,
       +  cudaMemcpy( k.torque, dev_torque,
              memSizeF4, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.contacts, dev_k->contacts,
       +  cudaMemcpy( k.contacts, dev_contacts,
              sizeof(unsigned int)*np*NC, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.distmod, dev_k->distmod,
       +  cudaMemcpy( k.distmod, dev_distmod,
              memSizeF4*NC, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( k.delta_t, dev_k->delta_t,
       +  cudaMemcpy( k.delta_t, dev_delta_t,
              memSizeF4*NC, cudaMemcpyDeviceToHost);
        
          // Individual particle energy values
       -  cudaMemcpy( e.es_dot, dev_e->es_dot,
       +  cudaMemcpy( e.es_dot, dev_es_dot,
              memSizeF, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( e.es, dev_e->es,
       +  cudaMemcpy( e.es, dev_es,
              memSizeF, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( e.ev_dot, dev_e->ev_dot,
       +  cudaMemcpy( e.ev_dot, dev_ev_dot,
              memSizeF, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( e.ev, dev_e->ev,
       +  cudaMemcpy( e.ev, dev_ev,
              memSizeF, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( e.p, dev_e->p,
       +  cudaMemcpy( e.p, dev_p,
              memSizeF, cudaMemcpyDeviceToHost);
        
          // Wall parameters
       -  cudaMemcpy( walls.wmode, dev_walls->wmode,
       +  cudaMemcpy( walls.wmode, dev_walls_wmode,
              sizeof(int)*walls.nw, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( walls.nx, dev_walls->nx,
       +  cudaMemcpy( walls.nx, dev_walls_nx,
              sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( walls.mvfd, dev_walls->mvfd,
       +  cudaMemcpy( walls.mvfd, dev_walls_mvfd,
              sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
       -  cudaMemcpy( walls.force, dev_walls->force,
       -      memSizeF*walls.nw, cudaMemcpyDeviceToHost);
        
          checkForCudaErrors("End of transferFromGlobalDeviceMemory");
        }
       t@@ -476,8 +483,8 @@ __host__ void DEM::startTime()
          // Shared memory per block
          unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
        
       -  Float* dev_w_force_partial;
       -  cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
       +  // Pre-sum of force per wall
       +  cudaMalloc((void**)&dev_walls_force_partial, sizeof(Float)*dimGrid.x);
        
          // Report to stdout
          if (verbose == 1) {
       t@@ -562,9 +569,9 @@ __host__ void DEM::startTime()
            // in the fine, uniform and homogenous grid.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort->gridParticleCellID, 
       -                                              dev_sort->gridParticleIndex, 
       -                                              dev_k->x);
       +    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID, 
       +                                              dev_gridParticleIndex, 
       +                                              dev_x);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -576,9 +583,9 @@ __host__ void DEM::startTime()
            // Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort->gridParticleCellID),
       -                        thrust::device_ptr<uint>(dev_sort->gridParticleCellID + np),
       -                        thrust::device_ptr<uint>(dev_sort->gridParticleIndex));
       +    thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
       +                        thrust::device_ptr<uint>(dev_gridParticleCellID + np),
       +                        thrust::device_ptr<uint>(dev_gridParticleIndex));
            cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
            if (PROFILING == 1)
              stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
       t@@ -588,7 +595,7 @@ __host__ void DEM::startTime()
            // Zero cell array values by setting cellStart to its highest possible value,
            // specified with pointer value 0xffffffff, which for a 32 bit unsigned int
            // is 4294967295.
       -    cudaMemset(dev_sort->cellStart, 0xffffffff, 
       +    cudaMemset(dev_cellStart, 0xffffffff, 
                       grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
            cudaThreadSynchronize();
            checkForCudaErrors("Post cudaMemset");
       t@@ -597,15 +604,15 @@ __host__ void DEM::startTime()
            // coherent memory access. Save ordered configurations in new arrays (*_sorted).
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort->cellStart, 
       -                                                   dev_sort->cellEnd,
       -                                                   dev_sort->gridParticleCellID, 
       -                                                   dev_sort->gridParticleIndex,
       -                                                   dev_k->x, dev_k->vel, 
       -                                                   dev_k->angvel,
       -                                                   dev_sort->x_sorted, 
       -                                                   dev_sort->vel_sorted, 
       -                                                   dev_sort->angvel_sorted);
       +    reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart, 
       +                                                   dev_cellEnd,
       +                                                   dev_gridParticleCellID, 
       +                                                   dev_gridParticleIndex,
       +                                                   dev_x, dev_vel, 
       +                                                   dev_angvel,
       +                                                   dev_x_sorted, 
       +                                                   dev_vel_sorted, 
       +                                                   dev_angvel_sorted);
        
            // Synchronization point
            cudaThreadSynchronize();
       t@@ -620,12 +627,12 @@ __host__ void DEM::startTime()
              // For each particle: Search contacts in neighbor cells
              if (PROFILING == 1)
                startTimer(&kernel_tic);
       -      topology<<<dimGrid, dimBlock>>>(dev_sort->cellStart, 
       -                                      dev_sort->cellEnd,
       -                                      dev_sort->gridParticleIndex,
       -                                      dev_sort->x_sorted, 
       -                                      dev_k->contacts,
       -                                      dev_k->distmod);
       +      topology<<<dimGrid, dimBlock>>>(dev_cellStart, 
       +                                      dev_cellEnd,
       +                                      dev_gridParticleIndex,
       +                                      dev_x_sorted, 
       +                                      dev_contacts,
       +                                      dev_distmod);
        
        
              // Synchronization point
       t@@ -639,28 +646,28 @@ __host__ void DEM::startTime()
            // For each particle: Process collisions and compute resulting forces.
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    interact<<<dimGrid, dimBlock>>>(dev_sort->gridParticleIndex,
       -                                    dev_sort->cellStart,
       -                                    dev_sort->cellEnd,
       -                                    dev_k->x,
       -                                    dev_sort->x_sorted,
       -                                    dev_sort->vel_sorted,
       -                                    dev_sort->angvel_sorted,
       -                                    dev_k->vel,
       -                                    dev_k->angvel,
       -                                    dev_k->force, 
       -                                    dev_k->torque, 
       -                                    dev_e->es_dot,
       -                                    dev_e->ev_dot, 
       -                                    dev_e->es,
       -                                    dev_e->ev,
       -                                    dev_e->p,
       -                                    dev_walls->nx,
       -                                    dev_walls->mvfd,
       -                                    dev_walls->force,
       -                                    dev_k->contacts,
       -                                    dev_k->distmod,
       -                                    dev_k->delta_t);
       +    interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
       +                                    dev_cellStart,
       +                                    dev_cellEnd,
       +                                    dev_x,
       +                                    dev_x_sorted,
       +                                    dev_vel_sorted,
       +                                    dev_angvel_sorted,
       +                                    dev_vel,
       +                                    dev_angvel,
       +                                    dev_force, 
       +                                    dev_torque, 
       +                                    dev_es_dot,
       +                                    dev_ev_dot, 
       +                                    dev_es,
       +                                    dev_ev,
       +                                    dev_p,
       +                                    dev_walls_nx,
       +                                    dev_walls_mvfd,
       +                                    dev_walls_force_pp,
       +                                    dev_contacts,
       +                                    dev_distmod,
       +                                    dev_delta_t);
        
        
            // Synchronization point
       t@@ -672,17 +679,20 @@ __host__ void DEM::startTime()
            // Update particle kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrate<<<dimGrid, dimBlock>>>(dev_sort->x_sorted, 
       -                                     dev_sort->vel_sorted, 
       -                                     dev_sort->angvel_sorted,
       -                                     dev_k->x, 
       -                                     dev_k->vel, 
       -                                     dev_k->angvel,
       -                                     dev_k->force,
       -                                     dev_k->torque, 
       -                                     dev_k->angpos,
       -                                     dev_k->xysum,
       -                                     dev_sort->gridParticleIndex);
       +    integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, 
       +                                     dev_vel_sorted, 
       +                                     dev_angvel_sorted,
       +                                     dev_x, 
       +                                     dev_vel, 
       +                                     dev_angvel,
       +                                     dev_force,
       +                                     dev_torque, 
       +                                     dev_angpos,
       +                                     dev_xysum,
       +                                     dev_gridParticleIndex);
       +    cudaThreadSynchronize();
       +    checkForCudaErrors("Post integrate");
       + 
        
            cudaThreadSynchronize();
            if (PROFILING == 1)
       t@@ -691,20 +701,26 @@ __host__ void DEM::startTime()
            // Summation of forces on wall
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    summation<<<dimGrid, dimBlock>>>(dev_walls->force, dev_w_force_partial);
       -
       +    if (walls.nw > 0) {
       +      summation<<<dimGrid, dimBlock>>>(dev_walls_force_pp,
       +                                       dev_walls_force_partial);
       +    }
            // Synchronization point
            cudaThreadSynchronize();
            if (PROFILING == 1)
              stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
       -    checkForCudaErrors("Post integrate & wall force summation");
       +    checkForCudaErrors("Post wall force summation");
        
            // Update wall kinematics
            if (PROFILING == 1)
              startTimer(&kernel_tic);
       -    integrateWalls<<< 1, walls.nw>>>(dev_walls,
       -                                     dev_w_force_partial,
       -                                     blocksPerGrid);
       +    if (walls.nw > 0) {
       +      integrateWalls<<< 1, walls.nw>>>(dev_walls_nx,
       +                                       dev_walls_mvfd,
       +                                       dev_walls_wmode,
       +                                       dev_walls_force_partial,
       +                                       blocksPerGrid);
       +    }
        
            // Synchronization point
            cudaThreadSynchronize();
 (DIR) diff --git a/src/file_io.cpp b/src/file_io.cpp
       t@@ -189,7 +189,6 @@ void DEM::readbin(const char *target)
          // Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
          walls.nx    = new Float4[walls.nw];
          walls.mvfd  = new Float4[walls.nw];
       -  walls.force = new Float[walls.nw*np];
        
          ifs.read(as_bytes(walls.wmode), sizeof(walls.wmode));
          for (i = 0; i<walls.nw; ++i) {
 (DIR) diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -175,8 +175,10 @@ __global__ void summation(Float* in, Float *out)
        }
        
        // Update wall positions
       -__global__ void integrateWalls(Walls* dev_walls, 
       -                               Float* dev_w_force_partial,
       +__global__ void integrateWalls(Float4* dev_walls_nx,
       +                                   Float4* dev_walls_mvfd,
       +                               int* dev_walls_wmode,
       +                               Float* dev_walls_force_partial,
                                       unsigned int blocksPerGrid)
        {
          unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       t@@ -185,9 +187,9 @@ __global__ void integrateWalls(Walls* dev_walls,
        
            // Copy data to temporary arrays to avoid any potential read-after-write, 
            // write-after-read, or write-after-write hazards. 
       -    Float4 w_nx   = dev_walls->nx[idx];
       -    Float4 w_mvfd = dev_walls->mvfd[idx];
       -    int wmode = dev_walls->wmode[idx];  // Wall BC, 0: fixed, 1: devs, 2: vel
       +    Float4 w_nx   = dev_walls_nx[idx];
       +    Float4 w_mvfd = dev_walls_mvfd[idx];
       +    int wmode = dev_walls_wmode[idx];  // Wall BC, 0: fixed, 1: devs, 2: vel
            Float acc;
        
            if (wmode == 0) // Wall fixed: do nothing
       t@@ -196,7 +198,7 @@ __global__ void integrateWalls(Walls* dev_walls,
            // Find the final sum of forces on wall
            w_mvfd.z = 0.0f;
            for (int i=0; i<blocksPerGrid; ++i) {
       -      w_mvfd.z += dev_w_force_partial[i];
       +      w_mvfd.z += dev_walls_force_partial[i];
            }
        
            Float dt = devC_dt;
       t@@ -222,8 +224,8 @@ __global__ void integrateWalls(Walls* dev_walls,
            w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
        
            // Store data in global memory
       -    dev_walls->nx[idx]   = w_nx;
       -    dev_walls->mvfd[idx] = w_mvfd;
       +    dev_walls_nx[idx]   = w_nx;
       +    dev_walls_mvfd[idx] = w_mvfd;
          }
        } // End of integrateWalls(...)
        
 (DIR) diff --git a/src/sphere.cpp b/src/sphere.cpp
       t@@ -207,16 +207,18 @@ void DEM::reportValues()
          else
            cout << "  - 1st and 2nd dim. boundaries: Visco-frictional walls\n";
        
       -  cout << "  - Top BC: ";
       -  if (walls.wmode[0] == 0)
       -    cout << "Fixed\n";
       -  else if (walls.wmode[0] == 1)
       -    cout << "Deviatoric stress\n";
       -  else if (walls.wmode[0] == 2)
       -    cout << "Velocity\n";
       -  else {
       -    cerr << "Top BC not recognized!\n";
       -    exit(1);
       +  if (walls.nw > 0) {
       +    cout << "  - Top BC: ";
       +    if (walls.wmode[0] == 0)
       +      cout << "Fixed\n";
       +    else if (walls.wmode[0] == 1)
       +      cout << "Deviatoric stress\n";
       +    else if (walls.wmode[0] == 2)
       +      cout << "Velocity\n";
       +    else {
       +      cerr << "Top BC not recognized!\n";
       +      exit(1);
       +    }
          }
        
          cout << "  - Grid: ";
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -25,13 +25,14 @@ class DEM {
            // Number of particles
            unsigned int np;
        
       +    // HOST STRUCTURES
            // Structure containing individual particle kinematics
            Kinematics k;        // host
       -    Kinematics *dev_k;        // device
       +    //Kinematics *dev_k;        // device
        
            // Structure containing energy values
            Energies e;                // host
       -    Energies *dev_e;        // device
       +    //Energies *dev_e;        // device
        
            // Structure of global parameters
            Params params;        // host
       t@@ -40,15 +41,46 @@ class DEM {
            Grid grid;                // host
        
            // Structure containing sorting arrays
       -    Sorting *dev_sort;        // device
       +    //Sorting *dev_sort;        // device
        
            // Structure of temporal parameters
            Time time;                // host
       -    Time *dev_time;        // device
       +    //Time *dev_time;        // device
        
            // Structure of wall parameters
            Walls walls;        // host
       -    Walls *dev_walls;        // device
       +    //Walls *dev_walls;        // device
       +
       +    // DEVICE ARRAYS
       +    Float4 *dev_x;
       +    Float2 *dev_xysum;
       +    Float4 *dev_vel;
       +    Float4 *dev_acc;
       +    Float4 *dev_force;
       +    Float4 *dev_angpos;
       +    Float4 *dev_angvel;
       +    Float4 *dev_angacc;
       +    Float4 *dev_torque;
       +    unsigned int *dev_contacts;
       +    Float4 *dev_distmod;
       +    Float4 *dev_delta_t;
       +    Float *dev_es_dot;
       +    Float *dev_es;
       +    Float *dev_ev_dot;
       +    Float *dev_ev;
       +    Float *dev_p;
       +    Float4 *dev_x_sorted;
       +    Float4 *dev_vel_sorted;
       +    Float4 *dev_angvel_sorted;
       +    unsigned int *dev_gridParticleCellID;
       +    unsigned int *dev_gridParticleIndex;
       +    unsigned int *dev_cellStart;
       +    unsigned int *dev_cellEnd;
       +    int *dev_walls_wmode;
       +    Float4 *dev_walls_nx; // normal, pos.
       +    Float4 *dev_walls_mvfd; // Mass, velocity, force, dev. stress
       +    Float *dev_walls_force_partial; // Pre-sum per wall
       +    Float *dev_walls_force_pp; // Force per particle per wall
        
            // GPU initialization, must be called before startTime()
            void initializeGPU(void);
       t@@ -101,10 +133,11 @@ class DEM {
            void startTime(void);
        
            // Render particles using raytracing
       -    // render(const char *target,
       -    //        Float3 lookat,
       -    //        Float3 eye,
       -    //        Float  focalLength);
       +    void render(const char *target,
       +        const Float3 lookat,
       +        const Float3 eye,
       +        const Float focalLength = 1.0,
       +        const int method = 1);
        
        };
        
 (DIR) diff --git a/src/utility.cu b/src/utility.cu
       t@@ -11,7 +11,7 @@ void checkForCudaErrors(const char* checkpoint_description)
          cudaError_t err = cudaGetLastError();
          if (err != cudaSuccess) {
            std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_description
       -      << "\nError string: " << cudaGetErrorString(err) << "\n";
       +      << "\nError string: " << cudaGetErrorString(err) << std::endl;
            exit(EXIT_FAILURE);
          }
        }
       t@@ -22,7 +22,7 @@ void checkForCudaErrors(const char* checkpoint_description, const unsigned int i
          if (err != cudaSuccess) {
            std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_description
              << "\nduring iteration " << iteration
       -      << "\nError string: " << cudaGetErrorString(err) << "\n";
       +      << "\nError string: " << cudaGetErrorString(err) << std::endl;
            exit(EXIT_FAILURE);
          }
        }