tadd const and __restrict__ keywords to facilitate constant memory cache usage - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 21ed15ef593a831b2124a48bbc662b9a227f85ac
 (DIR) parent 7626d19392ecba63c4b2e5f4d419e8423d7a90a0
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Mon, 11 Aug 2014 14:37:07 +0200
       
       add const and __restrict__ keywords to facilitate constant memory cache usage
       
       Diffstat:
         M src/cohesion.cuh                    |     143 ++++++++-----------------------
         M src/contactmodels.cuh               |     164 ++++++++++++++++++-------------
         M src/contactsearch.cuh               |     246 ++++++++++++++++---------------
         M src/integration.cuh                 |      70 ++++++++++++++++++-------------
         M src/navierstokes.cuh                |    1213 ++++++++++++++++---------------
         M src/raytracer.cuh                   |     173 +++++++++++++++----------------
         M tests/fluid_particle_interaction.py |       3 +--
       
       7 files changed, 996 insertions(+), 1016 deletions(-)
       ---
 (DIR) diff --git a/src/cohesion.cuh b/src/cohesion.cuh
       t@@ -6,14 +6,14 @@
        
        // Check bond pair list, apply linear contact model to pairs
        __global__ void bondsLinear(
       -        uint2*  dev_bonds,
       -        Float4* dev_bonds_delta, // Contact displacement
       -        Float4* dev_bonds_omega, // Contact rotational displacement
       -        Float4* dev_x,
       -        Float4* dev_vel,
       -        Float4* dev_angvel,
       -        Float4* dev_force,
       -        Float4* dev_torque)
       +    uint2* __restrict__ dev_bonds,
       +    Float4* __restrict__ dev_bonds_delta, // Contact displacement
       +    Float4* __restrict__ dev_bonds_omega, // Contact rotational displacement
       +    const Float4* __restrict__ dev_x,
       +    const Float4* __restrict__ dev_vel,
       +    const Float4* __restrict__ dev_angvel,
       +    Float4* __restrict__ dev_force,
       +    Float4* __restrict__ dev_torque)
        {
            // Find thread index
            unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
       t@@ -35,16 +35,16 @@ __global__ void bondsLinear(
                    // Convert tangential vectors to Float3's
                    // Uncorrected tangential component of displacement
                    Float3 delta0_t = MAKE_FLOAT3(
       -                    delta0_4.x,
       -                    delta0_4.y,
       -                    delta0_4.z);
       +                delta0_4.x,
       +                delta0_4.y,
       +                delta0_4.z);
                    const Float delta0_n = delta0_4.w;
        
                    // Uncorrected tangential component of rotation
                    Float3 omega0_t = MAKE_FLOAT3(
       -                    omega0_4.x,
       -                    omega0_4.y,
       -                    omega0_4.z);
       +                omega0_4.x,
       +                omega0_4.y,
       +                omega0_4.z);
                    const Float omega0_n = omega0_4.w;
        
                    // Read particle data
       t@@ -76,9 +76,9 @@ __global__ void bondsLinear(
        
                    // Inter-particle vector
                    const Float3 x = MAKE_FLOAT3(
       -                    x_i.x - x_j.x,
       -                    x_i.y - x_j.y,
       -                    x_i.z - x_j.z);
       +                x_i.x - x_j.x,
       +                x_i.y - x_j.y,
       +                x_i.z - x_j.z);
                    const Float x_length = length(x);
        
                    // Find overlap (negative value if overlapping)
       t@@ -96,13 +96,13 @@ __global__ void bondsLinear(
        
                    // Contact displacement, Luding 2008 eq. 10
                    const Float3 ddelta = (
       -                    MAKE_FLOAT3(
       -                        vel_i.x - vel_j.x,
       -                        vel_i.y - vel_j.y,
       -                        vel_i.z - vel_j.z) 
       -                    + (x_i.w + overlap/2.0) * cross(n, angvel_i)
       -                    + (x_j.w + overlap/2.0) * cross(n, angvel_j)
       -                    ) * devC_dt;
       +                MAKE_FLOAT3(
       +                    vel_i.x - vel_j.x,
       +                    vel_i.y - vel_j.y,
       +                    vel_i.z - vel_j.z) 
       +                + (x_i.w + overlap/2.0) * cross(n, angvel_i)
       +                + (x_j.w + overlap/2.0) * cross(n, angvel_j)
       +                ) * devC_dt;
        
                    // Normal component of the displacement increment
                    //const Float ddelta_n = dot(ddelta, n);
       t@@ -141,9 +141,9 @@ __global__ void bondsLinear(
        
                    // Contact rotational velocity
                    Float3 domega = MAKE_FLOAT3(
       -                    angvel_j.x - angvel_i.x,
       -                    angvel_j.y - angvel_i.y,
       -                    angvel_j.z - angvel_i.z) * devC_dt;
       +                angvel_j.x - angvel_i.x,
       +                angvel_j.y - angvel_i.y,
       +                angvel_j.z - angvel_i.z) * devC_dt;
                    /*const Float3 domega = MAKE_FLOAT3(
                      angvel_i.x - angvel_j.x,
                      angvel_i.y - angvel_j.y,
       t@@ -232,89 +232,16 @@ __global__ void bondsLinear(
            }
        }
        
       -// Linear-elastic bond: Attractive force with normal- and shear components
       -// acting upon particle A in a bonded particle pair
       -__device__ void bondLinear_old(Float3* N, Float3* T, Float* es_dot, Float* p,
       -        unsigned int idx_a, unsigned int idx_b, 
       -        Float4* dev_x_sorted, Float4* dev_vel_sorted, 
       -        Float4* dev_angvel_sorted,
       -        Float radius_a, Float radius_b, 
       -        Float3 x_ab, Float x_ab_length, 
       -        Float delta_ab) 
       -{
       -
       -    // If particles are not overlapping, apply bond force
       -    if (delta_ab > 0.0f) {
       -
       -        // Allocate variables and fetch missing time=t values for particle A and B
       -        Float4 vel_a     = dev_vel_sorted[idx_a];
       -        Float4 vel_b     = dev_vel_sorted[idx_b];
       -        Float4 angvel4_a = dev_angvel_sorted[idx_a];
       -        Float4 angvel4_b = dev_angvel_sorted[idx_b];
       -
       -        // Convert to Float3's
       -        Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       -        Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       -
       -        // Normal vector of contact
       -        Float3 n_ab = x_ab/x_ab_length;
       -
       -        // Relative contact interface velocity, w/o rolling
       -        Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       -                vel_a.y - vel_b.y, 
       -                vel_a.z - vel_b.z);
       -
       -        // Relative contact interface velocity of particle surfaces at
       -        // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       -        Float3 vel_ab = vel_ab_linear
       -            + radius_a * cross(n_ab, angvel_a)
       -            + radius_b * cross(n_ab, angvel_b);
       -
       -        // Relative contact interface rolling velocity
       -        //Float3 angvel_ab = angvel_a - angvel_b;
       -        //Float  angvel_ab_length = length(angvel_ab);
       -
       -        // Normal component of the relative contact interface velocity
       -        //Float vel_n_ab = dot(vel_ab_linear, n_ab);
       -
       -        // Tangential component of the relative contact interface velocity
       -        // Hinrichsen and Wolf 2004, eq. 13.9
       -        Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       -        //Float  vel_t_ab_length = length(vel_t_ab);
       -
       -        Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       -        Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
       -
       -        // Mean radius
       -        Float R_bar = (radius_a + radius_b)/2.0f;
       -
       -        // Normal force component: Elastic
       -        f_n = devC_params.k_n * delta_ab * n_ab;
       -
       -        if (length(vel_t_ab) > 0.f) {
       -            // Shear force component: Viscous
       -            f_t = -1.0f * devC_params.gamma_t * vel_t_ab;
       -
       -            // Shear friction production rate [W]
       -            //*es_dot += -dot(vel_t_ab, f_t);
       -        }
       -
       -        // Add force components from this bond to total force for particle
       -        *N += f_n + f_t;
       -        *T += -R_bar * cross(n_ab, f_t);
       -
       -        // Pressure excerted onto the particle from this bond
       -        *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
       -
       -    }
       -} // End of bondLinear()
       -
        
        // Capillary cohesion after Richefeu et al. (2006)
       -__device__ void capillaryCohesion_exp(Float3* N, Float radius_a, 
       -        Float radius_b, Float delta_ab,
       -        Float3 x_ab, Float x_ab_length, 
       -        Float kappa)
       +__device__ void capillaryCohesion_exp(
       +    Float3* N,
       +    const Float radius_a, 
       +    const Float radius_b,
       +    const Float delta_ab,
       +    const Float3 x_ab,
       +    const Float x_ab_length, 
       +    const Float kappa)
        {
        
            // Normal vector 
 (DIR) diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
       t@@ -7,11 +7,19 @@
        
        // Linear viscoelastic contact model for particle-wall interactions
        // with tangential friction and rolling resistance
       -__device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
       -        Float* ev_dot, Float* p,
       -        unsigned int idx_a, Float radius_a,
       -        Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
       -        Float3 n, Float delta, Float wvel)
       +__device__ Float contactLinear_wall(
       +    Float3* F,
       +    Float3* T,
       +    Float* es_dot,
       +    Float* ev_dot,
       +    Float* p,
       +    const unsigned int idx_a,
       +    const Float radius_a,
       +    const Float4* __restrict__ dev_vel_sorted,
       +    const Float4* __restrict__ dev_angvel_sorted,
       +    const Float3 n,
       +    const Float delta,
       +    const Float wvel)
        {
            // Fetch particle velocities from global memory
            Float4 vel_tmp = dev_vel_sorted[idx_a];
       t@@ -19,13 +27,13 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
        
            // Convert velocities to three-component vectors
            Float3 vel_linear = MAKE_FLOAT3(
       -            vel_tmp.x,
       -            vel_tmp.y,
       -            vel_tmp.z);
       +        vel_tmp.x,
       +        vel_tmp.y,
       +        vel_tmp.z);
            Float3 angvel = MAKE_FLOAT3(
       -            angvel_tmp.x,
       -            angvel_tmp.y,
       -            angvel_tmp.z);
       +        angvel_tmp.x,
       +        angvel_tmp.y,
       +        angvel_tmp.z);
        
            // Store the length of the angular velocity for later use
            Float angvel_length = length(angvel);
       t@@ -47,7 +55,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
        
            // Normal force component: Elastic - viscous damping
            Float3 f_n = fmax(0.0, -devC_params.k_n*delta
       -                     - devC_params.gamma_wn*vel_n) * n;
       +                      - devC_params.gamma_wn*vel_n) * n;
            const Float f_n_length = length(f_n); // Save length for later use
        
            // Store the energy lost by viscous damping. See derivation in
       t@@ -103,14 +111,22 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
        
        // Linear vicoelastic contact model for particle-particle interactions
        // with tangential friction and rolling resistance
       -__device__ void contactLinearViscous(Float3* F, Float3* T, 
       -        Float* es_dot, Float* ev_dot, Float* p,
       -        unsigned int idx_a, unsigned int idx_b, 
       -        Float4* dev_vel_sorted, 
       -        Float4* dev_angvel_sorted,
       -        Float radius_a, Float radius_b, 
       -        Float3 x_ab, Float x_ab_length, 
       -        Float delta_ab, Float kappa) 
       +__device__ void contactLinearViscous(
       +    Float3* F,
       +    Float3* T, 
       +    Float* es_dot,
       +    Float* ev_dot,
       +    Float* p,
       +    const unsigned int idx_a,
       +    const unsigned int idx_b, 
       +    const Float4* __restrict__ dev_vel_sorted, 
       +    const Float4* __restrict__ dev_angvel_sorted,
       +    const Float radius_a,
       +    const Float radius_b, 
       +    const Float3 x_ab,
       +    const Float x_ab_length, 
       +    const Float delta_ab,
       +    const Float kappa) 
        {
        
            // Allocate variables and fetch missing time=t values for particle A and B
       t@@ -131,8 +147,8 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,
        
            // Relative contact interface velocity, w/o rolling
            Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       -            vel_a.y - vel_b.y, 
       -            vel_a.z - vel_b.z);
       +                                       vel_a.y - vel_b.y, 
       +                                       vel_a.z - vel_b.z);
        
            // Relative contact interface velocity of particle surfaces at
            // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       t@@ -209,18 +225,25 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,
        
        
        // Linear elastic contact model for particle-particle interactions
       -__device__ void contactLinear(Float3* F, Float3* T, 
       -        Float* es_dot, Float* ev_dot, Float* p,
       -        unsigned int idx_a_orig,
       -        unsigned int idx_b_orig, 
       -        Float4  vel_a, 
       -        Float4* dev_vel,
       -        Float3  angvel_a,
       -        Float4* dev_angvel,
       -        Float radius_a, Float radius_b, 
       -        Float3 x, Float x_length, 
       -        Float delta, Float4* dev_delta_t,
       -        unsigned int mempos) 
       +__device__ void contactLinear(
       +    Float3* F,
       +    Float3* T, 
       +    Float* es_dot,
       +    Float* ev_dot,
       +    Float* p,
       +    const unsigned int idx_a_orig,
       +    const unsigned int idx_b_orig, 
       +    const Float4  vel_a, 
       +    const Float4* __restrict__ dev_vel,
       +    const Float3  angvel_a,
       +    const Float4* __restrict__ dev_angvel,
       +    const Float radius_a,
       +    const Float radius_b, 
       +    const Float3 x,
       +    const Float x_length, 
       +    const Float delta,
       +    Float4* __restrict__ dev_delta_t,
       +    const unsigned int mempos) 
        {
        
            // Allocate variables and fetch missing time=t values for particle A and B
       t@@ -231,15 +254,15 @@ __device__ void contactLinear(Float3* F, Float3* T,
            Float4 delta_t0_4 = dev_delta_t[mempos];
        
            Float3 delta_t0_uncor = MAKE_FLOAT3(
       -            delta_t0_4.x,
       -            delta_t0_4.y,
       -            delta_t0_4.z);
       +        delta_t0_4.x,
       +        delta_t0_4.y,
       +        delta_t0_4.z);
        
            // Convert to Float3
            Float3 angvel_b = MAKE_FLOAT3(
       -            angvel4_b.x,
       -            angvel4_b.y,
       -            angvel4_b.z);
       +        angvel4_b.x,
       +        angvel4_b.y,
       +        angvel4_b.z);
        
            // Force between grain pair decomposed into normal- and tangential part
            Float3 f_n, f_t, f_c;
       t@@ -249,9 +272,9 @@ __device__ void contactLinear(Float3* F, Float3* T,
        
            // Relative contact interface velocity, w/o rolling
            Float3 vel_linear = MAKE_FLOAT3(
       -            vel_a.x - vel_b.x, 
       -            vel_a.y - vel_b.y, 
       -            vel_a.z - vel_b.z);
       +        vel_a.x - vel_b.x, 
       +        vel_a.y - vel_b.y, 
       +        vel_a.z - vel_b.z);
        
            // Relative contact interface velocity of particle surfaces at
            // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10,
       t@@ -335,7 +358,7 @@ __device__ void contactLinear(Float3* F, Float3* T,
                    // 2008)
                    delta_t = -1.0/devC_params.k_t
                        * (devC_params.mu_d * length(f_n-f_c) * t
       -                        + devC_params.gamma_t * vel_t);
       +                   + devC_params.gamma_t * vel_t);
        
                    // Shear friction heat production rate: 
                    // The energy lost from the tangential spring is dissipated as heat
       t@@ -357,10 +380,10 @@ __device__ void contactLinear(Float3* F, Float3* T,
        
            // Store sum of tangential displacements
            dev_delta_t[mempos] = MAKE_FLOAT4(
       -            delta_t.x,
       -            delta_t.y,
       -            delta_t.z,
       -            0.0f);
       +        delta_t.x,
       +        delta_t.y,
       +        delta_t.z,
       +        0.0f);
        
        } // End of contactLinear()
        
       t@@ -368,18 +391,25 @@ __device__ void contactLinear(Float3* F, Float3* T,
        // Non-linear contact model for particle-particle interactions
        // Based on Hertzian and Mindlin contact theories (e.g. Hertz, 1882, Mindlin and 
        // Deresiewicz, 1953, Johnson, 1985). See Yohannes et al 2012 for example.
       -__device__ void contactHertz(Float3* F, Float3* T, 
       -        Float* es_dot, Float* ev_dot, Float* p,
       -        unsigned int idx_a_orig,
       -        unsigned int idx_b_orig, 
       -        Float4  vel_a, 
       -        Float4* dev_vel,
       -        Float3  angvel_a,
       -        Float4* dev_angvel,
       -        Float radius_a, Float radius_b, 
       -        Float3 x_ab, Float x_ab_length, 
       -        Float delta_ab, Float4* dev_delta_t,
       -        unsigned int mempos) 
       +__device__ void contactHertz(
       +    Float3* F,
       +    Float3* T, 
       +    Float* es_dot,
       +    Float* ev_dot,
       +    Float* p,
       +    const unsigned int idx_a_orig,
       +    const unsigned int idx_b_orig, 
       +    const Float4  vel_a, 
       +    const Float4* __restrict__ dev_vel,
       +    const Float3  angvel_a,
       +    const Float4* __restrict__ dev_angvel,
       +    const Float radius_a,
       +    const Float radius_b, 
       +    const Float3 x_ab,
       +    const Float x_ab_length, 
       +    const Float delta_ab,
       +    Float4* __restrict__ dev_delta_t,
       +    const unsigned int mempos) 
        {
        
            // Allocate variables and fetch missing time=t values for particle A and B
       t@@ -390,8 +420,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
            Float4 delta_t0_4 = dev_delta_t[mempos];
        
            Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
       -            delta_t0_4.y,
       -            delta_t0_4.z);
       +                                        delta_t0_4.y,
       +                                        delta_t0_4.z);
        
            // Convert to Float3
            Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       t@@ -404,8 +434,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
        
            // Relative contact interface velocity, w/o rolling
            Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x, 
       -            vel_a.y - vel_b.y, 
       -            vel_a.z - vel_b.z);
       +                                       vel_a.y - vel_b.y, 
       +                                       vel_a.z - vel_b.z);
        
            // Relative contact interface velocity of particle surfaces at
            // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       t@@ -435,7 +465,7 @@ __device__ void contactHertz(Float3* F, Float3* T,
        
            // Normal force component
            f_n = (-devC_params.k_n * powf(delta_ab, 3.0f/2.0f)  
       -            -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
       +           -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
                * n_ab;
        
            // Store energy dissipated in normal viscous component
       t@@ -517,9 +547,9 @@ __device__ void contactHertz(Float3* F, Float3* T,
                // New rolling resistance model
                /*T_res = -1.0f * fmin(devC_params.gamma_r * R_bar * angvel_ab_length,
                  devC_params.mu_r * R_bar * f_n_length)
       -         * angvel_ab/angvel_ab_length;*/
       +          * angvel_ab/angvel_ab_length;*/
                T_res = -1.0f * fmin(devC_params.gamma_r * radius_a * angvel_ab_length,
       -                devC_params.mu_r * radius_a * f_n_length)
       +                             devC_params.mu_r * radius_a * f_n_length)
                    * angvel_ab/angvel_ab_length;
            }
        
 (DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
       t@@ -85,19 +85,23 @@ __device__ int findDistMod(int3* targetCell, Float3* distmod)
        // Used for contactmodel=1, where contact history is not needed.
        // Kernel executed on device, and callable from device only.
        // Function is called from interact().
       -__device__ void findAndProcessContactsInCell(int3 targetCell, 
       -        unsigned int idx_a, 
       -        Float4 x_a, Float radius_a,
       -        Float3* F, Float3* T, 
       -        Float* es_dot, Float* ev_dot,
       -        Float* p,
       -        Float4* dev_x_sorted, 
       -        Float4* dev_vel_sorted, 
       -        Float4* dev_angvel_sorted,
       -        unsigned int* dev_cellStart, 
       -        unsigned int* dev_cellEnd,
       -        Float4* dev_walls_nx, 
       -        Float4* dev_walls_mvfd)
       +__device__ void findAndProcessContactsInCell(
       +    int3 targetCell, 
       +    const unsigned int idx_a, 
       +    const Float4 x_a,
       +    const Float radius_a,
       +    Float3* F,
       +    Float3* T, 
       +    Float* es_dot,
       +    Float* ev_dot,
       +    Float* p,
       +    const Float4* __restrict__ dev_x_sorted, 
       +    const Float4* __restrict__ dev_vel_sorted, 
       +    const Float4* __restrict__ dev_angvel_sorted,
       +    const unsigned int* __restrict__ dev_cellStart, 
       +    const unsigned int* __restrict__ dev_cellEnd,
       +    const Float4* __restrict__ dev_walls_nx, 
       +    Float4* __restrict__ dev_walls_mvfd)
        //uint4 bonds)
        {
        
       t@@ -133,8 +137,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
        
                            // Distance between particle centers (Float4 -> Float3)
                            Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, 
       -                            x_a.y - x_b.y, 
       -                            x_a.z - x_b.z);
       +                                              x_a.y - x_b.y, 
       +                                              x_a.z - x_b.z);
        
                            // Adjust interparticle vector if periodic boundary/boundaries
                            // are crossed
       t@@ -147,16 +151,16 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
                            // Check for particle overlap
                            if (delta_ab < 0.0f) {
                                contactLinearViscous(F, T, es_dot, ev_dot, p, 
       -                                idx_a, idx_b,
       -                                dev_vel_sorted, 
       -                                dev_angvel_sorted,
       -                                radius_a, radius_b, 
       -                                x_ab, x_ab_length,
       -                                delta_ab, kappa);
       +                                             idx_a, idx_b,
       +                                             dev_vel_sorted, 
       +                                             dev_angvel_sorted,
       +                                             radius_a, radius_b, 
       +                                             x_ab, x_ab_length,
       +                                             delta_ab, kappa);
                            } else if (delta_ab < devC_params.db) { 
                                // Check wether particle distance satisfies the capillary bond distance
                                capillaryCohesion_exp(F, radius_a, radius_b, delta_ab, 
       -                                x_ab, x_ab_length, kappa);
       +                                              x_ab, x_ab_length, kappa);
                            }
        
                            // Check wether particles are bonded together
       t@@ -183,16 +187,18 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
        // Used for contactmodel=2, where bookkeeping of contact history is necessary.
        // Kernel executed on device, and callable from device only.
        // Function is called from topology().
       -__device__ void findContactsInCell(int3 targetCell, 
       -        unsigned int idx_a, 
       -        Float4 x_a, Float radius_a,
       -        Float4* dev_x_sorted, 
       -        unsigned int* dev_cellStart, 
       -        unsigned int* dev_cellEnd,
       -        unsigned int* dev_gridParticleIndex,
       -        int* nc,
       -        unsigned int* dev_contacts,
       -        Float4* dev_distmod)
       +__device__ void findContactsInCell(
       +    int3 targetCell, 
       +    const unsigned int idx_a, 
       +    const Float4 x_a,
       +    const Float radius_a,
       +    const Float4* __restrict__ dev_x_sorted, 
       +    const unsigned int* __restrict__ dev_cellStart, 
       +    const unsigned int* __restrict__ dev_cellEnd,
       +    const unsigned int* __restrict__ dev_gridParticleIndex,
       +    int* nc,
       +    unsigned int* __restrict__ dev_contacts,
       +    Float4* __restrict__ dev_distmod)
        {
            // Get distance modifier for interparticle
            // vector, if it crosses a periodic boundary
       t@@ -237,8 +243,8 @@ __device__ void findContactsInCell(int3 targetCell,
        
                            // Distance between particle centers (Float4 -> Float3)
                            Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x, 
       -                            x_a.y - x_b.y, 
       -                            x_a.z - x_b.z);
       +                                              x_a.y - x_b.y, 
       +                                              x_a.z - x_b.z);
        
                            // Adjust interparticle vector if periodic boundary/boundaries
                            // are crossed
       t@@ -316,12 +322,13 @@ __device__ void findContactsInCell(int3 targetCell,
        // Search for neighbors to particle 'idx' inside the 27 closest cells, 
        // and save the contact pairs in global memory.
        // Function is called from mainGPU loop.
       -__global__ void topology(unsigned int* dev_cellStart, 
       -        unsigned int* dev_cellEnd, // Input: Particles in cell 
       -        unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       -        Float4* dev_x_sorted, 
       -        unsigned int* dev_contacts,
       -        Float4* dev_distmod)
       +__global__ void topology(
       +    const unsigned int* __restrict__ dev_cellStart, 
       +    const unsigned int* __restrict__ dev_cellEnd,
       +    const unsigned int* __restrict__ dev_gridParticleIndex,
       +    const Float4* __restrict__ dev_x_sorted,
       +    unsigned int* __restrict__ dev_contacts,
       +    Float4* __restrict__ dev_distmod)
        {
            // Thread index equals index of particle A
            unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
       t@@ -349,10 +356,10 @@ __global__ void topology(unsigned int* dev_cellStart,
                        for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
                            targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
                            findContactsInCell(targetPos, idx_a, x_a, radius_a,
       -                            dev_x_sorted, 
       -                            dev_cellStart, dev_cellEnd,
       -                            dev_gridParticleIndex,
       -                            &nc, dev_contacts, dev_distmod);
       +                                       dev_x_sorted, 
       +                                       dev_cellStart, dev_cellEnd,
       +                                       dev_gridParticleIndex,
       +                                       &nc, dev_contacts, dev_distmod);
                        }
                    }
                }
       t@@ -372,28 +379,28 @@ __global__ void topology(unsigned int* dev_cellStart,
        // Kernel is executed on device, and is callable from host only.
        // Function is called from mainGPU loop.
        __global__ void interact(
       -        unsigned int* dev_gridParticleIndex, // in
       -        unsigned int* dev_cellStart,         // in
       -        unsigned int* dev_cellEnd,           // in
       -        Float4* dev_x,                       // in
       -        Float4* dev_x_sorted,                // in
       -        Float4* dev_vel_sorted,              // in
       -        Float4* dev_angvel_sorted,           // in
       -        Float4* dev_vel,                     // in
       -        Float4* dev_angvel,                  // in
       -        Float4* dev_force,          // out
       -        Float4* dev_torque,         // out
       -        Float* dev_es_dot,          // out
       -        Float* dev_ev_dot,          // out
       -        Float* dev_es,              // out
       -        Float* dev_ev,              // out
       -        Float* dev_p,               // out
       -        Float4* dev_walls_nx,                // in
       -        Float4* dev_walls_mvfd,              // in
       -        Float* dev_walls_force_pp,  // out
       -        unsigned int* dev_contacts, // out
       -        Float4* dev_distmod,                 // in
       -        Float4* dev_delta_t)        // out
       +    const unsigned int* __restrict__ dev_gridParticleIndex, // in
       +    const unsigned int* __restrict__ dev_cellStart,         // in
       +    const unsigned int* __restrict__ dev_cellEnd,           // in
       +    const Float4* __restrict__ dev_x,                       // in
       +    const Float4* __restrict__ dev_x_sorted,                // in
       +    const Float4* __restrict__ dev_vel_sorted,              // in
       +    const Float4* __restrict__ dev_angvel_sorted,           // in
       +    const Float4* __restrict__ dev_vel,                     // in
       +    const Float4* __restrict__ dev_angvel,                  // in
       +    Float4* __restrict__ dev_force,                         // out
       +    Float4* __restrict__ dev_torque,                        // out
       +    Float*  __restrict__ dev_es_dot,                        // out
       +    Float*  __restrict__ dev_ev_dot,                        // out
       +    Float*  __restrict__ dev_es,                            // out
       +    Float*  __restrict__ dev_ev,                            // out
       +    Float*  __restrict__ dev_p,                             // out
       +    const Float4* __restrict__ dev_walls_nx,                // in
       +    Float4* __restrict__ dev_walls_mvfd,                    // in
       +    Float* __restrict__ dev_walls_force_pp,                 // out
       +    unsigned int* __restrict__ dev_contacts,                // out
       +    const Float4* __restrict__ dev_distmod,                 // in
       +    Float4* __restrict__ dev_delta_t)                       // out
        {
            // Thread index equals index of particle A
            unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
       t@@ -407,11 +414,11 @@ __global__ void interact(
        
                // Fetch world dimensions in constant memory read
                Float3 origo = MAKE_FLOAT3(devC_grid.origo[0], 
       -                devC_grid.origo[1], 
       -                devC_grid.origo[2]); 
       +                                   devC_grid.origo[1], 
       +                                   devC_grid.origo[2]); 
                Float3 L = MAKE_FLOAT3(devC_grid.L[0], 
       -                devC_grid.L[1], 
       -                devC_grid.L[2]);
       +                               devC_grid.L[1], 
       +                               devC_grid.L[2]);
        
                // Fetch wall data in global read
                Float4 w_0_nx, w_1_nx, w_2_nx, w_3_nx, w_4_nx;
       t@@ -497,8 +504,8 @@ __global__ void interact(
        
                            // Inter-particle vector, corrected for periodic boundaries
                            x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
       -                            x_a.y - x_b.y + distmod.y,
       -                            x_a.z - x_b.z + distmod.z);
       +                                       x_a.y - x_b.y + distmod.y,
       +                                       x_a.z - x_b.z + distmod.z);
        
                            x_ab_length = length(x_ab);
                            delta_n = x_ab_length - (radius_a + radius_b);
       t@@ -507,28 +514,28 @@ __global__ void interact(
                            if (delta_n < 0.0) {
                                if (devC_params.contactmodel == 2) {
                                    contactLinear(&F, &T, &es_dot, &ev_dot, &p, 
       -                                    idx_a_orig,
       -                                    idx_b_orig,
       -                                    vel_a,
       -                                    dev_vel,
       -                                    angvel_a,
       -                                    dev_angvel,
       -                                    radius_a, radius_b, 
       -                                    x_ab, x_ab_length,
       -                                    delta_n, dev_delta_t, 
       -                                    mempos);
       +                                          idx_a_orig,
       +                                          idx_b_orig,
       +                                          vel_a,
       +                                          dev_vel,
       +                                          angvel_a,
       +                                          dev_angvel,
       +                                          radius_a, radius_b, 
       +                                          x_ab, x_ab_length,
       +                                          delta_n, dev_delta_t, 
       +                                          mempos);
                                } else if (devC_params.contactmodel == 3) {
                                    contactHertz(&F, &T, &es_dot, &ev_dot, &p, 
       -                                    idx_a_orig,
       -                                    idx_b_orig,
       -                                    vel_a,
       -                                    dev_vel,
       -                                    angvel_a,
       -                                    dev_angvel,
       -                                    radius_a, radius_b, 
       -                                    x_ab, x_ab_length,
       -                                    delta_n, dev_delta_t, 
       -                                    mempos);
       +                                         idx_a_orig,
       +                                         idx_b_orig,
       +                                         vel_a,
       +                                         dev_vel,
       +                                         angvel_a,
       +                                         dev_angvel,
       +                                         radius_a, radius_b, 
       +                                         x_ab, x_ab_length,
       +                                         delta_n, dev_delta_t, 
       +                                         mempos);
                                }
                            } else {
                                __syncthreads();
       t@@ -556,11 +563,11 @@ __global__ void interact(
        
                    // Calculate address in grid from position
                    gridPos.x = floor((x_a.x - devC_grid.origo[0])
       -                    / (devC_grid.L[0]/devC_grid.num[0]));
       +                              / (devC_grid.L[0]/devC_grid.num[0]));
                    gridPos.y = floor((x_a.y - devC_grid.origo[1])
       -                    / (devC_grid.L[1]/devC_grid.num[1]));
       +                              / (devC_grid.L[1]/devC_grid.num[1]));
                    gridPos.z = floor((x_a.z - devC_grid.origo[2])
       -                    / (devC_grid.L[2]/devC_grid.num[2]));
       +                              / (devC_grid.L[2]/devC_grid.num[2]));
        
                    // Find overlaps between particle no. idx and all particles
                    // from its own cell + 26 neighbor cells.
       t@@ -570,12 +577,17 @@ __global__ void interact(
                        for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
                            for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
                                targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
       -                        findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a,
       -                                &F, &T, &es_dot, &ev_dot, &p,
       -                                dev_x_sorted,
       -                                dev_vel_sorted, dev_angvel_sorted,
       -                                dev_cellStart, dev_cellEnd,
       -                                dev_walls_nx, dev_walls_mvfd);
       +                        findAndProcessContactsInCell(targetPos, idx_a,
       +                                                     x_a, radius_a,
       +                                                     &F, &T, &es_dot,
       +                                                     &ev_dot, &p,
       +                                                     dev_x_sorted,
       +                                                     dev_vel_sorted,
       +                                                     dev_angvel_sorted,
       +                                                     dev_cellStart,
       +                                                     dev_cellEnd,
       +                                                     dev_walls_nx,
       +                                                     dev_walls_mvfd);
                            }
                        }
                    }
       t@@ -596,8 +608,8 @@ __global__ void interact(
                w_n = MAKE_FLOAT3(w_0_nx.x, w_0_nx.y, w_0_nx.z);
                if (delta_w < 0.0f) {
                    w_0_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
       -                    radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
       -                    w_0_mvfd.y);
       +                                           radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
       +                                           w_0_mvfd.y);
                }
        
                // Lower wall (force on wall not stored)
       t@@ -605,8 +617,8 @@ __global__ void interact(
                w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
                if (delta_w < 0.0f) {
                    (void)contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
       -                    radius_a, dev_vel_sorted, dev_angvel_sorted,
       -                    w_n, delta_w, 0.0f);
       +                                     radius_a, dev_vel_sorted, dev_angvel_sorted,
       +                                     w_n, delta_w, 0.0f);
                }
        
        
       t@@ -617,8 +629,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_1_nx.x, w_1_nx.y, w_1_nx.z);
                    if (delta_w < 0.0f) {
                        w_1_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_1_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_1_mvfd.y);
                    }
        
                    // Left wall (idx 2)
       t@@ -626,8 +638,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_2_nx.x, w_2_nx.y, w_2_nx.z);
                    if (delta_w < 0.0f) {
                        w_2_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_2_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_2_mvfd.y);
                    }
        
                    // Back wall (idx 3)
       t@@ -635,8 +647,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
                    if (delta_w < 0.0f) {
                        w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_3_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_3_mvfd.y);
                    }
        
                    // Front wall (idx 4)
       t@@ -644,8 +656,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
                    if (delta_w < 0.0f) {
                        w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_4_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_4_mvfd.y);
                    }
        
                } else if (devC_grid.periodic == 2) {   // right and left walls periodic
       t@@ -655,8 +667,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
                    if (delta_w < 0.0f) {
                        w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_3_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_3_mvfd.y);
                    }
        
                    // Front wall (idx 4)
       t@@ -664,8 +676,8 @@ __global__ void interact(
                    w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
                    if (delta_w < 0.0f) {
                        w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
       -                        idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       -                        delta_w, w_4_mvfd.y);
       +                                               idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
       +                                               delta_w, w_4_mvfd.y);
                    }
                }
        
 (DIR) diff --git a/src/integration.cuh b/src/integration.cuh
       t@@ -11,15 +11,23 @@
        
        // Second order integration scheme based on Taylor expansion of particle kinematics. 
        // Kernel executed on device, and callable from host only.
       -__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
       -        Float4* dev_angvel_sorted,
       -        Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
       -        Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
       -        Float4* dev_acc, Float4* dev_angacc,
       -        Float4* dev_vel0, Float4* dev_angvel0,
       -        Float4* dev_xyzsum,
       -        unsigned int* dev_gridParticleIndex, // Input: Sorted-Unsorted key
       -        unsigned int iter)
       +__global__ void integrate(
       +    const Float4* __restrict__ dev_x_sorted,
       +    const Float4* __restrict__ dev_vel_sorted,
       +    const Float4* __restrict__ dev_angvel_sorted,
       +    Float4* __restrict__ dev_x,
       +    Float4* __restrict__ dev_vel,
       +    Float4* __restrict__ dev_angvel,
       +    const Float4* __restrict__ dev_force,
       +    const Float4* __restrict__ dev_torque,
       +    Float4* __restrict__ dev_angpos,
       +    Float4* __restrict__ dev_acc,
       +    Float4* __restrict__ dev_angacc,
       +    Float4* __restrict__ dev_vel0,
       +    Float4* __restrict__ dev_angvel0,
       +    Float4* __restrict__ dev_xyzsum,
       +    const unsigned int* __restrict__ dev_gridParticleIndex,
       +    const unsigned int iter)
        {
            unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
        
       t@@ -59,13 +67,13 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
                // Coherent read from constant memory to registers
                const Float dt = devC_dt;
                const Float3 origo = MAKE_FLOAT3(
       -                devC_grid.origo[0],
       -                devC_grid.origo[1],
       -                devC_grid.origo[2]); 
       +            devC_grid.origo[0],
       +            devC_grid.origo[1],
       +            devC_grid.origo[2]); 
                const Float3 L = MAKE_FLOAT3(
       -                devC_grid.L[0],
       -                devC_grid.L[1],
       -                devC_grid.L[2]);
       +            devC_grid.L[0],
       +            devC_grid.L[1],
       +            devC_grid.L[2]);
        
                // Particle mass
                Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho;
       t@@ -179,14 +187,14 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
                // Truncation error O(dt^4) for positions, O(dt^3) for velocities
                // Approximate acceleration change by backwards difference:
                const Float3 dacc_dt = MAKE_FLOAT3(
       -                (acc.x - acc0.x)/dt,
       -                (acc.y - acc0.y)/dt,
       -                (acc.z - acc0.z)/dt);
       +            (acc.x - acc0.x)/dt,
       +            (acc.y - acc0.y)/dt,
       +            (acc.z - acc0.z)/dt);
        
                const Float3 dangacc_dt = MAKE_FLOAT3(
       -                (angacc.x - angacc0.x)/dt,
       -                (angacc.y - angacc0.y)/dt,
       -                (angacc.z - angacc0.z)/dt);
       +            (angacc.x - angacc0.x)/dt,
       +            (angacc.y - angacc0.y)/dt,
       +            (angacc.z - angacc0.z)/dt);
        
                x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
                x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
       t@@ -253,7 +261,9 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
        
        
        // Reduce wall force contributions from particles to a single value per wall
       -__global__ void summation(Float* in, Float *out)
       +__global__ void summation(
       +    const Float* __restrict__ in,
       +    Float* __restrict__ out)
        {
            __shared__ Float cache[256];
            unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
       t@@ -287,14 +297,14 @@ __global__ void summation(Float* in, Float *out)
        
        // Update wall positions
        __global__ void integrateWalls(
       -        Float4* dev_walls_nx,
       -        Float4* dev_walls_mvfd,
       -        int* dev_walls_wmode,
       -        Float* dev_walls_force_partial,
       -        Float* dev_walls_acc,
       -        unsigned int blocksPerGrid,
       -        Float t_current,
       -        unsigned int iter)
       +    Float4* __restrict__ dev_walls_nx,
       +    Float4* __restrict__ dev_walls_mvfd,
       +    const int* __restrict__ dev_walls_wmode,
       +    const Float* __restrict__ dev_walls_force_partial,
       +    Float* __restrict__ dev_walls_acc,
       +    const unsigned int blocksPerGrid,
       +    const Float t_current,
       +    const unsigned int iter)
        {
            unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
        
 (DIR) diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
       t@@ -25,34 +25,34 @@ __inline__ __device__ Float hmean(Float a, Float b) {
        
        // Helper functions for checking whether a value is NaN or Inf
        __device__ int checkFiniteFloat(
       -        const char* desc,
       -        const unsigned int x,
       -        const unsigned int y,
       -        const unsigned int z,
       -        const Float s)
       +    const char* desc,
       +    const unsigned int x,
       +    const unsigned int y,
       +    const unsigned int z,
       +    const Float s)
        {
       -        __syncthreads();
       -        if (!isfinite(s)) {
       -            printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
       -            return 1;
       -        }
       -        return 0;
       +    __syncthreads();
       +    if (!isfinite(s)) {
       +        printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
       +        return 1;
       +    }
       +    return 0;
        }
        
        __device__ int checkFiniteFloat3(
       -        const char* desc,
       -        const unsigned int x,
       -        const unsigned int y,
       -        const unsigned int z,
       -        const Float3 v)
       +    const char* desc,
       +    const unsigned int x,
       +    const unsigned int y,
       +    const unsigned int z,
       +    const Float3 v)
        {
       -        __syncthreads();
       -        if (!isfinite(v.x) || !isfinite(v.y)  || !isfinite(v.z)) {
       -            printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
       -                    x, y, z, desc, v.x, v.y, v.z);
       -            return 1;
       -        }
       -        return 0;
       +    __syncthreads();
       +    if (!isfinite(v.x) || !isfinite(v.y)  || !isfinite(v.z)) {
       +        printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
       +               x, y, z, desc, v.x, v.y, v.z);
       +        return 1;
       +    }
       +    return 0;
        }
        
        // Initialize memory
       t@@ -143,10 +143,10 @@ void DEM::freeNSmemDev()
        void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
        {
            checkForCudaErrors("Before attempting cudaMemcpy in "
       -            "transferNStoGlobalDeviceMemory");
       +                       "transferNStoGlobalDeviceMemory");
        
            //if (verbose == 1 && statusmsg == 1)
       -        //std::cout << "  Transfering fluid data to the device:           ";
       +    //std::cout << "  Transfering fluid data to the device:           ";
        
            // memory size for a scalar field
            unsigned int memSizeF  = sizeof(Float)*NScells();
       t@@ -162,7 +162,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
        
            checkForCudaErrors("End of transferNStoGlobalDeviceMemory");
            //if (verbose == 1 && statusmsg == 1)
       -        //std::cout << "Done" << std::endl;
       +    //std::cout << "Done" << std::endl;
        }
        
        // Transfer from device
       t@@ -194,7 +194,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
        void DEM::transferNSnormFromGlobalDeviceMemory()
        {
            cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(),
       -            cudaMemcpyDeviceToHost);
       +               cudaMemcpyDeviceToHost);
            checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory");
        }
        
       t@@ -202,7 +202,7 @@ void DEM::transferNSnormFromGlobalDeviceMemory()
        void DEM::transferNSepsilonFromGlobalDeviceMemory()
        {
            cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(),
       -            cudaMemcpyDeviceToHost);
       +               cudaMemcpyDeviceToHost);
            checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
        }
        
       t@@ -210,13 +210,13 @@ void DEM::transferNSepsilonFromGlobalDeviceMemory()
        void DEM::transferNSepsilonNewFromGlobalDeviceMemory()
        {
            cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(),
       -            cudaMemcpyDeviceToHost);
       +               cudaMemcpyDeviceToHost);
            checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
        }
        
        // Get linear index from 3D grid position
        __inline__ __device__ unsigned int idx(
       -        const int x, const int y, const int z)
       +    const int x, const int y, const int z)
        {
            // without ghost nodes
            //return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z;
       t@@ -229,11 +229,11 @@ __inline__ __device__ unsigned int idx(
        
        // Get linear index of velocity node from 3D grid position in staggered grid
        __inline__ __device__ unsigned int vidx(
       -        const int x, const int y, const int z)
       +    const int x, const int y, const int z)
        {
            // without ghost nodes
            //return x + (devC_grid.num[0]+1)*y
       -        //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
       +    //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
        
            // with ghost nodes
            // the ghost nodes are placed at x,y,z = -1 and WIDTH+1
       t@@ -246,10 +246,10 @@ __inline__ __device__ unsigned int vidx(
        // dev_ns_v or dev_ns_v_p array. This function does not set the averaged
        // velocity values in the ghost node cells.
        __global__ void findNSavgVel(
       -        Float3* dev_ns_v,    // out
       -        Float*  dev_ns_v_x,  // in
       -        Float*  dev_ns_v_y,  // in
       -        Float*  dev_ns_v_z)  // in
       +    Float3* __restrict__ dev_ns_v,    // out
       +    const Float* __restrict__ dev_ns_v_x,  // in
       +    const Float* __restrict__ dev_ns_v_y,  // in
       +    const Float* __restrict__ dev_ns_v_z)  // in
        {
        
            // 3D thread index
       t@@ -272,9 +272,9 @@ __global__ void findNSavgVel(
        
                // Find average velocity using arithmetic means
                const Float3 v_bar = MAKE_FLOAT3(
       -                amean(v_xn, v_xp),
       -                amean(v_yn, v_yp),
       -                amean(v_zn, v_zp));
       +            amean(v_xn, v_xp),
       +            amean(v_yn, v_yp),
       +            amean(v_zn, v_zp));
        
                // Save value
                __syncthreads();
       t@@ -287,10 +287,10 @@ __global__ void findNSavgVel(
        // or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are set
        // beforehand.
        __global__ void findNScellFaceVel(
       -        Float3* dev_ns_v,    // in
       -        Float*  dev_ns_v_x,  // out
       -        Float*  dev_ns_v_y,  // out
       -        Float*  dev_ns_v_z)  // out
       +    const Float3* __restrict__ dev_ns_v,    // in
       +    Float* __restrict__ dev_ns_v_x,  // out
       +    Float* __restrict__ dev_ns_v_y,  // out
       +    Float* __restrict__ dev_ns_v_z)  // out
        {
        
            // 3D thread index
       t@@ -340,7 +340,9 @@ __global__ void findNScellFaceVel(
        
        
        // Set the initial guess of the values of epsilon.
       -__global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
       +__global__ void setNSepsilonInterior(
       +    Float* __restrict__ dev_ns_epsilon,
       +    const Float value)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -349,7 +351,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
        
            // check that we are not outside the fluid grid
            if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
       -            z > 0 && z < devC_grid.num[2]-1) {
       +        z > 0 && z < devC_grid.num[2]-1) {
                __syncthreads();
                const unsigned int cellidx = idx(x,y,z);
                dev_ns_epsilon[cellidx] = value;
       t@@ -358,7 +360,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
        
        // The normalized residuals are given an initial value of 0, since the values at
        // the Dirichlet boundaries aren't written during the iterations.
       -__global__ void setNSnormZero(Float* dev_ns_norm)
       +__global__ void setNSnormZero(Float* __restrict__ dev_ns_norm)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -380,9 +382,9 @@ __global__ void setNSnormZero(Float* dev_ns_norm)
        // the Dirichlet boundary condition: the new value should be identical to the
        // old value, i.e. the temporal gradient is 0
        __global__ void setNSepsilonBottom(
       -        Float* dev_ns_epsilon,
       -        Float* dev_ns_epsilon_new,
       -        const Float value)
       +    Float* __restrict__ dev_ns_epsilon,
       +    Float* __restrict__ dev_ns_epsilon_new,
       +    const Float value)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -408,9 +410,9 @@ __global__ void setNSepsilonBottom(
        // the Dirichlet boundary condition: the new value should be identical to the
        // old value, i.e. the temporal gradient is 0
        __global__ void setNSepsilonTop(
       -        Float* dev_ns_epsilon,
       -        Float* dev_ns_epsilon_new,
       -        const Float value)
       +    Float* __restrict__ dev_ns_epsilon,
       +    Float* __restrict__ dev_ns_epsilon_new,
       +    const Float value)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -419,7 +421,7 @@ __global__ void setNSepsilonTop(
        
            // check that we are not outside the fluid grid, and at the upper z boundary
            if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
       -            z == devC_grid.num[2]-1) {
       +        z == devC_grid.num[2]-1) {
        
                __syncthreads();
                const unsigned int cellidx = idx(x,y,z);
       t@@ -428,11 +430,14 @@ __global__ void setNSepsilonTop(
            }
        }
        __device__ void copyNSvalsDev(
       -        unsigned int read, unsigned int write,
       -        Float* dev_ns_p,
       -        Float3* dev_ns_v, Float3* dev_ns_v_p,
       -        Float* dev_ns_phi, Float* dev_ns_dphi,
       -        Float* dev_ns_epsilon)
       +    const unsigned int read,
       +    const unsigned int write,
       +    Float*  __restrict__ dev_ns_p,
       +    Float3* __restrict__ dev_ns_v,
       +    Float3* __restrict__ dev_ns_v_p,
       +    Float*  __restrict__ dev_ns_phi,
       +    Float*  __restrict__ dev_ns_dphi,
       +    Float*  __restrict__ dev_ns_epsilon)
        {
            // Coalesced read
            const Float  p       = dev_ns_p[read];
       t@@ -457,10 +462,12 @@ __device__ void copyNSvalsDev(
        // are not written since they are not read. Launch this kernel for all cells in
        // the grid
        __global__ void setNSghostNodesDev(
       -        Float* dev_ns_p,
       -        Float3* dev_ns_v, Float3* dev_ns_v_p,
       -        Float* dev_ns_phi, Float* dev_ns_dphi,
       -        Float* dev_ns_epsilon)
       +    Float*  __restrict__ dev_ns_p,
       +    Float3* __restrict__ dev_ns_v,
       +    Float3* __restrict__ dev_ns_v_p,
       +    Float*  __restrict__ dev_ns_phi,
       +    Float*  __restrict__ dev_ns_dphi,
       +    Float*  __restrict__ dev_ns_epsilon)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -484,72 +491,72 @@ __global__ void setNSghostNodesDev(
                if (x == 0) {
                    writeidx = idx(nx,y,z);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
                if (x == nx-1) {
                    writeidx = idx(-1,y,z);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
        
                if (y == 0) {
                    writeidx = idx(x,ny,z);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
                if (y == ny-1) {
                    writeidx = idx(x,-1,z);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
        
                // Z boundaries fixed
                if (z == 0) {
                    writeidx = idx(x,y,-1);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
                if (z == nz-1) {
                    writeidx = idx(x,y,nz);
                    copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       +                          dev_ns_p,
       +                          dev_ns_v, dev_ns_v_p,
       +                          dev_ns_phi, dev_ns_dphi,
       +                          dev_ns_epsilon);
                }
        
                // Z boundaries periodic
                /*if (z == 0) {
       -            writeidx = idx(x,y,nz);
       -            copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       -        }
       -        if (z == nz-1) {
       -            writeidx = idx(x,y,-1);
       -            copyNSvalsDev(cellidx, writeidx,
       -                    dev_ns_p,
       -                    dev_ns_v, dev_ns_v_p,
       -                    dev_ns_phi, dev_ns_dphi,
       -                    dev_ns_epsilon);
       -        }*/
       +          writeidx = idx(x,y,nz);
       +          copyNSvalsDev(cellidx, writeidx,
       +          dev_ns_p,
       +          dev_ns_v, dev_ns_v_p,
       +          dev_ns_phi, dev_ns_dphi,
       +          dev_ns_epsilon);
       +          }
       +          if (z == nz-1) {
       +          writeidx = idx(x,y,-1);
       +          copyNSvalsDev(cellidx, writeidx,
       +          dev_ns_p,
       +          dev_ns_v, dev_ns_v_p,
       +          dev_ns_phi, dev_ns_dphi,
       +          dev_ns_epsilon);
       +          }*/
            }
        }
        
       t@@ -557,7 +564,7 @@ __global__ void setNSghostNodesDev(
        // (diagonal) cells are not written since they are not read. Launch this kernel
        // for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. );
        template<typename T>
       -__global__ void setNSghostNodes(T* dev_scalarfield)
       +__global__ void setNSghostNodes(T* __restrict__ dev_scalarfield)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -586,10 +593,10 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
        
                if (z == 0)
                    dev_scalarfield[idx(x,y,-1)] = val;     // Dirichlet
       -            //dev_scalarfield[idx(x,y,nz)] = val;    // Periodic -z
       +        //dev_scalarfield[idx(x,y,nz)] = val;    // Periodic -z
                if (z == nz-1)
                    dev_scalarfield[idx(x,y,nz)] = val;     // Dirichlet
       -            //dev_scalarfield[idx(x,y,-1)] = val;    // Periodic +z
       +        //dev_scalarfield[idx(x,y,-1)] = val;    // Periodic +z
            }
        }
        
       t@@ -597,9 +604,9 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
        // (diagonal) cells are not written since they are not read.
        template<typename T>
        __global__ void setNSghostNodes(
       -        T* dev_scalarfield,
       -        int bc_bot,
       -        int bc_top)
       +    T* __restrict__ dev_scalarfield,
       +    const int bc_bot,
       +    const int bc_top)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -651,11 +658,11 @@ __global__ void setNSghostNodes(
        // According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics"
        template<typename T>
        __global__ void setNSghostNodesFace(
       -        T* dev_scalarfield_x,
       -        T* dev_scalarfield_y,
       -        T* dev_scalarfield_z,
       -        int bc_bot,
       -        int bc_top)
       +    T* __restrict__ dev_scalarfield_x,
       +    T* __restrict__ dev_scalarfield_y,
       +    T* __restrict__ dev_scalarfield_z,
       +    const int bc_bot,
       +    const int bc_top)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -849,9 +856,9 @@ __global__ void setNSghostNodesFace(
        // The edge (diagonal) cells are not written since they are not read. Launch
        // this kernel for all cells in the grid.
        __global__ void setNSghostNodes_tau(
       -        Float* dev_ns_tau,
       -        int bc_bot,
       -        int bc_top)
       +    Float* __restrict__ dev_ns_tau,
       +    const int bc_bot,
       +    const int bc_top)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -981,108 +988,108 @@ __global__ void setNSghostNodes_tau(
        // The edge (diagonal) cells are not written since they are not read. Launch
        // this kernel for all cells in the grid.
        /*
       -__global__ void setNSghostNodesForcing(
       -        Float*  dev_ns_f1,
       -        Float3* dev_ns_f2,
       -        Float*  dev_ns_f,
       -        unsigned int nijac)
       -
       -{
       -    // 3D thread index
       -    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       -    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       -    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       -
       -    // Grid dimensions
       -    const unsigned int nx = devC_grid.num[0];
       -    const unsigned int ny = devC_grid.num[1];
       -    const unsigned int nz = devC_grid.num[2];
       -
       -    // 1D thread index
       -    unsigned int cellidx = idx(x,y,z);
       -
       -    // check that we are not outside the fluid grid
       -    if (x < nx && y < ny && z < nz) {
       -
       -        __syncthreads();
       -        const Float f  = dev_ns_f[cellidx];
       -        Float  f1;
       -        Float3 f2;
       -
       -        if (nijac == 0) {
       -            __syncthreads();
       -            f1 = dev_ns_f1[cellidx];
       -            f2 = dev_ns_f2[cellidx];
       -        }
       -
       -        if (x == 0) {
       -            cellidx = idx(nx,y,z);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -        if (x == nx-1) {
       -            cellidx = idx(-1,y,z);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -
       -        if (y == 0) {
       -            cellidx = idx(x,ny,z);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -        if (y == ny-1) {
       -            cellidx = idx(x,-1,z);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -
       -        if (z == 0) {
       -            cellidx = idx(x,y,nz);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -        if (z == nz-1) {
       -            cellidx = idx(x,y,-1);
       -            dev_ns_f[cellidx] = f;
       -            if (nijac == 0) {
       -                dev_ns_f1[cellidx] = f1;
       -                dev_ns_f2[cellidx] = f2;
       -            }
       -        }
       -    }
       -}
       +  __global__ void setNSghostNodesForcing(
       +  Float*  dev_ns_f1,
       +  Float3* dev_ns_f2,
       +  Float*  dev_ns_f,
       +  unsigned int nijac)
       +
       +  {
       +  // 3D thread index
       +  const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +  const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +  const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +  // Grid dimensions
       +  const unsigned int nx = devC_grid.num[0];
       +  const unsigned int ny = devC_grid.num[1];
       +  const unsigned int nz = devC_grid.num[2];
       +
       +  // 1D thread index
       +  unsigned int cellidx = idx(x,y,z);
       +
       +  // check that we are not outside the fluid grid
       +  if (x < nx && y < ny && z < nz) {
       +
       +  __syncthreads();
       +  const Float f  = dev_ns_f[cellidx];
       +  Float  f1;
       +  Float3 f2;
       +
       +  if (nijac == 0) {
       +  __syncthreads();
       +  f1 = dev_ns_f1[cellidx];
       +  f2 = dev_ns_f2[cellidx];
       +  }
       +
       +  if (x == 0) {
       +  cellidx = idx(nx,y,z);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +  if (x == nx-1) {
       +  cellidx = idx(-1,y,z);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +
       +  if (y == 0) {
       +  cellidx = idx(x,ny,z);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +  if (y == ny-1) {
       +  cellidx = idx(x,-1,z);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +
       +  if (z == 0) {
       +  cellidx = idx(x,y,nz);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +  if (z == nz-1) {
       +  cellidx = idx(x,y,-1);
       +  dev_ns_f[cellidx] = f;
       +  if (nijac == 0) {
       +  dev_ns_f1[cellidx] = f1;
       +  dev_ns_f2[cellidx] = f2;
       +  }
       +  }
       +  }
       +  }
        */
        
        // Find the porosity in each cell on the base of a sphere, centered at the cell
        // center. 
        __global__ void findPorositiesVelocitiesDiametersSpherical(
       -        const unsigned int* dev_cellStart,
       -        const unsigned int* dev_cellEnd,
       -        const Float4* dev_x_sorted,
       -        const Float4* dev_vel_sorted,
       -        Float*  dev_ns_phi,
       -        Float*  dev_ns_dphi,
       -        Float3* dev_ns_vp_avg,
       -        Float*  dev_ns_d_avg,
       -        const unsigned int iteration,
       -        const unsigned int np,
       -        const Float c_phi)
       +    const unsigned int* __restrict__ dev_cellStart,
       +    const unsigned int* __restrict__ dev_cellEnd,
       +    const Float4* __restrict__ dev_x_sorted,
       +    const Float4* __restrict__ dev_vel_sorted,
       +    Float*  __restrict__ dev_ns_phi,
       +    Float*  __restrict__ dev_ns_dphi,
       +    Float3* __restrict__ dev_ns_vp_avg,
       +    Float*  __restrict__ dev_ns_d_avg,
       +    const unsigned int iteration,
       +    const unsigned int np,
       +    const Float c_phi)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1114,9 +1121,9 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
        
                    // Cell sphere center position
                    const Float3 X = MAKE_FLOAT3(
       -                    x*dx + 0.5*dx,
       -                    y*dy + 0.5*dy,
       -                    z*dz + 0.5*dz);
       +                x*dx + 0.5*dx,
       +                y*dy + 0.5*dy,
       +                z*dz + 0.5*dz);
        
                    Float d, r;
                    Float phi = 1.00;
       t@@ -1143,12 +1150,12 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
        
                    // Iterate over 27 neighbor cells, R = cell width
                    /*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       -                for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       -                    for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
       +              for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       +              for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
        
                    // Iterate over 27 neighbor cells, R = 2*cell width
                    for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
       -            //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       +                //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
                        for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
                            for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis
        
       t@@ -1186,9 +1193,9 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
        
                                            // Find center distance
                                            dist = MAKE_FLOAT3(
       -                                            X.x - xr.x, 
       -                                            X.y - xr.y,
       -                                            X.z - xr.z);
       +                                        X.x - xr.x, 
       +                                        X.y - xr.y,
       +                                        X.z - xr.z);
                                            dist += distmod;
                                            d = length(dist);
        
       t@@ -1196,10 +1203,10 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
                                            if ((R - r) < d && d < (R + r)) {
                                                void_volume -=
                                                    1.0/(12.0*d) * (
       -                                                    M_PI*(R + r - d)*(R + r - d)
       -                                                    *(d*d + 2.0*d*r - 3.0*r*r
       -                                                        + 2.0*d*R + 6.0*r*R
       -                                                        - 3.0*R*R) );
       +                                                M_PI*(R + r - d)*(R + r - d)
       +                                                *(d*d + 2.0*d*r - 3.0*r*r
       +                                                  + 2.0*d*R + 6.0*r*R
       +                                                  - 3.0*R*R) );
                                                v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
                                                d_avg += 2.0*r;
                                                n++;
       t@@ -1259,8 +1266,8 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
                    //Float phi = 0.5;
                    //Float dphi = 0.0;
                    //if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) {
       -                //phi = 0.4;
       -                //dphi = 0.1;
       +            //phi = 0.4;
       +            //dphi = 0.1;
                    //}
                    //dev_ns_phi[cellidx]  = phi;
                    //dev_ns_dphi[cellidx] = dphi;
       t@@ -1276,17 +1283,17 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
        // Find the porosity in each cell on the base of a sphere, centered at the cell
        // center. 
        __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
       -        const unsigned int* dev_cellStart,
       -        const unsigned int* dev_cellEnd,
       -        const Float4* dev_x_sorted,
       -        const Float4* dev_vel_sorted,
       -        Float*  dev_ns_phi,
       -        Float*  dev_ns_dphi,
       -        Float3* dev_ns_vp_avg,
       -        Float*  dev_ns_d_avg,
       -        const unsigned int iteration,
       -        const unsigned int ndem,
       -        const unsigned int np)
       +    const unsigned int* __restrict__ dev_cellStart,
       +    const unsigned int* __restrict__ dev_cellEnd,
       +    const Float4* __restrict__ dev_x_sorted,
       +    const Float4* __restrict__ dev_vel_sorted,
       +    Float*  __restrict__ dev_ns_phi,
       +    Float*  __restrict__ dev_ns_dphi,
       +    Float3* __restrict__ dev_ns_vp_avg,
       +    Float*  __restrict__ dev_ns_d_avg,
       +    const unsigned int iteration,
       +    const unsigned int ndem,
       +    const unsigned int np)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1304,7 +1311,7 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
            const Float dz = devC_grid.L[2]/nz;
        
            // Cell sphere radius
       -    const Float R = fmin(dx, fmin(dy,dz));       // diameter = 2*cell width
       +    const Float R = fmin(dx, fmin(dy,dz));  // diameter = 2*cell width
        
            Float4 xr;  // particle pos. and radius
        
       t@@ -1315,9 +1322,9 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
        
                    // Cell sphere center position
                    const Float3 X = MAKE_FLOAT3(
       -                    x*dx + 0.5*dx,
       -                    y*dy + 0.5*dy,
       -                    z*dz + 0.5*dz);
       +                x*dx + 0.5*dx,
       +                y*dy + 0.5*dy,
       +                z*dz + 0.5*dz);
        
                    Float d, r;
                    Float phi = 1.00;
       t@@ -1396,9 +1403,9 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
        
                                            // Find center distance and normal vector
                                            x_p = MAKE_FLOAT3(
       -                                            xr.x - X.x,
       -                                            xr.y - X.y,
       -                                            xr.z - X.z);
       +                                        xr.x - X.x,
       +                                        xr.y - X.y,
       +                                        xr.z - X.z);
                                            d = length(x_p);
                                            n_p = x_p/d;
                                            q = d/R;
       t@@ -1438,8 +1445,8 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
                    phi = phi_0 + dphi/(ndem*devC_dt);
        
                    //if (dot_epsilon_kk != 0.0)
       -                //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
       -                        //x,y,z, dot_epsilon_kk, dphi, phi);
       +            //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
       +            //x,y,z, dot_epsilon_kk, dphi, phi);
        
                    // Make sure that the porosity is in the interval [0.0;1.0]
                    phi = fmin(1.00, fmax(0.00, phi));
       t@@ -1484,11 +1491,11 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
        
        // Modulate the hydraulic pressure at the upper boundary
        __global__ void setUpperPressureNS(
       -        Float* dev_ns_p,
       -        Float* dev_ns_epsilon,
       -        Float* dev_ns_epsilon_new,
       -        Float  beta,
       -        const Float new_pressure)
       +    Float* __restrict__ dev_ns_p,
       +    Float* __restrict__ dev_ns_epsilon,
       +    Float* __restrict__ dev_ns_epsilon_new,
       +    const Float  beta,
       +    const Float new_pressure)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1497,8 +1504,8 @@ __global__ void setUpperPressureNS(
            
            // check that the thread is located at the top boundary
            if (x < devC_grid.num[0] &&
       -            y < devC_grid.num[1] &&
       -            z == devC_grid.num[2]-1) {
       +        y < devC_grid.num[1] &&
       +        z == devC_grid.num[2]-1) {
        
                const unsigned int cellidx = idx(x,y,z);
        
       t@@ -1524,13 +1531,13 @@ __global__ void setUpperPressureNS(
        // Find the gradient in a cell in a homogeneous, cubic 3D scalar field using
        // finite central differences
        __device__ Float3 gradient(
       -        const Float* dev_scalarfield,
       -        const unsigned int x,
       -        const unsigned int y,
       -        const unsigned int z,
       -        const Float dx,
       -        const Float dy,
       -        const Float dz)
       +    const Float* __restrict__ dev_scalarfield,
       +    const unsigned int x,
       +    const unsigned int y,
       +    const unsigned int z,
       +    const Float dx,
       +    const Float dy,
       +    const Float dz)
        {
            // Read 6 neighbor cells
            __syncthreads();
       t@@ -1544,26 +1551,26 @@ __device__ Float3 gradient(
        
            //__syncthreads();
            //if (p != 0.0)
       -        //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
       +    //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
        
            // Calculate central-difference gradients
            return MAKE_FLOAT3(
       -            (xp - xn)/(2.0*dx),
       -            (yp - yn)/(2.0*dy),
       -            (zp - zn)/(2.0*dz));
       +        (xp - xn)/(2.0*dx),
       +        (yp - yn)/(2.0*dy),
       +        (zp - zn)/(2.0*dz));
        }
        
        // Find the divergence in a cell in a homogeneous, cubic, 3D vector field
        __device__ Float divergence(
       -        const Float* dev_vectorfield_x,
       -        const Float* dev_vectorfield_y,
       -        const Float* dev_vectorfield_z,
       -        const unsigned int x,
       -        const unsigned int y,
       -        const unsigned int z,
       -        const Float dx,
       -        const Float dy,
       -        const Float dz)
       +    const Float* __restrict__ dev_vectorfield_x,
       +    const Float* __restrict__ dev_vectorfield_y,
       +    const Float* __restrict__ dev_vectorfield_z,
       +    const unsigned int x,
       +    const unsigned int y,
       +    const unsigned int z,
       +    const Float dx,
       +    const Float dy,
       +    const Float dz)
        {
            // Read 6 cell-face values
            __syncthreads();
       t@@ -1583,13 +1590,13 @@ __device__ Float divergence(
        
        // Find the divergence of a tensor field
        __device__ Float3 divergence_tensor(
       -        Float*  dev_tensorfield,
       -        const unsigned int x,
       -        const unsigned int y,
       -        const unsigned int z,
       -        const Float dx,
       -        const Float dy,
       -        const Float dz)
       +    const Float* __restrict__ dev_tensorfield,
       +    const unsigned int x,
       +    const unsigned int y,
       +    const unsigned int z,
       +    const Float dx,
       +    const Float dy,
       +    const Float dz)
        {
            __syncthreads();
        
       t@@ -1638,18 +1645,18 @@ __device__ Float3 divergence_tensor(
        
            // Calculate div(phi*tau)
            const Float3 div_tensor = MAKE_FLOAT3(
       -            // x
       -            (t_xx_xp - t_xx_xn)/dx +
       -            (t_xy_yp - t_xy_yn)/dy +
       -            (t_xz_zp - t_xz_zn)/dz,
       -            // y
       -            (t_xy_xp - t_xy_xn)/dx +
       -            (t_yy_yp - t_yy_yn)/dy +
       -            (t_yz_zp - t_yz_zn)/dz,
       -            // z
       -            (t_xz_xp - t_xz_xn)/dx +
       -            (t_yz_yp - t_yz_yn)/dy +
       -            (t_zz_zp - t_zz_zn)/dz);
       +        // x
       +        (t_xx_xp - t_xx_xn)/dx +
       +        (t_xy_yp - t_xy_yn)/dy +
       +        (t_xz_zp - t_xz_zn)/dz,
       +        // y
       +        (t_xy_xp - t_xy_xn)/dx +
       +        (t_yy_yp - t_yy_yn)/dy +
       +        (t_yz_zp - t_yz_zn)/dz,
       +        // z
       +        (t_xz_xp - t_xz_xn)/dx +
       +        (t_yz_yp - t_yz_yn)/dy +
       +        (t_zz_zp - t_zz_zn)/dz);
        
        #ifdef CHECK_NS_FINITE
            (void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor);
       t@@ -1661,8 +1668,8 @@ __device__ Float3 divergence_tensor(
        // Find the spatial gradient in e.g. pressures per cell
        // using first order central differences
        __global__ void findNSgradientsDev(
       -        Float* dev_scalarfield,     // in
       -        Float3* dev_vectorfield)    // out
       +    const Float* __restrict__ dev_scalarfield,     // in
       +    Float3* __restrict__ dev_vectorfield)    // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1699,8 +1706,8 @@ __global__ void findNSgradientsDev(
        
        // Find the outer product of v v
        __global__ void findvvOuterProdNS(
       -        Float3* dev_ns_v,       // in
       -        Float*  dev_ns_v_prod)  // out
       +    const Float3* __restrict__ dev_ns_v,       // in
       +    Float*  __restrict__ dev_ns_v_prod)  // out
        {
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
            const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       t@@ -1750,8 +1757,8 @@ __global__ void findvvOuterProdNS(
        // Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6
        // values in 3D.
        __global__ void findNSstressTensor(
       -        Float3* dev_ns_v,       // in
       -        Float*  dev_ns_tau)     // out
       +    const Float3* __restrict__ dev_ns_v,       // in
       +    Float* __restrict__ dev_ns_tau)     // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1809,15 +1816,15 @@ __global__ void findNSstressTensor(
                    devC_params.mu*((zp.y - zn.y)/(2.0*dz) + (yp.z - yn.z)/(2.0*dy));
        
                /*
       -        if (x == 0 && y == 0 && z == 0)
       -            printf("mu = %f\n", mu);
       -        if (tau_xz > 1.0e-6)
       -            printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
       -        if (tau_yz > 1.0e-6)
       -            printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
       -        if (tau_zz > 1.0e-6)
       -            printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
       -            */
       +          if (x == 0 && y == 0 && z == 0)
       +          printf("mu = %f\n", mu);
       +          if (tau_xz > 1.0e-6)
       +          printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
       +          if (tau_yz > 1.0e-6)
       +          printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
       +          if (tau_zz > 1.0e-6)
       +          printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
       +        */
        
                // Store values in global memory
                __syncthreads();
       t@@ -1842,9 +1849,9 @@ __global__ void findNSstressTensor(
        
        // Find the divergence of phi*v*v
        __global__ void findNSdivphiviv(
       -        Float*  dev_ns_phi,          // in
       -        Float3* dev_ns_v,            // in
       -        Float3* dev_ns_div_phi_vi_v) // out
       +    const Float*  __restrict__ dev_ns_phi,          // in
       +    const Float3* __restrict__ dev_ns_v,            // in
       +    Float3* __restrict__ dev_ns_div_phi_vi_v) // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -1888,58 +1895,58 @@ __global__ void findNSdivphiviv(
                // Calculate upwind coefficients
                //*
                const Float3 a = MAKE_FLOAT3(
       -                copysign(1.0, v.x),
       -                copysign(1.0, v.y),
       -                copysign(1.0, v.z));
       +            copysign(1.0, v.x),
       +            copysign(1.0, v.y),
       +            copysign(1.0, v.z));
        
                // Calculate the divergence based on the upwind differences (Griebel et
                // al. 1998, eq. 3.9)
                const Float3 div_uw = MAKE_FLOAT3(
       -                // x
       -                ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
       -                (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
       +            // x
       +            ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
       +             (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
        
       -                ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
       -                (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
       +            ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
       +             (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
        
       -                ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
       -                (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
       +            ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
       +             (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
        
       -                // y
       -                ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
       -                (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
       +            // y
       +            ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
       +             (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
        
       -                ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
       -                (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
       +            ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
       +             (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
        
       -                ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
       -                (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
       +            ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
       +             (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
        
       -                // z
       -                ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
       -                (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
       +            // z
       +            ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
       +             (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
        
       -                ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
       -                (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
       +            ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
       +             (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
        
       -                ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
       -                (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
       +            ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
       +             (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
        
        
                // Calculate the divergence based on the central-difference gradients
                const Float3 div_cd = MAKE_FLOAT3(
       -                // x
       -                (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
       -                // y
       -                (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
       -                // z
       -                (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
       +            // x
       +            (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
       +            (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
       +            (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
       +            // y
       +            (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
       +            (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
       +            (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
       +            // z
       +            (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
       +            (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
       +            (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
        
                // Weighting parameter
                const Float tau = 0.5;
       t@@ -1951,26 +1958,26 @@ __global__ void findNSdivphiviv(
                /*
                // Calculate the divergence: div(phi*v_i*v)
                const Float3 div_phi_vi_v = MAKE_FLOAT3(
       -                // x
       -                (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
       -                // y
       -                (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
       -                // z
       -                (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
       -                (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
       -                (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
       -                // */
       +        // x
       +        (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
       +        (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
       +        (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
       +        // y
       +        (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
       +        (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
       +        (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
       +        // z
       +        (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
       +        (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
       +        (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
       +        // */
        
                // Write divergence
                __syncthreads();
                dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v;
        
                //printf("div(phi*v*v) [%d,%d,%d] = %f, %f, %f\n", x,y,z,
       -                //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
       +        //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
        
        #ifdef CHECK_NS_FINITE
                (void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v);
       t@@ -1979,8 +1986,8 @@ __global__ void findNSdivphiviv(
        }
        
        __global__ void findNSdivtau(
       -        Float*  dev_ns_tau,      // in
       -        Float3* dev_ns_div_tau)  // out
       +    const Float* __restrict__ dev_ns_tau,      // in
       +    Float3* __restrict__ dev_ns_div_tau)  // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2015,9 +2022,9 @@ __global__ void findNSdivtau(
        
        // Find the divergence of phi*tau
        __global__ void findNSdivphitau(
       -        Float*  dev_ns_phi,          // in
       -        Float*  dev_ns_tau,          // in
       -        Float3* dev_ns_div_phi_tau)  // out
       +    const Float* __restrict__ dev_ns_phi,          // in
       +    const Float* __restrict__ dev_ns_tau,          // in
       +    Float3* __restrict__ dev_ns_div_phi_tau)  // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2094,18 +2101,18 @@ __global__ void findNSdivphitau(
        
                // Calculate div(phi*tau)
                const Float3 div_phi_tau = MAKE_FLOAT3(
       -                // x
       -                (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
       -                (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
       -                (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
       -                // y
       -                (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
       -                (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
       -                (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
       -                // z
       -                (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
       -                (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
       -                (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
       +            // x
       +            (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
       +            (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
       +            (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
       +            // y
       +            (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
       +            (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
       +            (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
       +            // z
       +            (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
       +            (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
       +            (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
        
                // Write divergence
                __syncthreads();
       t@@ -2120,9 +2127,9 @@ __global__ void findNSdivphitau(
        // Find the divergence of phi v v
        // Unused
        __global__ void findNSdivphivv(
       -        Float*  dev_ns_v_prod, // in
       -        Float*  dev_ns_phi,    // in
       -        Float3* dev_ns_div_phi_v_v) // out
       +    const Float* __restrict__ dev_ns_v_prod, // in
       +    const Float* __restrict__ dev_ns_phi,    // in
       +    Float3* __restrict__ dev_ns_div_phi_v_v) // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2174,24 +2181,24 @@ __global__ void findNSdivphivv(
                // The symmetry described in findvvOuterProdNS is used
                __syncthreads();
                const Float3 div = MAKE_FLOAT3(
       -                ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
       -                  - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
       -                 (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
       -                  - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
       -                 (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
       -                  - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
       -                ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
       -                  - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
       -                 (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
       -                  - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
       -                 (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
       -                  - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
       -                ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
       -                  - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
       -                 (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
       -                  - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
       -                 (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
       -                  - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
       +            ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
       +              - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
       +             (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
       +              - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
       +             (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
       +              - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
       +            ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
       +              - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
       +             (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
       +              - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
       +             (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
       +              - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
       +            ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
       +              - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
       +             (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
       +              - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
       +             (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
       +              - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
        
                //printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z);
        
       t@@ -2209,25 +2216,25 @@ __global__ void findNSdivphivv(
        // Find predicted fluid velocity
        // Launch per face.
        __global__ void findPredNSvelocities(
       -        Float*  dev_ns_p,               // in
       -        Float*  dev_ns_v_x,             // in
       -        Float*  dev_ns_v_y,             // in
       -        Float*  dev_ns_v_z,             // in
       -        Float*  dev_ns_phi,             // in
       -        Float*  dev_ns_dphi,            // in
       -        Float*  dev_ns_div_tau_x,       // in
       -        Float*  dev_ns_div_tau_y,       // in
       -        Float*  dev_ns_div_tau_z,       // in
       -        Float3* dev_ns_div_phi_vi_v,    // in
       -        int     bc_bot,                 // in
       -        int     bc_top,                 // in
       -        Float   beta,                   // in
       -        Float3* dev_ns_F_pf,            // in
       -        unsigned int ndem,              // in
       -        Float   c_grad_p,               // in
       -        Float*  dev_ns_v_p_x,           // out
       -        Float*  dev_ns_v_p_y,           // out
       -        Float*  dev_ns_v_p_z)           // out
       +    const Float*  __restrict__ dev_ns_p,               // in
       +    const Float*  __restrict__ dev_ns_v_x,             // in
       +    const Float*  __restrict__ dev_ns_v_y,             // in
       +    const Float*  __restrict__ dev_ns_v_z,             // in
       +    const Float*  __restrict__ dev_ns_phi,             // in
       +    const Float*  __restrict__ dev_ns_dphi,            // in
       +    const Float*  __restrict__ dev_ns_div_tau_x,       // in
       +    const Float*  __restrict__ dev_ns_div_tau_y,       // in
       +    const Float*  __restrict__ dev_ns_div_tau_z,       // in
       +    const Float3* __restrict__ dev_ns_div_phi_vi_v,    // in
       +    const int     bc_bot,                 // in
       +    const int     bc_top,                 // in
       +    const Float   beta,                   // in
       +    const Float3* __restrict__ dev_ns_F_pf,            // in
       +    const unsigned int ndem,              // in
       +    const Float   __restrict__ c_grad_p,               // in
       +    Float* __restrict__ dev_ns_v_p_x,           // out
       +    Float* __restrict__ dev_ns_v_p_y,           // out
       +    Float* __restrict__ dev_ns_v_p_z)           // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2255,9 +2262,9 @@ __global__ void findPredNSvelocities(
                // Values that are needed for calculating the predicted velocity
                __syncthreads();
                const Float3 v = MAKE_FLOAT3(
       -                dev_ns_v_x[fidx],
       -                dev_ns_v_y[fidx],
       -                dev_ns_v_z[fidx]);
       +            dev_ns_v_x[fidx],
       +            dev_ns_v_y[fidx],
       +            dev_ns_v_z[fidx]);
        
                Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0);
                if (devC_params.mu > 0.0) {
       t@@ -2285,13 +2292,13 @@ __global__ void findPredNSvelocities(
        
                // component-wise average values
                const Float3 phi = MAKE_FLOAT3(
       -                amean(phi_c, phi_xn),
       -                amean(phi_c, phi_yn),
       -                amean(phi_c, phi_zn));
       +            amean(phi_c, phi_xn),
       +            amean(phi_c, phi_yn),
       +            amean(phi_c, phi_zn));
                const Float3 dphi = MAKE_FLOAT3(
       -                amean(dphi_c, dphi_xn),
       -                amean(dphi_c, dphi_yn),
       -                amean(dphi_c, dphi_zn));
       +            amean(dphi_c, dphi_xn),
       +            amean(dphi_c, dphi_yn),
       +            amean(dphi_c, dphi_zn));
        
                // The particle-fluid interaction force should only be incoorporated if
                // there is a fluid viscosity
       t@@ -2308,9 +2315,9 @@ __global__ void findPredNSvelocities(
                    f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
                }
                const Float3 f_i = MAKE_FLOAT3(
       -                amean(f_i_c.x, f_i_xn.x),
       -                amean(f_i_c.y, f_i_yn.y),
       -                amean(f_i_c.z, f_i_zn.z));
       +            amean(f_i_c.x, f_i_xn.x),
       +            amean(f_i_c.y, f_i_yn.y),
       +            amean(f_i_c.z, f_i_zn.z));
        
                const Float dt = ndem*devC_dt;
                const Float rho = devC_params.rho_f;
       t@@ -2326,9 +2333,9 @@ __global__ void findPredNSvelocities(
                    const Float p_yn = dev_ns_p[idx(x,y-1,z)];
                    const Float p_zn = dev_ns_p[idx(x,y,z-1)];
                    const Float3 grad_p = MAKE_FLOAT3(
       -                    (p - p_xn)/dx,
       -                    (p - p_yn)/dy,
       -                    (p - p_zn)/dz) * c_grad_p;
       +                (p - p_xn)/dx,
       +                (p - p_yn)/dy,
       +                (p - p_zn)/dz) * c_grad_p;
        #ifdef SET_1
                    pressure_term = -beta*dt/(rho*phi)*grad_p;
        #endif
       t@@ -2338,16 +2345,16 @@ __global__ void findPredNSvelocities(
                }
        
                const Float3 div_phi_vi_v = MAKE_FLOAT3(
       -                amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
       -                amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
       -                amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
       +            amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
       +            amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
       +            amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
        
                // Determine the predicted velocity
        #ifdef SET_1
                const Float3 interaction_term = -dt/(rho*phi)*f_i;
                const Float3 diffusion_term = dt/(rho*phi)*div_tau;
                const Float3 gravity_term = MAKE_FLOAT3(
       -                    devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
       +            devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
                const Float3 porosity_term = -1.0*v*dphi/phi;
                const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
        #endif
       t@@ -2355,7 +2362,7 @@ __global__ void findPredNSvelocities(
                const Float3 interaction_term = -dt/(rho*phi)*f_i;
                const Float3 diffusion_term = dt/rho*div_tau;
                const Float3 gravity_term = MAKE_FLOAT3(
       -                devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
       +            devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
                const Float3 porosity_term = -1.0*v*dphi/phi;
                const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
        #endif
       t@@ -2376,31 +2383,31 @@ __global__ void findPredNSvelocities(
        
                // No slip
                /*if ((z == 0 && bc_bot == 2) || (z == nz-1 && bc_top == 2)) {
       -            v_p.x = 0.0;
       -            v_p.y = 0.0;
       -            v_p.z = 0.0;
       -            }*/
       +          v_p.x = 0.0;
       +          v_p.y = 0.0;
       +          v_p.z = 0.0;
       +          }*/
        
        
        #ifdef REPORT_V_P_COMPONENTS
                // Report velocity components to stdout for debugging
                if (z==0)
       -        printf("\n[%d,%d,%d]"
       -               "\tv_p      = %+e %+e %+e\n"
       -               "\tpres     = %+e %+e %+e\n"
       -               "\tinteract = %+e %+e %+e\n"
       -               "\tdiff     = %+e %+e %+e\n"
       -               "\tgrav     = %+e %+e %+e\n"
       -               "\tporos    = %+e %+e %+e\n"
       -               "\tadv      = %+e %+e %+e\n",
       -               x, y, z,
       -               v_p.x, v_p.y, v_p.z,
       -               pressure_term.x, pressure_term.y, pressure_term.z, 
       -               interaction_term.x, interaction_term.y, interaction_term.z, 
       -               diffusion_term.x, diffusion_term.y, diffusion_term.z, 
       -               gravity_term.x, gravity_term.y, gravity_term.z, 
       -               porosity_term.x, porosity_term.y, porosity_term.z, 
       -               advection_term.x, advection_term.y, advection_term.z);
       +            printf("\n[%d,%d,%d]"
       +                   "\tv_p      = %+e %+e %+e\n"
       +                   "\tpres     = %+e %+e %+e\n"
       +                   "\tinteract = %+e %+e %+e\n"
       +                   "\tdiff     = %+e %+e %+e\n"
       +                   "\tgrav     = %+e %+e %+e\n"
       +                   "\tporos    = %+e %+e %+e\n"
       +                   "\tadv      = %+e %+e %+e\n",
       +                   x, y, z,
       +                   v_p.x, v_p.y, v_p.z,
       +                   pressure_term.x, pressure_term.y, pressure_term.z, 
       +                   interaction_term.x, interaction_term.y, interaction_term.z, 
       +                   diffusion_term.x, diffusion_term.y, diffusion_term.z, 
       +                   gravity_term.x, gravity_term.y, gravity_term.z, 
       +                   porosity_term.x, porosity_term.y, porosity_term.z, 
       +                   advection_term.x, advection_term.y, advection_term.z);
        #endif
        
                // Save the predicted velocity
       t@@ -2421,18 +2428,18 @@ __global__ void findPredNSvelocities(
        // At each iteration, the value of the forcing function is found as:
        //   f = f1 - f2 dot grad(epsilon)
        __global__ void findNSforcing(
       -        Float*  dev_ns_epsilon,     // in
       -        Float*  dev_ns_phi,         // in
       -        Float*  dev_ns_dphi,        // in
       -        Float3* dev_ns_v_p,         // in
       -        Float*  dev_ns_v_p_x,       // in
       -        Float*  dev_ns_v_p_y,       // in
       -        Float*  dev_ns_v_p_z,       // in
       -        unsigned int nijac,         // in
       -        unsigned int ndem,          // in
       -        Float*  dev_ns_f1,          // out
       -        Float3* dev_ns_f2,          // out
       -        Float*  dev_ns_f)           // out
       +    const Float*  __restrict__ dev_ns_epsilon,     // in
       +    const Float*  __restrict__ dev_ns_phi,         // in
       +    const Float*  __restrict__ dev_ns_dphi,        // in
       +    const Float3* __restrict__ dev_ns_v_p,         // in
       +    const Float*  __restrict__ dev_ns_v_p_x,       // in
       +    const Float*  __restrict__ dev_ns_v_p_y,       // in
       +    const Float*  __restrict__ dev_ns_v_p_z,       // in
       +    const unsigned int nijac,                      // in
       +    const unsigned int ndem,                       // in
       +    Float*  __restrict__ dev_ns_f1,                // out
       +    Float3* __restrict__ dev_ns_f2,                // out
       +    Float*  __restrict__ dev_ns_f)                 // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2472,7 +2479,7 @@ __global__ void findNSforcing(
                    // Calculate derivatives
                    const Float  div_v_p
                        = divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z,
       -                        x, y, z, dx, dy, dz);
       +                             x, y, z, dx, dy, dz);
                    const Float3 grad_phi
                        = gradient(dev_ns_phi, x, y, z, dx, dy, dz);
        
       t@@ -2497,7 +2504,7 @@ __global__ void findNSforcing(
        #ifdef REPORT_FORCING_TERMS
                    // Report values terms in the forcing function for debugging
                    printf("[%d,%d,%d]\tt1 = %f\tt2 = %f\tt4 = %f\n",
       -                    x,y,z, t1, t2, t4);
       +                   x,y,z, t1, t2, t4);
        #endif
        
                    // Save values
       t@@ -2524,7 +2531,7 @@ __global__ void findNSforcing(
                const Float t3 = -dot(f2, grad_epsilon);
                if (z >= nz-3)
                    printf("[%d,%d,%d]\tf = %f\tf1 = %f\tt3 = %f\n",
       -                    x,y,z, f, f1, t3);
       +                   x,y,z, f, f1, t3);
        #endif
        
                // Save forcing function value
       t@@ -2542,10 +2549,10 @@ __global__ void findNSforcing(
        // non-smoothed and smoothed values.
        template<typename T>
        __global__ void smoothing(
       -        T* dev_arr,
       -        const Float gamma,
       -        const unsigned int bc_bot,
       -        const unsigned int bc_top)
       +    T* __restrict__ dev_arr,
       +    const Float gamma,
       +    const unsigned int bc_bot,
       +    const unsigned int bc_top)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2602,13 +2609,13 @@ __global__ void smoothing(
        
        // Perform a single Jacobi iteration
        __global__ void jacobiIterationNS(
       -        const Float* dev_ns_epsilon,
       -        Float* dev_ns_epsilon_new,
       -        Float* dev_ns_norm,
       -        const Float* dev_ns_f,
       -        const int bc_bot,
       -        const int bc_top,
       -        const Float theta)
       +    const Float* __restrict__ dev_ns_epsilon,
       +    Float* __restrict__ dev_ns_epsilon_new,
       +    Float* __restrict__ dev_ns_norm,
       +    const Float* __restrict__ dev_ns_f,
       +    const int bc_bot,
       +    const int bc_top,
       +    const Float theta)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2672,9 +2679,9 @@ __global__ void jacobiIterationNS(
                const Float dzdz = dz*dz;
                Float e_new
                    = (-dxdx*dydy*dzdz*f
       -                    + dydy*dzdz*(e_xn + e_xp)
       -                    + dxdx*dzdz*(e_yn + e_yp)
       -                    + dxdx*dydy*(e_zn + e_zp))
       +               + dydy*dzdz*(e_xn + e_xp)
       +               + dxdx*dzdz*(e_yn + e_yp)
       +               + dxdx*dydy*(e_zn + e_zp))
                    /(2.0*(dxdx*dydy + dxdx*dzdz + dydy*dzdz));
        
                // New value of epsilon in 1D update
       t@@ -2682,7 +2689,7 @@ __global__ void jacobiIterationNS(
        
                // Print values for debugging
                /*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n",
       -                x,y,z, e, f, e_new);*/
       +          x,y,z, e, f, e_new);*/
        
                const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16);
                const Float e_relax = e*(1.0-theta) + e_new*theta;
       t@@ -2697,7 +2704,7 @@ __global__ void jacobiIterationNS(
                //(void)checkFiniteFloat("res_norm", x, y, z, res_norm);
                if (checkFiniteFloat("res_norm", x, y, z, res_norm)) {
                    printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n",
       -                    x,y,z, e, f, e_new, res_norm);
       +                   x,y,z, e, f, e_new, res_norm);
                }
        #endif
            }
       t@@ -2706,8 +2713,8 @@ __global__ void jacobiIterationNS(
        // Copy all values from one array to the other
        template<typename T>
        __global__ void copyValues(
       -        T* dev_read,
       -        T* dev_write)
       +    const T* __restrict__ dev_read,
       +    T* __restrict__ dev_write)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2717,10 +2724,10 @@ __global__ void copyValues(
            // Internal nodes only
            if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
        
       -    // Internal nodes + ghost nodes
       -    /*if (x <= devC_grid.num[0]+1 &&
       -            y <= devC_grid.num[1]+1 &&
       -            z <= devC_grid.num[2]+1) {*/
       +        // Internal nodes + ghost nodes
       +        /*if (x <= devC_grid.num[0]+1 &&
       +          y <= devC_grid.num[1]+1 &&
       +          z <= devC_grid.num[2]+1) {*/
        
                const unsigned int cellidx = idx(x,y,z); // without ghost nodes
                //const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes
       t@@ -2730,7 +2737,7 @@ __global__ void copyValues(
                const T val = dev_read[cellidx];
        
                //if (z == devC_grid.num[2]-1)
       -            //printf("[%d,%d,%d] = %f\n", x, y, z, val);
       +        //printf("[%d,%d,%d] = %f\n", x, y, z, val);
        
                // Write
                __syncthreads();
       t@@ -2740,11 +2747,11 @@ __global__ void copyValues(
        
        // Find and store the normalized residuals
        __global__ void findNormalizedResiduals(
       -        Float* dev_ns_epsilon_old,
       -        Float* dev_ns_epsilon,
       -        Float* dev_ns_norm,
       -        const unsigned int bc_bot,
       -        const unsigned int bc_top)
       +    const Float* __restrict__ dev_ns_epsilon_old,
       +    const Float* __restrict__ dev_ns_epsilon,
       +    Float* __restrict__ dev_ns_norm,
       +    const unsigned int bc_bot,
       +    const unsigned int bc_top)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2791,9 +2798,9 @@ __global__ void findNormalizedResiduals(
        
        // Computes the new velocity and pressure using the corrector
        __global__ void updateNSpressure(
       -        Float* dev_ns_epsilon,  // in
       -        Float  beta,            // in
       -        Float* dev_ns_p)        // out
       +    const Float* __restrict__ dev_ns_epsilon,  // in
       +    const Float  __restrict__ beta,            // in
       +    Float* __restrict__ dev_ns_p)              // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2825,19 +2832,19 @@ __global__ void updateNSpressure(
        }
        
        __global__ void updateNSvelocity(
       -        Float* dev_ns_v_p_x,    // in
       -        Float* dev_ns_v_p_y,    // in
       -        Float* dev_ns_v_p_z,    // in
       -        Float* dev_ns_phi,      // in
       -        Float* dev_ns_epsilon,  // in
       -        Float  beta,            // in
       -        int    bc_bot,          // in
       -        int    bc_top,          // in
       -        unsigned int ndem,      // in
       -        Float  c_grad_p,        // in
       -        Float* dev_ns_v_x,      // out
       -        Float* dev_ns_v_y,      // out
       -        Float* dev_ns_v_z)      // out
       +    const Float* __restrict__ dev_ns_v_p_x,    // in
       +    const Float* __restrict__ dev_ns_v_p_y,    // in
       +    const Float* __restrict__ dev_ns_v_p_z,    // in
       +    const Float* __restrict__ dev_ns_phi,      // in
       +    const Float* __restrict__ dev_ns_epsilon,  // in
       +    const Float  beta,            // in
       +    const int    bc_bot,          // in
       +    const int    bc_top,          // in
       +    const unsigned int ndem,      // in
       +    const Float  c_grad_p,        // in
       +    Float* __restrict__ dev_ns_v_x,      // out
       +    Float* __restrict__ dev_ns_v_y,      // out
       +    Float* __restrict__ dev_ns_v_z)      // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2879,15 +2886,15 @@ __global__ void updateNSvelocity(
                const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];
        
                const Float3 phi = MAKE_FLOAT3(
       -                amean(phi_c, phi_xn),
       -                amean(phi_c, phi_yn),
       -                amean(phi_c, phi_zn));
       +            amean(phi_c, phi_xn),
       +            amean(phi_c, phi_yn),
       +            amean(phi_c, phi_zn));
        
                // Find corrector gradient
                const Float3 grad_epsilon = MAKE_FLOAT3(
       -                (epsilon_c - epsilon_xn)/dx,
       -                (epsilon_c - epsilon_yn)/dy,
       -                (epsilon_c - epsilon_zn)/dz) * c_grad_p;
       +            (epsilon_c - epsilon_xn)/dx,
       +            (epsilon_c - epsilon_yn)/dy,
       +            (epsilon_c - epsilon_zn)/dz) * c_grad_p;
        
                // Find new velocity
        #ifdef SET_1
       t@@ -2900,16 +2907,16 @@ __global__ void updateNSvelocity(
        
                // Print values for debugging
                /* if (z == 0) {
       -            Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
       -            Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
       -            printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
       -                    x,y,z,
       -                    grad_epsilon.x,
       -                    grad_epsilon.y,
       -                    grad_epsilon.z,
       -                    e_up,
       -                    e_down);
       -        }*/
       +           Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
       +           Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
       +           printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
       +           x,y,z,
       +           grad_epsilon.x,
       +           grad_epsilon.y,
       +           grad_epsilon.z,
       +           e_up,
       +           e_down);
       +           }*/
        
                if ((z == 0 && bc_bot == 1) || (z == nz-1 && bc_top == 1))
                    v.z = 0.0;
       t@@ -2919,11 +2926,11 @@ __global__ void updateNSvelocity(
        
                // Check the advection term using the Courant-Friedrichs-Lewy condition
                if (v.x*ndem*devC_dt/dx
       -                + v.y*ndem*devC_dt/dy
       -                + v.z*ndem*devC_dt/dz > 1.0) {
       +            + v.y*ndem*devC_dt/dy
       +            + v.z*ndem*devC_dt/dz > 1.0) {
                    printf("[%d,%d,%d] Warning: Advection term in fluid may be "
       -                    "unstable (CFL condition), v = %f,%f,%f\n",
       -                    x,y,z, v.x, v.y, v.z);
       +                   "unstable (CFL condition), v = %f,%f,%f\n",
       +                   x,y,z, v.x, v.y, v.z);
                }
        
                // Write new values
       t@@ -2941,12 +2948,12 @@ __global__ void updateNSvelocity(
        // Find the average particle diameter and velocity for each CFD cell.
        // UNUSED: The values are estimated in the porosity estimation function instead
        __global__ void findAvgParticleVelocityDiameter(
       -        unsigned int* dev_cellStart, // in
       -        unsigned int* dev_cellEnd,   // in
       -        Float4* dev_vel_sorted,      // in
       -        Float4* dev_x_sorted,        // in
       -        Float3* dev_ns_vp_avg,       // out
       -        Float*  dev_ns_d_avg)        // out
       +    const unsigned int* __restrict__ dev_cellStart, // in
       +    const unsigned int* __restrict__ dev_cellEnd,   // in
       +    const Float4* __restrict__ dev_vel_sorted,      // in
       +    const Float4* __restrict__ dev_x_sorted,        // in
       +    Float3* dev_ns_vp_avg,       // out
       +    Float*  dev_ns_d_avg)        // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -3023,18 +3030,18 @@ __device__ Float dragCoefficient(Float re)
        // Find particle-fluid interaction force as outlined by Zhou et al. 2010, and
        // originally by Gidaspow 1992.
        __global__ void findInteractionForce(
       -        Float4* dev_x,           // in
       -        Float4* dev_vel,         // in
       -        Float*  dev_ns_phi,      // in
       -        Float*  dev_ns_p,        // in
       -        Float*  dev_ns_v_x,      // in
       -        Float*  dev_ns_v_y,      // in
       -        Float*  dev_ns_v_z,      // in
       -        Float*  dev_ns_div_tau_x,// in
       -        Float*  dev_ns_div_tau_y,// in
       -        Float*  dev_ns_div_tau_z,// in
       -        Float3* dev_ns_f_pf,     // out
       -        Float4* dev_force)       // out
       +    const Float4* __restrict__ dev_x,           // in
       +    const Float4* __restrict__ dev_vel,         // in
       +    const Float*  __restrict__ dev_ns_phi,      // in
       +    const Float*  __restrict__ dev_ns_p,        // in
       +    const Float*  __restrict__ dev_ns_v_x,      // in
       +    const Float*  __restrict__ dev_ns_v_y,      // in
       +    const Float*  __restrict__ dev_ns_v_z,      // in
       +    const Float*  __restrict__ dev_ns_div_tau_x,// in
       +    const Float*  __restrict__ dev_ns_div_tau_y,// in
       +    const Float*  __restrict__ dev_ns_div_tau_z,// in
       +    Float3* __restrict__ dev_ns_f_pf,     // out
       +    Float4* __restrict__ dev_force)       // out
        {
            unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index
        
       t@@ -3080,14 +3087,14 @@ __global__ void findInteractionForce(
                const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)];
        
                const Float3 v_f = MAKE_FLOAT3(
       -                amean(v_x, v_x_p),
       -                amean(v_y, v_y_p),
       -                amean(v_z, v_z_p));
       +            amean(v_x, v_x_p),
       +            amean(v_y, v_y_p),
       +            amean(v_z, v_z_p));
        
                const Float3 div_tau = MAKE_FLOAT3(
       -                amean(div_tau_x, div_tau_x_p),
       -                amean(div_tau_y, div_tau_y_p),
       -                amean(div_tau_z, div_tau_z_p));
       +            amean(div_tau_x, div_tau_x_p),
       +            amean(div_tau_y, div_tau_y_p),
       +            amean(div_tau_z, div_tau_z_p));
        
                const Float3 v_rel = v_f - v_p;
                const Float  v_rel_length = length(v_rel);
       t@@ -3119,15 +3126,15 @@ __global__ void findInteractionForce(
        
        #ifdef CHECK_NS_FINITE
                /*
       -        printf("\nfindInteractionForce %d [%d,%d,%d]\n"
       -               "\tV_p = %f Re=%f Cd=%f chi=%f\n"
       -               "\tf_d = %+e %+e %+e\n"
       -               "\tf_p = %+e %+e %+e\n"
       -               "\tf_v = %+e %+e %+e\n",
       -                i, i_x, i_y, i_z, V_p, Re, Cd, chi,
       -                f_d.x, f_d.y, f_d.z,
       -                f_p.x, f_p.y, f_p.z,
       -                f_v.x, f_v.y, f_v.z);// */
       +          printf("\nfindInteractionForce %d [%d,%d,%d]\n"
       +          "\tV_p = %f Re=%f Cd=%f chi=%f\n"
       +          "\tf_d = %+e %+e %+e\n"
       +          "\tf_p = %+e %+e %+e\n"
       +          "\tf_v = %+e %+e %+e\n",
       +          i, i_x, i_y, i_z, V_p, Re, Cd, chi,
       +          f_d.x, f_d.y, f_d.z,
       +          f_p.x, f_p.y, f_p.z,
       +          f_v.x, f_v.y, f_v.z);// */
                checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d);
                checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p);
                checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v);
       t@@ -3149,11 +3156,11 @@ __global__ void findInteractionForce(
        // Apply the fluid-particle interaction force to the fluid cell based on the
        // interaction forces from each particle in it
        __global__ void applyInteractionForceToFluid(
       -        unsigned int* dev_gridParticleIndex,    // in
       -        unsigned int* dev_cellStart,            // in
       -        unsigned int* dev_cellEnd,              // in
       -        Float3* dev_ns_f_pf,                    // in
       -        Float3* dev_ns_F_pf)                    // out
       +    const unsigned int* __restrict__ dev_gridParticleIndex,    // in
       +    const unsigned int* __restrict__ dev_cellStart,            // in
       +    const unsigned int* __restrict__ dev_cellEnd,              // in
       +    const Float3* __restrict__ dev_ns_f_pf,                    // in
       +    Float3* __restrict__ dev_ns_F_pf)                    // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -3212,10 +3219,10 @@ __global__ void applyInteractionForceToFluid(
        // Launch per cell face node.
        // Cell center ghost nodes must be set prior to call.
        __global__ void interpolateCenterToFace(
       -        Float3* dev_in,
       -        Float*  dev_out_x,
       -        Float*  dev_out_y,
       -        Float*  dev_out_z)
       +    const Float3* __restrict__ dev_in,
       +    Float* __restrict__ dev_out_x,
       +    Float* __restrict__ dev_out_y,
       +    Float* __restrict__ dev_out_z)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -3224,8 +3231,8 @@ __global__ void interpolateCenterToFace(
        
            // Check that we are not outside the fluid grid
            if (x <= devC_grid.num[0]
       -            && y <= devC_grid.num[1]
       -            && z <= devC_grid.num[2]) {
       +        && y <= devC_grid.num[1]
       +        && z <= devC_grid.num[2]) {
        
                const unsigned int faceidx = vidx(x,y,z);
        
       t@@ -3249,10 +3256,10 @@ __global__ void interpolateCenterToFace(
        
        // Launch per cell center node
        __global__ void interpolateFaceToCenter(
       -        Float*  dev_in_x,
       -        Float*  dev_in_y,
       -        Float*  dev_in_z,
       -        Float3* dev_out)
       +    Float*  dev_in_x,
       +    Float*  dev_in_y,
       +    Float*  dev_in_z,
       +    Float3* dev_out)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -3277,9 +3284,9 @@ __global__ void interpolateFaceToCenter(
                const Float z_p = dev_in_z[vidx(x,y,z+1)];
        
                const Float3 val = MAKE_FLOAT3(
       -                amean(x_n, x_p),
       -                amean(y_n, y_p),
       -                amean(z_n, z_p));
       +            amean(x_n, x_p),
       +            amean(y_n, y_p),
       +            amean(z_n, z_p));
        
                __syncthreads();
                //printf("[%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z);
       t@@ -3292,12 +3299,12 @@ __global__ void interpolateFaceToCenter(
        // Warning: The grid-corner values will be invalid, along with the non-normal
        // components of the ghost nodes
        __global__ void findFaceDivTau(
       -        Float* dev_ns_v_x,
       -        Float* dev_ns_v_y,
       -        Float* dev_ns_v_z,
       -        Float* dev_ns_div_tau_x,
       -        Float* dev_ns_div_tau_y,
       -        Float* dev_ns_div_tau_z)
       +    const Float* __restrict__ dev_ns_v_x,
       +    const Float* __restrict__ dev_ns_v_y,
       +    const Float* __restrict__ dev_ns_v_z,
       +    Float* __restrict__ dev_ns_div_tau_x,
       +    Float* __restrict__ dev_ns_div_tau_y,
       +    Float* __restrict__ dev_ns_div_tau_z)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -3347,19 +3354,19 @@ __global__ void findFaceDivTau(
        
                const Float div_tau_x =
                    devC_params.mu*(
       -                    (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
       -                    (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
       -                    (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
       +                (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
       +                (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
       +                (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
                const Float div_tau_y =
                    devC_params.mu*(
       -                    (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
       -                    (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
       -                    (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
       +                (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
       +                (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
       +                (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
                const Float div_tau_z =
                    devC_params.mu*(
       -                    (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
       -                    (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
       -                    (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
       +                (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
       +                (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
       +                (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
        
                __syncthreads();
                //printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z,
 (DIR) diff --git a/src/raytracer.cuh b/src/raytracer.cuh
       t@@ -6,12 +6,14 @@
        //#include "cuPrintf.cu"
        
        // Template for discarding the last term in four-component vector structs
       -__device__ __inline__ float3 f4_to_f3(float4 in) {
       +__device__ __inline__ float3 f4_to_f3(const float4 in) {
            return make_float3(in.x, in.y, in.z);
        }
        
        // Kernel for initializing image data
       -__global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
       +__global__ void imageInit(
       +    unsigned char* __restrict__ dev_img,
       +    const unsigned int pixels)
        {
            // Compute pixel position from threadIdx/blockIdx
            unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       t@@ -24,11 +26,12 @@ __global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
        }
        
        // Calculate ray origins and directions
       -__global__ void rayInitPerspective(float4* dev_ray_origo, 
       -        float4* dev_ray_direction, 
       -        float4 eye, 
       -        unsigned int width,
       -        unsigned int height)
       +__global__ void rayInitPerspective(
       +    float4* __restrict__ dev_ray_origo, 
       +    float4* __restrict__ dev_ray_direction, 
       +    const float4 eye, 
       +    const unsigned int width,
       +    const unsigned int height)
        {
            // Compute pixel position from threadIdx/blockIdx
            unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       t@@ -47,15 +50,17 @@ __global__ void rayInitPerspective(float4* dev_ray_origo,
        
            // Write ray origo and direction to global memory
            dev_ray_origo[mempos]     = make_float4(devC_eye, 0.0f);
       -    dev_ray_direction[mempos] = make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f);
       +    dev_ray_direction[mempos] =
       +        make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f);
        }
        
        // Check wether the pixel's viewing ray intersects with the spheres,
        // and shade the pixel correspondingly
       -__global__ void rayIntersectSpheres(float4* dev_ray_origo, 
       -        float4* dev_ray_direction,
       -        Float4* dev_x, 
       -        unsigned char* dev_img)
       +__global__ void rayIntersectSpheres(
       +    const float4* __restrict__ dev_ray_origo, 
       +    const float4* __restrict__ dev_ray_direction,
       +    const Float4* __restrict__ dev_x, 
       +    unsigned char* __restrict__ dev_img)
        {
            // Compute pixel position from threadIdx/blockIdx
            unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       t@@ -67,7 +72,8 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
            float3 d = f4_to_f3(dev_ray_direction[mempos]);
            //float  step = length(d);
        
       -    // Distance, in ray steps, between object and eye initialized with a large value
       +    // Distance, in ray steps, between object and eye initialized with
       +    // a large value
            float tdist = 1e10f;
        
            // Surface normal at closest sphere intersection
       t@@ -86,8 +92,6 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
                float3 c = make_float3(x.x, x.y, x.z);
                float  R = x.w;
        
       -        //cuPrintf("particle %d at: %f, %f, %f, radius: %f\n", i, c.x, c.y, c.z, R);
       -
                // Calculate the discriminant: d = B^2 - 4AC
                float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
                    - 4.0f*dot(d,d)        // -4*A
       t@@ -98,8 +102,10 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
                if (Delta > 0.0f) { 
        
                    // Calculate roots, Shirley 2009 p. 77
       -            float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                            * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       +            float t_minus = ((dot(-d,(e-c))
       +                              - sqrt( dot(d,(e-c))*dot(d,(e-c))
       +                                      - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
       +                             / dot(d,d));
        
                    // Check wether intersection is closer than previous values
                    if (fabs(t_minus) < tdist) {
       t@@ -137,14 +143,15 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
        
        // Check wether the pixel's viewing ray intersects with the spheres,
        // and shade the pixel correspondingly using a colormap
       -__global__ void rayIntersectSpheresColormap(float4* dev_ray_origo, 
       -        float4* dev_ray_direction,
       -        Float4* dev_x, 
       -        Float4* dev_vel,
       -        Float*  dev_linarr,
       -        float max_val,
       -        float lower_cutoff,
       -        unsigned char* dev_img)
       +__global__ void rayIntersectSpheresColormap(
       +    const float4* __restrict__ dev_ray_origo, 
       +    const float4* __restrict__ dev_ray_direction,
       +    const Float4* __restrict__ dev_x, 
       +    const Float4* __restrict__ dev_vel,
       +    const Float*  __restrict__ dev_linarr,
       +    const float max_val,
       +    const float lower_cutoff,
       +    unsigned char* __restrict__ dev_img)
        {
            // Compute pixel position from threadIdx/blockIdx
            unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
       t@@ -155,7 +162,8 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
            float3 e = f4_to_f3(dev_ray_origo[mempos]);
            float3 d = f4_to_f3(dev_ray_direction[mempos]);
        
       -    // Distance, in ray steps, between object and eye initialized with a large value
       +    // Distance, in ray steps, between object and eye initialized with
       +    // a large value
            float tdist = 1e10f;
        
            // Surface normal at closest sphere intersection
       t@@ -181,8 +189,6 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
                    - 4.0f*dot(d,d)        // -4*A
                    * (dot((e-c),(e-c)) - R*R);  // C
        
       -
       -
                // If the determinant is positive, there are two solutions
                // One where the line enters the sphere, and one where it exits
                if (lower_cutoff > 0.0) {
       t@@ -198,8 +204,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
                    if (Delta > 0.0f && val > lower_cutoff && fixvel == 0.f) {
        
                        // Calculate roots, Shirley 2009 p. 77
       -                float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                                * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       +                float t_minus =
       +                    ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
       +                                           - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
       +                     / dot(d,d));
        
                        // Check wether intersection is closer than previous values
                        if (fabs(t_minus) < tdist) {
       t@@ -217,8 +225,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
                    if (Delta > 0.0f) {
        
                        // Calculate roots, Shirley 2009 p. 77
       -                float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       -                                * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
       +                float t_minus =
       +                    ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
       +                                           - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
       +                     / dot(d,d));
        
                        // Check wether intersection is closer than previous values
                        if (fabs(t_minus) < tdist) {
       t@@ -269,25 +279,28 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
        
                // Write shading model values to pixel color channels
                dev_img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       -                    + k_a * I_a)*redv);
       +                                                  + k_a * I_a)*redv);
                dev_img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       -                    + k_a * I_a)*greenv);
       +                                                  + k_a * I_a)*greenv);
                dev_img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       -                    + k_a * I_a)*bluev);
       +                                                  + k_a * I_a)*bluev);
            }
        }
        
        
        __host__ void DEM::cameraInit(
       -        const float3 eye,
       -        const float3 lookat, 
       -        const float imgw,
       -        const float focalLength)
       +    const float3 eye,
       +    const float3 lookat, 
       +    const float imgw,
       +    const float focalLength)
        {
            float hw_ratio = height/width;
        
            // Image dimensions in world space (l, r, b, t)
       -    float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
       +    float4 imgplane = make_float4(
       +        -0.5f*imgw, 0.5f*imgw,
       +        -0.5f*imgw*hw_ratio,
       +        0.5f*imgw*hw_ratio);
        
            // The view vector
            float3 view = eye - lookat;
       t@@ -309,15 +322,6 @@ __host__ void DEM::cameraInit(
            if (verbose == 1)
                std::cout << "  Transfering camera values to constant memory: ";
        
       -    /* Reference by string removed in cuda 5.0
       -    cudaMemcpyToSymbol("devC_u", &u, sizeof(u));
       -    cudaMemcpyToSymbol("devC_v", &v, sizeof(v));
       -    cudaMemcpyToSymbol("devC_w", &w, sizeof(w));
       -    cudaMemcpyToSymbol("devC_eye", &eye, sizeof(eye));
       -    cudaMemcpyToSymbol("devC_imgplane", &imgplane, sizeof(imgplane));
       -    cudaMemcpyToSymbol("devC_d", &d, sizeof(d));
       -    cudaMemcpyToSymbol("devC_light", &light, sizeof(light));
       -    cudaMemcpyToSymbol("devC_pixels", &pixels, sizeof(pixels));*/
            cudaMemcpyToSymbol(devC_u, &u, sizeof(u));
            cudaMemcpyToSymbol(devC_v, &v, sizeof(v));
            cudaMemcpyToSymbol(devC_w, &w, sizeof(w));
       t@@ -333,7 +337,7 @@ __host__ void DEM::cameraInit(
        }
        
        // Allocate global device memory
       -__host__ void DEM::rt_allocateGlobalDeviceMemory(void)
       +__host__ void DEM::rt_allocateGlobalDeviceMemory()
        {
            if (verbose == 1)
                std::cout << "  Allocating device memory: ";
       t@@ -347,7 +351,7 @@ __host__ void DEM::rt_allocateGlobalDeviceMemory(void)
        
        
        // Free dynamically allocated device memory
       -__host__ void DEM::rt_freeGlobalDeviceMemory(void)
       +__host__ void DEM::rt_freeGlobalDeviceMemory()
        {
            if (verbose == 1)
                std::cout << "  Freeing device memory: ";
       t@@ -360,11 +364,12 @@ __host__ void DEM::rt_freeGlobalDeviceMemory(void)
        }
        
        // Transfer image data from device to host
       -__host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
       +__host__ void DEM::rt_transferFromGlobalDeviceMemory()
        {
            if (verbose == 1)
                std::cout << "  Transfering image data: device -> host: ";
       -    cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char), cudaMemcpyDeviceToHost);
       +    cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char),
       +               cudaMemcpyDeviceToHost);
            if (verbose == 1)
                std::cout << "Done" << std::endl;
            checkForCudaErrors("During rt_transferFromGlobalDeviceMemory()");
       t@@ -372,24 +377,13 @@ __host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
        
        // Wrapper for the rt kernel
        __host__ void DEM::render(
       -        const int method,
       -        const float maxval,
       -        const float lower_cutoff,
       -        const float focalLength,
       -        const unsigned int img_width,
       -        const unsigned int img_height)
       -    /*float4* p, unsigned int np,
       -      rgba* img, unsigned int width, unsigned int height,
       -      f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
       -      int visualize, float max_val,
       -      float* fixvel,
       -      float* xsum,
       -      float* pres,
       -      float* es_dot,
       -      float* es,
       -      float* vel)*/
       +    const int method,
       +    const float maxval,
       +    const float lower_cutoff,
       +    const float focalLength,
       +    const unsigned int img_width,
       +    const unsigned int img_height)
        {
       -    // Namespace directives
            using std::cout;
            using std::cerr;
            using std::endl;
       t@@ -418,15 +412,15 @@ __host__ void DEM::render(
            // Look at the centre of the mean positions
            float3 lookat = make_float3(maxpos.x, maxpos.y, maxpos.z) / 2.0f; 
            float3 eye = make_float3(
       -            grid.L[0] * 2.3f,
       -            grid.L[1] * -5.0f,
       -            grid.L[2] * 1.3f);
       +        grid.L[0] * 2.3f,
       +        grid.L[1] * -5.0f,
       +        grid.L[2] * 1.3f);
            cameraInit(eye, lookat, imgw, focalLength);
        
            // Construct rays for perspective projection
            rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
       -            dev_ray_origo, dev_ray_direction, 
       -            make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
       +        dev_ray_origo, dev_ray_direction, 
       +        make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
            cudaThreadSynchronize();
        
            Float* linarr;     // Linear array to use for color visualization
       t@@ -440,8 +434,8 @@ __host__ void DEM::render(
            // Visualize spheres without color scale overlay
            if (method == 0) {
                rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
       -                dev_ray_origo, dev_ray_direction,
       -                dev_x, dev_img);
       +            dev_ray_origo, dev_ray_direction,
       +            dev_x, dev_img);
            } else {
        
                if (method == 1) { // Visualize pressure
       t@@ -455,8 +449,8 @@ __host__ void DEM::render(
        #pragma omp parallel for if(np>100)
                    for (i = 0; i<np; ++i) {
                        linarr[i] = sqrt(k.vel[i].x*k.vel[i].x 
       -                        + k.vel[i].y*k.vel[i].y 
       -                        + k.vel[i].z*k.vel[i].z);
       +                                 + k.vel[i].y*k.vel[i].y 
       +                                 + k.vel[i].z*k.vel[i].z);
                    }
                    transfer = 1;
                    desc = "Linear velocity";
       t@@ -468,8 +462,8 @@ __host__ void DEM::render(
        #pragma omp parallel for if(np>100)
                    for (i = 0; i<np; ++i) {
                        linarr[i] = sqrt(k.angvel[i].x*k.angvel[i].x
       -                        + k.angvel[i].y*k.angvel[i].y 
       -                        + k.angvel[i].z*k.angvel[i].z);
       +                                 + k.angvel[i].y*k.angvel[i].y 
       +                                 + k.angvel[i].z*k.angvel[i].z);
                    }
                    transfer = 1;
                    desc = "Angular velocity";
       t@@ -492,8 +486,8 @@ __host__ void DEM::render(
        #pragma omp parallel for if(np>100)
                    for (i = 0; i<np; ++i) {
                        linarr[i] = sqrt(k.angpos[i].x*k.angpos[i].x
       -                        + k.angpos[i].y*k.angpos[i].y 
       -                        + k.angpos[i].z*k.angpos[i].z);
       +                                 + k.angpos[i].y*k.angpos[i].y 
       +                                 + k.angpos[i].z*k.angpos[i].z);
                    }
                    transfer = 1;
                    desc = "Angular positions";
       t@@ -504,23 +498,24 @@ __host__ void DEM::render(
                // Report color visualization method and color map range
                if (verbose == 1) {
                    cout << "  " << desc << " color map range: [0, " 
       -                << maxval << "] " << unit << endl;
       +                 << maxval << "] " << unit << endl;
                }
        
                // Copy linarr to dev_linarr if required
                if (transfer == 1) {
                    cudaMalloc((void**)&dev_linarr, np*sizeof(Float));
                    checkForCudaErrors("Error during cudaMalloc of linear array");
       -            cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), cudaMemcpyHostToDevice);
       +            cudaMemcpy(dev_linarr, linarr, np*sizeof(Float),
       +                       cudaMemcpyHostToDevice);
                    checkForCudaErrors("Error during cudaMemcpy of linear array");
                }
        
                // Start raytracing kernel
                rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
       -                dev_ray_origo, dev_ray_direction,
       -                dev_x, dev_vel,
       -                dev_linarr, maxval, lower_cutoff,
       -                dev_img);
       +            dev_ray_origo, dev_ray_direction,
       +            dev_x, dev_vel,
       +            dev_linarr, maxval, lower_cutoff,
       +            dev_img);
        
            }
        
 (DIR) diff --git a/tests/fluid_particle_interaction.py b/tests/fluid_particle_interaction.py
       t@@ -50,5 +50,4 @@ test(sim.vel[0,0] > 0.0, 'Particle 0 velocity:')
        test(sim.vel[1,0] > 0.0, 'Particle 1 velocity:')
        test(sim.vel[2,0] > 0.0, 'Particle 2 velocity:')
        
       -
       -sim.cleanup()
       +#sim.cleanup()