timplemented staggered grid in several functions - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit bd373edea562b18de93fe231cc1583108a4d1b05
 (DIR) parent 0548db26d1afead334b2d7be927411f8e6dfd0dc
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Tue,  3 Jun 2014 10:51:51 +0200
       
       implemented staggered grid in several functions
       
       Diffstat:
         M python/fluidshear.py                |       4 ++--
         M src/device.cu                       |     180 +++++++++++++++++--------------
         M src/navierstokes.cuh                |     431 ++++++++++++++++++++++++++-----
         M src/sphere.h                        |      11 +++++++----
         M tests/stokes_law.py                 |       2 +-
       
       5 files changed, 469 insertions(+), 159 deletions(-)
       ---
 (DIR) diff --git a/python/fluidshear.py b/python/fluidshear.py
       t@@ -9,8 +9,8 @@ sid = 'fluidshear'
        sim = sphere.sim(sid + '-init', np = 24000, fluid = False)
        #sim.cleanup()
        sim.radius[:] = 0.05
       -sim.initRandomGridPos(gridnum = [16, 16, 9000])
       -sim.initTemporal(total = 5.0, file_dt = 0.05)
       +sim.initRandomGridPos(gridnum = [20, 20, 9000])
       +sim.initTemporal(total = 10.0, file_dt = 0.05)
        sim.g[2] = -9.81
        sim.run(dry=True)
        sim.run()
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -557,7 +557,7 @@ __host__ void DEM::startTime()
            unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock); 
            dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
        
       -    // Use 3D block and grid layout for fluid calculations
       +    // Use 3D block and grid layout for cell-centered fluid calculations
            dim3 dimBlockFluid(8, 8, 8);    // 512 threads per block
            dim3 dimGridFluid(
                    iDivUp(grid.num[0], dimBlockFluid.x),
       t@@ -568,6 +568,17 @@ __host__ void DEM::startTime()
                exit(1);
            }
        
       +    // Use 3D block and grid layout for cell-face fluid calculations
       +    dim3 dimBlockFluidFace(8, 8, 8);    // 512 threads per block
       +    dim3 dimGridFluidFace(
       +            iDivUp(grid.num[0]+1, dimBlockFluid.x),
       +            iDivUp(grid.num[1]+1, dimBlockFluid.y),
       +            iDivUp(grid.num[2]+1, dimBlockFluid.z));
       +    if (dimGridFluid.z > 64 && navierstokes == 1) {
       +        cerr << "Error: dimGridFluid.z > 64" << endl;
       +        exit(1);
       +    }
       +
        
            // Shared memory per block
            unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
       t@@ -867,29 +878,42 @@ __host__ void DEM::startTime()
                    }*/
        
                    if (np > 0) {
       -                // Determine the interaction force
       -                /*findInteractionForce<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_phi,
       -                        dev_ns_d_avg,
       -                        dev_ns_vp_avg,
       -                        dev_ns_v,
       -                        dev_ns_fi);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post findInteractionForce", iter);
        
       -                // Apply interaction force to the particles
       -                applyParticleInteractionForce<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_fi,
       -                        dev_ns_phi,
       -                        dev_ns_p,
       -                        dev_gridParticleIndex,
       -                        dev_cellStart,
       -                        dev_cellEnd,
       -                        dev_x_sorted,
       -                        dev_force);
       +                if (iter == 0) {
       +                    // set cell center ghost nodes
       +                    setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_v, ns.bc_bot, ns.bc_top);
       +
       +                    // find cell face velocities
       +                    interpolateCenterToFace
       +                        <<<dimGridFluidFace, dimBlockFluidFace>>>(
       +                            dev_ns_v,
       +                            dev_ns_v_x,
       +                            dev_ns_v_y,
       +                            dev_ns_v_z);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrors("Post interpolateCenterToFace");
       +                }
       +
       +                setNSghostNodesFace<Float>
       +                    <<<dimGridFluidFace, dimBlockFluidFace>>>(
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
       +                        ns.bc_bot,
       +                        ns.bc_top);
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post applyParticleInteractionForce",
       -                        iter);*/
       +                checkForCudaErrorsIter("Post setNSghostNodesFace", iter);
       +
       +                findFaceDivTau<<<dimGridFluidFace, dimBlockFluidFace>>>(
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
       +                        dev_ns_div_tau_x,
       +                        dev_ns_div_tau_y,
       +                        dev_ns_div_tau_z);
       +                cudaThreadSynchronize();
       +                checkForCudaErrorsIter("Post findFaceDivTau", iter);
        
                        // Per particle, find the fluid-particle interaction force f_pf
                        // and apply it to the particle
       t@@ -898,9 +922,12 @@ __host__ void DEM::startTime()
                                dev_vel,
                                dev_ns_phi,
                                dev_ns_p,
       -                        dev_ns_v,
       -                        dev_ns_tau,
       -                        iter,
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
       +                        dev_ns_div_tau_x,
       +                        dev_ns_div_tau_y,
       +                        dev_ns_div_tau_z,
                                dev_ns_f_pf,
                                dev_force);
                        cudaThreadSynchronize();
       t@@ -961,21 +988,19 @@ __host__ void DEM::startTime()
                        // Set the values of the ghost nodes in the grid
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                /*setNSghostNodesDev<<<dimGridFluid, dimBlockFluid>>>(
       -                  dev_ns_p,
       -                  dev_ns_v,
       -                  dev_ns_v_p,
       -                  dev_ns_phi,
       -                  dev_ns_dphi,
       -                  dev_ns_epsilon);*/
       +
                        setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
                                dev_ns_p, ns.bc_bot, ns.bc_top);
        
       -                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_v, ns.bc_bot, ns.bc_top);
       +                //setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       +                        //dev_ns_v, ns.bc_bot, ns.bc_top);
        
       -                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_v_p, ns.bc_bot, ns.bc_top);
       +                setNSghostNodesFace<Float>
       +                    <<<dimGridFluidFace, dimBlockFluidFace>>>(
       +                        dev_ns_v_p_x,
       +                        dev_ns_v_p_y,
       +                        dev_ns_v_p_z,
       +                        ns.bc_bot, ns.bc_top);
        
                        setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
                                dev_ns_phi, ns.bc_bot, ns.bc_top);
       t@@ -993,26 +1018,15 @@ __host__ void DEM::startTime()
                          transferNSepsilonFromGlobalDeviceMemory();
                          printNSarray(stdout, ns.epsilon, "epsilon");*/
        
       -                // Find the fluid stress tensor, needed for predicting the fluid
       -                // velocities
       -                if (PROFILING == 1)
       -                    startTimer(&kernel_tic);
       -                findNSstressTensor<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_v,
       -                        dev_ns_tau);
       +                // interpolate velocities to cell centers which makes velocity
       +                // prediction easier
       +                interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
       +                        dev_ns_v);
                        cudaThreadSynchronize();
       -                if (PROFILING == 1)
       -                    stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_findNSstressTensor);
       -                checkForCudaErrorsIter("Post findNSstressTensor", iter);
       -
       -                // Set stress tensor values in the ghost nodes
       -                setNSghostNodes_tau<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_tau,
       -                        ns.bc_bot,
       -                        ns.bc_top); 
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setNSghostNodes_tau", iter);
       +                checkForCudaErrorsIter("Post interpolateFaceToCenter", iter);
        
                        // Find the divergence of phi*vi*v, needed for predicting the
                        // fluid velocities
       t@@ -1028,53 +1042,44 @@ __host__ void DEM::startTime()
                                    &t_findNSdivphiviv);
                        checkForCudaErrorsIter("Post findNSdivphiviv", iter);
        
       -                // Find the divergence of phi*tau, needed for predicting the
       -                // fluid velocities
       -                if (PROFILING == 1)
       -                    startTimer(&kernel_tic);
       -                /*findNSdivphitau<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_phi,
       -                        dev_ns_tau,
       -                        dev_ns_div_phi_tau);*/
       -                findNSdivtau<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_tau,
       -                        dev_ns_div_tau);
       -                cudaThreadSynchronize();
       -                if (PROFILING == 1)
       -                    stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_findNSdivtau);
       -                //checkForCudaErrorsIter("Post findNSdivtau", iter);
       -                checkForCudaErrorsIter("Post findNSdivtau", iter);
       -
                        // Predict the fluid velocities on the base of the old pressure
                        // field and ignoring the incompressibility constraint
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                findPredNSvelocities<<<dimGridFluid, dimBlockFluid>>>(
       +                findPredNSvelocities<<<dimGridFluidFace, dimBlockFluidFace>>>(
                                dev_ns_p,
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
                                dev_ns_v,
                                dev_ns_phi,
                                dev_ns_dphi,
       -                        dev_ns_div_phi_vi_v,
       -                        //dev_ns_div_phi_tau,
       -                        dev_ns_div_tau,
       +                        dev_ns_div_tau_x,
       +                        dev_ns_div_tau_y,
       +                        dev_ns_div_tau_z,
                                ns.bc_bot,
                                ns.bc_top,
                                ns.beta,
                                dev_ns_fi,
                                ns.ndem,
       -                        dev_ns_v_p);
       +                        dev_ns_v_p_x,
       +                        dev_ns_v_p_y,
       +                        dev_ns_v_p_z);
                        cudaThreadSynchronize();
                        if (PROFILING == 1)
                            stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
                                    &t_findPredNSvelocities);
                        checkForCudaErrorsIter("Post findPredNSvelocities", iter);
        
       -                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_v_p);
       +                setNSghostNodesFace<Float>
       +                    <<<dimGridFluidFace, dimBlockFluidFace>>>(
       +                        dev_ns_v_p_x,
       +                        dev_ns_v_p_y,
       +                        dev_ns_v_p_z,
       +                        ns.bc_bot, ns.bc_top);
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setNSghostNodesFloat3(dev_ns_v_p)",
       -                        iter);
       +                checkForCudaErrorsIter(
       +                        "Post setNSghostNodesFaceFloat3(dev_ns_v_p)", iter);
        
                        // In the first iteration of the sphere program, we'll need to
                        // manually estimate the values of epsilon. In the subsequent
       t@@ -1507,6 +1512,17 @@ __host__ void DEM::startTime()
                    cudaThreadSynchronize();
                    checkForCudaErrorsIter("Beginning of file output section", iter);
        
       +            // v_x, v_y, v_z -> v
       +            if (navierstokes == 1) {
       +                interpolateFaceToCenter<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v_x,
       +                        dev_ns_v_y,
       +                        dev_ns_v_z,
       +                        dev_ns_v);
       +                cudaThreadSynchronize();
       +                checkForCudaErrorsIter("Post interpolateFaceToCenter", iter);
       +            }
       +
                    //// Copy device data to host memory
                    transferFromGlobalDeviceMemory();
                    checkForCudaErrorsIter("After transferFromGlobalDeviceMemory()",
 (DIR) diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
       t@@ -69,7 +69,7 @@ void DEM::initNSmemDev(void)
            cudaMalloc((void**)&dev_ns_v_x, memSizeFvel);// velocity in stag. grid
            cudaMalloc((void**)&dev_ns_v_y, memSizeFvel);// velocity in stag. grid
            cudaMalloc((void**)&dev_ns_v_z, memSizeFvel);// velocity in stag. grid
       -    cudaMalloc((void**)&dev_ns_v_p, memSizeF*3); // predicted cell velocity
       +    //cudaMalloc((void**)&dev_ns_v_p, memSizeF*3); // predicted cell velocity
            cudaMalloc((void**)&dev_ns_v_p_x, memSizeFvel); // pred. vel. in stag. grid
            cudaMalloc((void**)&dev_ns_v_p_y, memSizeFvel); // pred. vel. in stag. grid
            cudaMalloc((void**)&dev_ns_v_p_z, memSizeFvel); // pred. vel. in stag. grid
       t@@ -78,7 +78,7 @@ void DEM::initNSmemDev(void)
            cudaMalloc((void**)&dev_ns_fi, memSizeF*3);  // interaction force
            cudaMalloc((void**)&dev_ns_phi, memSizeF);   // cell porosity
            cudaMalloc((void**)&dev_ns_dphi, memSizeF);  // cell porosity change
       -    cudaMalloc((void**)&dev_ns_div_phi_v_v, memSizeF*3); // div(phi v v)
       +    //cudaMalloc((void**)&dev_ns_div_phi_v_v, memSizeF*3); // div(phi v v)
            cudaMalloc((void**)&dev_ns_epsilon, memSizeF); // pressure difference
            cudaMalloc((void**)&dev_ns_epsilon_new, memSizeF); // new pressure diff.
            cudaMalloc((void**)&dev_ns_epsilon_old, memSizeF); // old pressure diff.
       t@@ -87,7 +87,10 @@ void DEM::initNSmemDev(void)
            cudaMalloc((void**)&dev_ns_f1, memSizeF);    // constant addition in forcing
            cudaMalloc((void**)&dev_ns_f2, memSizeF*3);  // constant slope in forcing
            cudaMalloc((void**)&dev_ns_tau, memSizeF*6); // stress tensor (symmetrical)
       -    cudaMalloc((void**)&dev_ns_div_tau, memSizeF*6); // div(tau)
       +    cudaMalloc((void**)&dev_ns_div_tau, memSizeF3); // div(tau), cell center
       +    cudaMalloc((void**)&dev_ns_div_tau_x, memSizeFvel); // div(tau), cell face
       +    cudaMalloc((void**)&dev_ns_div_tau_y, memSizeFvel); // div(tau), cell face
       +    cudaMalloc((void**)&dev_ns_div_tau_z, memSizeFvel); // div(tau), cell face
            cudaMalloc((void**)&dev_ns_div_phi_vi_v, memSizeF*3); // div(phi*vi*v)
            //cudaMalloc((void**)&dev_ns_div_phi_tau, memSizeF*3);  // div(phi*tau)
            cudaMalloc((void**)&dev_ns_f_pf, sizeof(Float3)*np); // particle fluid force
       t@@ -103,7 +106,7 @@ void DEM::freeNSmemDev()
            cudaFree(dev_ns_v_x);
            cudaFree(dev_ns_v_y);
            cudaFree(dev_ns_v_z);
       -    cudaFree(dev_ns_v_p);
       +    //cudaFree(dev_ns_v_p);
            cudaFree(dev_ns_v_p_x);
            cudaFree(dev_ns_v_p_y);
            cudaFree(dev_ns_v_p_z);
       t@@ -112,7 +115,7 @@ void DEM::freeNSmemDev()
            cudaFree(dev_ns_fi);
            cudaFree(dev_ns_phi);
            cudaFree(dev_ns_dphi);
       -    cudaFree(dev_ns_div_phi_v_v);
       +    //cudaFree(dev_ns_div_phi_v_v);
            cudaFree(dev_ns_epsilon);
            cudaFree(dev_ns_epsilon_new);
            cudaFree(dev_ns_epsilon_old);
       t@@ -123,7 +126,10 @@ void DEM::freeNSmemDev()
            cudaFree(dev_ns_tau);
            cudaFree(dev_ns_div_phi_vi_v);
            //cudaFree(dev_ns_div_phi_tau);
       -    cudaFree(dev_ns_div_tau);
       +    //cudaFree(dev_ns_div_tau);
       +    cudaFree(dev_ns_div_tau_x);
       +    cudaFree(dev_ns_div_tau_y);
       +    cudaFree(dev_ns_div_tau_z);
            cudaFree(dev_ns_f_pf);
        }
        
       t@@ -144,7 +150,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
            cudaMemcpy(dev_ns_p, ns.p, memSizeF, cudaMemcpyHostToDevice);
            checkForCudaErrors("transferNStoGlobalDeviceMemory after first cudaMemcpy");
            cudaMemcpy(dev_ns_v, ns.v, memSizeF*3, cudaMemcpyHostToDevice);
       -    cudaMemcpy(dev_ns_v_p, ns.v_p, memSizeF*3, cudaMemcpyHostToDevice);
       +    //cudaMemcpy(dev_ns_v_p, ns.v_p, memSizeF*3, cudaMemcpyHostToDevice);
            cudaMemcpy(dev_ns_phi, ns.phi, memSizeF, cudaMemcpyHostToDevice);
            cudaMemcpy(dev_ns_dphi, ns.dphi, memSizeF, cudaMemcpyHostToDevice);
        
       t@@ -165,7 +171,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
            cudaMemcpy(ns.p, dev_ns_p, memSizeF, cudaMemcpyDeviceToHost);
            checkForCudaErrors("In transferNSfromGlobalDeviceMemory, dev_ns_p", 0);
            cudaMemcpy(ns.v, dev_ns_v, memSizeF*3, cudaMemcpyDeviceToHost);
       -    cudaMemcpy(ns.v_p, dev_ns_v_p, memSizeF*3, cudaMemcpyDeviceToHost);
       +    //cudaMemcpy(ns.v_p, dev_ns_v_p, memSizeF*3, cudaMemcpyDeviceToHost);
            cudaMemcpy(ns.phi, dev_ns_phi, memSizeF, cudaMemcpyDeviceToHost);
            cudaMemcpy(ns.dphi, dev_ns_dphi, memSizeF, cudaMemcpyDeviceToHost);
            cudaMemcpy(ns.norm, dev_ns_norm, memSizeF, cudaMemcpyDeviceToHost);
       t@@ -624,6 +630,63 @@ __global__ void setNSghostNodes(
            }
        }
        
       +// Update a field in the ghost nodes from their parent cell values. The edge
       +// (diagonal) cells are not written since they are not read.
       +// Launch per face.
       +template<typename T>
       +__global__ void setNSghostNodesFace(
       +        T* dev_scalarfield_x,
       +        T* dev_scalarfield_y,
       +        T* dev_scalarfield_z,
       +        int bc_bot,
       +        int bc_top)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // check that we are not outside the fluid grid
       +    if (x <= nx && y <= ny && z <= nz) {
       +
       +        const T val_x = dev_scalarfield_x[vidx(x,y,z)];
       +        const T val_y = dev_scalarfield_y[vidx(x,y,z)];
       +        const T val_z = dev_scalarfield_z[vidx(x,y,z)];
       +
       +        // x
       +        if (x == 0)
       +            dev_scalarfield_x[vidx(nx+1,y,z)] = val_x;
       +        if (x == nx)
       +            dev_scalarfield_x[vidx(-1,y,z)] = val_x;
       +
       +        // y
       +        if (y == 0)
       +            dev_scalarfield_y[vidx(x,ny+1,z)] = val_y;
       +        if (y == ny)
       +            dev_scalarfield_y[vidx(x,-1,z)] = val_y;
       +
       +        // z
       +        if (z == 0 && bc_bot == 0)
       +            dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Dirichlet
       +        if (z == 1 && bc_bot == 1)
       +            dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Neumann
       +        if (z == 0 && bc_bot == 2)
       +            dev_scalarfield_z[vidx(x,y,nz+1)] = val_z;   // Periodic -z
       +
       +        if (z == nz && bc_top == 0)
       +            dev_scalarfield_z[vidx(x,y,nz)] = val_z;     // Dirichlet
       +        if (z == nz-1 && bc_top == 1)
       +            dev_scalarfield_z[vidx(x,y,nz)] = val_z;     // Neumann
       +        if (z == nz && bc_top == 2)
       +            dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Periodic +z
       +    }
       +}
       +
        // Update the tensor field for the ghost nodes from their parent cell values.
        // The edge (diagonal) cells are not written since they are not read. Launch
        // this kernel for all cells in the grid.
       t@@ -1688,7 +1751,7 @@ __global__ void findNSdivphiviv(
                // Read porosity and velocity in the 6 neighbor cells
                __syncthreads();
                const Float  phi_xn = dev_ns_phi[idx(x-1,y,z)];
       -        //const Float  phi    = dev_ns_phi[idx(x,y,z)];
       +        const Float  phi    = dev_ns_phi[idx(x,y,z)];
                const Float  phi_xp = dev_ns_phi[idx(x+1,y,z)];
                const Float  phi_yn = dev_ns_phi[idx(x,y-1,z)];
                const Float  phi_yp = dev_ns_phi[idx(x,y+1,z)];
       t@@ -1696,7 +1759,7 @@ __global__ void findNSdivphiviv(
                const Float  phi_zp = dev_ns_phi[idx(x,y,z+1)];
        
                const Float3 v_xn = dev_ns_v[idx(x-1,y,z)];
       -        //const Float3 v    = dev_ns_v[idx(x,y,z)];
       +        const Float3 v    = dev_ns_v[idx(x,y,z)];
                const Float3 v_xp = dev_ns_v[idx(x+1,y,z)];
                const Float3 v_yn = dev_ns_v[idx(x,y-1,z)];
                const Float3 v_yp = dev_ns_v[idx(x,y+1,z)];
       t@@ -1704,7 +1767,8 @@ __global__ void findNSdivphiviv(
                const Float3 v_zp = dev_ns_v[idx(x,y,z+1)];
        
                // Calculate upwind coefficients
       -        /*const Float3 a = MAKE_FLOAT3(
       +        //*
       +        const Float3 a = MAKE_FLOAT3(
                        copysign(1.0, v.x),
                        copysign(1.0, v.y),
                        copysign(1.0, v.z));
       t@@ -1763,8 +1827,9 @@ __global__ void findNSdivphiviv(
        
                // Determine the weighted average of both discretizations
                const Float3 div_phi_vi_v = tau*div_uw + (1.0 - tau)*div_cd;
       -        */
       +        //*/
        
       +        /*
                // Calculate the divergence: div(phi*v_i*v)
                const Float3 div_phi_vi_v = MAKE_FLOAT3(
                        // x
       t@@ -1779,6 +1844,7 @@ __global__ void findNSdivphiviv(
                        (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
                        (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
                        (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
       +                // */
        
                // Write divergence
                __syncthreads();
       t@@ -1929,6 +1995,7 @@ __global__ void findNSdivphitau(
        }
        
        // Find the divergence of phi v v
       +// Unused
        __global__ void findNSdivphivv(
                Float*  dev_ns_v_prod, // in
                Float*  dev_ns_phi,    // in
       t@@ -2017,20 +2084,27 @@ __global__ void findNSdivphivv(
        
        
        // Find predicted fluid velocity
       +// Launch per face.
        __global__ void findPredNSvelocities(
                Float*  dev_ns_p,               // in
       +        Float*  dev_ns_v_x,             // in
       +        Float*  dev_ns_v_y,             // in
       +        Float*  dev_ns_v_z,             // in
                Float3* dev_ns_v,               // in
                Float*  dev_ns_phi,             // in
                Float*  dev_ns_dphi,            // in
       +        Float*  dev_ns_div_tau_x,       // in
       +        Float*  dev_ns_div_tau_y,       // in
       +        Float*  dev_ns_div_tau_z,       // in
                Float3* dev_ns_div_phi_vi_v,    // in
       -        //Float3* dev_ns_div_phi_tau,     // in
       -        Float3* dev_ns_div_tau,         // in
                int     bc_bot,                 // in
                int     bc_top,                 // in
                Float   beta,                   // in
                Float3* dev_ns_fi,              // in
                unsigned int ndem,              // in
       -        Float3* dev_ns_v_p)             // out
       +        Float*  dev_ns_v_p_x,           // out
       +        Float*  dev_ns_v_p_y,           // out
       +        Float*  dev_ns_v_p_z)           // out
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2048,76 +2122,123 @@ __global__ void findPredNSvelocities(
            const Float dz = devC_grid.L[2]/nz;
        
            // 1D thread index
       +    const unsigned int vidx = vidx(x,y,z);
            const unsigned int cellidx = idx(x,y,z);
        
            // Check that we are not outside the fluid grid
       -    if (x < nx && y < ny && z < nz) {
       +    if (x <= nx && y <= ny && z <= nz) {
        
                // Values that are needed for calculating the predicted velocity
                __syncthreads();
       -        const Float3 v            = dev_ns_v[cellidx];
       -        const Float  phi          = dev_ns_phi[cellidx];
       -        const Float  dphi         = dev_ns_dphi[cellidx];
       -        const Float3 div_phi_vi_v = dev_ns_div_phi_vi_v[cellidx];
       -        //const Float3 div_phi_tau  = dev_ns_div_phi_tau[cellidx];
       -        const Float3 div_tau      = dev_ns_div_tau[cellidx];
       +        const Float3 v_face = MAKE_FLOAT3(
       +                dev_ns_v_x[vidx],
       +                dev_ns_v_y[vidx],
       +                dev_ns_v_z[vidx]);
       +        const Float3 div_tau_face = MAKE_FLOAT3(
       +                dev_ns_div_tau_x[vidx],
       +                dev_ns_div_tau_y[vidx],
       +                dev_ns_div_tau_z[vidx]);
       +
       +        // cell center values
       +        const Float phi_xn    = dev_ns_phi[idx(x-1,y,z)];
       +        const Float phi_c     = dev_ns_phi[cellidx];
       +        const Float phi_yn    = dev_ns_phi[idx(x,y-1,z)];
       +        const Float phi_zn    = dev_ns_phi[idx(x,y,z-1)];
       +
       +        const Float dphi_xn   = dev_ns_dphi[idx(x-1,y,z)];
       +        const Float dphi_c    = dev_ns_dphi[cellidx];
       +        const Float dphi_yn   = dev_ns_dphi[idx(x,y-1,z)];
       +        const Float dphi_zn   = dev_ns_dphi[idx(x,y,z-1)];
       +
       +        const Float3 v_xn = dev_ns_v[idx(x-1,y,z)];
       +        const Float3 v_c  = dev_ns_v[cellidx];
       +        const Float3 v_yn = dev_ns_v[idx(x,y-1,z)];
       +        const Float3 v_zn = dev_ns_v[idx(x,y,z-1)];
       +
       +        const Float3 div_phi_vi_v_xn = dev_ns_div_phi_vi_v[idx(x-1,y,z)];
       +        const Float3 div_phi_vi_v_c  = dev_ns_div_phi_vi_v[cellidx];
       +        const Float3 div_phi_vi_v_yn = dev_ns_div_phi_vi_v[idx(x,y-1,z)];
       +        const Float3 div_phi_vi_v_zn = dev_ns_div_phi_vi_v[idx(x,y,z-1)];
       +
       +        // component-wise average values
       +        const Float3 phi = MAKE_FLOAT3(
       +                (phi_c + phi_xn)/2.0,
       +                (phi_c + phi_yn)/2.0,
       +                (phi_c + phi_zn)/2.0);
       +        const Float3 dphi = MAKE_FLOAT3(
       +                (dphi_c + dphi_xn)/2.0,
       +                (dphi_c + dphi_yn)/2.0,
       +                (dphi_c + dphi_zn)/2.0);
        
                // The particle-fluid interaction force should only be incoorporated if
                // there is a fluid viscosity
       -        Float3 f_i;
       -        if (devC_params.mu > 0.0)
       -            f_i = dev_ns_fi[cellidx];
       -        else
       -            f_i = MAKE_FLOAT3(0.0, 0.0, 0.0);
       +        Float3 f_i_c, f_i_xn, f_i_yn, f_i_zn;
       +        if (devC_params.mu > 0.0) {
       +            f_i_c  = dev_ns_fi[cellidx];
       +            f_i_xn = dev_ns_fi[idx(x-1,y,z)];
       +            f_i_yn = dev_ns_fi[idx(x,y-1,z)];
       +            f_i_zn = dev_ns_fi[idx(x,y,z-1)];
       +        } else {
       +            f_i_c  = MAKE_FLOAT3(0.0, 0.0, 0.0);
       +            f_i_xn = MAKE_FLOAT3(0.0, 0.0, 0.0);
       +            f_i_yn = MAKE_FLOAT3(0.0, 0.0, 0.0);
       +            f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
       +        }
       +        const Float3 f_i = MAKE_FLOAT3(
       +                (f_i_c.x + f_i_xn.x)/2.0,
       +                (f_i_c.y + f_i_yn.y)/2.0,
       +                (f_i_c.z + f_i_zn.z)/2.0);
        
                const Float dt = ndem*devC_dt;
                const Float rho = devC_params.rho_f;
        
       -        // Find pressure gradient
       -        Float3 grad_p = MAKE_FLOAT3(0.0, 0.0, 0.0);
       -
                // The pressure gradient is not needed in Chorin's projection method
                // (ns.beta=0), so only has to be looked up in pressure-dependant
                // projection methods
                Float3 pressure_term = MAKE_FLOAT3(0.0, 0.0, 0.0);
                if (beta > 0.0) {
       -            grad_p = gradient(dev_ns_p, x, y, z, dx, dy, dz);
       +            __syncthreads();
       +            const Float p    = dev_ns_p[cellidx];
       +            const Float p_xn = dev_ns_p[idx(x-1,y,z)];
       +            const Float p_yn = dev_ns_p[idx(x,y-1,z)];
       +            const Float p_zn = dev_ns_p[idx(x,y,z-1)];
       +            const Float3 grad_p = MAKE_FLOAT3(
       +                    (p - p_xn)/dx,
       +                    (p - p_yn)/dy,
       +                    (p - p_zn)/dz);
        #ifdef SET_1
       -            pressure_term = -beta*dt/(rho*phi)*grad_p;
       +            pressure_term = -beta*dt/(rho*phi)*grad_p_x;
        #endif
        #ifdef SET_2
                    pressure_term = -beta*dt/rho*grad_p;
        #endif
                }
        
       -        // Calculate the predicted velocity
       -        /*
       -        Float3 v_p = v
       -            + pressure_term
       -            + 1.0/devC_params.rho_f*div_phi_tau*ndem*devC_dt/phi // diffusion
       -            + MAKE_FLOAT3(devC_params.g[0], devC_params.g[1], devC_params.g[2])
       -                *ndem*devC_dt     // gravity term
       -            - ndem*devC_dt/(devC_params.rho_f*phi)*f_i
       -            //- ndem*devC_dt/(devC_params.rho_f*phi*dx*dy*dz)*f_i
       -            - v*dphi/phi
       -            - div_phi_vi_v*ndem*devC_dt/phi    // advection term
       -            ;*/
       +        // calculate the advection term using a first-order upwind scheme
       +        // div_phi_vf_vf
        
       +
       +        const float3 div_phi_vf_vx = make_float3(
       +                div_phi_vx_vf,
       +                div_phi_vy_vf,
       +                div_phi_vz_vf)k
       +
       +        // Determine the predicted velocity
        #ifdef SET_1
       -            const Float3 interaction_term = -dt/(rho*phi)*f_i;
       -            const Float3 diffusion_term = dt/(rho*phi)*div_tau;
       -            const Float3 gravity_term = MAKE_FLOAT3(
       +        const Float3 interaction_term = -dt/(rho*phi)*f_i;
       +        const Float3 diffusion_term = dt/(rho*phi)*div_tau;
       +        const Float3 gravity_term = MAKE_FLOAT3(
                            devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
       -            const Float3 porosity_term = -1.0*v*dphi/phi;
       -            const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
       +        const Float3 porosity_term = -1.0*v*dphi/phi;
       +        const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
        #endif
        #ifdef SET_2
       -            const Float3 interaction_term = -dt/(rho*phi)*f_i;
       -            const Float3 diffusion_term = dt/rho*div_tau;
       -            const Float3 gravity_term = MAKE_FLOAT3(
       -                    devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
       -            const Float3 porosity_term = -1.0*v*dphi/phi;
       -            const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
       +        const Float3 interaction_term = -dt/(rho*phi)*f_i;
       +        const Float3 diffusion_term = dt/rho*div_tau;
       +        const Float3 gravity_term = MAKE_FLOAT3(
       +                devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
       +        const Float3 porosity_term = -1.0*v*dphi/phi;
       +        const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
        #endif
        
                Float3 v_p = v
       t@@ -2155,8 +2276,9 @@ __global__ void findPredNSvelocities(
        
                // Save the predicted velocity
                __syncthreads();
       -        //v_p = MAKE_FLOAT3(0.0, 0.0, 0.0);
       -        dev_ns_v_p[cellidx] = v_p;
       +        dev_ns_v_p_x[vidx] = v_p.x;
       +        dev_ns_v_p_y[vidx] = v_p.y;
       +        dev_ns_v_p_z[vidx] = v_p.z;
        
        #ifdef CHECK_NS_FINITE
                (void)checkFiniteFloat3("v_p", x, y, z, v_p);
       t@@ -2176,7 +2298,7 @@ __global__ void findNSforcing(
                Float*  dev_ns_f,
                Float*  dev_ns_phi,
                Float*  dev_ns_dphi,
       -        Float3* dev_ns_v_p,
       +        //Float3* dev_ns_v_p,
                unsigned int nijac,
                unsigned int ndem)
        {
       t@@ -2786,9 +2908,12 @@ __global__ void findInteractionForce(
                Float4* dev_vel,         // in
                Float*  dev_ns_phi,      // in
                Float*  dev_ns_p,        // in
       -        Float3* dev_ns_v,        // in
       -        Float*  dev_ns_tau,      // in
       -        const unsigned int iter, // in
       +        Float*  dev_ns_v_x,      // in
       +        Float*  dev_ns_v_y,      // in
       +        Float*  dev_ns_v_z,      // in
       +        Float*  dev_ns_div_tau_x,// in
       +        Float*  dev_ns_div_tau_y,// in
       +        Float*  dev_ns_div_tau_z,// in
                Float3* dev_ns_f_pf,     // out
                Float4* dev_force)       // out
        {
       t@@ -2819,8 +2944,30 @@ __global__ void findInteractionForce(
        
                // Fluid information
                __syncthreads();
       -        const Float  phi = dev_ns_phi[cellidx];
       -        const Float3 v_f = dev_ns_v[cellidx];
       +        const Float phi = dev_ns_phi[cellidx];
       +        const Float v_x   = dev_ns_v_x[vidx(x,y,z)];
       +        const Float v_x_p = dev_ns_v_x[vidx(x+1,y,z)];
       +        const Float v_y   = dev_ns_v_x[vidx(x,y,z)];
       +        const Float v_y_p = dev_ns_v_x[vidx(x,y+1,z)];
       +        const Float v_z   = dev_ns_v_x[vidx(x,y,z)];
       +        const Float v_z_p = dev_ns_v_x[vidx(x,y,z+1)];
       +
       +        const Float div_tau_x   = dev_ns_div_tau_x[vidx(x,y,z)];
       +        const Float div_tau_x_p = dev_ns_div_tau_x[vidx(x+1,y,z)];
       +        const Float div_tau_y   = dev_ns_div_tau_x[vidx(x,y,z)];
       +        const Float div_tau_y_p = dev_ns_div_tau_x[vidx(x,y+1,z)];
       +        const Float div_tau_z   = dev_ns_div_tau_x[vidx(x,y,z)];
       +        const Float div_tau_z_p = dev_ns_div_tau_x[vidx(x,y,z+1)];
       +
       +        const Float3 v_f = MAKE_FLOAT3(
       +                (v_x + v_x_p)/2.0,
       +                (v_y + v_y_p)/2.0,
       +                (v_z + v_z_p)/2.0);
       +
       +        const Float3 div_tau = MAKE_FLOAT3(
       +                (div_tau_x + div_tau_x_p)/2.0,
       +                (div_tau_y + div_tau_y_p)/2.0,
       +                (div_tau_z + div_tau_z_p)/2.0);
        
                const Float3 v_rel = v_f - v_p;
                const Float  v_rel_length = length(v_rel);
       t@@ -2844,13 +2991,7 @@ __global__ void findInteractionForce(
                    -1.0*gradient(dev_ns_p, i_x, i_y, i_z, dx, dy, dz)*V_p;
        
                // Viscous force
       -        Float3 f_v;
       -        if (iter == 0) 
       -            f_v = MAKE_FLOAT3(0.0, 0.0, 0.0);
       -        else
       -            f_v =
       -                -1.0*divergence_tensor(dev_ns_tau, i_x, i_y, i_z, dx, dy, dz)
       -                *V_p;
       +        const Float3 f_v = -1.0*div_tau*V_p;
        
                // Interaction force on particle (force) and fluid (f_pf)
                __syncthreads();
       t@@ -3020,6 +3161,156 @@ __global__ void applyInteractionForceToFluid(
            }
        }*/
        
       +// Launch per cell face node.
       +// Cell center ghost nodes must be set prior to call.
       +__global__ void interpolateCenterToFace(
       +        Float3* dev_in,
       +        Float*  dev_out_x,
       +        Float*  dev_out_y,
       +        Float*  dev_out_z)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Check that we are not outside the fluid grid
       +    if (x <= devC_grid.num[0]
       +            && y <= devC_grid.num[1]
       +            && z <= devC_grid.num[2]) {
       +
       +        const unsigned int faceidx = vidx(x,y,z);
       +
       +        __syncthreads();
       +        const Float3 xn = dev_in[idx(x-1,y,z)];
       +        const Float3 yn = dev_in[idx(x,y-1,z)];
       +        const Float3 zn = dev_in[idx(x,y,z-1)];
       +        const Float3 center = dev_in[idx(x,y,z)];
       +
       +        const Float3 x_val = (center.x - xn.x)/2.0;
       +        const Float3 y_val = (center.y - yn.y)/2.0;
       +        const Float3 z_val = (center.z - zn.z)/2.0;
       +
       +        __syncthreads();
       +        dev_out_x[faceidx] = x_val;
       +        dev_out_y[faceidx] = y_val;
       +        dev_out_z[faceidx] = z_val;
       +    }
       +}
       +
       +// Launch per cell center node
       +__global__ void interpolateFaceToCenter(
       +        Float*  dev_in_x,
       +        Float*  dev_in_y,
       +        Float*  dev_in_z,
       +        Float3* dev_out)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Check that we are not outside the fluid grid
       +    if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
       +
       +        const unsigned int cellidx = idx(x,y,z);
       +
       +        __syncthreads();
       +        const Float x_n = dev_in_x[idx(x,y,z)];
       +        const Float x_p = dev_in_x[idx(x+1,y,z)];
       +        const Float y_n = dev_in_x[idx(x,y,z)];
       +        const Float y_p = dev_in_x[idx(x,y+1,z)];
       +        const Float z_n = dev_in_x[idx(x,y,z)];
       +        const Float z_p = dev_in_x[idx(x,y,z+1)];
       +
       +        const val = MAKE_FLOAT3(
       +                (x_n + x_p)/2.0,
       +                (y_n + y_p)/2.0,
       +                (z_n + z_p)/2.0);
       +
       +        __syncthreads();
       +        dev_out[cellidx] = val;
       +    }
       +}
       +
       +// Launch per cell face node. Set velocity ghost nodes beforehand.
       +// Find div(tau) at all cell faces.
       +__global__ findFaceDivTau(
       +        Float* dev_ns_v_x,
       +        Float* dev_ns_v_y,
       +        Float* dev_ns_v_z,
       +        Float* dev_ns_div_tau_x,
       +        Float* dev_ns_div_tau_y,
       +        Float* dev_ns_div_tau_z)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // Cell sizes
       +    const Float dx = devC_grid.L[0]/nx;
       +    const Float dy = devC_grid.L[1]/ny;
       +    const Float dz = devC_grid.L[2]/nz;
       +
       +    // Check that we are not outside the fluid grid
       +    if (x <= nx && y <= ny && z <= nz) {
       +
       +        const unsigned int faceidx = vidx(x,y,z);
       +
       +        __syncthreads();
       +        const Float v_x_xn = dev_ns_v_x[vidx(x-1,y,z)];
       +        const Float v_x = dev_ns_v_x[faceidx];
       +        const Float v_x_xp = dev_ns_v_x[vidx(x+1,y,z)];
       +        const Float v_x_yn = dev_ns_v_x[vidx(x,y-1,z)];
       +        const Float v_x_yp = dev_ns_v_x[vidx(x,y+1,z)];
       +        const Float v_x_zn = dev_ns_v_x[vidx(x,y,z-1)];
       +        const Float v_x_zp = dev_ns_v_x[vidx(x,y,z+1)];
       +
       +        const Float x_y_xn = dev_nx_v_y[vidx(x-1,y,z)];
       +        const Float x_y_xp = dev_nx_v_y[vidx(x+1,y,z)];
       +        const Float v_y_yn = dev_ns_v_y[vidx(x,y-1,z)];
       +        const Float v_y = dev_ns_v_y[faceidx];
       +        const Float v_y_yp = dev_ns_v_y[vidx(x,y+1,z)];
       +        const Float x_y_zn = dev_nx_v_y[vidx(x,y,z-1)];
       +        const Float x_y_zp = dev_nx_v_y[vidx(x,y,z+1)];
       +
       +        const Float v_z_xn = dev_ns_v_z[vidx(x-1,y,z)];
       +        const Float v_z_xp = dev_ns_v_z[vidx(x+1,y,z)];
       +        const Float v_z_yn = dev_ns_v_z[vidx(x,y-1,z)];
       +        const Float v_z_yp = dev_ns_v_z[vidx(x,y+1,z)];
       +        const Float v_z_zn = dev_ns_v_z[vidx(x,y,z-1)];
       +        const Float v_z = dev_ns_v_z[faceidx];
       +        const Float v_z_zp = dev_ns_v_z[vidx(x,y,z+1)];
       +
       +        const Float div_tau_x =
       +            devC_params.mu*(
       +                    (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
       +                    (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
       +                    (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
       +        const Float div_tau_y =
       +            devC_params.mu*(
       +                    (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
       +                    (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
       +                    (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
       +        const Float div_tau_z =
       +            devC_params.mu*(
       +                    (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
       +                    (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
       +                    (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
       +
       +        __syncthreads();
       +        dev_ns_div_tau_x[faceidx] = div_tau_x;
       +        dev_ns_div_tau_y[faceidx] = div_tau_y;
       +        dev_ns_div_tau_z[faceidx] = div_tau_z;
       +    }
       +}
       +
        // Print final heads and free memory
        void DEM::endNSdev()
        {
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -175,7 +175,7 @@ class DEM {
                Float*  dev_ns_v_x;          // Cell fluid velocity in staggered grid
                Float*  dev_ns_v_y;          // Cell fluid velocity in staggered grid
                Float*  dev_ns_v_z;          // Cell fluid velocity in staggered grid
       -        Float3* dev_ns_v_p;          // Predicted cell fluid velocity
       +        //Float3* dev_ns_v_p;          // Predicted cell fluid velocity
                Float*  dev_ns_v_p_x;        // Predicted cell fluid velocity in st. gr.
                Float*  dev_ns_v_p_y;        // Predicted cell fluid velocity in st. gr.
                Float*  dev_ns_v_p_z;        // Predicted cell fluid velocity in st. gr.
       t@@ -184,7 +184,7 @@ class DEM {
                Float3* dev_ns_fi;           // Particle-fluid interaction force
                Float*  dev_ns_phi;          // Cell porosity
                Float*  dev_ns_dphi;         // Cell porosity change
       -        Float3* dev_ns_div_phi_v_v;  // Divegence used in velocity prediction
       +        //Float3* dev_ns_div_phi_v_v;  // Divegence used in velocity prediction
                Float*  dev_ns_epsilon;      // Pressure difference
                Float*  dev_ns_epsilon_new;  // Pressure diff. after Jacobi iteration
                Float*  dev_ns_epsilon_old;  // Pressure diff. before Jacobi iteration
       t@@ -193,10 +193,13 @@ class DEM {
                Float*  dev_ns_f1;           // Constant terms in forcing function
                Float3* dev_ns_f2;           // Constant slopes in forcing function
                Float*  dev_ns_v_prod;       // Outer product of fluid velocities
       -        Float*  dev_ns_tau;          // Fluid stress tensor
       -        Float3* dev_ns_div_phi_vi_v; // div(phi*vi*v)
       +        //Float*  dev_ns_tau;          // Fluid stress tensor
       +        //Float3* dev_ns_div_phi_vi_v; // div(phi*vi*v)
                //Float3* dev_ns_div_phi_tau;  // div(phi*tau)
                Float3* dev_ns_div_tau;      // div(tau)
       +        Float*  dev_ns_div_tau_x;    // div(tau) on x-face
       +        Float*  dev_ns_div_tau_y;    // div(tau) on y-face
       +        Float*  dev_ns_div_tau_z;    // div(tau) on z-face
                Float3* dev_ns_f_pf;         // Particle-fluid interaction force
        
                //// Navier Stokes functions
 (DIR) diff --git a/tests/stokes_law.py b/tests/stokes_law.py
       t@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
        
        print("### Stokes test - single sphere sedimentation ###")
        ## Stokes drag
       -orig = sphere.sim(sid = "stokes_law_gradP", fluid = True)
       +orig = sphere.sim(sid = "stokes_law_set2", fluid = True)
        cleanup(orig)
        orig.defaultParams()
        orig.addParticle([0.5,0.5,1.46], 0.05)