tnavierstokes.cuh - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
       tnavierstokes.cuh (128731B)
       ---
            1 // navierstokes.cuh
            2 // CUDA implementation of porous flow
            3 
            4 #include <iostream>
            5 #include <cuda.h>
            6 //#include <cutil_math.h>
            7 #include <helper_math.h>
            8 
            9 #include "vector_arithmetic.h"  // for arbitrary precision vectors
           10 #include "sphere.h"
           11 #include "datatypes.h"
           12 #include "utility.h"
           13 #include "constants.cuh"
           14 #include "debug.h"
           15 
           16 // Arithmetic mean of two numbers
           17 __inline__ __device__ Float amean(Float a, Float b) {
           18     return (a+b)*0.5;
           19 }
           20 
           21 // Harmonic mean of two numbers
           22 __inline__ __device__ Float hmean(Float a, Float b) {
           23     return (2.0*a*b)/(a+b);
           24 }
           25 
           26 // Helper functions for checking whether a value is NaN or Inf
           27 __device__ int checkFiniteFloat(
           28     const char* desc,
           29     const unsigned int x,
           30     const unsigned int y,
           31     const unsigned int z,
           32     const Float s)
           33 {
           34     __syncthreads();
           35     if (!isfinite(s)) {
           36         printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
           37         return 1;
           38     }
           39     return 0;
           40 }
           41 
           42 __device__ int checkFiniteFloat3(
           43     const char* desc,
           44     const unsigned int x,
           45     const unsigned int y,
           46     const unsigned int z,
           47     const Float3 v)
           48 {
           49     __syncthreads();
           50     if (!isfinite(v.x) || !isfinite(v.y)  || !isfinite(v.z)) {
           51         printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
           52                x, y, z, desc, v.x, v.y, v.z);
           53         return 1;
           54     }
           55     return 0;
           56 }
           57 
           58 // Initialize memory
           59 void DEM::initNSmemDev(void)
           60 {
           61     // size of scalar field
           62     unsigned int memSizeF = sizeof(Float)*NScells();
           63 
           64     // size of cell-face arrays in staggered grid discretization
           65     unsigned int memSizeFface = sizeof(Float)*NScellsVelocity();
           66 
           67     cudaMalloc((void**)&dev_ns_p, memSizeF);     // hydraulic pressure
           68     cudaMalloc((void**)&dev_ns_v, memSizeF*3);   // cell hydraulic velocity
           69     cudaMalloc((void**)&dev_ns_v_x, memSizeFface);// velocity in stag. grid
           70     cudaMalloc((void**)&dev_ns_v_y, memSizeFface);// velocity in stag. grid
           71     cudaMalloc((void**)&dev_ns_v_z, memSizeFface);// velocity in stag. grid
           72     cudaMalloc((void**)&dev_ns_v_p, memSizeF*3); // predicted cell velocity
           73     cudaMalloc((void**)&dev_ns_v_p_x, memSizeFface); // pred. vel. in stag. grid
           74     cudaMalloc((void**)&dev_ns_v_p_y, memSizeFface); // pred. vel. in stag. grid
           75     cudaMalloc((void**)&dev_ns_v_p_z, memSizeFface); // pred. vel. in stag. grid
           76     cudaMalloc((void**)&dev_ns_vp_avg, memSizeF*3); // avg. particle velocity
           77     cudaMalloc((void**)&dev_ns_d_avg, memSizeF); // avg. particle diameter
           78     cudaMalloc((void**)&dev_ns_F_pf, memSizeF*3);  // interaction force
           79     //cudaMalloc((void**)&dev_ns_F_pf_x, memSizeFface);  // interaction force
           80     //cudaMalloc((void**)&dev_ns_F_pf_y, memSizeFface);  // interaction force
           81     //cudaMalloc((void**)&dev_ns_F_pf_z, memSizeFface);  // interaction force
           82     cudaMalloc((void**)&dev_ns_phi, memSizeF);   // cell porosity
           83     cudaMalloc((void**)&dev_ns_dphi, memSizeF);  // cell porosity change
           84     //cudaMalloc((void**)&dev_ns_div_phi_v_v, memSizeF*3); // div(phi v v)
           85     cudaMalloc((void**)&dev_ns_epsilon, memSizeF); // pressure difference
           86     cudaMalloc((void**)&dev_ns_epsilon_new, memSizeF); // new pressure diff.
           87     cudaMalloc((void**)&dev_ns_epsilon_old, memSizeF); // old pressure diff.
           88     cudaMalloc((void**)&dev_ns_norm, memSizeF);  // normalized residual
           89     cudaMalloc((void**)&dev_ns_f, memSizeF);     // forcing function value
           90     cudaMalloc((void**)&dev_ns_f1, memSizeF);    // constant addition in forcing
           91     cudaMalloc((void**)&dev_ns_f2, memSizeF*3);  // constant slope in forcing
           92     //cudaMalloc((void**)&dev_ns_tau, memSizeF*6); // stress tensor
           93     //cudaMalloc((void**)&dev_ns_div_tau, memSizeF*3); // div(tau), cell center
           94     cudaMalloc((void**)&dev_ns_div_tau_x, memSizeFface); // div(tau), cell face
           95     cudaMalloc((void**)&dev_ns_div_tau_y, memSizeFface); // div(tau), cell face
           96     cudaMalloc((void**)&dev_ns_div_tau_z, memSizeFface); // div(tau), cell face
           97     cudaMalloc((void**)&dev_ns_div_phi_vi_v, memSizeF*3); // div(phi*vi*v)
           98     //cudaMalloc((void**)&dev_ns_div_phi_tau, memSizeF*3);  // div(phi*tau)
           99     cudaMalloc((void**)&dev_ns_f_pf, sizeof(Float3)*np); // particle fluid force
          100     cudaMalloc((void**)&dev_ns_f_d, sizeof(Float4)*np); // drag force
          101     cudaMalloc((void**)&dev_ns_f_p, sizeof(Float4)*np); // pressure force
          102     cudaMalloc((void**)&dev_ns_f_v, sizeof(Float4)*np); // viscous force
          103     cudaMalloc((void**)&dev_ns_f_sum, sizeof(Float4)*np); // sum of int. forces
          104 
          105     checkForCudaErrors("End of initNSmemDev");
          106 }
          107 
          108 // Free memory
          109 void DEM::freeNSmemDev()
          110 {
          111     cudaFree(dev_ns_p);
          112     cudaFree(dev_ns_v);
          113     cudaFree(dev_ns_v_x);
          114     cudaFree(dev_ns_v_y);
          115     cudaFree(dev_ns_v_z);
          116     cudaFree(dev_ns_v_p);
          117     cudaFree(dev_ns_v_p_x);
          118     cudaFree(dev_ns_v_p_y);
          119     cudaFree(dev_ns_v_p_z);
          120     cudaFree(dev_ns_vp_avg);
          121     cudaFree(dev_ns_d_avg);
          122     cudaFree(dev_ns_F_pf);
          123     //cudaFree(dev_ns_F_pf_x);
          124     //cudaFree(dev_ns_F_pf_y);
          125     //cudaFree(dev_ns_F_pf_z);
          126     cudaFree(dev_ns_phi);
          127     cudaFree(dev_ns_dphi);
          128     //cudaFree(dev_ns_div_phi_v_v);
          129     cudaFree(dev_ns_epsilon);
          130     cudaFree(dev_ns_epsilon_new);
          131     cudaFree(dev_ns_epsilon_old);
          132     cudaFree(dev_ns_norm);
          133     cudaFree(dev_ns_f);
          134     cudaFree(dev_ns_f1);
          135     cudaFree(dev_ns_f2);
          136     //cudaFree(dev_ns_tau);
          137     cudaFree(dev_ns_div_phi_vi_v);
          138     //cudaFree(dev_ns_div_phi_tau);
          139     //cudaFree(dev_ns_div_tau);
          140     cudaFree(dev_ns_div_tau_x);
          141     cudaFree(dev_ns_div_tau_y);
          142     cudaFree(dev_ns_div_tau_z);
          143     cudaFree(dev_ns_f_pf);
          144     cudaFree(dev_ns_f_d);
          145     cudaFree(dev_ns_f_p);
          146     cudaFree(dev_ns_f_v);
          147     cudaFree(dev_ns_f_sum);
          148 }
          149 
          150 // Transfer to device
          151 void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
          152 {
          153     checkForCudaErrors("Before attempting cudaMemcpy in "
          154                        "transferNStoGlobalDeviceMemory");
          155 
          156     //if (verbose == 1 && statusmsg == 1)
          157     //std::cout << "  Transfering fluid data to the device:           ";
          158 
          159     // memory size for a scalar field
          160     unsigned int memSizeF  = sizeof(Float)*NScells();
          161 
          162     //writeNSarray(ns.p, "ns.p.txt");
          163 
          164     cudaMemcpy(dev_ns_p, ns.p, memSizeF, cudaMemcpyHostToDevice);
          165     checkForCudaErrors("transferNStoGlobalDeviceMemory after first cudaMemcpy");
          166     cudaMemcpy(dev_ns_v, ns.v, memSizeF*3, cudaMemcpyHostToDevice);
          167     //cudaMemcpy(dev_ns_v_p, ns.v_p, memSizeF*3, cudaMemcpyHostToDevice);
          168     cudaMemcpy(dev_ns_phi, ns.phi, memSizeF, cudaMemcpyHostToDevice);
          169     cudaMemcpy(dev_ns_dphi, ns.dphi, memSizeF, cudaMemcpyHostToDevice);
          170 
          171     checkForCudaErrors("End of transferNStoGlobalDeviceMemory");
          172     //if (verbose == 1 && statusmsg == 1)
          173     //std::cout << "Done" << std::endl;
          174 }
          175 
          176 // Transfer from device
          177 void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
          178 {
          179     if (verbose == 1 && statusmsg == 1)
          180         std::cout << "  Transfering fluid data from the device:         ";
          181 
          182     // memory size for a scalar field
          183     unsigned int memSizeF  = sizeof(Float)*NScells();
          184 
          185     cudaMemcpy(ns.p, dev_ns_p, memSizeF, cudaMemcpyDeviceToHost);
          186     checkForCudaErrors("In transferNSfromGlobalDeviceMemory, dev_ns_p", 0);
          187     cudaMemcpy(ns.v, dev_ns_v, memSizeF*3, cudaMemcpyDeviceToHost);
          188     cudaMemcpy(ns.v_x, dev_ns_v_x, memSizeF, cudaMemcpyDeviceToHost);
          189     cudaMemcpy(ns.v_y, dev_ns_v_y, memSizeF, cudaMemcpyDeviceToHost);
          190     cudaMemcpy(ns.v_z, dev_ns_v_z, memSizeF, cudaMemcpyDeviceToHost);
          191     //cudaMemcpy(ns.v_p, dev_ns_v_p, memSizeF*3, cudaMemcpyDeviceToHost);
          192     cudaMemcpy(ns.phi, dev_ns_phi, memSizeF, cudaMemcpyDeviceToHost);
          193     cudaMemcpy(ns.dphi, dev_ns_dphi, memSizeF, cudaMemcpyDeviceToHost);
          194     cudaMemcpy(ns.norm, dev_ns_norm, memSizeF, cudaMemcpyDeviceToHost);
          195     cudaMemcpy(ns.f_d, dev_ns_f_d, sizeof(Float4)*np, cudaMemcpyDeviceToHost);
          196     cudaMemcpy(ns.f_p, dev_ns_f_p, sizeof(Float4)*np, cudaMemcpyDeviceToHost);
          197     cudaMemcpy(ns.f_v, dev_ns_f_v, sizeof(Float4)*np, cudaMemcpyDeviceToHost);
          198     cudaMemcpy(ns.f_sum, dev_ns_f_sum, sizeof(Float4)*np,
          199             cudaMemcpyDeviceToHost);
          200 
          201     checkForCudaErrors("End of transferNSfromGlobalDeviceMemory", 0);
          202     if (verbose == 1 && statusmsg == 1)
          203         std::cout << "Done" << std::endl;
          204 }
          205 
          206 // Transfer the normalized residuals from device to host
          207 void DEM::transferNSnormFromGlobalDeviceMemory()
          208 {
          209     cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(),
          210                cudaMemcpyDeviceToHost);
          211     checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory");
          212 }
          213 
          214 // Transfer the pressure change from device to host
          215 void DEM::transferNSepsilonFromGlobalDeviceMemory()
          216 {
          217     cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(),
          218                cudaMemcpyDeviceToHost);
          219     checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
          220 }
          221 
          222 // Transfer the pressure change from device to host
          223 void DEM::transferNSepsilonNewFromGlobalDeviceMemory()
          224 {
          225     cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(),
          226                cudaMemcpyDeviceToHost);
          227     checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
          228 }
          229 
          230 // Get linear index from 3D grid position
          231 __inline__ __device__ unsigned int idx(
          232     const int x, const int y, const int z)
          233 {
          234     // without ghost nodes
          235     //return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z;
          236 
          237     // with ghost nodes
          238     // the ghost nodes are placed at x,y,z = -1 and WIDTH
          239     return (x+1) + (devC_grid.num[0]+2)*(y+1) +
          240         (devC_grid.num[0]+2)*(devC_grid.num[1]+2)*(z+1);
          241 }
          242 
          243 // Get linear index of velocity node from 3D grid position in staggered grid
          244 __inline__ __device__ unsigned int vidx(
          245     const int x, const int y, const int z)
          246 {
          247     // without ghost nodes
          248     //return x + (devC_grid.num[0]+1)*y
          249     //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
          250 
          251     // with ghost nodes
          252     // the ghost nodes are placed at x,y,z = -1 and WIDTH+1
          253     return (x+1) + (devC_grid.num[0]+3)*(y+1)
          254         + (devC_grid.num[0]+3)*(devC_grid.num[1]+3)*(z+1);
          255 }
          256 
          257 // Find averaged cell velocities from cell-face velocities. This function works
          258 // for both normal and predicted velocities. Launch for every cell in the
          259 // dev_ns_v or dev_ns_v_p array. This function does not set the averaged
          260 // velocity values in the ghost node cells.
          261 __global__ void findNSavgVel(
          262     Float3* __restrict__ dev_ns_v,    // out
          263     const Float* __restrict__ dev_ns_v_x,  // in
          264     const Float* __restrict__ dev_ns_v_y,  // in
          265     const Float* __restrict__ dev_ns_v_z)  // in
          266 {
          267 
          268     // 3D thread index
          269     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          270     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          271     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          272 
          273     // check that we are not outside the fluid grid
          274     if (x<devC_grid.num[0] && y<devC_grid.num[1] && z<devC_grid.num[2]-1) {
          275         const unsigned int cellidx = idx(x,y,z);
          276 
          277         // Read cell-face velocities
          278         __syncthreads();
          279         const Float v_xn = dev_ns_v_x[vidx(x,y,z)];
          280         const Float v_xp = dev_ns_v_x[vidx(x+1,y,z)];
          281         const Float v_yn = dev_ns_v_y[vidx(x,y,z)];
          282         const Float v_yp = dev_ns_v_y[vidx(x,y+1,z)];
          283         const Float v_zn = dev_ns_v_z[vidx(x,y,z)];
          284         const Float v_zp = dev_ns_v_z[vidx(x,y,z+1)];
          285 
          286         // Find average velocity using arithmetic means
          287         const Float3 v_bar = MAKE_FLOAT3(
          288             amean(v_xn, v_xp),
          289             amean(v_yn, v_yp),
          290             amean(v_zn, v_zp));
          291 
          292         // Save value
          293         __syncthreads();
          294         dev_ns_v[idx(x,y,z)] = v_bar;
          295     }
          296 }
          297 
          298 // Find cell-face velocities from averaged velocities. This function works for
          299 // both normal and predicted velocities. Launch for every cell in the dev_ns_v
          300 // or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are set
          301 // beforehand.
          302 __global__ void findNScellFaceVel(
          303     const Float3* __restrict__ dev_ns_v,    // in
          304     Float* __restrict__ dev_ns_v_x,  // out
          305     Float* __restrict__ dev_ns_v_y,  // out
          306     Float* __restrict__ dev_ns_v_z)  // out
          307 {
          308 
          309     // 3D thread index
          310     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          311     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          312     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          313 
          314     // Grid dimensions
          315     const unsigned int nx = devC_grid.num[0];
          316     const unsigned int ny = devC_grid.num[1];
          317     const unsigned int nz = devC_grid.num[2];
          318 
          319     // check that we are not outside the fluid grid
          320     if (x < nx && y < ny && x < nz) {
          321         const unsigned int cellidx = idx(x,y,z);
          322 
          323         // Read the averaged velocity from this cell as well as the required
          324         // components from the neighbor cells
          325         __syncthreads();
          326         const Float3 v = dev_ns_v[idx(x,y,z)];
          327         const Float v_xn = dev_ns_v[idx(x-1,y,z)].x;
          328         const Float v_xp = dev_ns_v[idx(x+1,y,z)].x;
          329         const Float v_yn = dev_ns_v[idx(x,y-1,z)].y;
          330         const Float v_yp = dev_ns_v[idx(x,y+1,z)].y;
          331         const Float v_zn = dev_ns_v[idx(x,y,z-1)].z;
          332         const Float v_zp = dev_ns_v[idx(x,y,z+1)].z;
          333 
          334         // Find cell-face velocities and save them right away
          335         __syncthreads();
          336 
          337         // Values at the faces closest to the coordinate system origo
          338         dev_ns_v_x[vidx(x,y,z)] = amean(v_xn, v.x);
          339         dev_ns_v_y[vidx(x,y,z)] = amean(v_yn, v.y);
          340         dev_ns_v_z[vidx(x,y,z)] = amean(v_zn, v.z);
          341 
          342         // Values at the cell faces furthest from the coordinate system origo.
          343         // These values should only be written at the corresponding boundaries
          344         // in order to avoid write conflicts.
          345         if (x == nx-1)
          346             dev_ns_v_x[vidx(x+1,y,z)] = amean(v.x, v_xp);
          347         if (y == ny-1)
          348             dev_ns_v_x[vidx(x+1,y,z)] = amean(v.y, v_yp);
          349         if (z == nz-1)
          350             dev_ns_v_x[vidx(x+1,y,z)] = amean(v.z, v_zp);
          351     }
          352 }
          353 
          354 
          355 // Set the initial guess of the values of epsilon.
          356 __global__ void setNSepsilonInterior(
          357     Float* __restrict__ dev_ns_epsilon,
          358     const Float value)
          359 {
          360     // 3D thread index
          361     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          362     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          363     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          364 
          365     // check that we are not outside the fluid grid
          366     if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
          367         z > 0 && z < devC_grid.num[2]-1) {
          368         __syncthreads();
          369         const unsigned int cellidx = idx(x,y,z);
          370         dev_ns_epsilon[cellidx] = value;
          371     }
          372 }
          373 
          374 // The normalized residuals are given an initial value of 0, since the values at
          375 // the Dirichlet boundaries aren't written during the iterations.
          376 __global__ void setNSnormZero(Float* __restrict__ dev_ns_norm)
          377 {
          378     // 3D thread index
          379     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          380     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          381     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          382 
          383     // check that we are not outside the fluid grid
          384     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
          385         __syncthreads();
          386         const unsigned int cellidx = idx(x,y,z);
          387         dev_ns_norm[idx(x,y,z)]    = 0.0;
          388     }
          389 }
          390 
          391 
          392 // Set the constant values of epsilon at the lower boundary.  Since the
          393 // Dirichlet boundary values aren't transfered during array swapping, the values
          394 // also need to be written to the new array of epsilons.  A value of 0 equals
          395 // the Dirichlet boundary condition: the new value should be identical to the
          396 // old value, i.e. the temporal gradient is 0
          397 __global__ void setNSepsilonBottom(
          398     Float* __restrict__ dev_ns_epsilon,
          399     Float* __restrict__ dev_ns_epsilon_new,
          400     const Float value)
          401 {
          402     // 3D thread index
          403     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          404     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          405     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          406 
          407     // check that we are not outside the fluid grid, and at the z boundaries
          408     //if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
          409     //        (z == devC_grid.num[2]-1 || z == 0)) {
          410     // check that we are not outside the fluid grid, and at the lower z boundary
          411     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z == 0) {
          412 
          413         __syncthreads();
          414         const unsigned int cellidx = idx(x,y,z);
          415         dev_ns_epsilon[cellidx]     = value;
          416         dev_ns_epsilon_new[cellidx] = value;
          417     }
          418 }
          419 
          420 // Set the constant values of epsilon at the upper boundary.  Since the
          421 // Dirichlet boundary values aren't transfered during array swapping, the values
          422 // also need to be written to the new array of epsilons.
          423 __global__ void setNSepsilonTop(
          424     Float* __restrict__ dev_ns_epsilon,
          425     Float* __restrict__ dev_ns_epsilon_new,
          426     const Float value)
          427 {
          428     // 3D thread index
          429     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          430     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          431     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          432 
          433     // check that we are not outside the fluid grid, and at the upper z boundary
          434     if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
          435         z == devC_grid.num[2]-1) {
          436 
          437         __syncthreads();
          438         const unsigned int cellidx = idx(x,y,z);
          439         dev_ns_epsilon[cellidx]     = value;
          440         dev_ns_epsilon_new[cellidx] = value;
          441     }
          442 }
          443 
          444 // Set the constant values of epsilon and grad_z(epsilon) at the upper wall, if
          445 // it is dynamic (Dirichlet+Neumann). Since the Dirichlet boundary values aren't
          446 // transfered during array swapping, the values also need to be written to the
          447 // new array of epsilons.
          448 __global__ void setNSepsilonAtTopWall(
          449     Float* __restrict__ dev_ns_epsilon,
          450     Float* __restrict__ dev_ns_epsilon_new,
          451     const unsigned int iz,
          452     const Float value,
          453     const Float dp_dz)
          454 {
          455     // 3D thread index
          456     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          457     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          458     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          459 
          460     const unsigned int cellidx = idx(x,y,z);
          461 
          462     // cells containing the wall (Dirichlet BC)
          463     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2] &&
          464             z == iz) {
          465         __syncthreads();
          466         dev_ns_epsilon[cellidx]     = value;
          467         dev_ns_epsilon_new[cellidx] = value;
          468     }
          469 
          470     // cells above the wall (Neumann BC)
          471     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2] &&
          472             z == iz+1) {
          473 
          474         // Pressure value in order to obtain hydrostatic pressure distribution
          475         // for Neumann BC. The pressure should equal the value at the top wall
          476         // minus the contribution due to the fluid weight.
          477         // p_iz+1 = p_iz - rho_f*g*dz
          478         const Float p = value - dp_dz;
          479 
          480         __syncthreads();
          481         dev_ns_epsilon[cellidx]     = p;
          482         dev_ns_epsilon_new[cellidx] = p;
          483     }
          484 }
          485 
          486 __device__ void copyNSvalsDev(
          487     const unsigned int read,
          488     const unsigned int write,
          489     Float*  __restrict__ dev_ns_p,
          490     Float3* __restrict__ dev_ns_v,
          491     Float3* __restrict__ dev_ns_v_p,
          492     Float*  __restrict__ dev_ns_phi,
          493     Float*  __restrict__ dev_ns_dphi,
          494     Float*  __restrict__ dev_ns_epsilon)
          495 {
          496     // Coalesced read
          497     const Float  p       = dev_ns_p[read];
          498     const Float3 v       = dev_ns_v[read];
          499     const Float3 v_p     = dev_ns_v_p[read];
          500     const Float  phi     = dev_ns_phi[read];
          501     const Float  dphi    = dev_ns_dphi[read];
          502     const Float  epsilon = dev_ns_epsilon[read];
          503 
          504     // Coalesced write
          505     __syncthreads();
          506     dev_ns_p[write]       = p;
          507     dev_ns_v[write]       = v;
          508     dev_ns_v_p[write]     = v_p;
          509     dev_ns_phi[write]     = phi;
          510     dev_ns_dphi[write]    = dphi;
          511     dev_ns_epsilon[write] = epsilon;
          512 }
          513 
          514 
          515 // Update ghost nodes from their parent cell values. The edge (diagonal) cells
          516 // are not written since they are not read. Launch this kernel for all cells in
          517 // the grid
          518 __global__ void setNSghostNodesDev(
          519     Float*  __restrict__ dev_ns_p,
          520     Float3* __restrict__ dev_ns_v,
          521     Float3* __restrict__ dev_ns_v_p,
          522     Float*  __restrict__ dev_ns_phi,
          523     Float*  __restrict__ dev_ns_dphi,
          524     Float*  __restrict__ dev_ns_epsilon)
          525 {
          526     // 3D thread index
          527     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          528     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          529     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          530 
          531     // Grid dimensions
          532     const unsigned int nx = devC_grid.num[0];
          533     const unsigned int ny = devC_grid.num[1];
          534     const unsigned int nz = devC_grid.num[2];
          535 
          536     // 1D thread index
          537     const unsigned int cellidx = idx(x,y,z);
          538 
          539     // 1D position of ghost node
          540     unsigned int writeidx;
          541 
          542     // check that we are not outside the fluid grid
          543     if (x < nx && y < ny && z < nz) {
          544 
          545         if (x == 0) {
          546             writeidx = idx(nx,y,z);
          547             copyNSvalsDev(cellidx, writeidx,
          548                           dev_ns_p,
          549                           dev_ns_v, dev_ns_v_p,
          550                           dev_ns_phi, dev_ns_dphi,
          551                           dev_ns_epsilon);
          552         }
          553         if (x == nx-1) {
          554             writeidx = idx(-1,y,z);
          555             copyNSvalsDev(cellidx, writeidx,
          556                           dev_ns_p,
          557                           dev_ns_v, dev_ns_v_p,
          558                           dev_ns_phi, dev_ns_dphi,
          559                           dev_ns_epsilon);
          560         }
          561 
          562         if (y == 0) {
          563             writeidx = idx(x,ny,z);
          564             copyNSvalsDev(cellidx, writeidx,
          565                           dev_ns_p,
          566                           dev_ns_v, dev_ns_v_p,
          567                           dev_ns_phi, dev_ns_dphi,
          568                           dev_ns_epsilon);
          569         }
          570         if (y == ny-1) {
          571             writeidx = idx(x,-1,z);
          572             copyNSvalsDev(cellidx, writeidx,
          573                           dev_ns_p,
          574                           dev_ns_v, dev_ns_v_p,
          575                           dev_ns_phi, dev_ns_dphi,
          576                           dev_ns_epsilon);
          577         }
          578 
          579         // Z boundaries fixed
          580         if (z == 0) {
          581             writeidx = idx(x,y,-1);
          582             copyNSvalsDev(cellidx, writeidx,
          583                           dev_ns_p,
          584                           dev_ns_v, dev_ns_v_p,
          585                           dev_ns_phi, dev_ns_dphi,
          586                           dev_ns_epsilon);
          587         }
          588         if (z == nz-1) {
          589             writeidx = idx(x,y,nz);
          590             copyNSvalsDev(cellidx, writeidx,
          591                           dev_ns_p,
          592                           dev_ns_v, dev_ns_v_p,
          593                           dev_ns_phi, dev_ns_dphi,
          594                           dev_ns_epsilon);
          595         }
          596 
          597         // Z boundaries periodic
          598         /*if (z == 0) {
          599           writeidx = idx(x,y,nz);
          600           copyNSvalsDev(cellidx, writeidx,
          601           dev_ns_p,
          602           dev_ns_v, dev_ns_v_p,
          603           dev_ns_phi, dev_ns_dphi,
          604           dev_ns_epsilon);
          605           }
          606           if (z == nz-1) {
          607           writeidx = idx(x,y,-1);
          608           copyNSvalsDev(cellidx, writeidx,
          609           dev_ns_p,
          610           dev_ns_v, dev_ns_v_p,
          611           dev_ns_phi, dev_ns_dphi,
          612           dev_ns_epsilon);
          613           }*/
          614     }
          615 }
          616 
          617 // Update a field in the ghost nodes from their parent cell values. The edge
          618 // (diagonal) cells are not written since they are not read. Launch this kernel
          619 // for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. );
          620 template<typename T>
          621 __global__ void setNSghostNodes(T* __restrict__ dev_scalarfield)
          622 {
          623     // 3D thread index
          624     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          625     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          626     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          627 
          628     // Grid dimensions
          629     const unsigned int nx = devC_grid.num[0];
          630     const unsigned int ny = devC_grid.num[1];
          631     const unsigned int nz = devC_grid.num[2];
          632 
          633     // check that we are not outside the fluid grid
          634     if (x < nx && y < ny && z < nz) {
          635 
          636         const T val = dev_scalarfield[idx(x,y,z)];
          637 
          638         if (x == 0)
          639             dev_scalarfield[idx(nx,y,z)] = val;
          640         if (x == nx-1)
          641             dev_scalarfield[idx(-1,y,z)] = val;
          642 
          643         if (y == 0)
          644             dev_scalarfield[idx(x,ny,z)] = val;
          645         if (y == ny-1)
          646             dev_scalarfield[idx(x,-1,z)] = val;
          647 
          648         if (z == 0)
          649             dev_scalarfield[idx(x,y,-1)] = val;     // Dirichlet
          650         //dev_scalarfield[idx(x,y,nz)] = val;    // Periodic -z
          651         if (z == nz-1)
          652             dev_scalarfield[idx(x,y,nz)] = val;     // Dirichlet
          653         //dev_scalarfield[idx(x,y,-1)] = val;    // Periodic +z
          654     }
          655 }
          656 
          657 // Update a field in the ghost nodes from their parent cell values. The edge
          658 // (diagonal) cells are not written since they are not read.
          659 template<typename T>
          660 __global__ void setNSghostNodes(
          661     T* __restrict__ dev_scalarfield,
          662     const int bc_bot,
          663     const int bc_top)
          664 {
          665     // 3D thread index
          666     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          667     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          668     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          669 
          670     // Grid dimensions
          671     const unsigned int nx = devC_grid.num[0];
          672     const unsigned int ny = devC_grid.num[1];
          673     const unsigned int nz = devC_grid.num[2];
          674 
          675     // check that we are not outside the fluid grid
          676     if (x < nx && y < ny && z < nz) {
          677 
          678         const T val = dev_scalarfield[idx(x,y,z)];
          679 
          680         // x
          681         if (x == 0)
          682             dev_scalarfield[idx(nx,y,z)] = val;
          683         if (x == nx-1)
          684             dev_scalarfield[idx(-1,y,z)] = val;
          685 
          686         // y
          687         if (y == 0)
          688             dev_scalarfield[idx(x,ny,z)] = val;
          689         if (y == ny-1)
          690             dev_scalarfield[idx(x,-1,z)] = val;
          691 
          692         // z
          693         if (z == 0 && bc_bot == 0)
          694             dev_scalarfield[idx(x,y,-1)] = val;     // Dirichlet
          695         //if (z == 1 && bc_bot == 1)
          696         if (z == 0 && bc_bot == 1)
          697             dev_scalarfield[idx(x,y,-1)] = val;     // Neumann
          698         if (z == 0 && bc_bot == 2)
          699             dev_scalarfield[idx(x,y,nz)] = val;     // Periodic -z
          700 
          701         if (z == nz-1 && bc_top == 0)
          702             dev_scalarfield[idx(x,y,nz)] = val;     // Dirichlet
          703         if (z == nz-2 && bc_top == 1)
          704             dev_scalarfield[idx(x,y,nz)] = val;     // Neumann
          705         if (z == nz-1 && bc_top == 2)
          706             dev_scalarfield[idx(x,y,-1)] = val;     // Periodic +z
          707     }
          708 }
          709 
          710 // Update a field in the ghost nodes from their parent cell values. The edge
          711 // (diagonal) cells are not written since they are not read.
          712 // Launch per face.
          713 // According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics"
          714 template<typename T>
          715 __global__ void setNSghostNodesFace(
          716     T* __restrict__ dev_scalarfield_x,
          717     T* __restrict__ dev_scalarfield_y,
          718     T* __restrict__ dev_scalarfield_z,
          719     const int bc_bot,
          720     const int bc_top)
          721 {
          722     // 3D thread index
          723     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          724     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          725     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          726 
          727     // Grid dimensions
          728     const unsigned int nx = devC_grid.num[0];
          729     const unsigned int ny = devC_grid.num[1];
          730     const unsigned int nz = devC_grid.num[2];
          731 
          732     // check that we are not outside the fluid grid
          733     //if (x <= nx && y <= ny && z <= nz) {
          734     if (x < nx && y < ny && z < nz) {
          735 
          736         const T val_x = dev_scalarfield_x[vidx(x,y,z)];
          737         const T val_y = dev_scalarfield_y[vidx(x,y,z)];
          738         const T val_z = dev_scalarfield_z[vidx(x,y,z)];
          739 
          740         // x (periodic)
          741         if (x == 0) {
          742             dev_scalarfield_x[vidx(nx,y,z)] = val_x;
          743             dev_scalarfield_y[vidx(nx,y,z)] = val_y;
          744             dev_scalarfield_z[vidx(nx,y,z)] = val_z;
          745         }
          746         if (x == 1) {
          747             dev_scalarfield_x[vidx(nx+1,y,z)] = val_x;
          748         }
          749         if (x == nx-1) {
          750             dev_scalarfield_x[vidx(-1,y,z)] = val_x;
          751             dev_scalarfield_y[vidx(-1,y,z)] = val_y;
          752             dev_scalarfield_z[vidx(-1,y,z)] = val_z;
          753         }
          754 
          755         // z ghost nodes at x = -1 and z = nz,
          756         // equal to the ghost node at x = nx-1 and z = nz
          757         if (z == nz-1 && x == nx-1 && bc_top == 0) // Dirichlet +z
          758             dev_scalarfield_z[vidx(-1,y,nz)] = val_z;
          759 
          760         if (z == nz-1 && x == nx-1 && (bc_top == 1 || bc_top == 2)) //Neumann +z
          761             dev_scalarfield_z[vidx(-1,y,nz)] = 0.0;
          762 
          763         if (z == 0 && x == nx-1 && bc_top == 3) // Periodic +z
          764             dev_scalarfield_z[vidx(-1,y,nz)] = val_z;
          765 
          766         // z ghost nodes at y = -1 and z = nz,
          767         // equal to the ghost node at y = ny-1 and z = nz
          768         if (z == nz-1 && y == ny-1 && bc_top == 0) // Dirichlet +z
          769             dev_scalarfield_z[vidx(x,-1,nz)] = val_z;
          770 
          771         if (z == nz-1 && y == ny-1 && (bc_top == 1 || bc_top == 2)) //Neumann +z
          772             dev_scalarfield_z[vidx(x,-1,nz)] = 0.0;
          773 
          774         if (z == 0 && y == ny-1 && bc_top == 3) // Periodic +z
          775             dev_scalarfield_z[vidx(x,-1,nz)] = val_z;
          776 
          777 
          778         // x ghost nodes at x = nx and z = -1,
          779         // equal to the ghost nodes at x = 0 and z = -1
          780         // Dirichlet, Neumann free slip or periodic -z
          781         if (z == 0 && x == 0 && (bc_bot == 0 || bc_bot == 1 || bc_bot == 3))
          782             dev_scalarfield_x[vidx(nx,y,-1)] = val_x;
          783 
          784         if (z == 0 && x == 0 && bc_bot == 2) // Neumann no slip -z
          785             dev_scalarfield_x[vidx(nx,y,-1)] = -val_x;
          786 
          787         // y ghost nodes at y = ny and z = -1,
          788         // equal to the ghost node at x = 0 and z = -1
          789         // Dirichlet, Neumann free slip or periodic -z
          790         if (z == 0 && y == 0 && (bc_bot == 0 || bc_bot == 1 || bc_bot == 3))
          791             dev_scalarfield_y[vidx(x,ny,-1)] = val_y;
          792 
          793         if (z == 0 && y == 0 && bc_bot == 2) // Neumann no slip -z
          794             dev_scalarfield_y[vidx(x,ny,-1)] = -val_y;
          795 
          796 
          797         // z ghost nodes at x = nx and z = nz
          798         // equal to the ghost node at x = 0 and z = nz
          799         if (z == nz-1 && x == 0 && (bc_top == 0 || bc_top == 3)) // D. or p. +z
          800             dev_scalarfield_z[vidx(nx,y,nz)] = val_z;
          801 
          802         if (z == nz-1 && x == 0 && (bc_top == 1 || bc_top == 2)) // N. +z
          803             dev_scalarfield_z[vidx(nx,y,nz)] = 0.0;
          804 
          805         // z ghost nodes at y = ny and z = nz
          806         // equal to the ghost node at y = 0 and z = nz
          807         if (z == nz-1 && y == 0 && (bc_top == 0 || bc_top == 3)) // D. or p. +z
          808             dev_scalarfield_z[vidx(x,ny,nz)] = val_z;
          809 
          810         if (z == nz-1 && y == 0 && (bc_top == 1 || bc_top == 2)) // N. +z
          811             dev_scalarfield_z[vidx(x,ny,nz)] = 0.0;
          812 
          813 
          814         // x ghost nodes at x = nx and z = nz,
          815         // equal to the ghost nodes at x = 0 and z = nz
          816         // Dirichlet, Neumann free slip or periodic +z
          817         if (z == nz-1 && x == 0 && (bc_bot == 0 || bc_bot == 1 || bc_bot == 3))
          818             dev_scalarfield_x[vidx(nx,y,nz)] = val_x;
          819 
          820         if (z == nz-1 && x == 0 && bc_bot == 2) // Neumann no slip -z
          821             dev_scalarfield_x[vidx(nx,y,nz)] = -val_x;
          822 
          823         // y ghost nodes at y = ny and z = nz,
          824         // equal to the ghost nodes at y = 0 and z = nz
          825         // Dirichlet, Neumann free slip or periodic +z
          826         if (z == nz-1 && y == 0 && (bc_bot == 0 || bc_bot == 1 || bc_bot == 3))
          827             dev_scalarfield_y[vidx(x,ny,nz)] = val_y;
          828 
          829         if (z == nz-1 && y == 0 && bc_bot == 2) // Neumann no slip -z
          830             dev_scalarfield_y[vidx(x,ny,nz)] = -val_y;
          831 
          832 
          833         // y (periodic)
          834         if (y == 0) {
          835             dev_scalarfield_x[vidx(x,ny,z)] = val_x;
          836             dev_scalarfield_y[vidx(x,ny,z)] = val_y;
          837             dev_scalarfield_z[vidx(x,ny,z)] = val_z;
          838         }
          839         if (y == 1) {
          840             dev_scalarfield_y[vidx(x,ny+1,z)] = val_y;
          841         }
          842         if (y == ny-1) {
          843             dev_scalarfield_x[vidx(x,-1,z)] = val_x;
          844             dev_scalarfield_y[vidx(x,-1,z)] = val_y;
          845             dev_scalarfield_z[vidx(x,-1,z)] = val_z;
          846         }
          847 
          848         // z
          849         if (z == 0 && bc_bot == 0) {
          850             dev_scalarfield_x[vidx(x,y,-1)] = val_y;     // Dirichlet -z
          851             dev_scalarfield_y[vidx(x,y,-1)] = val_x;     // Dirichlet -z
          852             dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Dirichlet -z
          853         }
          854         if (z == 0 && bc_bot == 1) {
          855             //dev_scalarfield_x[vidx(x,y,-1)] = val_x;   // Neumann free slip -z
          856             //dev_scalarfield_y[vidx(x,y,-1)] = val_y;   // Neumann free slip -z
          857             //dev_scalarfield_z[vidx(x,y,-1)] = val_z;   // Neumann free slip -z
          858             dev_scalarfield_x[vidx(x,y,-1)] = val_x;     // Neumann free slip -z
          859             dev_scalarfield_y[vidx(x,y,-1)] = val_y;     // Neumann free slip -z
          860             dev_scalarfield_z[vidx(x,y,-1)] = 0.0;       // Neumann free slip -z
          861         }
          862         if (z == 0 && bc_bot == 2) {
          863             //dev_scalarfield_x[vidx(x,y,-1)] = val_x;     // Neumann no slip -z
          864             //dev_scalarfield_y[vidx(x,y,-1)] = val_y;     // Neumann no slip -z
          865             //dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Neumann no slip -z
          866             dev_scalarfield_x[vidx(x,y,-1)] = -val_x;    // Neumann no slip -z
          867             dev_scalarfield_y[vidx(x,y,-1)] = -val_y;    // Neumann no slip -z
          868             dev_scalarfield_z[vidx(x,y,-1)] = 0.0;       // Neumann no slip -z
          869         }
          870         if (z == 0 && bc_bot == 3) {
          871             dev_scalarfield_x[vidx(x,y,nz)] = val_x;     // Periodic -z
          872             dev_scalarfield_y[vidx(x,y,nz)] = val_y;     // Periodic -z
          873             dev_scalarfield_z[vidx(x,y,nz)] = val_z;     // Periodic -z
          874         }
          875         if (z == 1 && bc_bot == 3) {
          876             dev_scalarfield_z[vidx(x,y,nz+1)] = val_z;   // Periodic -z
          877         }
          878 
          879         if (z == nz-1 && bc_top == 0) {
          880             dev_scalarfield_z[vidx(x,y,nz)] = val_z;     // Dirichlet +z
          881         }
          882         if (z == nz-1 && bc_top == 1) {
          883             //dev_scalarfield_x[vidx(x,y,nz)] = val_x;   // Neumann free slip +z
          884             //dev_scalarfield_y[vidx(x,y,nz)] = val_y;   // Neumann free slip +z
          885             //dev_scalarfield_z[vidx(x,y,nz)] = val_z;   // Neumann free slip +z
          886             //dev_scalarfield_z[vidx(x,y,nz+1)] = val_z; // Neumann free slip +z
          887             dev_scalarfield_x[vidx(x,y,nz)] = val_x;     // Neumann free slip +z
          888             dev_scalarfield_y[vidx(x,y,nz)] = val_y;     // Neumann free slip +z
          889             dev_scalarfield_z[vidx(x,y,nz)] = 0.0;     // Neumann free slip +z
          890             dev_scalarfield_z[vidx(x,y,nz+1)] = 0.0;   // Neumann free slip +z
          891         }
          892         if (z == nz-1 && bc_top == 2) {
          893             //dev_scalarfield_x[vidx(x,y,nz)] = val_x;     // Neumann no slip +z
          894             //dev_scalarfield_y[vidx(x,y,nz)] = val_y;     // Neumann no slip +z
          895             //dev_scalarfield_z[vidx(x,y,nz)] = val_z;     // Neumann no slip +z
          896             //dev_scalarfield_z[vidx(x,y,nz+1)] = val_z;   // Neumann no slip +z
          897             dev_scalarfield_x[vidx(x,y,nz)] = -val_x;    // Neumann no slip +z
          898             dev_scalarfield_y[vidx(x,y,nz)] = -val_y;    // Neumann no slip +z
          899             dev_scalarfield_z[vidx(x,y,nz)] = 0.0;       // Neumann no slip +z
          900             dev_scalarfield_z[vidx(x,y,nz+1)] = 0.0;     // Neumann no slip +z
          901         }
          902         if (z == nz-1 && bc_top == 3) {
          903             dev_scalarfield_x[vidx(x,y,-1)] = val_x;     // Periodic +z
          904             dev_scalarfield_y[vidx(x,y,-1)] = val_y;     // Periodic +z
          905             dev_scalarfield_z[vidx(x,y,-1)] = val_z;     // Periodic +z
          906         }
          907     }
          908 }
          909 
          910 // Update the tensor field for the ghost nodes from their parent cell values.
          911 // The edge (diagonal) cells are not written since they are not read. Launch
          912 // this kernel for all cells in the grid.
          913 __global__ void setNSghostNodes_tau(
          914     Float* __restrict__ dev_ns_tau,
          915     const int bc_bot,
          916     const int bc_top)
          917 {
          918     // 3D thread index
          919     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
          920     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
          921     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
          922 
          923     // Grid dimensions
          924     const unsigned int nx = devC_grid.num[0];
          925     const unsigned int ny = devC_grid.num[1];
          926     const unsigned int nz = devC_grid.num[2];
          927 
          928     // check that we are not outside the fluid grid
          929     if (x < nx && y < ny && z < nz) {
          930 
          931         // Linear index of length-6 vector field entry
          932         unsigned int cellidx6 = idx(x,y,z)*6;
          933 
          934         // Read parent values
          935         __syncthreads();
          936         const Float tau_xx = dev_ns_tau[cellidx6];
          937         const Float tau_xy = dev_ns_tau[cellidx6+1];
          938         const Float tau_xz = dev_ns_tau[cellidx6+2];
          939         const Float tau_yy = dev_ns_tau[cellidx6+3];
          940         const Float tau_yz = dev_ns_tau[cellidx6+4];
          941         const Float tau_zz = dev_ns_tau[cellidx6+5];
          942 
          943         // x
          944         if (x == 0) {
          945             cellidx6 = idx(nx,y,z)*6;
          946             dev_ns_tau[cellidx6]   = tau_xx;
          947             dev_ns_tau[cellidx6+1] = tau_xy;
          948             dev_ns_tau[cellidx6+2] = tau_xz;
          949             dev_ns_tau[cellidx6+3] = tau_yy;
          950             dev_ns_tau[cellidx6+4] = tau_yz;
          951             dev_ns_tau[cellidx6+5] = tau_zz;
          952         }
          953         if (x == nx-1) {
          954             cellidx6 = idx(-1,y,z)*6;
          955             dev_ns_tau[cellidx6]   = tau_xx;
          956             dev_ns_tau[cellidx6+1] = tau_xy;
          957             dev_ns_tau[cellidx6+2] = tau_xz;
          958             dev_ns_tau[cellidx6+3] = tau_yy;
          959             dev_ns_tau[cellidx6+4] = tau_yz;
          960             dev_ns_tau[cellidx6+5] = tau_zz;
          961         }
          962 
          963         // y
          964         if (y == 0) {
          965             cellidx6 = idx(x,ny,z)*6;
          966             dev_ns_tau[cellidx6]   = tau_xx;
          967             dev_ns_tau[cellidx6+1] = tau_xy;
          968             dev_ns_tau[cellidx6+2] = tau_xz;
          969             dev_ns_tau[cellidx6+3] = tau_yy;
          970             dev_ns_tau[cellidx6+4] = tau_yz;
          971             dev_ns_tau[cellidx6+5] = tau_zz;
          972         }
          973         if (y == ny-1) {
          974             cellidx6 = idx(x,-1,z)*6;
          975             dev_ns_tau[cellidx6]   = tau_xx;
          976             dev_ns_tau[cellidx6+1] = tau_xy;
          977             dev_ns_tau[cellidx6+2] = tau_xz;
          978             dev_ns_tau[cellidx6+3] = tau_yy;
          979             dev_ns_tau[cellidx6+4] = tau_yz;
          980             dev_ns_tau[cellidx6+5] = tau_zz;
          981         }
          982 
          983         // z
          984         if (z == 0 && bc_bot == 0) {  // Dirichlet
          985             cellidx6 = idx(x,y,-1)*6;
          986             dev_ns_tau[cellidx6]   = tau_xx;
          987             dev_ns_tau[cellidx6+1] = tau_xy;
          988             dev_ns_tau[cellidx6+2] = tau_xz;
          989             dev_ns_tau[cellidx6+3] = tau_yy;
          990             dev_ns_tau[cellidx6+4] = tau_yz;
          991             dev_ns_tau[cellidx6+5] = tau_zz;
          992         }
          993         if (z == 1 && bc_bot == 1) {  // Neumann
          994             cellidx6 = idx(x,y,-1)*6;
          995             dev_ns_tau[cellidx6]   = tau_xx;
          996             dev_ns_tau[cellidx6+1] = tau_xy;
          997             dev_ns_tau[cellidx6+2] = tau_xz;
          998             dev_ns_tau[cellidx6+3] = tau_yy;
          999             dev_ns_tau[cellidx6+4] = tau_yz;
         1000             dev_ns_tau[cellidx6+5] = tau_zz;
         1001         }
         1002         if (z == 0 && bc_bot == 2) {  // Periodic
         1003             cellidx6 = idx(x,y,nz)*6;
         1004             dev_ns_tau[cellidx6]   = tau_xx;
         1005             dev_ns_tau[cellidx6+1] = tau_xy;
         1006             dev_ns_tau[cellidx6+2] = tau_xz;
         1007             dev_ns_tau[cellidx6+3] = tau_yy;
         1008             dev_ns_tau[cellidx6+4] = tau_yz;
         1009             dev_ns_tau[cellidx6+5] = tau_zz;
         1010         }
         1011 
         1012         if (z == nz-1 && bc_top == 0) {  // Dirichlet
         1013             cellidx6 = idx(x,y,nz)*6;
         1014             dev_ns_tau[cellidx6]   = tau_xx;
         1015             dev_ns_tau[cellidx6+1] = tau_xy;
         1016             dev_ns_tau[cellidx6+2] = tau_xz;
         1017             dev_ns_tau[cellidx6+3] = tau_yy;
         1018             dev_ns_tau[cellidx6+4] = tau_yz;
         1019             dev_ns_tau[cellidx6+5] = tau_zz;
         1020         }
         1021         if (z == nz-2 && bc_top == 1) {  // Neumann
         1022             cellidx6 = idx(x,y,nz)*6;
         1023             dev_ns_tau[cellidx6]   = tau_xx;
         1024             dev_ns_tau[cellidx6+1] = tau_xy;
         1025             dev_ns_tau[cellidx6+2] = tau_xz;
         1026             dev_ns_tau[cellidx6+3] = tau_yy;
         1027             dev_ns_tau[cellidx6+4] = tau_yz;
         1028             dev_ns_tau[cellidx6+5] = tau_zz;
         1029         }
         1030         if (z == nz-1 && bc_top == 2) {  // Periodic
         1031             cellidx6 = idx(x,y,-1)*6;
         1032             dev_ns_tau[cellidx6]   = tau_xx;
         1033             dev_ns_tau[cellidx6+1] = tau_xy;
         1034             dev_ns_tau[cellidx6+2] = tau_xz;
         1035             dev_ns_tau[cellidx6+3] = tau_yy;
         1036             dev_ns_tau[cellidx6+4] = tau_yz;
         1037             dev_ns_tau[cellidx6+5] = tau_zz;
         1038         }
         1039     }
         1040 }
         1041 
         1042 // Update a the forcing values in the ghost nodes from their parent cell values.
         1043 // The edge (diagonal) cells are not written since they are not read. Launch
         1044 // this kernel for all cells in the grid.
         1045 /*
         1046   __global__ void setNSghostNodesForcing(
         1047   Float*  dev_ns_f1,
         1048   Float3* dev_ns_f2,
         1049   Float*  dev_ns_f,
         1050   unsigned int nijac)
         1051 
         1052   {
         1053   // 3D thread index
         1054   const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1055   const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1056   const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1057 
         1058   // Grid dimensions
         1059   const unsigned int nx = devC_grid.num[0];
         1060   const unsigned int ny = devC_grid.num[1];
         1061   const unsigned int nz = devC_grid.num[2];
         1062 
         1063   // 1D thread index
         1064   unsigned int cellidx = idx(x,y,z);
         1065 
         1066   // check that we are not outside the fluid grid
         1067   if (x < nx && y < ny && z < nz) {
         1068 
         1069   __syncthreads();
         1070   const Float f  = dev_ns_f[cellidx];
         1071   Float  f1;
         1072   Float3 f2;
         1073 
         1074   if (nijac == 0) {
         1075   __syncthreads();
         1076   f1 = dev_ns_f1[cellidx];
         1077   f2 = dev_ns_f2[cellidx];
         1078   }
         1079 
         1080   if (x == 0) {
         1081   cellidx = idx(nx,y,z);
         1082   dev_ns_f[cellidx] = f;
         1083   if (nijac == 0) {
         1084   dev_ns_f1[cellidx] = f1;
         1085   dev_ns_f2[cellidx] = f2;
         1086   }
         1087   }
         1088   if (x == nx-1) {
         1089   cellidx = idx(-1,y,z);
         1090   dev_ns_f[cellidx] = f;
         1091   if (nijac == 0) {
         1092   dev_ns_f1[cellidx] = f1;
         1093   dev_ns_f2[cellidx] = f2;
         1094   }
         1095   }
         1096 
         1097   if (y == 0) {
         1098   cellidx = idx(x,ny,z);
         1099   dev_ns_f[cellidx] = f;
         1100   if (nijac == 0) {
         1101   dev_ns_f1[cellidx] = f1;
         1102   dev_ns_f2[cellidx] = f2;
         1103   }
         1104   }
         1105   if (y == ny-1) {
         1106   cellidx = idx(x,-1,z);
         1107   dev_ns_f[cellidx] = f;
         1108   if (nijac == 0) {
         1109   dev_ns_f1[cellidx] = f1;
         1110   dev_ns_f2[cellidx] = f2;
         1111   }
         1112   }
         1113 
         1114   if (z == 0) {
         1115   cellidx = idx(x,y,nz);
         1116   dev_ns_f[cellidx] = f;
         1117   if (nijac == 0) {
         1118   dev_ns_f1[cellidx] = f1;
         1119   dev_ns_f2[cellidx] = f2;
         1120   }
         1121   }
         1122   if (z == nz-1) {
         1123   cellidx = idx(x,y,-1);
         1124   dev_ns_f[cellidx] = f;
         1125   if (nijac == 0) {
         1126   dev_ns_f1[cellidx] = f1;
         1127   dev_ns_f2[cellidx] = f2;
         1128   }
         1129   }
         1130   }
         1131   }
         1132 */
         1133 
         1134 // Find the porosity in each cell on the base of a sphere, centered at the cell
         1135 // center. 
         1136 __global__ void findPorositiesVelocitiesDiametersSpherical(
         1137     const unsigned int* __restrict__ dev_cellStart,
         1138     const unsigned int* __restrict__ dev_cellEnd,
         1139     const Float4* __restrict__ dev_x_sorted,
         1140     const Float4* __restrict__ dev_vel_sorted,
         1141     Float*  __restrict__ dev_ns_phi,
         1142     Float*  __restrict__ dev_ns_dphi,
         1143     Float3* __restrict__ dev_ns_vp_avg,
         1144     Float*  __restrict__ dev_ns_d_avg,
         1145     const unsigned int iteration,
         1146     const unsigned int np,
         1147     const Float c_phi)
         1148 {
         1149     // 3D thread index
         1150     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1151     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1152     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1153     
         1154     // Grid dimensions
         1155     const unsigned int nx = devC_grid.num[0];
         1156     const unsigned int ny = devC_grid.num[1];
         1157     const unsigned int nz = devC_grid.num[2];
         1158 
         1159     // Cell dimensions
         1160     const Float dx = devC_grid.L[0]/nx;
         1161     const Float dy = devC_grid.L[1]/ny;
         1162     const Float dz = devC_grid.L[2]/nz;
         1163 
         1164     // Cell sphere radius
         1165     //const Float R = fmin(dx, fmin(dy,dz)) * 0.5; // diameter = cell width
         1166     const Float R = fmin(dx, fmin(dy,dz));       // diameter = 2*cell width
         1167     const Float cell_volume = 4.0/3.0*M_PI*R*R*R;
         1168 
         1169     Float void_volume = cell_volume;
         1170     Float4 xr;  // particle pos. and radius
         1171 
         1172     // check that we are not outside the fluid grid
         1173     if (x < nx && y < ny && z < nz) {
         1174 
         1175         if (np > 0) {
         1176 
         1177             // Cell sphere center position
         1178             const Float3 X = MAKE_FLOAT3(
         1179                 x*dx + 0.5*dx,
         1180                 y*dy + 0.5*dy,
         1181                 z*dz + 0.5*dz);
         1182 
         1183             Float d, r;
         1184             Float phi = 1.00;
         1185             Float4 v;
         1186             unsigned int n = 0;
         1187 
         1188             Float3 v_avg = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1189             Float  d_avg = 0.0;
         1190 
         1191             // Read old porosity
         1192             __syncthreads();
         1193             Float phi_0 = dev_ns_phi[idx(x,y,z)];
         1194 
         1195             // The cell 3d index
         1196             const int3 gridPos = make_int3((int)x,(int)y,(int)z);
         1197 
         1198             // The neighbor cell 3d index
         1199             int3 targetCell;
         1200 
         1201             // The distance modifier for particles across periodic boundaries
         1202             Float3 dist, distmod;
         1203 
         1204             unsigned int cellID, startIdx, endIdx, i;
         1205 
         1206             // Iterate over 27 neighbor cells, R = cell width
         1207             /*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
         1208               for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
         1209               for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
         1210 
         1211             // Iterate over 27 neighbor cells, R = 2*cell width
         1212             for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
         1213                 //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
         1214                 for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
         1215                     for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis
         1216 
         1217                         // Index of neighbor cell this iteration is looking at
         1218                         targetCell = gridPos + make_int3(x_dim, y_dim, z_dim);
         1219 
         1220                         // Get distance modifier for interparticle
         1221                         // vector, if it crosses a periodic boundary
         1222                         distmod = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1223                         if (findDistMod(&targetCell, &distmod) != -1) {
         1224 
         1225                             // Calculate linear cell ID
         1226                             cellID = targetCell.x
         1227                                 + targetCell.y * devC_grid.num[0]
         1228                                 + (devC_grid.num[0] * devC_grid.num[1])
         1229                                 * targetCell.z;
         1230 
         1231                             // Lowest particle index in cell
         1232                             __syncthreads();
         1233                             startIdx = dev_cellStart[cellID];
         1234 
         1235                             // Make sure cell is not empty
         1236                             if (startIdx != 0xffffffff) {
         1237 
         1238                                 // Highest particle index in cell
         1239                                 __syncthreads();
         1240                                 endIdx = dev_cellEnd[cellID];
         1241 
         1242                                 // Iterate over cell particles
         1243                                 for (i=startIdx; i<endIdx; ++i) {
         1244 
         1245                                     // Read particle position and radius
         1246                                     __syncthreads();
         1247                                     xr = dev_x_sorted[i];
         1248                                     v  = dev_vel_sorted[i];
         1249                                     r = xr.w;
         1250 
         1251                                     // Find center distance
         1252                                     dist = MAKE_FLOAT3(
         1253                                         X.x - xr.x, 
         1254                                         X.y - xr.y,
         1255                                         X.z - xr.z);
         1256                                     dist += distmod;
         1257                                     d = length(dist);
         1258 
         1259                                     // Lens shaped intersection
         1260                                     if ((R - r) < d && d < (R + r)) {
         1261                                         void_volume -=
         1262                                             1.0/(12.0*d) * (
         1263                                                 M_PI*(R + r - d)*(R + r - d)
         1264                                                 *(d*d + 2.0*d*r - 3.0*r*r
         1265                                                   + 2.0*d*R + 6.0*r*R
         1266                                                   - 3.0*R*R) );
         1267                                         v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
         1268                                         d_avg += 2.0*r;
         1269                                         n++;
         1270                                     }
         1271 
         1272                                     // Particle fully contained in cell sphere
         1273                                     if (d <= R - r) {
         1274                                         void_volume -= 4.0/3.0*M_PI*r*r*r;
         1275                                         v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
         1276                                         d_avg += 2.0*r;
         1277                                         n++;
         1278                                     }
         1279                                 }
         1280                             }
         1281                         }
         1282                     }
         1283                 }
         1284             }
         1285 
         1286             if (phi < 0.999) {
         1287                 v_avg /= n;
         1288                 d_avg /= n;
         1289             }
         1290 
         1291             // Make sure that the porosity is in the interval [0.0;1.0]
         1292             phi = fmin(1.00, fmax(0.00, void_volume/cell_volume));
         1293             //phi = void_volume/cell_volume;
         1294 
         1295             Float dphi = phi - phi_0;
         1296             if (iteration == 0)
         1297                 dphi = 0.0;
         1298 
         1299             // report values to stdout for debugging
         1300             //printf("%d,%d,%d\tphi = %f dphi = %f v_avg = %f,%f,%f d_avg = %f\n",
         1301             //       x,y,z, phi, dphi, v_avg.x, v_avg.y, v_avg.z, d_avg);
         1302 
         1303             // Save porosity, porosity change, average velocity and average diameter
         1304             __syncthreads();
         1305             //phi = 0.5; dphi = 0.0; // disable porosity effects
         1306             const unsigned int cellidx = idx(x,y,z);
         1307             dev_ns_phi[cellidx]  = phi*c_phi;
         1308             dev_ns_dphi[cellidx] = dphi*c_phi;
         1309             dev_ns_vp_avg[cellidx] = v_avg;
         1310             dev_ns_d_avg[cellidx]  = d_avg;
         1311 
         1312 #ifdef CHECK_FLUID_FINITE
         1313             (void)checkFiniteFloat("phi", x, y, z, phi);
         1314             (void)checkFiniteFloat("dphi", x, y, z, dphi);
         1315             (void)checkFiniteFloat3("v_avg", x, y, z, v_avg);
         1316             (void)checkFiniteFloat("d_avg", x, y, z, d_avg);
         1317 #endif
         1318         } else {
         1319 
         1320             __syncthreads();
         1321             const unsigned int cellidx = idx(x,y,z);
         1322 
         1323             //Float phi = 0.5;
         1324             //Float dphi = 0.0;
         1325             //if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) {
         1326             //phi = 0.4;
         1327             //dphi = 0.1;
         1328             //}
         1329             //dev_ns_phi[cellidx]  = phi;
         1330             //dev_ns_dphi[cellidx] = dphi;
         1331             dev_ns_phi[cellidx]  = 1.0;
         1332             dev_ns_dphi[cellidx] = 0.0;
         1333 
         1334             dev_ns_vp_avg[cellidx] = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1335             dev_ns_d_avg[cellidx]  = 0.0;
         1336         }
         1337     }
         1338 }
         1339 
         1340 // Find the porosity in each cell on the base of a sphere, centered at the cell
         1341 // center. 
         1342 __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
         1343     const unsigned int* __restrict__ dev_cellStart,
         1344     const unsigned int* __restrict__ dev_cellEnd,
         1345     const Float4* __restrict__ dev_x_sorted,
         1346     const Float4* __restrict__ dev_vel_sorted,
         1347     Float*  __restrict__ dev_ns_phi,
         1348     Float*  __restrict__ dev_ns_dphi,
         1349     Float3* __restrict__ dev_ns_vp_avg,
         1350     Float*  __restrict__ dev_ns_d_avg,
         1351     const unsigned int iteration,
         1352     const unsigned int ndem,
         1353     const unsigned int np)
         1354 {
         1355     // 3D thread index
         1356     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1357     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1358     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1359     
         1360     // Grid dimensions
         1361     const unsigned int nx = devC_grid.num[0];
         1362     const unsigned int ny = devC_grid.num[1];
         1363     const unsigned int nz = devC_grid.num[2];
         1364 
         1365     // Cell dimensions
         1366     const Float dx = devC_grid.L[0]/nx;
         1367     const Float dy = devC_grid.L[1]/ny;
         1368     const Float dz = devC_grid.L[2]/nz;
         1369 
         1370     // Cell sphere radius
         1371     const Float R = fmin(dx, fmin(dy,dz));  // diameter = 2*cell width
         1372 
         1373     Float4 xr;  // particle pos. and radius
         1374 
         1375     // check that we are not outside the fluid grid
         1376     if (x < nx && y < ny && z < nz) {
         1377 
         1378         if (np > 0) {
         1379 
         1380             // Cell sphere center position
         1381             const Float3 X = MAKE_FLOAT3(
         1382                 x*dx + 0.5*dx,
         1383                 y*dy + 0.5*dy,
         1384                 z*dz + 0.5*dz);
         1385 
         1386             Float d, r;
         1387             Float phi = 1.00;
         1388             Float4 v;
         1389             unsigned int n = 0;
         1390 
         1391             Float3 v_avg = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1392             Float  d_avg = 0.0;
         1393 
         1394             // Read old porosity
         1395             __syncthreads();
         1396             Float phi_0 = dev_ns_phi[idx(x,y,z)];
         1397 
         1398             // The cell 3d index
         1399             const int3 gridPos = make_int3((int)x,(int)y,(int)z);
         1400 
         1401             // The neighbor cell 3d index
         1402             int3 targetCell;
         1403 
         1404             // The distance modifier for particles across periodic boundaries
         1405             Float3 distmod;
         1406 
         1407             unsigned int cellID, startIdx, endIdx, i;
         1408 
         1409             // Diagonal strain rate tensor components
         1410             Float3 dot_epsilon_ii = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1411 
         1412             // Vector pointing from cell center to particle center
         1413             Float3 x_p;
         1414 
         1415             // Normal vector pointing from cell center towards particle center
         1416             Float3 n_p;
         1417 
         1418             // Normalized sphere-particle distance
         1419             Float q;
         1420 
         1421             // Kernel function derivative value
         1422             Float dw_q;
         1423 
         1424             // Iterate over 27 neighbor cells, R = 2*cell width
         1425             for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
         1426                 for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
         1427                     for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis
         1428 
         1429                         // Index of neighbor cell this iteration is looking at
         1430                         targetCell = gridPos + make_int3(x_dim, y_dim, z_dim);
         1431 
         1432                         // Get distance modifier for interparticle
         1433                         // vector, if it crosses a periodic boundary
         1434                         distmod = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1435                         if (findDistMod(&targetCell, &distmod) != -1) {
         1436 
         1437                             // Calculate linear cell ID
         1438                             cellID = targetCell.x
         1439                                 + targetCell.y * devC_grid.num[0]
         1440                                 + (devC_grid.num[0] * devC_grid.num[1])
         1441                                 * targetCell.z; 
         1442 
         1443                             // Lowest particle index in cell
         1444                             startIdx = dev_cellStart[cellID];
         1445 
         1446                             // Make sure cell is not empty
         1447                             if (startIdx != 0xffffffff) {
         1448 
         1449                                 // Highest particle index in cell
         1450                                 endIdx = dev_cellEnd[cellID];
         1451 
         1452                                 // Iterate over cell particles
         1453                                 for (i=startIdx; i<endIdx; ++i) {
         1454 
         1455                                     // Read particle position and radius
         1456                                     __syncthreads();
         1457                                     xr = dev_x_sorted[i];
         1458                                     v  = dev_vel_sorted[i];
         1459                                     r = xr.w;
         1460 
         1461                                     // Find center distance and normal vector
         1462                                     x_p = MAKE_FLOAT3(
         1463                                         xr.x - X.x,
         1464                                         xr.y - X.y,
         1465                                         xr.z - X.z);
         1466                                     d = length(x_p);
         1467                                     n_p = x_p/d;
         1468                                     q = d/R;
         1469 
         1470 
         1471                                     dw_q = 0.0;
         1472                                     if (0.0 < q && q < 1.0) {
         1473                                         // kernel for 2d disc approximation
         1474                                         //dw_q = -1.0;
         1475 
         1476                                         // kernel for 3d sphere approximation
         1477                                         dw_q = -1.5*pow(-q + 1.0, 0.5)
         1478                                             *pow(q + 1.0, 0.5)
         1479                                             + 0.5*pow(-q + 1.0, 1.5)
         1480                                             *pow(q + 1.0, -0.5);
         1481                                     }
         1482 
         1483                                     v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
         1484                                     d_avg += 2.0*r;
         1485                                     dot_epsilon_ii +=
         1486                                         dw_q*MAKE_FLOAT3(v.x, v.y, v.z)*n_p;
         1487                                     n++;
         1488 
         1489                                 }
         1490                             }
         1491                         }
         1492                     }
         1493                 }
         1494             }
         1495 
         1496             dot_epsilon_ii /= R;
         1497             const Float dot_epsilon_kk =
         1498                 dot_epsilon_ii.x + dot_epsilon_ii.y + dot_epsilon_ii.z;
         1499 
         1500             const Float dphi =
         1501                 (1.0 - fmin(phi_0,0.99))*dot_epsilon_kk*ndem*devC_dt;
         1502             phi = phi_0 + dphi/(ndem*devC_dt);
         1503 
         1504             //if (dot_epsilon_kk != 0.0)
         1505             //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
         1506             //x,y,z, dot_epsilon_kk, dphi, phi);
         1507 
         1508             // Make sure that the porosity is in the interval [0.0;1.0]
         1509             phi = fmin(1.00, fmax(0.00, phi));
         1510 
         1511             if (phi < 0.999) {
         1512                 v_avg /= n;
         1513                 d_avg /= n;
         1514             }
         1515 
         1516             // report values to stdout for debugging
         1517             //printf("%d,%d,%d\tphi = %f dphi = %f v_avg = %f,%f,%f d_avg = %f\n",
         1518             //       x,y,z, phi, dphi, v_avg.x, v_avg.y, v_avg.z, d_avg);
         1519 
         1520             // Save porosity, porosity change, average velocity and average diameter
         1521             __syncthreads();
         1522             const unsigned int cellidx = idx(x,y,z);
         1523             //phi = 0.5; dphi = 0.0; // disable porosity effects const unsigned int cellidx = idx(x,y,z);
         1524             dev_ns_phi[cellidx]  = phi;
         1525             dev_ns_dphi[cellidx] = dphi;
         1526             dev_ns_vp_avg[cellidx] = v_avg;
         1527             dev_ns_d_avg[cellidx]  = d_avg;
         1528 
         1529 #ifdef CHECK_FLUID_FINITE
         1530             (void)checkFiniteFloat("phi", x, y, z, phi);
         1531             (void)checkFiniteFloat("dphi", x, y, z, dphi);
         1532             (void)checkFiniteFloat3("v_avg", x, y, z, v_avg);
         1533             (void)checkFiniteFloat("d_avg", x, y, z, d_avg);
         1534 #endif
         1535         } else {
         1536             // np=0: there are no particles
         1537 
         1538             __syncthreads();
         1539             const unsigned int cellidx = idx(x,y,z);
         1540 
         1541             dev_ns_dphi[cellidx] = 0.0;
         1542 
         1543             dev_ns_vp_avg[cellidx] = MAKE_FLOAT3(0.0, 0.0, 0.0);
         1544             dev_ns_d_avg[cellidx]  = 0.0;
         1545         }
         1546     }
         1547 }
         1548 
         1549 // Modulate the hydraulic pressure at the upper boundary
         1550 __global__ void setUpperPressureNS(
         1551     Float* __restrict__ dev_ns_p,
         1552     Float* __restrict__ dev_ns_epsilon,
         1553     Float* __restrict__ dev_ns_epsilon_new,
         1554     const Float  beta,
         1555     const Float new_pressure)
         1556 {
         1557     // 3D thread index
         1558     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1559     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1560     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1561     
         1562     // check that the thread is located at the top boundary
         1563     if (x < devC_grid.num[0] &&
         1564         y < devC_grid.num[1] &&
         1565         z == devC_grid.num[2]-1) {
         1566 
         1567         const unsigned int cellidx = idx(x,y,z);
         1568 
         1569         // Read the current pressure
         1570         const Float pressure = dev_ns_p[cellidx];
         1571 
         1572         // Determine the new epsilon boundary condition
         1573         const Float epsilon = new_pressure - beta*pressure;
         1574 
         1575         // Write the new pressure and epsilon values to the top boundary cells
         1576         __syncthreads();
         1577         dev_ns_epsilon[cellidx] = epsilon;
         1578         dev_ns_epsilon_new[cellidx] = epsilon;
         1579         dev_ns_p[cellidx] = new_pressure;
         1580 
         1581 #ifdef CHECK_FLUID_FINITE
         1582         (void)checkFiniteFloat("epsilon", x, y, z, epsilon);
         1583         (void)checkFiniteFloat("new_pressure", x, y, z, new_pressure);
         1584 #endif
         1585     }
         1586 }
         1587 
         1588 // Find the gradient in a cell in a homogeneous, cubic 3D scalar field using
         1589 // finite central differences
         1590 __device__ Float3 gradient(
         1591     const Float* __restrict__ dev_scalarfield,
         1592     const unsigned int x,
         1593     const unsigned int y,
         1594     const unsigned int z,
         1595     const Float dx,
         1596     const Float dy,
         1597     const Float dz)
         1598 {
         1599     // Read 6 neighbor cells
         1600     __syncthreads();
         1601     //const Float p  = dev_scalarfield[idx(x,y,z)];
         1602     const Float xn = dev_scalarfield[idx(x-1,y,z)];
         1603     const Float xp = dev_scalarfield[idx(x+1,y,z)];
         1604     const Float yn = dev_scalarfield[idx(x,y-1,z)];
         1605     const Float yp = dev_scalarfield[idx(x,y+1,z)];
         1606     const Float zn = dev_scalarfield[idx(x,y,z-1)];
         1607     const Float zp = dev_scalarfield[idx(x,y,z+1)];
         1608 
         1609     //__syncthreads();
         1610     //if (p != 0.0)
         1611     //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
         1612 
         1613     // Calculate central-difference gradients
         1614     return MAKE_FLOAT3(
         1615         (xp - xn)/(2.0*dx),
         1616         (yp - yn)/(2.0*dy),
         1617         (zp - zn)/(2.0*dz));
         1618 }
         1619 
         1620 // Find the divergence in a cell in a homogeneous, cubic, 3D vector field
         1621 __device__ Float divergence(
         1622     const Float* __restrict__ dev_vectorfield_x,
         1623     const Float* __restrict__ dev_vectorfield_y,
         1624     const Float* __restrict__ dev_vectorfield_z,
         1625     const unsigned int x,
         1626     const unsigned int y,
         1627     const unsigned int z,
         1628     const Float dx,
         1629     const Float dy,
         1630     const Float dz)
         1631 {
         1632     // Read 6 cell-face values
         1633     __syncthreads();
         1634     const Float xn = dev_vectorfield_x[vidx(x,y,z)];
         1635     const Float xp = dev_vectorfield_x[vidx(x+1,y,z)];
         1636     const Float yn = dev_vectorfield_y[vidx(x,y,z)];
         1637     const Float yp = dev_vectorfield_y[vidx(x,y+1,z)];
         1638     const Float zn = dev_vectorfield_z[vidx(x,y,z)];
         1639     const Float zp = dev_vectorfield_z[vidx(x,y,z+1)];
         1640 
         1641     // Calculate the central difference gradrients and the divergence
         1642     return
         1643         (xp - xn)/dx +
         1644         (yp - yn)/dy +
         1645         (zp - zn)/dz;
         1646 }
         1647 
         1648 // Find the divergence of a tensor field
         1649 __device__ Float3 divergence_tensor(
         1650     const Float* __restrict__ dev_tensorfield,
         1651     const unsigned int x,
         1652     const unsigned int y,
         1653     const unsigned int z,
         1654     const Float dx,
         1655     const Float dy,
         1656     const Float dz)
         1657 {
         1658     __syncthreads();
         1659 
         1660     // Read the tensor in the 6 neighbor cells
         1661     const Float t_xx_xp = dev_tensorfield[idx(x+1,y,z)*6];
         1662     const Float t_xy_xp = dev_tensorfield[idx(x+1,y,z)*6+1];
         1663     const Float t_xz_xp = dev_tensorfield[idx(x+1,y,z)*6+2];
         1664     const Float t_yy_xp = dev_tensorfield[idx(x+1,y,z)*6+3];
         1665     const Float t_yz_xp = dev_tensorfield[idx(x+1,y,z)*6+4];
         1666     const Float t_zz_xp = dev_tensorfield[idx(x+1,y,z)*6+5];
         1667 
         1668     const Float t_xx_xn = dev_tensorfield[idx(x-1,y,z)*6];
         1669     const Float t_xy_xn = dev_tensorfield[idx(x-1,y,z)*6+1];
         1670     const Float t_xz_xn = dev_tensorfield[idx(x-1,y,z)*6+2];
         1671     const Float t_yy_xn = dev_tensorfield[idx(x-1,y,z)*6+3];
         1672     const Float t_yz_xn = dev_tensorfield[idx(x-1,y,z)*6+4];
         1673     const Float t_zz_xn = dev_tensorfield[idx(x-1,y,z)*6+5];
         1674 
         1675     const Float t_xx_yp = dev_tensorfield[idx(x,y+1,z)*6];
         1676     const Float t_xy_yp = dev_tensorfield[idx(x,y+1,z)*6+1];
         1677     const Float t_xz_yp = dev_tensorfield[idx(x,y+1,z)*6+2];
         1678     const Float t_yy_yp = dev_tensorfield[idx(x,y+1,z)*6+3];
         1679     const Float t_yz_yp = dev_tensorfield[idx(x,y+1,z)*6+4];
         1680     const Float t_zz_yp = dev_tensorfield[idx(x,y+1,z)*6+5];
         1681 
         1682     const Float t_xx_yn = dev_tensorfield[idx(x,y-1,z)*6];
         1683     const Float t_xy_yn = dev_tensorfield[idx(x,y-1,z)*6+1];
         1684     const Float t_xz_yn = dev_tensorfield[idx(x,y-1,z)*6+2];
         1685     const Float t_yy_yn = dev_tensorfield[idx(x,y-1,z)*6+3];
         1686     const Float t_yz_yn = dev_tensorfield[idx(x,y-1,z)*6+4];
         1687     const Float t_zz_yn = dev_tensorfield[idx(x,y-1,z)*6+5];
         1688 
         1689     const Float t_xx_zp = dev_tensorfield[idx(x,y,z+1)*6];
         1690     const Float t_xy_zp = dev_tensorfield[idx(x,y,z+1)*6+1];
         1691     const Float t_xz_zp = dev_tensorfield[idx(x,y,z+1)*6+2];
         1692     const Float t_yy_zp = dev_tensorfield[idx(x,y,z+1)*6+3];
         1693     const Float t_yz_zp = dev_tensorfield[idx(x,y,z+1)*6+4];
         1694     const Float t_zz_zp = dev_tensorfield[idx(x,y,z+1)*6+5];
         1695 
         1696     const Float t_xx_zn = dev_tensorfield[idx(x,y,z-1)*6];
         1697     const Float t_xy_zn = dev_tensorfield[idx(x,y,z-1)*6+1];
         1698     const Float t_xz_zn = dev_tensorfield[idx(x,y,z-1)*6+2];
         1699     const Float t_yy_zn = dev_tensorfield[idx(x,y,z-1)*6+3];
         1700     const Float t_yz_zn = dev_tensorfield[idx(x,y,z-1)*6+4];
         1701     const Float t_zz_zn = dev_tensorfield[idx(x,y,z-1)*6+5];
         1702 
         1703     // Calculate div(phi*tau)
         1704     const Float3 div_tensor = MAKE_FLOAT3(
         1705         // x
         1706         (t_xx_xp - t_xx_xn)/dx +
         1707         (t_xy_yp - t_xy_yn)/dy +
         1708         (t_xz_zp - t_xz_zn)/dz,
         1709         // y
         1710         (t_xy_xp - t_xy_xn)/dx +
         1711         (t_yy_yp - t_yy_yn)/dy +
         1712         (t_yz_zp - t_yz_zn)/dz,
         1713         // z
         1714         (t_xz_xp - t_xz_xn)/dx +
         1715         (t_yz_yp - t_yz_yn)/dy +
         1716         (t_zz_zp - t_zz_zn)/dz);
         1717 
         1718 #ifdef CHECK_FLUID_FINITE
         1719     (void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor);
         1720 #endif
         1721     return div_tensor;
         1722 }
         1723 
         1724 
         1725 // Find the spatial gradient in e.g. pressures per cell
         1726 // using first order central differences
         1727 __global__ void findNSgradientsDev(
         1728     const Float* __restrict__ dev_scalarfield,     // in
         1729     Float3* __restrict__ dev_vectorfield)    // out
         1730 {
         1731     // 3D thread index
         1732     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1733     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1734     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1735 
         1736     // Grid dimensions
         1737     const unsigned int nx = devC_grid.num[0];
         1738     const unsigned int ny = devC_grid.num[1];
         1739     const unsigned int nz = devC_grid.num[2];
         1740 
         1741     // Grid sizes
         1742     const Float dx = devC_grid.L[0]/nx;
         1743     const Float dy = devC_grid.L[1]/ny;
         1744     const Float dz = devC_grid.L[2]/nz;
         1745 
         1746     // 1D thread index
         1747     const unsigned int cellidx = idx(x,y,z);
         1748 
         1749     // Check that we are not outside the fluid grid
         1750     if (x < nx && y < ny && z < nz) {
         1751 
         1752         const Float3 grad = gradient(dev_scalarfield, x, y, z, dx, dy, dz);
         1753 
         1754         // Write gradient
         1755         __syncthreads();
         1756         dev_vectorfield[cellidx] = grad;
         1757 
         1758 #ifdef CHECK_FLUID_FINITE
         1759         (void)checkFiniteFloat3("grad", x, y, z, grad);
         1760 #endif
         1761     }
         1762 }
         1763 
         1764 // Find the outer product of v v
         1765 __global__ void findvvOuterProdNS(
         1766     const Float3* __restrict__ dev_ns_v,       // in
         1767     Float*  __restrict__ dev_ns_v_prod)  // out
         1768 {
         1769     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1770     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1771     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1772 
         1773     // 1D thread index
         1774     const unsigned int cellidx6 = idx(x,y,z)*6;
         1775 
         1776     // Check that we are not outside the fluid grid
         1777     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         1778 
         1779         __syncthreads();
         1780         const Float3 v = dev_ns_v[idx(x,y,z)];
         1781 
         1782         // The outer product (v v) looks like:
         1783         // [[ v_x^2    v_x*v_y  v_x*v_z ]
         1784         //  [ v_y*v_x  v_y^2    v_y*v_z ]
         1785         //  [ v_z*v_x  v_z*v_y  v_z^2   ]]
         1786 
         1787         // The tensor is symmetrical: value i,j = j,i.
         1788         // Only the upper triangle is saved, with the cells given a linear index
         1789         // enumerated as:
         1790         // [[ 0 1 2 ]
         1791         //  [   3 4 ]
         1792         //  [     5 ]]
         1793 
         1794         __syncthreads();
         1795         dev_ns_v_prod[cellidx6]   = v.x*v.x;
         1796         dev_ns_v_prod[cellidx6+1] = v.x*v.y;
         1797         dev_ns_v_prod[cellidx6+2] = v.x*v.z;
         1798         dev_ns_v_prod[cellidx6+3] = v.y*v.y;
         1799         dev_ns_v_prod[cellidx6+4] = v.y*v.z;
         1800         dev_ns_v_prod[cellidx6+5] = v.z*v.z;
         1801 
         1802 #ifdef CHECK_FLUID_FINITE
         1803         (void)checkFiniteFloat("v_prod[0]", x, y, z, v.x*v.x);
         1804         (void)checkFiniteFloat("v_prod[1]", x, y, z, v.x*v.y);
         1805         (void)checkFiniteFloat("v_prod[2]", x, y, z, v.x*v.z);
         1806         (void)checkFiniteFloat("v_prod[3]", x, y, z, v.y*v.y);
         1807         (void)checkFiniteFloat("v_prod[4]", x, y, z, v.y*v.z);
         1808         (void)checkFiniteFloat("v_prod[5]", x, y, z, v.z*v.z);
         1809 #endif
         1810     }
         1811 }
         1812 
         1813 
         1814 // Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6
         1815 // values in 3D.
         1816 __global__ void findNSstressTensor(
         1817     const Float3* __restrict__ dev_ns_v,       // in
         1818     const Float mu,                            // in
         1819     Float* __restrict__ dev_ns_tau)     // out
         1820 {
         1821     // 3D thread index
         1822     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1823     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1824     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1825 
         1826     // Grid dimensions
         1827     const unsigned int nx = devC_grid.num[0];
         1828     const unsigned int ny = devC_grid.num[1];
         1829     const unsigned int nz = devC_grid.num[2];
         1830 
         1831     // Cell sizes
         1832     const Float dx = devC_grid.L[0]/nx;
         1833     const Float dy = devC_grid.L[1]/ny;
         1834     const Float dz = devC_grid.L[2]/nz;
         1835 
         1836     // 1D thread index
         1837     const unsigned int cellidx6 = idx(x,y,z)*6;
         1838 
         1839     // Check that we are not outside the fluid grid
         1840     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         1841 
         1842         // The fluid stress tensor (tau) looks like
         1843         // [[ tau_xx  tau_xy  tau_xz ]
         1844         //  [ tau_yx  tau_xy  tau_yz ]
         1845         //  [ tau_zx  tau_zy  tau_zz ]]
         1846 
         1847         // The tensor is symmetrical: value i,j = j,i.
         1848         // Only the upper triangle is saved, with the cells given a linear index
         1849         // enumerated as:
         1850         // [[ 0 1 2 ]
         1851         //  [   3 4 ]
         1852         //  [     5 ]]
         1853 
         1854         // Read neighbor values for central differences
         1855         __syncthreads();
         1856         const Float3 xp = dev_ns_v[idx(x+1,y,z)];
         1857         const Float3 xn = dev_ns_v[idx(x-1,y,z)];
         1858         const Float3 yp = dev_ns_v[idx(x,y+1,z)];
         1859         const Float3 yn = dev_ns_v[idx(x,y-1,z)];
         1860         const Float3 zp = dev_ns_v[idx(x,y,z+1)];
         1861         const Float3 zn = dev_ns_v[idx(x,y,z-1)];
         1862 
         1863         // The diagonal stress tensor components
         1864         const Float tau_xx = 2.0*mu*(xp.x - xn.x)/(2.0*dx);
         1865         const Float tau_yy = 2.0*mu*(yp.y - yn.y)/(2.0*dy);
         1866         const Float tau_zz = 2.0*mu*(zp.z - zn.z)/(2.0*dz);
         1867 
         1868         // The off-diagonal stress tensor components
         1869         const Float tau_xy =
         1870             mu*((yp.x - yn.x)/(2.0*dy) + (xp.y - xn.y)/(2.0*dx));
         1871         const Float tau_xz =
         1872             mu*((zp.x - zn.x)/(2.0*dz) + (xp.z - xn.z)/(2.0*dx));
         1873         const Float tau_yz =
         1874             mu*((zp.y - zn.y)/(2.0*dz) + (yp.z - yn.z)/(2.0*dy));
         1875 
         1876         /*
         1877           if (x == 0 && y == 0 && z == 0)
         1878           printf("mu = %f\n", mu);
         1879           if (tau_xz > 1.0e-6)
         1880           printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
         1881           if (tau_yz > 1.0e-6)
         1882           printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
         1883           if (tau_zz > 1.0e-6)
         1884           printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
         1885         */
         1886 
         1887         // Store values in global memory
         1888         __syncthreads();
         1889         dev_ns_tau[cellidx6]   = tau_xx;
         1890         dev_ns_tau[cellidx6+1] = tau_xy;
         1891         dev_ns_tau[cellidx6+2] = tau_xz;
         1892         dev_ns_tau[cellidx6+3] = tau_yy;
         1893         dev_ns_tau[cellidx6+4] = tau_yz;
         1894         dev_ns_tau[cellidx6+5] = tau_zz;
         1895 
         1896 #ifdef CHECK_FLUID_FINITE
         1897         (void)checkFiniteFloat("tau_xx", x, y, z, tau_xx);
         1898         (void)checkFiniteFloat("tau_xy", x, y, z, tau_xy);
         1899         (void)checkFiniteFloat("tau_xz", x, y, z, tau_xz);
         1900         (void)checkFiniteFloat("tau_yy", x, y, z, tau_yy);
         1901         (void)checkFiniteFloat("tau_yz", x, y, z, tau_yz);
         1902         (void)checkFiniteFloat("tau_zz", x, y, z, tau_zz);
         1903 #endif
         1904     }
         1905 }
         1906 
         1907 
         1908 // Find the divergence of phi*v*v
         1909 __global__ void findNSdivphiviv(
         1910     const Float*  __restrict__ dev_ns_phi,          // in
         1911     const Float3* __restrict__ dev_ns_v,            // in
         1912     Float3* __restrict__ dev_ns_div_phi_vi_v) // out
         1913 {
         1914     // 3D thread index
         1915     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         1916     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         1917     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         1918 
         1919     // Grid dimensions
         1920     const unsigned int nx = devC_grid.num[0];
         1921     const unsigned int ny = devC_grid.num[1];
         1922     const unsigned int nz = devC_grid.num[2];
         1923 
         1924     // Cell sizes
         1925     const Float dx = devC_grid.L[0]/nx;
         1926     const Float dy = devC_grid.L[1]/ny;
         1927     const Float dz = devC_grid.L[2]/nz;
         1928 
         1929     // 1D thread index
         1930     const unsigned int cellidx = idx(x,y,z);
         1931 
         1932     // Check that we are not outside the fluid grid
         1933     if (x < nx && y < ny && z < nz) {
         1934 
         1935         // Read porosity and velocity in the 6 neighbor cells
         1936         __syncthreads();
         1937         const Float  phi_xn = dev_ns_phi[idx(x-1,y,z)];
         1938         const Float  phi    = dev_ns_phi[idx(x,y,z)];
         1939         const Float  phi_xp = dev_ns_phi[idx(x+1,y,z)];
         1940         const Float  phi_yn = dev_ns_phi[idx(x,y-1,z)];
         1941         const Float  phi_yp = dev_ns_phi[idx(x,y+1,z)];
         1942         const Float  phi_zn = dev_ns_phi[idx(x,y,z-1)];
         1943         const Float  phi_zp = dev_ns_phi[idx(x,y,z+1)];
         1944 
         1945         const Float3 v_xn = dev_ns_v[idx(x-1,y,z)];
         1946         const Float3 v    = dev_ns_v[idx(x,y,z)];
         1947         const Float3 v_xp = dev_ns_v[idx(x+1,y,z)];
         1948         const Float3 v_yn = dev_ns_v[idx(x,y-1,z)];
         1949         const Float3 v_yp = dev_ns_v[idx(x,y+1,z)];
         1950         const Float3 v_zn = dev_ns_v[idx(x,y,z-1)];
         1951         const Float3 v_zp = dev_ns_v[idx(x,y,z+1)];
         1952 
         1953         // Calculate upwind coefficients
         1954         //*
         1955         const Float3 a = MAKE_FLOAT3(
         1956             copysign(1.0, v.x),
         1957             copysign(1.0, v.y),
         1958             copysign(1.0, v.z));
         1959 
         1960         // Calculate the divergence based on the upwind differences (Griebel et
         1961         // al. 1998, eq. 3.9)
         1962         const Float3 div_uw = MAKE_FLOAT3(
         1963             // x
         1964             ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
         1965              (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
         1966 
         1967             ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
         1968              (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
         1969 
         1970             ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
         1971              (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
         1972 
         1973             // y
         1974             ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
         1975              (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
         1976 
         1977             ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
         1978              (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
         1979 
         1980             ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
         1981              (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
         1982 
         1983             // z
         1984             ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
         1985              (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
         1986 
         1987             ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
         1988              (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
         1989 
         1990             ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
         1991              (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
         1992 
         1993 
         1994         // Calculate the divergence based on the central-difference gradients
         1995         const Float3 div_cd = MAKE_FLOAT3(
         1996             // x
         1997             (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
         1998             (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
         1999             (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
         2000             // y
         2001             (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
         2002             (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
         2003             (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
         2004             // z
         2005             (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
         2006             (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
         2007             (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
         2008 
         2009         // Weighting parameter
         2010         const Float tau = 0.5;
         2011 
         2012         // Determine the weighted average of both discretizations
         2013         const Float3 div_phi_vi_v = tau*div_uw + (1.0 - tau)*div_cd;
         2014         //*/
         2015 
         2016         /*
         2017         // Calculate the divergence: div(phi*v_i*v)
         2018         const Float3 div_phi_vi_v = MAKE_FLOAT3(
         2019         // x
         2020         (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
         2021         (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
         2022         (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
         2023         // y
         2024         (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
         2025         (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
         2026         (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
         2027         // z
         2028         (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
         2029         (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
         2030         (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
         2031         // */
         2032 
         2033         // Write divergence
         2034         __syncthreads();
         2035         dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v;
         2036 
         2037         //printf("div(phi*v*v) [%d,%d,%d] = %f, %f, %f\n", x,y,z,
         2038         //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
         2039 
         2040 #ifdef CHECK_FLUID_FINITE
         2041         (void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v);
         2042 #endif
         2043     }
         2044 }
         2045 
         2046 __global__ void findNSdivtau(
         2047     const Float* __restrict__ dev_ns_tau,      // in
         2048     Float3* __restrict__ dev_ns_div_tau)  // out
         2049 {
         2050     // 3D thread index
         2051     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2052     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2053     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2054 
         2055     // Grid dimensions
         2056     const unsigned int nx = devC_grid.num[0];
         2057     const unsigned int ny = devC_grid.num[1];
         2058     const unsigned int nz = devC_grid.num[2];
         2059 
         2060     // Cell sizes
         2061     const Float dx = devC_grid.L[0]/nx;
         2062     const Float dy = devC_grid.L[1]/ny;
         2063     const Float dz = devC_grid.L[2]/nz;
         2064 
         2065     // 1D thread index
         2066     const unsigned int cellidx = idx(x,y,z);
         2067 
         2068     // Check that we are not outside the fluid grid
         2069     if (x < nx && y < ny && z < nz) {
         2070         
         2071         __syncthreads();
         2072         const Float3 div_tau =
         2073             divergence_tensor(dev_ns_tau, x, y, z, dx, dy, dz);
         2074 
         2075         __syncthreads();
         2076         dev_ns_div_tau[cellidx] = div_tau;
         2077     }
         2078 }
         2079 
         2080 
         2081 // Find the divergence of phi*tau
         2082 __global__ void findNSdivphitau(
         2083     const Float* __restrict__ dev_ns_phi,          // in
         2084     const Float* __restrict__ dev_ns_tau,          // in
         2085     Float3* __restrict__ dev_ns_div_phi_tau)  // out
         2086 {
         2087     // 3D thread index
         2088     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2089     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2090     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2091 
         2092     // Grid dimensions
         2093     const unsigned int nx = devC_grid.num[0];
         2094     const unsigned int ny = devC_grid.num[1];
         2095     const unsigned int nz = devC_grid.num[2];
         2096 
         2097     // Cell sizes
         2098     const Float dx = devC_grid.L[0]/nx;
         2099     const Float dy = devC_grid.L[1]/ny;
         2100     const Float dz = devC_grid.L[2]/nz;
         2101 
         2102     // 1D thread index
         2103     const unsigned int cellidx = idx(x,y,z);
         2104 
         2105     // Check that we are not outside the fluid grid
         2106     if (x < nx && y < ny && z < nz) {
         2107 
         2108         // Read the porosity in the 6 neighbor cells
         2109         __syncthreads();
         2110         const Float phi_xn = dev_ns_phi[idx(x-1,y,z)];
         2111         const Float phi_xp = dev_ns_phi[idx(x+1,y,z)];
         2112         const Float phi_yn = dev_ns_phi[idx(x,y-1,z)];
         2113         const Float phi_yp = dev_ns_phi[idx(x,y+1,z)];
         2114         const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];
         2115         const Float phi_zp = dev_ns_phi[idx(x,y,z+1)];
         2116 
         2117         // Read the stress tensor in the 6 neighbor cells
         2118         const Float tau_xx_xp = dev_ns_tau[idx(x+1,y,z)*6];
         2119         const Float tau_xy_xp = dev_ns_tau[idx(x+1,y,z)*6+1];
         2120         const Float tau_xz_xp = dev_ns_tau[idx(x+1,y,z)*6+2];
         2121         const Float tau_yy_xp = dev_ns_tau[idx(x+1,y,z)*6+3];
         2122         const Float tau_yz_xp = dev_ns_tau[idx(x+1,y,z)*6+4];
         2123         const Float tau_zz_xp = dev_ns_tau[idx(x+1,y,z)*6+5];
         2124 
         2125         const Float tau_xx_xn = dev_ns_tau[idx(x-1,y,z)*6];
         2126         const Float tau_xy_xn = dev_ns_tau[idx(x-1,y,z)*6+1];
         2127         const Float tau_xz_xn = dev_ns_tau[idx(x-1,y,z)*6+2];
         2128         const Float tau_yy_xn = dev_ns_tau[idx(x-1,y,z)*6+3];
         2129         const Float tau_yz_xn = dev_ns_tau[idx(x-1,y,z)*6+4];
         2130         const Float tau_zz_xn = dev_ns_tau[idx(x-1,y,z)*6+5];
         2131 
         2132         const Float tau_xx_yp = dev_ns_tau[idx(x,y+1,z)*6];
         2133         const Float tau_xy_yp = dev_ns_tau[idx(x,y+1,z)*6+1];
         2134         const Float tau_xz_yp = dev_ns_tau[idx(x,y+1,z)*6+2];
         2135         const Float tau_yy_yp = dev_ns_tau[idx(x,y+1,z)*6+3];
         2136         const Float tau_yz_yp = dev_ns_tau[idx(x,y+1,z)*6+4];
         2137         const Float tau_zz_yp = dev_ns_tau[idx(x,y+1,z)*6+5];
         2138 
         2139         const Float tau_xx_yn = dev_ns_tau[idx(x,y-1,z)*6];
         2140         const Float tau_xy_yn = dev_ns_tau[idx(x,y-1,z)*6+1];
         2141         const Float tau_xz_yn = dev_ns_tau[idx(x,y-1,z)*6+2];
         2142         const Float tau_yy_yn = dev_ns_tau[idx(x,y-1,z)*6+3];
         2143         const Float tau_yz_yn = dev_ns_tau[idx(x,y-1,z)*6+4];
         2144         const Float tau_zz_yn = dev_ns_tau[idx(x,y-1,z)*6+5];
         2145 
         2146         const Float tau_xx_zp = dev_ns_tau[idx(x,y,z+1)*6];
         2147         const Float tau_xy_zp = dev_ns_tau[idx(x,y,z+1)*6+1];
         2148         const Float tau_xz_zp = dev_ns_tau[idx(x,y,z+1)*6+2];
         2149         const Float tau_yy_zp = dev_ns_tau[idx(x,y,z+1)*6+3];
         2150         const Float tau_yz_zp = dev_ns_tau[idx(x,y,z+1)*6+4];
         2151         const Float tau_zz_zp = dev_ns_tau[idx(x,y,z+1)*6+5];
         2152 
         2153         const Float tau_xx_zn = dev_ns_tau[idx(x,y,z-1)*6];
         2154         const Float tau_xy_zn = dev_ns_tau[idx(x,y,z-1)*6+1];
         2155         const Float tau_xz_zn = dev_ns_tau[idx(x,y,z-1)*6+2];
         2156         const Float tau_yy_zn = dev_ns_tau[idx(x,y,z-1)*6+3];
         2157         const Float tau_yz_zn = dev_ns_tau[idx(x,y,z-1)*6+4];
         2158         const Float tau_zz_zn = dev_ns_tau[idx(x,y,z-1)*6+5];
         2159 
         2160         // Calculate div(phi*tau)
         2161         const Float3 div_phi_tau = MAKE_FLOAT3(
         2162             // x
         2163             (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
         2164             (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
         2165             (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
         2166             // y
         2167             (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
         2168             (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
         2169             (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
         2170             // z
         2171             (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
         2172             (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
         2173             (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
         2174 
         2175         // Write divergence
         2176         __syncthreads();
         2177         dev_ns_div_phi_tau[cellidx] = div_phi_tau;
         2178 
         2179 #ifdef CHECK_FLUID_FINITE
         2180         (void)checkFiniteFloat3("div_phi_tau", x, y, z, div_phi_tau);
         2181 #endif
         2182     }
         2183 }
         2184 
         2185 // Find the divergence of phi v v
         2186 // Unused
         2187 __global__ void findNSdivphivv(
         2188     const Float* __restrict__ dev_ns_v_prod, // in
         2189     const Float* __restrict__ dev_ns_phi,    // in
         2190     Float3* __restrict__ dev_ns_div_phi_v_v) // out
         2191 {
         2192     // 3D thread index
         2193     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2194     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2195     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2196 
         2197     // Grid dimensions
         2198     const unsigned int nx = devC_grid.num[0];
         2199     const unsigned int ny = devC_grid.num[1];
         2200     const unsigned int nz = devC_grid.num[2];
         2201 
         2202     // Cell sizes
         2203     const Float dx = devC_grid.L[0]/nx;
         2204     const Float dy = devC_grid.L[1]/ny;
         2205     const Float dz = devC_grid.L[2]/nz;
         2206 
         2207     // 1D thread index
         2208     const unsigned int cellidx = idx(x,y,z);
         2209 
         2210     // Check that we are not outside the fluid grid
         2211     if (x < nx && y < ny && z < nz) {
         2212 
         2213         // Read cell and 6 neighbor cells
         2214         __syncthreads();
         2215         //const Float  phi    = dev_ns_phi[cellidx];
         2216         const Float  phi_xn = dev_ns_phi[idx(x-1,y,z)];
         2217         const Float  phi_xp = dev_ns_phi[idx(x+1,y,z)];
         2218         const Float  phi_yn = dev_ns_phi[idx(x,y-1,z)];
         2219         const Float  phi_yp = dev_ns_phi[idx(x,y+1,z)];
         2220         const Float  phi_zn = dev_ns_phi[idx(x,y,z-1)];
         2221         const Float  phi_zp = dev_ns_phi[idx(x,y,z+1)];
         2222 
         2223         // The tensor is symmetrical: value i,j = j,i.
         2224         // Only the upper triangle is saved, with the cells given a linear index
         2225         // enumerated as:
         2226         // [[ 0 1 2 ]
         2227         //  [   3 4 ]
         2228         //  [     5 ]]
         2229 
         2230         // div(T) = 
         2231         //  [ de_xx/dx + de_xy/dy + de_xz/dz ,
         2232         //    de_yx/dx + de_yy/dy + de_yz/dz ,
         2233         //    de_zx/dx + de_zy/dy + de_zz/dz ]
         2234 
         2235         // This function finds the divergence of (phi v v), which is a vector
         2236 
         2237         // Calculate the divergence. See
         2238         // https://en.wikipedia.org/wiki/Divergence#Application_in_Cartesian_coordinates
         2239         // The symmetry described in findvvOuterProdNS is used
         2240         __syncthreads();
         2241         const Float3 div = MAKE_FLOAT3(
         2242             ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
         2243               - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
         2244              (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
         2245               - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
         2246              (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
         2247               - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
         2248             ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
         2249               - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
         2250              (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
         2251               - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
         2252              (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
         2253               - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
         2254             ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
         2255               - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
         2256              (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
         2257               - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
         2258              (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
         2259               - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
         2260 
         2261         //printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z);
         2262 
         2263         // Write divergence
         2264         __syncthreads();
         2265         dev_ns_div_phi_v_v[cellidx] = div;
         2266 
         2267 #ifdef CHECK_FLUID_FINITE
         2268         (void)checkFiniteFloat3("div_phi_v_v", x, y, z, div);
         2269 #endif
         2270     }
         2271 }
         2272 
         2273 
         2274 // Find predicted fluid velocity
         2275 // Launch per face.
         2276 __global__ void findPredNSvelocities(
         2277     const Float*  __restrict__ dev_ns_p,               // in
         2278     const Float*  __restrict__ dev_ns_v_x,             // in
         2279     const Float*  __restrict__ dev_ns_v_y,             // in
         2280     const Float*  __restrict__ dev_ns_v_z,             // in
         2281     const Float*  __restrict__ dev_ns_phi,             // in
         2282     const Float*  __restrict__ dev_ns_dphi,            // in
         2283     const Float*  __restrict__ dev_ns_div_tau_x,       // in
         2284     const Float*  __restrict__ dev_ns_div_tau_y,       // in
         2285     const Float*  __restrict__ dev_ns_div_tau_z,       // in
         2286     const Float3* __restrict__ dev_ns_div_phi_vi_v,    // in
         2287     const int     bc_bot,                              // in
         2288     const int     bc_top,                              // in
         2289     const Float   beta,                                // in
         2290     const Float3* __restrict__ dev_ns_F_pf,            // in
         2291     const unsigned int ndem,                           // in
         2292     const unsigned int wall0_iz,                       // in
         2293     const Float   c_v,                                 // in
         2294     const Float   mu,                                  // in
         2295     const Float   rho_f,                               // in
         2296     Float* __restrict__ dev_ns_v_p_x,           // out
         2297     Float* __restrict__ dev_ns_v_p_y,           // out
         2298     Float* __restrict__ dev_ns_v_p_z)           // out
         2299 {
         2300     // 3D thread index
         2301     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2302     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2303     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2304 
         2305     // Grid dimensions
         2306     const unsigned int nx = devC_grid.num[0];
         2307     const unsigned int ny = devC_grid.num[1];
         2308     const unsigned int nz = devC_grid.num[2];
         2309 
         2310     // Cell sizes
         2311     const Float dx = devC_grid.L[0]/nx;
         2312     const Float dy = devC_grid.L[1]/ny;
         2313     const Float dz = devC_grid.L[2]/nz;
         2314 
         2315     // 1D thread index
         2316     const unsigned int fidx = vidx(x,y,z);
         2317     const unsigned int cellidx = idx(x,y,z);
         2318 
         2319     // Check that we are not outside the fluid grid
         2320     //if (x <= nx && y <= ny && z <= nz) {
         2321     if (x < nx && y < ny && z < nz) {
         2322 
         2323         // Values that are needed for calculating the predicted velocity
         2324         __syncthreads();
         2325         const Float3 v = MAKE_FLOAT3(
         2326             dev_ns_v_x[fidx],
         2327             dev_ns_v_y[fidx],
         2328             dev_ns_v_z[fidx]);
         2329         //printf("v in v* [%d,%d,%d] = %f, %f, %f\n", x,y,z, v.x, v.y, v.z);
         2330 
         2331         Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2332         if (mu > 0.0) {
         2333             div_tau = MAKE_FLOAT3(
         2334                 dev_ns_div_tau_x[fidx],
         2335                 dev_ns_div_tau_y[fidx],
         2336                 dev_ns_div_tau_z[fidx]);
         2337         }
         2338 
         2339         // cell center values
         2340         const Float phi_xn    = dev_ns_phi[idx(x-1,y,z)];
         2341         const Float phi_c     = dev_ns_phi[cellidx];
         2342         const Float phi_yn    = dev_ns_phi[idx(x,y-1,z)];
         2343         const Float phi_zn    = dev_ns_phi[idx(x,y,z-1)];
         2344 
         2345         const Float dphi_xn   = dev_ns_dphi[idx(x-1,y,z)];
         2346         const Float dphi_c    = dev_ns_dphi[cellidx];
         2347         const Float dphi_yn   = dev_ns_dphi[idx(x,y-1,z)];
         2348         const Float dphi_zn   = dev_ns_dphi[idx(x,y,z-1)];
         2349 
         2350         const Float3 div_phi_vi_v_xn = dev_ns_div_phi_vi_v[idx(x-1,y,z)];
         2351         const Float3 div_phi_vi_v_c  = dev_ns_div_phi_vi_v[cellidx];
         2352         const Float3 div_phi_vi_v_yn = dev_ns_div_phi_vi_v[idx(x,y-1,z)];
         2353         const Float3 div_phi_vi_v_zn = dev_ns_div_phi_vi_v[idx(x,y,z-1)];
         2354 
         2355         // component-wise average values
         2356         const Float3 phi = MAKE_FLOAT3(
         2357             amean(phi_c, phi_xn),
         2358             amean(phi_c, phi_yn),
         2359             amean(phi_c, phi_zn));
         2360         const Float3 dphi = MAKE_FLOAT3(
         2361             amean(dphi_c, dphi_xn),
         2362             amean(dphi_c, dphi_yn),
         2363             amean(dphi_c, dphi_zn));
         2364 
         2365         // The particle-fluid interaction force should only be incoorporated if
         2366         // there is a fluid viscosity
         2367         Float3 f_i_c, f_i_xn, f_i_yn, f_i_zn;
         2368         if (mu > 0.0 && devC_np > 0) {
         2369             f_i_c  = dev_ns_F_pf[cellidx];
         2370             f_i_xn = dev_ns_F_pf[idx(x-1,y,z)];
         2371             f_i_yn = dev_ns_F_pf[idx(x,y-1,z)];
         2372             f_i_zn = dev_ns_F_pf[idx(x,y,z-1)];
         2373         } else {
         2374             f_i_c  = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2375             f_i_xn = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2376             f_i_yn = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2377             f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2378         }
         2379         const Float3 f_i = MAKE_FLOAT3(
         2380             amean(f_i_c.x, f_i_xn.x),
         2381             amean(f_i_c.y, f_i_yn.y),
         2382             amean(f_i_c.z, f_i_zn.z));
         2383 
         2384         const Float dt = ndem*devC_dt;
         2385 
         2386         // The pressure gradient is not needed in Chorin's projection method
         2387         // (ns.beta=0), so only has to be looked up in pressure-dependant
         2388         // projection methods
         2389         Float3 pressure_term = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2390         if (beta > 0.0) {
         2391             __syncthreads();
         2392             const Float p    = dev_ns_p[cellidx];
         2393             const Float p_xn = dev_ns_p[idx(x-1,y,z)];
         2394             const Float p_yn = dev_ns_p[idx(x,y-1,z)];
         2395             const Float p_zn = dev_ns_p[idx(x,y,z-1)];
         2396             const Float3 grad_p = MAKE_FLOAT3(
         2397                 (p - p_xn)/dx,
         2398                 (p - p_yn)/dy,
         2399                 (p - p_zn)/dz);
         2400 #ifdef SET_1
         2401             pressure_term = -beta*dt/(rho_f*phi)*grad_p;
         2402 #endif
         2403 #ifdef SET_2
         2404             pressure_term = -beta*dt/rho_f*grad_p;
         2405 #endif
         2406         }
         2407 
         2408         const Float3 div_phi_vi_v = MAKE_FLOAT3(
         2409             amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
         2410             amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
         2411             amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
         2412 
         2413         // Determine the terms of the predicted velocity change
         2414         const Float3 interaction_term = -dt/(rho_f*phi)*f_i;
         2415         const Float3 gravity_term = MAKE_FLOAT3(
         2416             devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
         2417         const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
         2418         const Float3 porosity_term = -1.0*v*dphi/phi;
         2419 #ifdef SET_1
         2420         const Float3 diffusion_term = dt/(rho_f*phi)*div_tau;
         2421 #endif
         2422 #ifdef SET_2
         2423         const Float3 diffusion_term = dt/rho_f*div_tau;
         2424 #endif
         2425 
         2426         // Predict new velocity and add scaling parameters
         2427         Float3 v_p = v + c_v*(
         2428                 pressure_term
         2429                 + interaction_term
         2430                 + diffusion_term
         2431                 + gravity_term
         2432                 + porosity_term
         2433                 + advection_term);
         2434 
         2435         //// Neumann BCs
         2436         if ((z == 0 && bc_bot == 1) || (z > nz-1 && bc_top == 1))
         2437             v_p.z = 0.0;
         2438 
         2439         if ((z == 0 && bc_bot == 2) || (z > nz-1 && bc_top == 2))
         2440             v_p = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2441 
         2442         // Set velocities to zero above the dynamic wall
         2443         if (z >= wall0_iz)
         2444             v_p = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2445 
         2446 #ifdef REPORT_V_P_COMPONENTS
         2447         // Report velocity components to stdout for debugging
         2448         printf("\n[%d,%d,%d] REPORT_V_P_COMPONENTS\n"
         2449                "\tv_p      = %+e %+e %+e\n"
         2450                "\tv_old    = %+e %+e %+e\n"
         2451                "\tpres     = %+e %+e %+e\n"
         2452                "\tinteract = %+e %+e %+e\n"
         2453                "\tdiff     = %+e %+e %+e\n"
         2454                "\tgrav     = %+e %+e %+e\n"
         2455                "\tporos    = %+e %+e %+e\n"
         2456                "\tadv      = %+e %+e %+e\n",
         2457                x, y, z,
         2458                v_p.x, v_p.y, v_p.z,
         2459                v.x, v.y, v.z,
         2460                c_v*pressure_term.x,
         2461                c_v*pressure_term.y,
         2462                c_v*pressure_term.z,
         2463                c_v*interaction_term.x,
         2464                c_v*interaction_term.y,
         2465                c_v*interaction_term.z,
         2466                c_v*diffusion_term.x,
         2467                c_v*diffusion_term.y,
         2468                c_v*diffusion_term.z,
         2469                c_v*gravity_term.x,
         2470                c_v*gravity_term.y,
         2471                c_v*gravity_term.z,
         2472                c_v*porosity_term.x,
         2473                c_v*porosity_term.y,
         2474                c_v*porosity_term.z,
         2475                c_v*advection_term.x,
         2476                c_v*advection_term.y,
         2477                c_v*advection_term.z);
         2478 #endif
         2479 
         2480         // Save the predicted velocity
         2481         __syncthreads();
         2482         dev_ns_v_p_x[fidx] = v_p.x;
         2483         dev_ns_v_p_y[fidx] = v_p.y;
         2484         dev_ns_v_p_z[fidx] = v_p.z;
         2485 
         2486 #ifdef CHECK_FLUID_FINITE
         2487         (void)checkFiniteFloat3("v_p", x, y, z, v_p);
         2488 #endif
         2489     }
         2490 }
         2491 
         2492 // Find the value of the forcing function. Only grad(epsilon) changes during
         2493 // the Jacobi iterations. The remaining, constant terms are only calculated
         2494 // during the first iteration.
         2495 // At each iteration, the value of the forcing function is found as:
         2496 //   f = f1 - f2 dot grad(epsilon)
         2497 __global__ void findNSforcing(
         2498     const Float*  __restrict__ dev_ns_epsilon,     // in
         2499     const Float*  __restrict__ dev_ns_phi,         // in
         2500     const Float*  __restrict__ dev_ns_dphi,        // in
         2501     const Float3* __restrict__ dev_ns_v_p,         // in
         2502     const Float*  __restrict__ dev_ns_v_p_x,       // in
         2503     const Float*  __restrict__ dev_ns_v_p_y,       // in
         2504     const Float*  __restrict__ dev_ns_v_p_z,       // in
         2505     const unsigned int nijac,                      // in
         2506     const unsigned int ndem,                       // in
         2507     const Float c_v,                               // in
         2508     const Float rho_f,                             // in
         2509     Float*  __restrict__ dev_ns_f1,                // out
         2510     Float3* __restrict__ dev_ns_f2,                // out
         2511     Float*  __restrict__ dev_ns_f)                 // out
         2512 {
         2513     // 3D thread index
         2514     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2515     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2516     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2517 
         2518     // Grid dimensions
         2519     const unsigned int nx = devC_grid.num[0];
         2520     const unsigned int ny = devC_grid.num[1];
         2521     const unsigned int nz = devC_grid.num[2];
         2522 
         2523     // Cell sizes
         2524     const Float dx = devC_grid.L[0]/nx;
         2525     const Float dy = devC_grid.L[1]/ny;
         2526     const Float dz = devC_grid.L[2]/nz;
         2527 
         2528     // 1D thread index
         2529     const unsigned int cellidx = idx(x,y,z);
         2530 
         2531 
         2532     // Check that we are not outside the fluid grid
         2533     if (x < nx && y < ny && z < nz) {
         2534 
         2535         // Constant forcing function terms
         2536         Float f1;
         2537         Float3 f2;
         2538 
         2539         // Check if this is the first Jacobi iteration. If it is, find f1 and f2
         2540         if (nijac == 0) {
         2541 
         2542             // Read needed values
         2543             __syncthreads();
         2544             const Float3 v_p  = dev_ns_v_p[cellidx];
         2545             const Float  phi  = dev_ns_phi[cellidx];
         2546             const Float  dphi = dev_ns_dphi[cellidx];
         2547 
         2548             // Calculate derivatives
         2549             const Float  div_v_p
         2550                 = divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z,
         2551                              x, y, z, dx, dy, dz);
         2552             const Float3 grad_phi
         2553                 = gradient(dev_ns_phi, x, y, z, dx, dy, dz);
         2554 
         2555             // CFD time step
         2556             const Float dt = devC_dt*ndem;
         2557 
         2558             // Find forcing function terms
         2559 #ifdef SET_1
         2560             const Float t1 = rho_f*phi*div_v_p/(c_v*dt);
         2561             const Float t2 = rho_f*dot(grad_phi, v_p)/(c_v*dt);
         2562             const Float t4 = rho_f*dphi/(dt*dt*c_v);
         2563 #endif
         2564 #ifdef SET_2
         2565             const Float t1 = rho_f*div_v_p/(c_v*dt);
         2566             const Float t2 = rho_f*dot(grad_phi, v_p)/(c_v*dt*phi);
         2567             const Float t4 = rho_f*dphi/(dt*dt*phi*c_v);
         2568 #endif
         2569             f1 = t1 + t2 + t4;
         2570             f2 = grad_phi/phi; // t3/grad(epsilon)
         2571 
         2572 #ifdef REPORT_FORCING_TERMS
         2573             // Report values terms in the forcing function for debugging
         2574             printf("[%d,%d,%d] REPORT_FORCING_TERMS\t"
         2575                     "t1 = %f\tt2 = %f\tt4 = %f\n", x,y,z, t1, t2, t4);
         2576 #endif
         2577 
         2578             // Save values
         2579             __syncthreads();
         2580             dev_ns_f1[cellidx] = f1;
         2581             dev_ns_f2[cellidx] = f2;
         2582 
         2583         } else {
         2584 
         2585             // Read previously found values
         2586             __syncthreads();
         2587             f1 = dev_ns_f1[cellidx];
         2588             f2 = dev_ns_f2[cellidx];
         2589         }
         2590 
         2591         // Find the gradient of epsilon, which changes during Jacobi iterations
         2592         const Float3 grad_epsilon
         2593             = gradient(dev_ns_epsilon, x, y, z, dx, dy, dz);
         2594 
         2595         // Forcing function value
         2596         const Float f = f1 - dot(grad_epsilon, f2);
         2597 
         2598 #ifdef REPORT_FORCING_TERMS
         2599         const Float t3 = -dot(f2, grad_epsilon);
         2600         if (z >= nz-3)
         2601             printf("[%d,%d,%d] REPORT_FORCING_TERMS\tf= %f\tf1 = %f\tt3 = %f\n",
         2602                    x,y,z, f, f1, t3);
         2603 #endif
         2604 
         2605         // Save forcing function value
         2606         __syncthreads();
         2607         dev_ns_f[cellidx] = f;
         2608 
         2609 #ifdef CHECK_FLUID_FINITE
         2610         (void)checkFiniteFloat("f", x, y, z, f);
         2611 #endif
         2612     }
         2613 }
         2614 
         2615 // Spatial smoothing, used for the epsilon values. If there are several blocks,
         2616 // there will be small errors at the block boundaries, since the update will mix
         2617 // non-smoothed and smoothed values.
         2618 template<typename T>
         2619 __global__ void smoothing(
         2620     T* __restrict__ dev_arr,
         2621     const Float gamma,
         2622     const unsigned int bc_bot,
         2623     const unsigned int bc_top)
         2624 {
         2625     // 3D thread index
         2626     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2627     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2628     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2629 
         2630     // Grid dimensions
         2631     const unsigned int nx = devC_grid.num[0];
         2632     const unsigned int ny = devC_grid.num[1];
         2633     const unsigned int nz = devC_grid.num[2];
         2634 
         2635     // 1D thread index
         2636     const unsigned int cellidx = idx(x,y,z);
         2637 
         2638     // Perform the epsilon updates for all non-ghost nodes except the
         2639     // Dirichlet boundaries at z=0 and z=nz-1.
         2640     // Adjust z range if a boundary has the Dirichlet boundary condition.
         2641     int z_min = 0;
         2642     int z_max = nz-1;
         2643     if (bc_bot == 0)
         2644         z_min = 1;
         2645     if (bc_top == 0)
         2646         z_max = nz-2;
         2647 
         2648     if (x < nx && y < ny && z >= z_min && z <= z_max) {
         2649 
         2650         __syncthreads();
         2651         const T e_xn = dev_arr[idx(x-1,y,z)];
         2652         const T e    = dev_arr[cellidx];
         2653         const T e_xp = dev_arr[idx(x+1,y,z)];
         2654         const T e_yn = dev_arr[idx(x,y-1,z)];
         2655         const T e_yp = dev_arr[idx(x,y+1,z)];
         2656         const T e_zn = dev_arr[idx(x,y,z-1)];
         2657         const T e_zp = dev_arr[idx(x,y,z+1)];
         2658 
         2659         const T e_avg_neigbors = 1.0/6.0 *
         2660             (e_xn + e_xp + e_yn + e_yp + e_zn + e_zp);
         2661 
         2662         const T e_smooth = (1.0 - gamma)*e + gamma*e_avg_neigbors;
         2663 
         2664         __syncthreads();
         2665         dev_arr[cellidx] = e_smooth;
         2666 
         2667         //printf("%d,%d,%d\te = %f e_smooth = %f\n", x,y,z, e, e_smooth);
         2668         /*printf("%d,%d,%d\te_xn = %f, e_xp = %f, e_yn = %f, e_yp = %f,"
         2669           " e_zn = %f, e_zp = %f\n", x,y,z, e_xn, e_xp,
         2670           e_yn, e_yp, e_zn, e_zp);*/
         2671 
         2672 #ifdef CHECK_FLUID_FINITE
         2673         (void)checkFiniteFloat("e_smooth", x, y, z, e_smooth);
         2674 #endif
         2675     }
         2676 }
         2677 
         2678 // Perform a single Jacobi iteration
         2679 __global__ void jacobiIterationNS(
         2680     const Float* __restrict__ dev_ns_epsilon,
         2681     Float* __restrict__ dev_ns_epsilon_new,
         2682     Float* __restrict__ dev_ns_norm,
         2683     const Float* __restrict__ dev_ns_f,
         2684     const int bc_bot,
         2685     const int bc_top,
         2686     const Float theta,
         2687     const unsigned int wall0_iz,
         2688     const Float dp_dz)
         2689 {
         2690     // 3D thread index
         2691     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2692     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2693     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2694 
         2695     // Grid dimensions
         2696     const unsigned int nx = devC_grid.num[0];
         2697     const unsigned int ny = devC_grid.num[1];
         2698     const unsigned int nz = devC_grid.num[2];
         2699 
         2700     // Cell sizes
         2701     const Float dx = devC_grid.L[0]/nx;
         2702     const Float dy = devC_grid.L[1]/ny;
         2703     const Float dz = devC_grid.L[2]/nz;
         2704 
         2705     // 1D thread index
         2706     const unsigned int cellidx = idx(x,y,z);
         2707 
         2708     // Check that we are not outside the fluid grid
         2709     //if (x < nx && y < ny && z < nz) {
         2710 
         2711     // internal nodes only
         2712     //if (x > 0 && x < nx-1 && y > 0 && y < ny-1 && z > 0 && z < nz-1) {
         2713 
         2714     // Lower boundary: Dirichlet. Upper boundary: Dirichlet
         2715     //if (x < nx && y < ny && z > 0 && z < nz-1) {
         2716 
         2717     // Lower boundary: Neumann. Upper boundary: Dirichlet
         2718     //if (x < nx && y < ny && z < nz-1) {
         2719 
         2720     // Perform the epsilon updates for all non-ghost nodes except the Dirichlet
         2721     // boundaries at z=0 and z=nz-1.
         2722     // Adjust z range if a boundary has the Dirichlet boundary condition.
         2723     int z_min = 0;
         2724     int z_max = nz-1;
         2725     if (bc_bot == 0)
         2726         z_min = 1;
         2727     if (bc_top == 0)
         2728         z_max = nz-2;
         2729 
         2730     if (x < nx && y < ny && z >= z_min && z <= z_max) {
         2731 
         2732         // Read the epsilon values from the cell and its 6 neighbors
         2733         __syncthreads();
         2734         const Float e_xn = dev_ns_epsilon[idx(x-1,y,z)];
         2735         const Float e    = dev_ns_epsilon[cellidx];
         2736         const Float e_xp = dev_ns_epsilon[idx(x+1,y,z)];
         2737         const Float e_yn = dev_ns_epsilon[idx(x,y-1,z)];
         2738         const Float e_yp = dev_ns_epsilon[idx(x,y+1,z)];
         2739         const Float e_zn = dev_ns_epsilon[idx(x,y,z-1)];
         2740         const Float e_zp = dev_ns_epsilon[idx(x,y,z+1)];
         2741 
         2742         // Read the value of the forcing function
         2743         const Float f = dev_ns_f[cellidx];
         2744 
         2745         // New value of epsilon in 3D update, derived by rearranging the
         2746         // discrete Laplacian
         2747         const Float dxdx = dx*dx;
         2748         const Float dydy = dy*dy;
         2749         const Float dzdz = dz*dz;
         2750         Float e_new
         2751             = (-dxdx*dydy*dzdz*f
         2752                + dydy*dzdz*(e_xn + e_xp)
         2753                + dxdx*dzdz*(e_yn + e_yp)
         2754                + dxdx*dydy*(e_zn + e_zp))
         2755             /(2.0*(dxdx*dydy + dxdx*dzdz + dydy*dzdz));
         2756 
         2757 
         2758         // Dirichlet BC at dynamic top wall. wall0_iz will be larger than the
         2759         // grid if the wall isn't dynamic
         2760         if (z == wall0_iz)
         2761             e_new = e;
         2762 
         2763         // Neumann BC at dynamic top wall
         2764         if (z == wall0_iz + 1)
         2765             e_new = e_zn + dp_dz;
         2766 
         2767         // New value of epsilon in 1D update
         2768         //const Float e_new = (e_zp + e_zn - dz*dz*f)/2.0;
         2769 
         2770         // Print values for debugging
         2771         /*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n",
         2772           x,y,z, e, f, e_new);*/
         2773 
         2774         const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16);
         2775         const Float e_relax = e*(1.0-theta) + e_new*theta;
         2776 
         2777         __syncthreads();
         2778         dev_ns_epsilon_new[cellidx] = e_relax;
         2779         dev_ns_norm[cellidx] = res_norm;
         2780 
         2781 #ifdef CHECK_FLUID_FINITE
         2782         (void)checkFiniteFloat("e_new", x, y, z, e_new);
         2783         (void)checkFiniteFloat("e_relax", x, y, z, e_relax);
         2784         //(void)checkFiniteFloat("res_norm", x, y, z, res_norm);
         2785         if (checkFiniteFloat("res_norm", x, y, z, res_norm)) {
         2786             printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n",
         2787                    x,y,z, e, f, e_new, res_norm);
         2788         }
         2789 #endif
         2790     }
         2791 }
         2792 
         2793 // Copy all values from one array to the other
         2794 template<typename T>
         2795 __global__ void copyValues(
         2796     const T* __restrict__ dev_read,
         2797     T* __restrict__ dev_write)
         2798 {
         2799     // 3D thread index
         2800     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2801     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2802     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2803 
         2804     // Internal nodes only
         2805     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         2806 
         2807         // Internal nodes + ghost nodes
         2808         /*if (x <= devC_grid.num[0]+1 &&
         2809           y <= devC_grid.num[1]+1 &&
         2810           z <= devC_grid.num[2]+1) {*/
         2811 
         2812         const unsigned int cellidx = idx(x,y,z); // without ghost nodes
         2813         //const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes
         2814 
         2815         // Read
         2816         __syncthreads();
         2817         const T val = dev_read[cellidx];
         2818 
         2819         //if (z == devC_grid.num[2]-1)
         2820         //printf("[%d,%d,%d] = %f\n", x, y, z, val);
         2821 
         2822         // Write
         2823         __syncthreads();
         2824         dev_write[cellidx] = val;
         2825     }
         2826 }
         2827 
         2828 // Find and store the normalized residuals
         2829 __global__ void findNormalizedResiduals(
         2830     const Float* __restrict__ dev_ns_epsilon_old,
         2831     const Float* __restrict__ dev_ns_epsilon,
         2832     Float* __restrict__ dev_ns_norm,
         2833     const unsigned int bc_bot,
         2834     const unsigned int bc_top)
         2835 {
         2836     // 3D thread index
         2837     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2838     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2839     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2840 
         2841     // Grid dimensions
         2842     const unsigned int nx = devC_grid.num[0];
         2843     const unsigned int ny = devC_grid.num[1];
         2844     const unsigned int nz = devC_grid.num[2];
         2845 
         2846     // 1D thread index
         2847     const unsigned int cellidx = idx(x,y,z);
         2848 
         2849     // Perform the epsilon updates for all non-ghost nodes except the
         2850     // Dirichlet boundaries at z=0 and z=nz-1.
         2851     // Adjust z range if a boundary has the Dirichlet boundary condition.
         2852     int z_min = 0;
         2853     int z_max = nz-1;
         2854     if (bc_bot == 0)
         2855         z_min = 1;
         2856     if (bc_top == 0)
         2857         z_max = nz-2;
         2858 
         2859     if (x < nx && y < ny && z >= z_min && z <= z_max) {
         2860 
         2861         __syncthreads();
         2862         const Float e = dev_ns_epsilon_old[cellidx];
         2863         const Float e_new = dev_ns_epsilon[cellidx];
         2864 
         2865         // Find the normalized residual value. A small value is added to the
         2866         // denominator to avoid a divide by zero.
         2867         //const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16);
         2868         const Float res_norm = (e_new - e)/(e + 1.0e-16);
         2869 
         2870         __syncthreads();
         2871         dev_ns_norm[cellidx] = res_norm;
         2872 
         2873 #ifdef CHECK_FLUID_FINITE
         2874         checkFiniteFloat("res_norm", x, y, z, res_norm);
         2875 #endif
         2876     }
         2877 }
         2878 
         2879 
         2880 // Computes the new velocity and pressure using the corrector
         2881 __global__ void updateNSpressure(
         2882     const Float* __restrict__ dev_ns_epsilon,  // in
         2883     const Float  __restrict__ beta,            // in
         2884     Float* __restrict__ dev_ns_p)              // out
         2885 {
         2886     // 3D thread index
         2887     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2888     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2889     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2890 
         2891     // 1D thread index
         2892     const unsigned int cellidx = idx(x,y,z);
         2893 
         2894     // Check that we are not outside the fluid grid
         2895     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         2896 
         2897         // Read values
         2898         __syncthreads();
         2899         const Float  p_old   = dev_ns_p[cellidx];
         2900         const Float  epsilon = dev_ns_epsilon[cellidx];
         2901 
         2902         // New pressure
         2903         Float p = beta*p_old + epsilon;
         2904 
         2905         // Write new values
         2906         __syncthreads();
         2907         dev_ns_p[cellidx] = p;
         2908 
         2909 #ifdef CHECK_FLUID_FINITE
         2910         checkFiniteFloat("p", x, y, z, p);
         2911 #endif
         2912     }
         2913 }
         2914 
         2915 __global__ void updateNSvelocity(
         2916     const Float* __restrict__ dev_ns_v_p_x,    // in
         2917     const Float* __restrict__ dev_ns_v_p_y,    // in
         2918     const Float* __restrict__ dev_ns_v_p_z,    // in
         2919     const Float* __restrict__ dev_ns_phi,      // in
         2920     const Float* __restrict__ dev_ns_epsilon,  // in
         2921     const Float  beta,            // in
         2922     const int    bc_bot,          // in
         2923     const int    bc_top,          // in
         2924     const unsigned int ndem,      // in
         2925     const Float  c_v,             // in
         2926     const Float  rho_f,           // in
         2927     const unsigned int wall0_iz,  // in
         2928     const unsigned int iter,      // in
         2929     Float* __restrict__ dev_ns_v_x,      // out
         2930     Float* __restrict__ dev_ns_v_y,      // out
         2931     Float* __restrict__ dev_ns_v_z)      // out
         2932 {
         2933     // 3D thread index
         2934     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         2935     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         2936     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         2937 
         2938     // Grid dimensions
         2939     const unsigned int nx = devC_grid.num[0];
         2940     const unsigned int ny = devC_grid.num[1];
         2941     const unsigned int nz = devC_grid.num[2];
         2942 
         2943     // Cell sizes
         2944     const Float dx = devC_grid.L[0]/nx;
         2945     const Float dy = devC_grid.L[1]/ny;
         2946     const Float dz = devC_grid.L[2]/nz;
         2947 
         2948     // 1D thread index
         2949     const unsigned int cellidx = vidx(x,y,z);
         2950 
         2951     // Check that we are not outside the fluid grid
         2952     //if (x <= nx && y <= ny && z <= nz) {
         2953     if (x < nx && y < ny && z < nz) {
         2954 
         2955         // Read values
         2956         __syncthreads();
         2957         const Float v_p_x = dev_ns_v_p_x[cellidx];
         2958         const Float v_p_y = dev_ns_v_p_y[cellidx];
         2959         const Float v_p_z = dev_ns_v_p_z[cellidx];
         2960         const Float3 v_p = MAKE_FLOAT3(v_p_x, v_p_y, v_p_z);
         2961 
         2962         const Float epsilon_xn = dev_ns_epsilon[idx(x-1,y,z)];
         2963         const Float epsilon_c  = dev_ns_epsilon[idx(x,y,z)];
         2964         const Float epsilon_yn = dev_ns_epsilon[idx(x,y-1,z)];
         2965         const Float epsilon_zn = dev_ns_epsilon[idx(x,y,z-1)];
         2966 
         2967         const Float phi_xn = dev_ns_phi[idx(x-1,y,z)];
         2968         const Float phi_c  = dev_ns_phi[idx(x,y,z)];
         2969         const Float phi_yn = dev_ns_phi[idx(x,y-1,z)];
         2970         const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];
         2971 
         2972         const Float3 phi = MAKE_FLOAT3(
         2973             amean(phi_c, phi_xn),
         2974             amean(phi_c, phi_yn),
         2975             amean(phi_c, phi_zn));
         2976 
         2977         // Find corrector gradient
         2978         const Float3 grad_epsilon = MAKE_FLOAT3(
         2979             (epsilon_c - epsilon_xn)/dx,
         2980             (epsilon_c - epsilon_yn)/dy,
         2981             (epsilon_c - epsilon_zn)/dz);
         2982 
         2983         // Find new velocity
         2984         Float3 v = v_p
         2985 #ifdef SET_1
         2986             - c_v*ndem*devC_dt/(phi*rho_f)*grad_epsilon;
         2987 #endif
         2988 #ifdef SET_2
         2989             - c_v*ndem*devC_dt/rho_f*grad_epsilon;
         2990 #endif
         2991 
         2992         if ((z == 0 && bc_bot == 1) || (z > nz-1 && bc_top == 1))
         2993             v.z = 0.0;
         2994 
         2995         if ((z == 0 && bc_bot == 2) || (z > nz-1 && bc_top == 2))
         2996             v = MAKE_FLOAT3(0.0, 0.0, 0.0);
         2997 
         2998         // Do not calculate all components at the outer grid edges (nx, ny, nz)
         2999         if (x == nx) {
         3000             v.y = 0.0;
         3001             v.z = 0.0;
         3002         }
         3003         if (y == ny) {
         3004             v.x = 0.0;
         3005             v.z = 0.0;
         3006         }
         3007         if (z == nz) {
         3008             v.x = 0.0;
         3009             v.y = 0.0;
         3010         }
         3011 
         3012         // Set velocities to zero above the dynamic wall
         3013         if (z >= wall0_iz)
         3014             v = MAKE_FLOAT3(0.0, 0.0, 0.0);
         3015 
         3016 #ifdef REPORT_V_C_COMPONENTS
         3017         printf("[%d,%d,%d] REPORT_V_C_COMPONENTS\n"
         3018                 "\tv_p           = % f\t% f\t% f\n"
         3019                 "\tv_c           = % f\t% f\t% f\n"
         3020                 "\tv             = % f\t% f\t% f\n"
         3021                 "\tgrad(epsilon) = % f\t% f\t% f\n"
         3022                 "\tphi           = % f\t% f\t% f\n",
         3023                 x, y, z,
         3024                 v_p.x, v_p.y, v_p.z,
         3025                 v.x-v_p.x, v.y-v_p.y, v.z-v_p.z,
         3026                 v.x, v.y, v.z,
         3027                 grad_epsilon.x, grad_epsilon.y, grad_epsilon.z,
         3028                 phi.x, phi.y, phi.z);
         3029 #endif
         3030 
         3031         // Check the advection term using the Courant-Friedrichs-Lewy condition
         3032         __syncthreads();
         3033         if (v.x*ndem*devC_dt/dx
         3034             + v.y*ndem*devC_dt/dy
         3035             + v.z*ndem*devC_dt/dz > 1.0) {
         3036             printf("[%d,%d,%d] Warning: Advection term in fluid may be "
         3037                    "unstable (CFL condition)\n"
         3038                    "\tv = %.2e,%.2e,%.2e\n"
         3039                    "\te_c,e_xn,e_yn,e_zn = %.2e,%.2e,%.2e,%.2e\n",
         3040                    x,y,z, v.x, v.y, v.z,
         3041                    epsilon_c, epsilon_xn, epsilon_yn, epsilon_zn
         3042                    );
         3043         }
         3044 
         3045         // Write new values
         3046         __syncthreads();
         3047         dev_ns_v_x[cellidx] = v.x;
         3048         dev_ns_v_y[cellidx] = v.y;
         3049         dev_ns_v_z[cellidx] = v.z;
         3050 
         3051 #ifdef CHECK_FLUID_FINITE
         3052         checkFiniteFloat3("v", x, y, z, v);
         3053 #endif
         3054     }
         3055 }
         3056 
         3057 // Find the average particle diameter and velocity for each CFD cell.
         3058 // UNUSED: The values are estimated in the porosity estimation function instead
         3059 __global__ void findAvgParticleVelocityDiameter(
         3060     const unsigned int* __restrict__ dev_cellStart, // in
         3061     const unsigned int* __restrict__ dev_cellEnd,   // in
         3062     const Float4* __restrict__ dev_vel_sorted,      // in
         3063     const Float4* __restrict__ dev_x_sorted,        // in
         3064     Float3* dev_ns_vp_avg,       // out
         3065     Float*  dev_ns_d_avg)        // out
         3066 {
         3067     // 3D thread index
         3068     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         3069     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         3070     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         3071 
         3072     // check that we are not outside the fluid grid
         3073     if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         3074 
         3075         Float4 v;
         3076         Float d;
         3077         unsigned int startIdx, endIdx, i;
         3078         unsigned int n = 0;
         3079 
         3080         // average particle velocity
         3081         Float3 v_avg = MAKE_FLOAT3(0.0, 0.0, 0.0);  
         3082 
         3083         // average particle diameter
         3084         Float d_avg = 0.0;
         3085 
         3086         const unsigned int cellID = x + y * devC_grid.num[0]
         3087             + (devC_grid.num[0] * devC_grid.num[1]) * z; 
         3088 
         3089         // Lowest particle index in cell
         3090         startIdx = dev_cellStart[cellID];
         3091 
         3092         // Make sure cell is not empty
         3093         if (startIdx != 0xffffffff) {
         3094 
         3095             // Highest particle index in cell
         3096             endIdx = dev_cellEnd[cellID];
         3097 
         3098             // Iterate over cell particles
         3099             for (i=startIdx; i<endIdx; ++i) {
         3100 
         3101                 // Read particle velocity
         3102                 __syncthreads();
         3103                 v = dev_vel_sorted[i];
         3104                 d = 2.0*dev_x_sorted[i].w;
         3105                 n++;
         3106                 v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
         3107                 d_avg += d;
         3108             }
         3109 
         3110             v_avg /= n;
         3111             d_avg /= n;
         3112         }
         3113 
         3114         // save average radius and velocity
         3115         const unsigned int cellidx = idx(x,y,z);
         3116         __syncthreads();
         3117         dev_ns_vp_avg[cellidx] = v_avg;
         3118         dev_ns_d_avg[cellidx]  = d_avg;
         3119 
         3120 #ifdef CHECK_FLUID_FINITE
         3121         checkFiniteFloat3("v_avg", x, y, z, v_avg);
         3122         checkFiniteFloat("d_avg", x, y, z, d_avg);
         3123 #endif
         3124     }
         3125 }
         3126 
         3127 // Find the drag coefficient as dictated by the Reynold's number
         3128 // Shamy and Zeghal (2005).
         3129 __device__ Float dragCoefficient(Float re)
         3130 {
         3131     Float cd;
         3132     if (re >= 1000.0)
         3133         cd = 0.44;
         3134     else
         3135         cd = 24.0/re*(1.0 + 0.15*pow(re, 0.687));
         3136     return cd;
         3137 }
         3138 
         3139 // Find particle-fluid interaction force as outlined by Zhou et al. 2010, and
         3140 // originally by Gidaspow 1992.
         3141 __global__ void findInteractionForce(
         3142     const Float4* __restrict__ dev_x,           // in
         3143     const Float4* __restrict__ dev_vel,         // in
         3144     const Float*  __restrict__ dev_ns_phi,      // in
         3145     const Float*  __restrict__ dev_ns_p,        // in
         3146     const Float*  __restrict__ dev_ns_v_x,      // in
         3147     const Float*  __restrict__ dev_ns_v_y,      // in
         3148     const Float*  __restrict__ dev_ns_v_z,      // in
         3149     const Float*  __restrict__ dev_ns_div_tau_x,// in
         3150     const Float*  __restrict__ dev_ns_div_tau_y,// in
         3151     const Float*  __restrict__ dev_ns_div_tau_z,// in
         3152     const Float mu,                             // in
         3153     const Float rho_f,                          // in
         3154     //const Float c_v,                       // in
         3155     Float3* __restrict__ dev_ns_f_pf,     // out
         3156     Float4* __restrict__ dev_force,       // out
         3157     Float4* __restrict__ dev_ns_f_d,      // out
         3158     Float4* __restrict__ dev_ns_f_p,      // out
         3159     Float4* __restrict__ dev_ns_f_v,      // out
         3160     Float4* __restrict__ dev_ns_f_sum)    // out
         3161 {
         3162     unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index
         3163 
         3164     if (i < devC_np) {
         3165 
         3166         // Particle information
         3167         __syncthreads();
         3168         const Float4 x   = dev_x[i];
         3169         const Float4 vel = dev_vel[i];
         3170         const Float3 v_p = MAKE_FLOAT3(vel.x, vel.y, vel.z);
         3171         const Float  d = x.w;
         3172 
         3173         // Fluid cell
         3174         const unsigned int i_x =
         3175             floor((x.x - devC_grid.origo[0])/(devC_grid.L[0]/devC_grid.num[0]));
         3176         const unsigned int i_y =
         3177             floor((x.y - devC_grid.origo[1])/(devC_grid.L[1]/devC_grid.num[1]));
         3178         const unsigned int i_z =
         3179             floor((x.z - devC_grid.origo[2])/(devC_grid.L[2]/devC_grid.num[2]));
         3180         const unsigned int cellidx = idx(i_x, i_y, i_z);
         3181 
         3182         // Cell sizes
         3183         const Float dx = devC_grid.L[0]/devC_grid.num[0];
         3184         const Float dy = devC_grid.L[1]/devC_grid.num[1];
         3185         const Float dz = devC_grid.L[2]/devC_grid.num[2];
         3186 
         3187         // Fluid information
         3188         __syncthreads();
         3189         const Float phi = dev_ns_phi[cellidx];
         3190 
         3191         const Float v_x   = dev_ns_v_x[vidx(i_x,i_y,i_z)];
         3192         const Float v_x_p = dev_ns_v_x[vidx(i_x+1,i_y,i_z)];
         3193         const Float v_y   = dev_ns_v_y[vidx(i_x,i_y,i_z)];
         3194         const Float v_y_p = dev_ns_v_y[vidx(i_x,i_y+1,i_z)];
         3195         const Float v_z   = dev_ns_v_z[vidx(i_x,i_y,i_z)];
         3196         const Float v_z_p = dev_ns_v_z[vidx(i_x,i_y,i_z+1)];
         3197 
         3198         const Float div_tau_x   = dev_ns_div_tau_x[vidx(i_x,i_y,i_z)];
         3199         const Float div_tau_x_p = dev_ns_div_tau_x[vidx(i_x+1,i_y,i_z)];
         3200         const Float div_tau_y   = dev_ns_div_tau_y[vidx(i_x,i_y,i_z)];
         3201         const Float div_tau_y_p = dev_ns_div_tau_y[vidx(i_x,i_y+1,i_z)];
         3202         const Float div_tau_z   = dev_ns_div_tau_z[vidx(i_x,i_y,i_z)];
         3203         const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)];
         3204 
         3205         const Float3 v_f = MAKE_FLOAT3(
         3206             amean(v_x, v_x_p),
         3207             amean(v_y, v_y_p),
         3208             amean(v_z, v_z_p));
         3209 
         3210         const Float3 div_tau = MAKE_FLOAT3(
         3211             amean(div_tau_x, div_tau_x_p),
         3212             amean(div_tau_y, div_tau_y_p),
         3213             amean(div_tau_z, div_tau_z_p));
         3214 
         3215         const Float3 v_rel = v_f - v_p;
         3216         const Float  v_rel_length = length(v_rel);
         3217 
         3218         const Float V_p = dx*dy*dz - phi*dx*dy*dz;
         3219         const Float Re  = rho_f*d*phi*v_rel_length/mu;
         3220         Float Cd  = pow(0.63 + 4.8/pow(Re, 0.5), 2.0);
         3221         Float chi = 3.7 - 0.65*exp(-pow(1.5 - log10(Re), 2.0)/2.0);
         3222 
         3223         if (v_rel_length < 1.0e-6) { // avoid Re=0 -> Cd=inf, chi=-nan
         3224             Cd = 0.0;
         3225             chi = 0.0;
         3226         }
         3227 
         3228         // Drag force
         3229         const Float3 f_d = 0.125*Cd*rho_f*M_PI*d*d*phi*phi
         3230             *v_rel_length*v_rel*pow(phi, -chi);
         3231 
         3232         // Pressure gradient force
         3233         const Float3 f_p =
         3234             -1.0*gradient(dev_ns_p, i_x, i_y, i_z, dx, dy, dz)*V_p;
         3235 
         3236         // Viscous force
         3237         const Float3 f_v = -1.0*div_tau*V_p;
         3238 
         3239         // Interaction force on particle (force) and fluid (f_pf)
         3240         __syncthreads();
         3241         const Float3 f_pf = f_d + f_p + f_v;
         3242 
         3243 #ifdef CHECK_FLUID_FINITE
         3244         /*
         3245           printf("\nfindInteractionForce %d [%d,%d,%d]\n"
         3246           "\tV_p = %f Re=%f Cd=%f chi=%f\n"
         3247           "\tf_d = %+e %+e %+e\n"
         3248           "\tf_p = %+e %+e %+e\n"
         3249           "\tf_v = %+e %+e %+e\n",
         3250           i, i_x, i_y, i_z, V_p, Re, Cd, chi,
         3251           f_d.x, f_d.y, f_d.z,
         3252           f_p.x, f_p.y, f_p.z,
         3253           f_v.x, f_v.y, f_v.z);// */
         3254         checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d);
         3255         checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p);
         3256         checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v);
         3257         checkFiniteFloat3("f_pf", i_x, i_y, i_z, f_pf);
         3258 #endif
         3259 
         3260         __syncthreads();
         3261 #ifdef SET_1
         3262         dev_ns_f_pf[i] = f_pf;
         3263 #endif
         3264 
         3265 #ifdef SET_2
         3266         dev_ns_f_pf[i] = f_d;
         3267 #endif
         3268 
         3269         __syncthreads();
         3270         dev_force[i] += MAKE_FLOAT4(f_pf.x, f_pf.y, f_pf.z, 0.0);
         3271         dev_ns_f_d[i] = MAKE_FLOAT4(f_d.x, f_d.y, f_d.z, 0.0);
         3272         dev_ns_f_p[i] = MAKE_FLOAT4(f_p.x, f_p.y, f_p.z, 0.0);
         3273         dev_ns_f_v[i] = MAKE_FLOAT4(f_v.x, f_v.y, f_v.z, 0.0);
         3274         dev_ns_f_sum[i] = MAKE_FLOAT4(
         3275                 f_d.x + f_p.x + f_v.x,
         3276                 f_d.y + f_p.y + f_v.y,
         3277                 f_d.z + f_p.z + f_v.z,
         3278                 0.0);
         3279     }
         3280 }
         3281 
         3282 // Apply the fluid-particle interaction force to the fluid cell based on the
         3283 // interaction forces from each particle in it
         3284 __global__ void applyInteractionForceToFluid(
         3285     const unsigned int* __restrict__ dev_gridParticleIndex,    // in
         3286     const unsigned int* __restrict__ dev_cellStart,            // in
         3287     const unsigned int* __restrict__ dev_cellEnd,              // in
         3288     const Float3* __restrict__ dev_ns_f_pf,                    // in
         3289     Float3* __restrict__ dev_ns_F_pf)                    // out
         3290 {
         3291     // 3D thread index
         3292     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         3293     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         3294     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         3295 
         3296     // Grid dimensions
         3297     const unsigned int nx = devC_grid.num[0];
         3298     const unsigned int ny = devC_grid.num[1];
         3299     const unsigned int nz = devC_grid.num[2];
         3300 
         3301     // Cell sizes
         3302     const Float dx = devC_grid.L[0]/nx;
         3303     const Float dy = devC_grid.L[1]/ny;
         3304     const Float dz = devC_grid.L[2]/nz;
         3305 
         3306     // Check that we are not outside the fluid grid
         3307     if (x < nx && y < ny && z < nz) {
         3308 
         3309         const unsigned int cellidx = idx(x,y,z);
         3310 
         3311         // Calculate linear cell ID
         3312         const unsigned int cellID = x + y * devC_grid.num[0]
         3313             + (devC_grid.num[0] * devC_grid.num[1]) * z; 
         3314 
         3315         const unsigned int startidx = dev_cellStart[cellID];
         3316         unsigned int endidx, i, origidx;
         3317 
         3318         Float3 fi;
         3319 
         3320         if (startidx != 0xffffffff) {
         3321 
         3322             __syncthreads();
         3323             endidx = dev_cellEnd[cellID];
         3324 
         3325             for (i=startidx; i<endidx; ++i) {
         3326 
         3327                 __syncthreads();
         3328                 origidx = dev_gridParticleIndex[i];
         3329                 fi += dev_ns_f_pf[origidx];
         3330             }
         3331         }
         3332 
         3333         const Float3 F_pf = fi/(dx*dy*dz);
         3334 
         3335 #ifdef CHECK_FLUID_FINITE
         3336         checkFiniteFloat3("F_pf", x, y, z, F_pf);
         3337 #endif
         3338         //printf("F_pf [%d,%d,%d] = %f,%f,%f\n", x,y,z, F_pf.x, F_pf.y, F_pf.z);
         3339         __syncthreads();
         3340         dev_ns_F_pf[cellidx] = F_pf;
         3341     }
         3342 }
         3343 
         3344 
         3345 // Launch per cell face node.
         3346 // Cell center ghost nodes must be set prior to call.
         3347 __global__ void interpolateCenterToFace(
         3348     const Float3* __restrict__ dev_in,
         3349     Float* __restrict__ dev_out_x,
         3350     Float* __restrict__ dev_out_y,
         3351     Float* __restrict__ dev_out_z)
         3352 {
         3353     // 3D thread index
         3354     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         3355     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         3356     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         3357 
         3358     // Check that we are not outside the fluid grid
         3359     if (x <= devC_grid.num[0]
         3360         && y <= devC_grid.num[1]
         3361         && z <= devC_grid.num[2]) {
         3362 
         3363         const unsigned int faceidx = vidx(x,y,z);
         3364 
         3365         __syncthreads();
         3366         const Float3 xn = dev_in[idx(x-1,y,z)];
         3367         const Float3 yn = dev_in[idx(x,y-1,z)];
         3368         const Float3 zn = dev_in[idx(x,y,z-1)];
         3369         const Float3 center = dev_in[idx(x,y,z)];
         3370 
         3371         const Float x_val = amean(center.x, xn.x);
         3372         const Float y_val = amean(center.y, yn.y);
         3373         const Float z_val = amean(center.z, zn.z);
         3374 
         3375         __syncthreads();
         3376         //printf("c2f [%d,%d,%d] = %f,%f,%f\n", x,y,z, x_val, y_val, z_val);
         3377         dev_out_x[faceidx] = x_val;
         3378         dev_out_y[faceidx] = y_val;
         3379         dev_out_z[faceidx] = z_val;
         3380     }
         3381 }
         3382 
         3383 // Launch per cell center node
         3384 __global__ void interpolateFaceToCenter(
         3385     Float*  dev_in_x,
         3386     Float*  dev_in_y,
         3387     Float*  dev_in_z,
         3388     Float3* dev_out)
         3389 {
         3390     // 3D thread index
         3391     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         3392     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         3393     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         3394 
         3395     // Grid dimensions
         3396     const unsigned int nx = devC_grid.num[0];
         3397     const unsigned int ny = devC_grid.num[1];
         3398     const unsigned int nz = devC_grid.num[2];
         3399 
         3400     // Check that we are not outside the fluid grid
         3401     //if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
         3402     if (x < nx && y < ny && z < nz) {
         3403 
         3404         __syncthreads();
         3405         const Float x_n = dev_in_x[vidx(x,y,z)];
         3406         const Float x_p = dev_in_x[vidx(x+1,y,z)];
         3407         const Float y_n = dev_in_y[vidx(x,y,z)];
         3408         const Float y_p = dev_in_y[vidx(x,y+1,z)];
         3409         const Float z_n = dev_in_z[vidx(x,y,z)];
         3410         const Float z_p = dev_in_z[vidx(x,y,z+1)];
         3411 
         3412         const Float3 val = MAKE_FLOAT3(
         3413             amean(x_n, x_p),
         3414             amean(y_n, y_p),
         3415             amean(z_n, z_p));
         3416 
         3417         __syncthreads();
         3418         //printf("f2c [%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z);
         3419         dev_out[idx(x,y,z)] = val;
         3420     }
         3421 }
         3422 
         3423 // Launch per cell face node. Set velocity ghost nodes beforehand.
         3424 // Find div(tau) at all cell faces.
         3425 // Warning: The grid-corner values will be invalid, along with the non-normal
         3426 // components of the ghost nodes
         3427 __global__ void findFaceDivTau(
         3428     const Float* __restrict__ dev_ns_v_x,   // in
         3429     const Float* __restrict__ dev_ns_v_y,   // in
         3430     const Float* __restrict__ dev_ns_v_z,   // in
         3431     const Float mu,                         // in
         3432     Float* __restrict__ dev_ns_div_tau_x,   // out
         3433     Float* __restrict__ dev_ns_div_tau_y,   // out
         3434     Float* __restrict__ dev_ns_div_tau_z)   // out
         3435 {
         3436     // 3D thread index
         3437     const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
         3438     const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
         3439     const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
         3440 
         3441     // Grid dimensions
         3442     const unsigned int nx = devC_grid.num[0];
         3443     const unsigned int ny = devC_grid.num[1];
         3444     const unsigned int nz = devC_grid.num[2];
         3445 
         3446     // Cell sizes
         3447     const Float dx = devC_grid.L[0]/nx;
         3448     const Float dy = devC_grid.L[1]/ny;
         3449     const Float dz = devC_grid.L[2]/nz;
         3450 
         3451     // Check that we are not outside the fluid grid
         3452     //if (x <= nx && y <= ny && z <= nz) {
         3453     if (x < nx && y < ny && z < nz) {
         3454 
         3455         const unsigned int faceidx = vidx(x,y,z);
         3456 
         3457         __syncthreads();
         3458         const Float v_x_xn = dev_ns_v_x[vidx(x-1,y,z)];
         3459         const Float v_x = dev_ns_v_x[faceidx];
         3460         const Float v_x_xp = dev_ns_v_x[vidx(x+1,y,z)];
         3461         const Float v_x_yn = dev_ns_v_x[vidx(x,y-1,z)];
         3462         const Float v_x_yp = dev_ns_v_x[vidx(x,y+1,z)];
         3463         const Float v_x_zn = dev_ns_v_x[vidx(x,y,z-1)];
         3464         const Float v_x_zp = dev_ns_v_x[vidx(x,y,z+1)];
         3465 
         3466         const Float v_y_xn = dev_ns_v_y[vidx(x-1,y,z)];
         3467         const Float v_y_xp = dev_ns_v_y[vidx(x+1,y,z)];
         3468         const Float v_y_yn = dev_ns_v_y[vidx(x,y-1,z)];
         3469         const Float v_y = dev_ns_v_y[faceidx];
         3470         const Float v_y_yp = dev_ns_v_y[vidx(x,y+1,z)];
         3471         const Float v_y_zn = dev_ns_v_y[vidx(x,y,z-1)];
         3472         const Float v_y_zp = dev_ns_v_y[vidx(x,y,z+1)];
         3473 
         3474         const Float v_z_xn = dev_ns_v_z[vidx(x-1,y,z)];
         3475         const Float v_z_xp = dev_ns_v_z[vidx(x+1,y,z)];
         3476         const Float v_z_yn = dev_ns_v_z[vidx(x,y-1,z)];
         3477         const Float v_z_yp = dev_ns_v_z[vidx(x,y+1,z)];
         3478         const Float v_z_zn = dev_ns_v_z[vidx(x,y,z-1)];
         3479         const Float v_z = dev_ns_v_z[faceidx];
         3480         const Float v_z_zp = dev_ns_v_z[vidx(x,y,z+1)];
         3481 
         3482         const Float div_tau_x =
         3483             mu*(
         3484                 (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
         3485                 (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
         3486                 (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
         3487         const Float div_tau_y =
         3488             mu*(
         3489                 (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
         3490                 (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
         3491                 (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
         3492         const Float div_tau_z =
         3493             mu*(
         3494                 (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
         3495                 (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
         3496                 (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
         3497 
         3498         __syncthreads();
         3499         //printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z,
         3500         //div_tau_x, div_tau_y, div_tau_z);
         3501         dev_ns_div_tau_x[faceidx] = div_tau_x;
         3502         dev_ns_div_tau_y[faceidx] = div_tau_y;
         3503         dev_ns_div_tau_z[faceidx] = div_tau_z;
         3504     }
         3505 
         3506 }
         3507 
         3508 // Print final heads and free memory
         3509 void DEM::endNSdev()
         3510 {
         3511     freeNSmemDev();
         3512 }
         3513 
         3514 // vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4