tported Darcy routines to CUDA, compiles, needs testing - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 1658688f8ba0d701c10244801f13fd46821c799c
 (DIR) parent 27f9122518041ed1ed936e25643d2f9f61cc8100
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Wed,  9 Oct 2013 11:11:55 +0200
       
       ported Darcy routines to CUDA, compiles, needs testing
       
       Diffstat:
         M src/darcy.cpp                       |     221 ++++++++++++++++++-------------
         A src/darcy.cuh                       |     616 +++++++++++++++++++++++++++++++
         M src/device.cu                       |     172 ++++++++++++++++++++++++++-----
         M src/sphere.h                        |       5 +++++
       
       4 files changed, 898 insertions(+), 116 deletions(-)
       ---
 (DIR) diff --git a/src/darcy.cpp b/src/darcy.cpp
       t@@ -10,6 +10,9 @@
        #include "sphere.h"
        #include "utility.h"
        
       +// Enable line below to make x and y boundaries periodic
       +#define PERIODIC_XY
       +
        // Initialize memory
        void DEM::initDarcyMem()
        {
       t@@ -50,7 +53,7 @@ unsigned int DEM::idx(
            //return x + d_nx*y + d_nx*d_ny*z;
        
            // with ghost nodes
       -    // the ghost nodes are placed at -1 and WIDTH
       +    // the ghost nodes are placed at x,y,z = -1 and WIDTH
            return (x+1) + (d_nx+2)*(y+1) + (d_nx+2)*(d_ny+2)*(z+1);
        }
        
       t@@ -75,17 +78,26 @@ void DEM::initDarcyVals()
                        // read from input binary
        
                        // Hydraulic permeability [m^2]
       -                d_K[cellidx] = k*rho*-params.g[2]/params.nu;
       +                //d_K[cellidx] = k*rho*-params.g[2]/params.nu;
       +                d_K[cellidx] = 0.5;
        
                        // Hydraulic storativity [-]
                        d_Ss[cellidx] = 8.0e-3;
                        //d_Ss[cellidx] = 1.0;
        
       -                // Hydraulic recharge [Pa/s]
       +                // Hydraulic recharge [s^-1]
                        d_W[cellidx] = 0.0;
                    }
                }
            }
       +
       +    // Extract water from all cells in center
       +    ix = d_nx/2-1; iy = d_ny/2-1;
       +    Float cellvolume = d_dx*d_dy*d_dz;
       +    for (iz=0; iz<d_nz; ++iz) {
       +        //d_W[idx(ix,iy,iz)] = -0.1*cellvolume;
       +        d_W[idx(ix,iy,iz)] = -1.0;
       +    }
        }
        
        
       t@@ -169,7 +181,12 @@ void DEM::findDarcyTransmissivities()
        
            // Kozeny-Carman parameter
            //Float a = 1.0e-8;
       -    Float a = 1.0;
       +    //Float a = 1.0;
       +    
       +    // Representative grain radius
       +    Float r_bar2 = meanRadius()*2.0;
       +    // Grain size factor for Kozeny-Carman relationship
       +    Float d_factor = r_bar2*r_bar2/180.0;
        
            unsigned int ix, iy, iz, cellidx;
            Float K, k;
       t@@ -185,7 +202,9 @@ void DEM::findDarcyTransmissivities()
                        // Calculate permeability from the Kozeny-Carman relationship
                        // Nelson 1994 eq. 1c
                        // Boek 2012 eq. 16
       -                k = a*phi*phi*phi/(1.0 - phi*phi);
       +                //k = a*phi*phi*phi/(1.0 - phi*phi);
       +                // Schwartz and Zhang 2003
       +                k = phi*phi*phi/((1.0-phi)*(1.0-phi)) * d_factor;
        
                        // Save hydraulic conductivity [m/s]
                        //K = d_K[cellidx];
       t@@ -261,7 +280,8 @@ void DEM::findDarcyGradients()
            // With ghost-nodes
            for (ix=0; ix<d_nx; ++ix) {
                for (iy=0; iy<d_ny; ++iy) {
       -            for (iz=1; iz<d_nz-1; ++iz) {
       +            //for (iz=1; iz<d_nz-1; ++iz) {
       +            for (iz=0; iz<d_nz; ++iz) {
        
                        cellidx = idx(ix,iy,iz);
        
       t@@ -335,9 +355,10 @@ void DEM::explDarcyStep()
            checkDarcyTimestep();
        
            // Cell dims squared
       -    const Float dx2 = d_dx*d_dx;
       -    const Float dy2 = d_dy*d_dy;
       -    const Float dz2 = d_dz*d_dz;
       +    const Float dxdx = d_dx*d_dx;
       +    const Float dydy = d_dy*d_dy;
       +    const Float dzdz = d_dz*d_dz;
       +    const Float dxdydz = d_dx*d_dy*d_dz;
        
            //setDarcyBCNeumannZero();
        
       t@@ -358,95 +379,104 @@ void DEM::explDarcyStep()
                        // Cell linear index
                        cellidx = idx(ix,iy,iz);
        
       -                // If x,y,z boundaries are fixed values:
       -                // Enforce Dirichlet BC
       -                if (ix == 0 || iy == 0 || iz == 0 ||
       +                // If x,y,z boundaries are fixed values: Enforce Dirichlet BC
       +                /*if (ix == 0 || iy == 0 || iz == 0 ||
                                ix == d_nx-1 || iy == d_ny-1 || iz == d_nz-1) {
       +                    d_H_new[cellidx] = d_H[cellidx];*/
       +
       +                // If z boundaries are fixed val, x and y are periodic:
       +                /*if (iz == 0 || iz == d_nz-1) {
                            d_H_new[cellidx] = d_H[cellidx];
       -                // If z boundaries are periodic:
       -                //if (iz == 0 || iz == d_nz-1) {
       -                    //d_H_new[cellidx] = d_H[cellidx];
       -                } else {
        
       -                    // Cell hydraulic conductivity
       -                    K = d_K[cellidx];
       -
       -                    // Cell hydraulic transmissivities
       -                    Tx = K*d_dx;
       -                    Ty = K*d_dy;
       -                    Tz = K*d_dz;
       -
       -                    // Cell hydraulic head
       -                    H = d_H[cellidx];
       -
       -                    // Harmonic mean of transmissivity
       -                    // (in neg. and pos. direction along axis from cell)
       -                    // with periodic x and y boundaries
       -                    // without ghost nodes
       -                    /*
       -                    if (ix == 0)
       -                        gradx_n = hmean(Tx, d_T[idx(d_nx-1,iy,iz)].x)
       -                            * (d_H[idx(d_nx-1,iy,iz)] - H)/dx2;
       -                    else
       -                        gradx_n = hmean(Tx, d_T[idx(ix-1,iy,iz)].x)
       -                            * (d_H[idx(ix-1,iy,iz)] - H)/dx2;
       -
       -                    if (ix == d_nx-1)
       -                        gradx_p = hmean(Tx, d_T[idx(0,iy,iz)].x)
       -                            * (d_H[idx(0,iy,iz)] - H)/dx2;
       -                    else
       -                        gradx_p = hmean(Tx, d_T[idx(ix+1,iy,iz)].x)
       -                            * (d_H[idx(ix+1,iy,iz)] - H)/dx2;
       -
       -                    if (iy == 0)
       -                        grady_n = hmean(Ty, d_T[idx(ix,d_ny-1,iz)].y)
       -                            * (d_H[idx(ix,d_ny-1,iz)] - H)/dy2;
       -                    else
       -                        grady_n = hmean(Ty, d_T[idx(ix,iy-1,iz)].y)
       -                            * (d_H[idx(ix,iy-1,iz)] - H)/dy2;
       -
       -                    if (iy == d_ny-1)
       -                        grady_p = hmean(Ty, d_T[idx(ix,0,iz)].y)
       -                            * (d_H[idx(ix,0,iz)] - H)/dy2;
       -                    else
       -                        grady_p = hmean(Ty, d_T[idx(ix,iy+1,iz)].y)
       -                            * (d_H[idx(ix,iy+1,iz)] - H)/dy2;
       -                            */
       +                } else {*/
       +
       +                // Cell hydraulic conductivity
       +                K = d_K[cellidx];
       +
       +                // Cell hydraulic transmissivities
       +                Tx = K*d_dx;
       +                Ty = K*d_dy;
       +                Tz = K*d_dz;
        
       +                // Cell hydraulic head
       +                H = d_H[cellidx];
       +
       +                // Harmonic mean of transmissivity
       +                // (in neg. and pos. direction along axis from cell)
       +                // with periodic x and y boundaries
       +                // without ghost nodes
       +                /*
       +                if (ix == 0)
       +                    gradx_n = hmean(Tx, d_T[idx(d_nx-1,iy,iz)].x)
       +                        * (d_H[idx(d_nx-1,iy,iz)] - H)/dx2;
       +                else
                            gradx_n = hmean(Tx, d_T[idx(ix-1,iy,iz)].x)
                                * (d_H[idx(ix-1,iy,iz)] - H)/dx2;
       +
       +                if (ix == d_nx-1)
       +                    gradx_p = hmean(Tx, d_T[idx(0,iy,iz)].x)
       +                        * (d_H[idx(0,iy,iz)] - H)/dx2;
       +                else
                            gradx_p = hmean(Tx, d_T[idx(ix+1,iy,iz)].x)
                                * (d_H[idx(ix+1,iy,iz)] - H)/dx2;
        
       +                if (iy == 0)
       +                    grady_n = hmean(Ty, d_T[idx(ix,d_ny-1,iz)].y)
       +                        * (d_H[idx(ix,d_ny-1,iz)] - H)/dy2;
       +                else
                            grady_n = hmean(Ty, d_T[idx(ix,iy-1,iz)].y)
                                * (d_H[idx(ix,iy-1,iz)] - H)/dy2;
       +
       +                if (iy == d_ny-1)
       +                    grady_p = hmean(Ty, d_T[idx(ix,0,iz)].y)
       +                        * (d_H[idx(ix,0,iz)] - H)/dy2;
       +                else
                            grady_p = hmean(Ty, d_T[idx(ix,iy+1,iz)].y)
                                * (d_H[idx(ix,iy+1,iz)] - H)/dy2;
       -
       +                        */
       +
       +                gradx_n = hmean(Tx, d_T[idx(ix-1,iy,iz)].x)
       +                    * (d_H[idx(ix-1,iy,iz)] - H)/dxdx;
       +                gradx_p = hmean(Tx, d_T[idx(ix+1,iy,iz)].x)
       +                    * (d_H[idx(ix+1,iy,iz)] - H)/dxdx;
       +
       +                grady_n = hmean(Ty, d_T[idx(ix,iy-1,iz)].y)
       +                    * (d_H[idx(ix,iy-1,iz)] - H)/dydy;
       +                grady_p = hmean(Ty, d_T[idx(ix,iy+1,iz)].y)
       +                    * (d_H[idx(ix,iy+1,iz)] - H)/dydy;
       +
       +                // Neumann (no-flow) boundary condition at +z and -z boundaries
       +                // enforced by a gradient value of 0.0
       +                if (iz == 0)
       +                    gradz_n = 0.0;
       +                else
                            gradz_n = hmean(Tz, d_T[idx(ix,iy,iz-1)].z)
       -                        * (d_H[idx(ix,iy,iz-1)] - H)/dz2;
       +                        * (d_H[idx(ix,iy,iz-1)] - H)/dzdz;
       +                if (iz == d_nz-1)
       +                    gradz_p = 0.0;
       +                else
                            gradz_p = hmean(Tz, d_T[idx(ix,iy,iz+1)].z)
       -                        * (d_H[idx(ix,iy,iz+1)] - H)/dz2;
       -
       -                    /*std::cerr << ix << ',' << iy << ',' << iz << '\t'
       -                        << H << '\t' << Tx << ',' << Ty << ',' << Tz << '\t'
       -                        << gradx_n << ',' << gradx_p << '\t'
       -                        << grady_n << ',' << grady_p << '\t'
       -                        << gradz_n << ',' << gradz_p << std::endl;*/
       -
       -                    // Cell hydraulic storativity
       -                    S = d_Ss[cellidx]*d_dx*d_dy*d_dz;
       -
       -                    // Laplacian operator
       -                    deltaH = time.dt/S *
       -                        (  gradx_n + gradx_p
       -                         + grady_n + grady_p
       -                         + gradz_n + gradz_p
       -                         + d_W[cellidx] );
       -
       -                    // Calculate new hydraulic pressure in cell
       -                    d_H_new[cellidx] = H + deltaH;
       -                }
       +                        * (d_H[idx(ix,iy,iz+1)] - H)/dzdz;
       +
       +                /*std::cerr << ix << ',' << iy << ',' << iz << '\t'
       +                    << H << '\t' << Tx << ',' << Ty << ',' << Tz << '\t'
       +                    << gradx_n << ',' << gradx_p << '\t'
       +                    << grady_n << ',' << grady_p << '\t'
       +                    << gradz_n << ',' << gradz_p << std::endl;*/
       +
       +                // Cell hydraulic storativity
       +                S = d_Ss[cellidx]*dxdydz;
       +
       +                // Laplacian operator
       +                deltaH = time.dt/S *
       +                    (  gradx_n + gradx_p
       +                     + grady_n + grady_p
       +                     + gradz_n + gradz_p
       +                     + d_W[cellidx] );
       +
       +                // Calculate new hydraulic pressure in cell
       +                d_H_new[cellidx] = H + deltaH;
       +                //}
                    }
                }
            }
       t@@ -607,6 +637,7 @@ Float DEM::cellPorosity(
            return phi;
        }
        
       +// Calculate the porosity for each cell
        void DEM::findPorosities()
        {
            unsigned int ix, iy, iz, cellidx;
       t@@ -619,6 +650,16 @@ void DEM::findPorosities()
            }
        }
        
       +// Returns the mean particle radius
       +Float DEM::meanRadius()
       +{
       +    unsigned int i;
       +    Float r_sum;
       +    for (i=0; i<np; ++i)
       +        r_sum += k.x[i].w;
       +    return r_sum/((Float)np);
       +}
       +
        // Find particles with centres inside a spatial interval
        // NOTE: This function is untested and unused
        std::vector<unsigned int> DEM::particlesInCell(
       t@@ -709,8 +750,8 @@ void DEM::checkDarcyTimestep()
            if (value > 0.5) {
                std::cerr << "Error! The explicit darcy solution will be unstable.\n"
                    << "This happens due to a combination of the following:\n"
       -            << " - The transmissivity T (i.e. hydraulic conductivity, K) is too large"
       -            << " (" << T_max << ")\n"
       +            << " - The transmissivity T (i.e. hydraulic conductivity, K)"
       +            << " is too large (" << T_max << ")\n"
                    << " - The storativity S is too small"
                    << " (" << S_min << ")\n"
                    << " - The time step is too large"
       t@@ -722,7 +763,7 @@ void DEM::checkDarcyTimestep()
            }
        }
        
       -// Solve Darcy flow on a regular, cubic grid
       +// Initialize darcy arrays, their values, and check the time step length
        void DEM::initDarcy(const Float cellsizemultiplier)
        {
            if (params.nu <= 0.0) {
       t@@ -762,15 +803,15 @@ void DEM::initDarcy(const Float cellsizemultiplier)
        // Print final heads and free memory
        void DEM::endDarcy()
        {
       -    FILE* Kfile;
       +    /*FILE* Kfile;
            if ((Kfile = fopen("d_K.txt","w"))) {
                printDarcyArray(Kfile, d_K);
                fclose(Kfile);
            } else {
                fprintf(stderr, "Error, could not open d_K.txt\n");
       -    }
       -    printDarcyArray(stdout, d_phi, "d_phi");
       -    printDarcyArray(stdout, d_K, "d_K");
       +    }*/
       +    //printDarcyArray(stdout, d_phi, "d_phi");
       +    //printDarcyArray(stdout, d_K, "d_K");
            //printDarcyArray(stdout, d_H, "d_H");
            //printDarcyArray(stdout, d_V, "d_V");
            freeDarcyMem();
 (DIR) diff --git a/src/darcy.cuh b/src/darcy.cuh
       t@@ -0,0 +1,616 @@
       +// darcy.cu
       +// CUDA implementation of Darcy flow
       +
       +// Enable line below to perform Darcy flow computations on the GPU, disable for
       +// CPU computation
       +#define DARCY_GPU
       +
       +#include <iostream>
       +#include <cuda.h>
       +//#include <cutil_math.h>
       +#include <helper_math.h>
       +
       +#include "vector_arithmetic.h"        // for arbitrary prec. vectors
       +#include "sphere.h"
       +#include "datatypes.h"
       +#include "utility.cuh"
       +#include "utility.h"
       +#include "constants.cuh"
       +#include "debug.h"
       +
       +// Initialize memory
       +void DEM::initDarcyMemDev(void)
       +{
       +    // number of cells
       +    //unsigned int ncells = d_nx*d_ny*d_nz; // without ghost nodes
       +    unsigned int ncells = (d_nx+2)*(d_ny+2)*(d_nz+2); // with ghost nodes
       +    unsigned int memSizeF  = sizeof(Float) * ncells;
       +
       +    cudaMalloc((void**)&dev_d_H, memSizeF);     // hydraulic pressure
       +    cudaMalloc((void**)&dev_d_H_new, memSizeF); // new pressure matrix
       +    cudaMalloc((void**)&dev_d_V, memSizeF*3);   // cell hydraulic velocity
       +    cudaMalloc((void**)&dev_d_dH, memSizeF*3);  // hydraulic pressure gradient
       +    cudaMalloc((void**)&dev_d_K, memSizeF);     // hydraulic conductivity
       +    cudaMalloc((void**)&dev_d_T, memSizeF*3);   // hydraulic transmissivity
       +    cudaMalloc((void**)&dev_d_Ss, memSizeF);     // hydraulic storativi
       +    cudaMalloc((void**)&dev_d_W, memSizeF);     // hydraulic recharge
       +    cudaMalloc((void**)&dev_d_phi, memSizeF);     // cell porosity
       +
       +    checkForCudaErrors("End of initDarcyMemDev");
       +}
       +
       +// Free memory
       +void DEM::freeDarcyMemDev()
       +{
       +    cudaFree(dev_d_H);
       +    cudaFree(dev_d_H_new);
       +    cudaFree(dev_d_V);
       +    cudaFree(dev_d_dH);
       +    cudaFree(dev_d_K);
       +    cudaFree(dev_d_T);
       +    cudaFree(dev_d_Ss);
       +    cudaFree(dev_d_W);
       +    cudaFree(dev_d_phi);
       +}
       +
       +// Transfer to device
       +void DEM::transferDarcyToGlobalDeviceMemory(int statusmsg)
       +{
       +    checkForCudaErrors("Before attempting cudaMemcpy in "
       +            "transferDarcyToGlobalDeviceMemory");
       +
       +    if (verbose == 1 && statusmsg == 1)
       +        std::cout << "  Transfering darcy data to the device:           ";
       +
       +    // number of cells
       +    //unsigned int ncells = d_nx*d_ny*d_nz; // without ghost nodes
       +    unsigned int ncells = (d_nx+2)*(d_ny+2)*(d_nz+2); // with ghost nodes
       +    unsigned int memSizeF  = sizeof(Float) * ncells;
       +
       +    // Kinematic particle values
       +    cudaMemcpy(dev_d_H, d_H, memSizeF, cudaMemcpyHostToDevice);
       +    checkForCudaErrors("transferDarcyToGlobalDeviceMemory after first cudaMemcpy");
       +    cudaMemcpy(dev_d_H_new, d_H_new, memSizeF, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_V, d_V, memSizeF*3, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_dH, d_dH, memSizeF*3, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_K, d_K, memSizeF, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_T, d_T, memSizeF*3, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_Ss, d_Ss, memSizeF, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_W, d_W, memSizeF, cudaMemcpyHostToDevice);
       +    cudaMemcpy(dev_d_phi, d_phi, memSizeF, cudaMemcpyHostToDevice);
       +
       +    checkForCudaErrors("End of transferDarcyToGlobalDeviceMemory");
       +    if (verbose == 1 && statusmsg == 1)
       +        std::cout << "Done" << std::endl;
       +}
       +
       +// Transfer from device
       +void DEM::transferDarcyFromGlobalDeviceMemory(int statusmsg)
       +{
       +    if (verbose == 1 && statusmsg == 1)
       +        std::cout << "  Transfering darcy data from the device:         ";
       +
       +    // number of cells
       +    //unsigned int ncells = d_nx*d_ny*d_nz; // without ghost nodes
       +    unsigned int ncells = (d_nx+2)*(d_ny+2)*(d_nz+2); // with ghost nodes
       +    unsigned int memSizeF  = sizeof(Float) * ncells;
       +
       +    // Kinematic particle values
       +    cudaMemcpy(d_H, dev_d_H, memSizeF, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_H_new, dev_d_H_new, memSizeF, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_V, dev_d_V, memSizeF*3, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_dH, dev_d_dH, memSizeF*3, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_K, dev_d_K, memSizeF, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_T, dev_d_T, memSizeF*3, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_Ss, dev_d_Ss, memSizeF, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_W, dev_d_W, memSizeF, cudaMemcpyDeviceToHost);
       +    cudaMemcpy(d_phi, dev_d_phi, memSizeF, cudaMemcpyDeviceToHost);
       +
       +    checkForCudaErrors("End of transferDarcyFromGlobalDeviceMemory");
       +    if (verbose == 1 && statusmsg == 1)
       +        std::cout << "Done" << std::endl;
       +}
       +
       +// Get linear index from 3D grid position
       +__device__ unsigned int idx(
       +        const int x, const int y, const int z)
       +{
       +    // without ghost nodes
       +    //return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z;
       +
       +    // with ghost nodes
       +    // the ghost nodes are placed at x,y,z = -1 and WIDTH
       +    return (x+1) + (devC_grid.num[0]+2)*(y+1) +
       +        (devC_grid.num[0]+2)*(devC_grid.num[1]+2)*(z+1);
       +}
       +
       +__device__ void copyDarcyValsDev(
       +        unsigned int read, unsigned int write,
       +        Float* dev_d_H, Float* dev_d_H_new,
       +        Float3* dev_d_V, Float3* dev_d_dH,
       +        Float* dev_d_K, Float3* dev_d_T,
       +        Float* dev_d_Ss, Float* dev_d_W,
       +        Float* dev_d_phi)
       +{
       +    // Coalesced read
       +    const Float  H     = dev_d_H[read];
       +    const Float  H_new = dev_d_H_new[read];
       +    const Float3 V     = dev_d_V[read];
       +    const Float3 dH    = dev_d_dH[read];
       +    const Float  K     = dev_d_K[read];
       +    const Float3 T     = dev_d_T[read];
       +    const Float  Ss    = dev_d_Ss[read];
       +    const Float  W     = dev_d_W[read];
       +    const Float  phi   = dev_d_phi[read];
       +
       +    // Coalesced write
       +    __syncthreads();
       +    dev_d_H[write]     = H;
       +    dev_d_H_new[write] = H_new;
       +    dev_d_V[write]     = V;
       +    dev_d_dH[write]    = dH;
       +    dev_d_K[write]     = K;
       +    dev_d_T[write]     = T;
       +    dev_d_Ss[write]    = Ss;
       +    dev_d_W[write]     = W;
       +    dev_d_phi[write]   = phi;
       +}
       +
       +// Update ghost nodes from their parent cell values
       +// The edge (diagonal) cells are not written since they are note read
       +// Launch this kernel for all cells in the grid
       +__global__ void setDarcyGhostNodesDev(
       +        Float* dev_d_H, Float* dev_d_H_new,
       +        Float3* dev_d_V, Float3* dev_d_dH,
       +        Float* dev_d_K, Float3* dev_d_T,
       +        Float* dev_d_Ss, Float* dev_d_W,
       +        Float* dev_d_phi)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // 1D thread index
       +    const unsigned int cellidx = idx(x,y,z);
       +
       +    // 1D position of ghost node
       +    unsigned int writeidx;
       +
       +    // check that we are not outside the fluid grid
       +    if (x < nx && y < ny && z < nz) {
       +
       +        if (x == 0) {
       +            writeidx = idx(nx,y,z);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +        if (x == nx-1) {
       +            writeidx = idx(-1,y,z);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +
       +        if (y == 0) {
       +            writeidx = idx(x,ny,z);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +        if (y == ny-1) {
       +            writeidx = idx(x,-1,z);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +
       +        if (z == 0) {
       +            writeidx = idx(x,y,nz);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +        if (z == nz-1) {
       +            writeidx = idx(x,y,-1);
       +            copyDarcyValsDev(cellidx, writeidx,
       +                    dev_d_H, dev_d_H_new,
       +                    dev_d_V, dev_d_dH,
       +                    dev_d_K, dev_d_T,
       +                    dev_d_Ss, dev_d_W,
       +                    dev_d_phi);
       +        }
       +    }
       +}
       +
       +// Find the porosity in each cell
       +__global__ void findPorositiesDev(
       +        unsigned int* dev_cellStart,
       +        unsigned int* dev_cellEnd,
       +        Float4* dev_x_sorted,
       +        Float* dev_d_phi)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +    
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // Cell dimensions
       +    const Float dx = devC_grid.L[0]/nx;
       +    const Float dy = devC_grid.L[1]/ny;
       +    const Float dz = devC_grid.L[2]/nz;
       +    const Float cell_volume = dx*dy*dz;
       +
       +    Float void_volume = cell_volume;
       +    Float4 xr;  // particle pos. and radius
       +
       +    // check that we are not outside the fluid grid
       +    if (x < nx && y < ny && z < nz) {
       +
       +        // Calculate linear cell ID
       +        const unsigned int cellID = x + y*devC_grid.num[0]
       +            + (devC_grid.num[0] * devC_grid.num[1])*z;
       +
       +        // Lowest particle index in cell
       +        const unsigned int startIdx = dev_cellStart[cellID];
       +
       +        // Highest particle index in cell
       +        const unsigned int endIdx = dev_cellEnd[cellID];
       +
       +        // Iterate over cell particles
       +        for (unsigned int i = startIdx; i<endIdx; ++i) {
       +
       +            // Read particle position and radius
       +            __syncthreads();
       +            xr = dev_x_sorted[i];
       +
       +            // Subtract particle volume from void volume
       +            void_volume -= 4.0/3.0*M_PI*xr.w*xr.w*xr.w;
       +        }
       +
       +        // Make sure that the porosity is in the interval ]0.0;1.0[
       +        const Float phi = fmin(0.99, fmax(0.01, void_volume/cell_volume));
       +
       +        // Save porosity
       +        __syncthreads();
       +
       +        dev_d_phi[idx(x,y,z)] = phi;
       +    }
       +}
       +
       +
       +// Find cell transmissivities from hydraulic conductivities and cell dimensions
       +// Make sure to compute the porosities (d_phi) beforehand
       +// d_factor: Grain size factor for Kozeny-Carman relationship
       +__global__ void findDarcyTransmissivitiesDev(
       +        Float* dev_d_K,
       +        Float3* dev_d_T,
       +        Float* dev_d_phi,
       +        Float d_factor)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // Grid sizes
       +    const Float d_dx = devC_grid.L[0]/nx;
       +    const Float d_dy = devC_grid.L[1]/ny;
       +    const Float d_dz = devC_grid.L[2]/nz;
       +
       +    // Density of the fluid [kg/m^3]
       +    const Float rho = 1000.0;
       +
       +    // Check that we are not outside the fluid grid
       +    if (x < nx && y < ny && z < nz) {
       +
       +        // 1D thread index
       +        const unsigned int cellidx = idx(x,y,z);
       +
       +        __syncthreads();
       +
       +        // Cell porosity [-]
       +        const Float phi = dev_d_phi[cellidx];
       +
       +        // Calculate permeability from the Kozeny-Carman relationship
       +        // Nelson 1994 eq. 1c
       +        // Boek 2012 eq. 16
       +        //k = a*phi*phi*phi/(1.0 - phi*phi);
       +        // Schwartz and Zhang 2003
       +        Float k = phi*phi*phi/((1.0-phi)*(1.0-phi)) * d_factor;
       +
       +
       +        __syncthreads();
       +
       +        // Save hydraulic conductivity [m/s]
       +        const Float K = k*rho*-devC_params.g[2]/devC_params.nu;
       +        //K = 0.5; 
       +        dev_d_K[cellidx] = K;
       +
       +        // Hydraulic transmissivity [m2/s]
       +        Float3 T = {K*d_dx, K*d_dy, K*d_dz};
       +        dev_d_T[cellidx] = T;
       +
       +    }
       +}
       +
       +// Find the spatial gradient in e.g.pressures per cell
       +// using first order central differences
       +__global__ void findDarcyGradientsDev(
       +        Float* dev_scalarfield,     // in
       +        Float3* dev_vectorfield)    // out
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // Grid sizes
       +    const Float dx = devC_grid.L[0]/nx;
       +    const Float dy = devC_grid.L[1]/ny;
       +    const Float dz = devC_grid.L[2]/nz;
       +
       +    // 1D thread index
       +    const unsigned int cellidx = idx(x,y,z);
       +
       +    // Check that we are not outside the fluid grid
       +    Float3 gradient;
       +    if (x < nx && y < ny && z < nz) {
       +
       +        __syncthreads();
       +
       +        // x
       +        gradient.x =
       +            (dev_scalarfield[idx(x+1,y,z)] - dev_scalarfield[idx(x-1,y,z)])
       +            /(2.0*dx);
       +
       +        // y
       +        gradient.y =
       +            (dev_scalarfield[idx(x,y+1,z)] - dev_scalarfield[idx(x,y-1,z)])
       +            /(2.0*dy);
       +
       +        // z
       +        gradient.z =
       +            (dev_scalarfield[idx(x,y,z+1)] - dev_scalarfield[idx(x,y,z-1)])
       +            /(2.0*dz);
       +
       +        __syncthreads();
       +        dev_vectorfield[cellidx] = gradient;
       +    }
       +}
       +
       +// Arithmetic mean of two numbers
       +__device__ Float ameanDev(Float a, Float b) {
       +    return (a+b)*0.5;
       +}
       +
       +// Harmonic mean of two numbers
       +__device__ Float hmeanDev(Float a, Float b) {
       +    return (2.0*a*b)/(a+b);
       +}
       +
       +// Perform an explicit step.
       +__global__ void explDarcyStepDev(
       +        Float* dev_d_H,
       +        Float* dev_d_H_new,
       +        Float3* dev_d_T,
       +        Float* dev_d_Ss,
       +        Float* dev_d_W)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // Grid dimensions
       +    const unsigned int nx = devC_grid.num[0];
       +    const unsigned int ny = devC_grid.num[1];
       +    const unsigned int nz = devC_grid.num[2];
       +
       +    // Grid sizes
       +    const Float dx = devC_grid.L[0]/nx;
       +    const Float dy = devC_grid.L[1]/ny;
       +    const Float dz = devC_grid.L[2]/nz;
       +
       +    // 1D thread index
       +    const unsigned int cellidx = idx(x,y,z);
       +
       +    // Check that we are not outside the fluid grid
       +    if (x < nx && y < ny && z < nz) {
       +
       +        // Explicit 3D finite difference scheme
       +        // new = old + production*timestep + gradient*timestep
       +
       +        // Enforce Dirichlet BC
       +        if (x == 0 || y == 0 || z == 0 ||
       +                x == nx-1 || y == ny-1 || z == nz-1) {
       +            __syncthreads();
       +            dev_d_H_new[cellidx] = dev_d_H[cellidx];
       +        } else {
       +
       +            // Cell hydraulic conductivity
       +            __syncthreads();
       +            //const Float K = dev_d_K[cellidx];
       +
       +            // Cell hydraulic transmissivities
       +            const Float3 T = dev_d_T[cellidx];
       +            //const Float Tx = K*dx;
       +            //const Float Ty = K*dy;
       +            //const Float Tz = K*dz;
       +
       +            // Harmonic mean of transmissivity
       +            // (in neg. and pos. direction along axis from cell)
       +            __syncthreads();
       +            const Float Tx_n = hmeanDev(T.x, dev_d_T[idx(x-1,y,z)].x);
       +            const Float Tx_p = hmeanDev(T.x, dev_d_T[idx(x+1,y,z)].x);
       +            const Float Ty_n = hmeanDev(T.y, dev_d_T[idx(x,y-1,z)].y);
       +            const Float Ty_p = hmeanDev(T.y, dev_d_T[idx(x,y+1,z)].y);
       +            const Float Tz_n = hmeanDev(T.z, dev_d_T[idx(x,y,z-1)].z);
       +            const Float Tz_p = hmeanDev(T.z, dev_d_T[idx(x,y,z+1)].z);
       +
       +            // Cell hydraulic storativity
       +            const Float S = dev_d_Ss[cellidx]*dx*dy*dz;
       +
       +            // Cell hydraulic head
       +            const Float H = dev_d_H[cellidx];
       +
       +            // Laplacian operator
       +            const Float deltaH = devC_dt/S *
       +                (    Tx_n * (dev_d_H[idx(x-1,y,z)] - H)/(dx*dx)
       +                   + Tx_p * (dev_d_H[idx(x+1,y,z)] - H)/(dx*dx)
       +                   + Ty_n * (dev_d_H[idx(x,y-1,z)] - H)/(dy*dy)
       +                   + Ty_p * (dev_d_H[idx(x,y+1,z)] - H)/(dy*dy)
       +                   + Tz_n * (dev_d_H[idx(x,y,z-1)] - H)/(dy*dz)
       +                   + Tz_p * (dev_d_H[idx(x,y,z+1)] - H)/(dy*dz)
       +                   + dev_d_W[cellidx] );
       +
       +            // Calculate new hydraulic pressure in cell
       +            __syncthreads();
       +            dev_d_H_new[cellidx] = H + deltaH;
       +        }
       +    }
       +}
       +
       +// Find cell velocity
       +__global__ void findDarcyVelocitiesDev(
       +        Float* dev_d_H,
       +        Float3* dev_d_dH,
       +        Float3* dev_d_V,
       +        Float* dev_d_phi,
       +        Float* dev_d_K)
       +{
       +    // 3D thread index
       +    const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       +    const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
       +    const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
       +
       +    // 1D thread index
       +    const unsigned int cellidx = idx(x,y,z);
       +
       +    // Check that we are not outside the fluid grid
       +    if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
       +
       +        // Flux [m/s]: q = -k/nu * dH
       +        // Pore velocity [m/s]: v = q/n
       +
       +        // Dynamic viscosity
       +        Float nu = devC_params.nu;
       +
       +        __syncthreads();
       +        const Float3 dH = dev_d_dH[cellidx];
       +        const Float K = dev_d_K[cellidx];
       +        const Float phi = dev_d_phi[cellidx];
       +
       +        // Calculate flux
       +        // The sign might need to be reversed, depending on the
       +        // grid orientation
       +        Float3 q = MAKE_FLOAT3(
       +                -K/nu * dH.x,
       +                -K/nu * dH.y,
       +                -K/nu * dH.z);
       +
       +        // Calculate velocity
       +        Float3 v = MAKE_FLOAT3(
       +                v.x = q.x/phi,
       +                v.y = q.y/phi,
       +                v.z = q.z/phi);
       +
       +        // Save velocity
       +        __syncthreads();
       +        dev_d_V[cellidx] = v;
       +    }
       +}
       +
       +// Solve Darcy flow on a regular, cubic grid
       +/*void DEM::initDarcyDev(const Float cellsizemultiplier)
       +{
       +    if (params.nu <= 0.0) {
       +        std::cerr << "Error in initDarcy. The dymamic viscosity (params.nu), "
       +            << "should be larger than 0.0, but is " << params.nu << std::endl;
       +        exit(1);
       +    }
       +
       +    // Number of cells
       +    d_nx = floor(grid.num[0]*cellsizemultiplier);
       +    d_ny = floor(grid.num[1]*cellsizemultiplier);
       +    d_nz = floor(grid.num[2]*cellsizemultiplier);
       +
       +    // Cell size 
       +    d_dx = grid.L[0]/d_nx;
       +    d_dy = grid.L[1]/d_ny;
       +    d_dz = grid.L[2]/d_nz;
       +
       +    if (verbose == 1) {
       +        std::cout << "  - Fluid grid dimensions: "
       +            << d_nx << "*"
       +            << d_ny << "*"
       +            << d_nz << std::endl;
       +        std::cout << "  - Fluid grid cell size: "
       +            << d_dx << "*"
       +            << d_dy << "*"
       +            << d_dz << std::endl;
       +    }
       +
       +    initDarcyMemDev();
       +    initDarcyVals();
       +    findDarcyTransmissivities();
       +
       +    checkDarcyTimestep();
       +
       +    transferDarcyToGlobalDeviceMemory(1);
       +}*/
       +
       +// Print final heads and free memory
       +void DEM::endDarcyDev()
       +{
       +    /*FILE* Kfile;
       +    if ((Kfile = fopen("d_K.txt","w"))) {
       +        printDarcyArray(Kfile, d_K);
       +        fclose(Kfile);
       +    } else {
       +        fprintf(stderr, "Error, could not open d_K.txt\n");
       +    }*/
       +    //printDarcyArray(stdout, d_phi, "d_phi");
       +    //printDarcyArray(stdout, d_K, "d_K");
       +    //printDarcyArray(stdout, d_H, "d_H");
       +    //printDarcyArray(stdout, d_V, "d_V");
       +    freeDarcyMemDev();
       +}
       +
       +// vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -27,7 +27,7 @@
        #include "integration.cuh"
        #include "raytracer.cuh"
        #include "latticeboltzmann.cuh"
       -//#include "darcy.cuh"
       +#include "darcy.cuh"
        
        
        // Wrapper function for initializing the CUDA components.
       t@@ -450,15 +450,20 @@ __host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
            }
        
            // Fluid arrays
       -    if (params.nu > 0.0 && darcy == 0) {
       +    if (params.nu > 0.0) {
       +        if (darcy == 0) {
        #ifdef LBM_GPU
       -        cudaMemcpy( dev_f, f,
       -                sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19,
       -                cudaMemcpyHostToDevice);
       -        cudaMemcpy( dev_v_rho, v_rho,
       -                sizeof(Float4)*grid.num[0]*grid.num[1]*grid.num[2],
       -                cudaMemcpyHostToDevice);
       +            cudaMemcpy( dev_f, f,
       +                    sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19,
       +                    cudaMemcpyHostToDevice);
       +            cudaMemcpy( dev_v_rho, v_rho,
       +                    sizeof(Float4)*grid.num[0]*grid.num[1]*grid.num[2],
       +                    cudaMemcpyHostToDevice);
        #endif
       +        } //else {
       +            //transferDarcyToGlobalDeviceMemory(1);
       +        //}
       +            // Darcy arrays aren't ready yet
            }
        
            checkForCudaErrors("End of transferToGlobalDeviceMemory");
       t@@ -530,16 +535,20 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
                    sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
        
            // Fluid arrays
       +    if (params.nu > 0.0) {
       +        if (darcy == 0) {
        #ifdef LBM_GPU
       -    if (params.nu > 0.0 && darcy == 0) {
                cudaMemcpy( f, dev_f,
                        sizeof(Float)*grid.num[0]*grid.num[1]*grid.num[2]*19,
                        cudaMemcpyDeviceToHost);
                cudaMemcpy(v_rho, dev_v_rho,
                        sizeof(Float4)*grid.num[0]*grid.num[1]*grid.num[2],
                        cudaMemcpyDeviceToHost);
       -    }
        #endif
       +        } else {
       +            transferDarcyFromGlobalDeviceMemory(0);
       +        }
       +    }
        
            checkForCudaErrors("End of transferFromGlobalDeviceMemory");
        }
       t@@ -580,13 +589,13 @@ __host__ void DEM::startTime()
            unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock); 
            dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
        
       -    // Use 3D block and grid layout for Lattice-Boltzmann fluid calculations
       +    // Use 3D block and grid layout for fluid calculations
            dim3 dimBlockFluid(8, 8, 8);    // 512 threads per block
            dim3 dimGridFluid(
                    iDivUp(grid.num[0], dimBlockFluid.x),
                    iDivUp(grid.num[1], dimBlockFluid.y),
                    iDivUp(grid.num[2], dimBlockFluid.z));
       -    if (dimGridFluid.z > 64) {
       +    if (dimGridFluid.z > 64 && params.nu > 0.0) {
                cerr << "Error: dimGridFluid.z > 64" << endl;
                exit(1);
            }
       t@@ -634,18 +643,32 @@ __host__ void DEM::startTime()
            fclose(fp);
        
            // Initialize fluid distribution array
       -    if (params.nu > 0.0 && darcy == 0) {
       +    Float d_factor;
       +    if (params.nu > 0.0) {
       +        if (darcy == 0) {
        #ifdef LBM_GPU
       -        initFluid<<< dimGridFluid, dimBlockFluid >>>(dev_v_rho, dev_f);
       -        cudaThreadSynchronize();
       -#else
       +            initFluid<<< dimGridFluid, dimBlockFluid >>>(dev_v_rho, dev_f);
       +            cudaThreadSynchronize();
       +#endif
       +        } else if (darcy == 1) {
        #ifdef DARCY_GPU
       -        initFluid(v_rho, f, grid.num[0], grid.num[1], grid.num[2]);
       +            const Float cellsizemultiplier = 1.0;
       +            initDarcy(cellsizemultiplier);
       +            initDarcyMemDev();
       +            transferDarcyToGlobalDeviceMemory(1);
       +
       +            // Representative grain radius
       +            const Float r_bar2 = meanRadius()*2.0;
       +            // Grain size factor for Kozeny-Carman relationship
       +            d_factor = r_bar2*r_bar2/180.0;
        #else
       -        const Float cellsizemultiplier = 1.0;
       -        initDarcy(cellsizemultiplier);
       -#endif
       +            const Float cellsizemultiplier = 1.0;
       +            initDarcy(cellsizemultiplier);
        #endif
       +        } else {
       +            std::cerr << "Error, darcy value (" << darcy 
       +                << ") not understood." << std::endl;
       +        }
            }
        
            if (verbose == 1) {
       t@@ -677,6 +700,12 @@ __host__ void DEM::startTime()
            double t_summation = 0.0;
            double t_integrateWalls = 0.0;
        
       +    double t_findPorositiesDev = 0.0;
       +    double t_findDarcyTransmissivitiesDev = 0.0;
       +    double t_explDarcyStepDev = 0.0;
       +    double t_findDarcyGradientsDev = 0.0;
       +    double t_findDarcyVelocitiesDev = 0.0;
       +
            if (PROFILING == 1) {
                cudaEventCreate(&kernel_tic);
                cudaEventCreate(&kernel_toc);
       t@@ -866,17 +895,90 @@ __host__ void DEM::startTime()
                }
        
                // Solve darcy flow through grid
       -        if (darcy == 1) {
       +        if (params.nu > 0.0 && darcy == 1) {
        
        #ifdef DARCY_GPU
       -            std::cout << "GPU darcy" << std::endl;
       +            checkForCudaErrors("Before findPorositiesDev", iter);
       +            // Find cell porosities
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            findPorositiesDev<<<dimGridFluid, dimBlockFluid>>>(
       +                    dev_cellStart,
       +                    dev_cellEnd,
       +                    dev_x_sorted,
       +                    dev_d_phi);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_findPorositiesDev);
       +            checkForCudaErrors("Post findPorositiesDev", iter);
       +
       +            // Find resulting cell transmissivities
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            findDarcyTransmissivitiesDev<<<dimGridFluid, dimBlockFluid>>>(
       +                    dev_d_K,
       +                    dev_d_T,
       +                    dev_d_phi,
       +                    d_factor);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_findDarcyTransmissivitiesDev);
       +            checkForCudaErrors("Post findDarcyTransmissivitiesDev", iter);
       +
       +            // Perform explicit Darcy time step
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            explDarcyStepDev<<<dimGridFluid, dimBlockFluid>>>(
       +                    dev_d_H,
       +                    dev_d_H_new,
       +                    dev_d_T,
       +                    dev_d_Ss,
       +                    dev_d_W);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_explDarcyStepDev);
       +            checkForCudaErrors("Post explDarcyStepDev", iter);
       +
       +            // Flop flop
       +            swapFloatArrays(dev_d_H, dev_d_H_new);
       +
       +            // Find the pressure gradients
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            findDarcyGradientsDev<<<dimGridFluid, dimBlockFluid>>>(
       +                    dev_d_H, dev_d_dH);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_findDarcyGradientsDev);
       +            checkForCudaErrors("Post findDarcyGradientsDev", iter);
       +
       +            // Find the pressure gradients
       +            if (PROFILING == 1)
       +                startTimer(&kernel_tic);
       +            findDarcyVelocitiesDev<<<dimGridFluid, dimBlockFluid>>>(
       +                    dev_d_H,
       +                    dev_d_dH,
       +                    dev_d_V,
       +                    dev_d_phi,
       +                    dev_d_K);
       +            cudaThreadSynchronize();
       +            if (PROFILING == 1)
       +                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                        &t_findDarcyVelocitiesDev);
       +            checkForCudaErrors("Post findDarcyVelocitiesDev", iter);
       +
       +#else
                    // Copy device data to host memory
                    transferFromGlobalDeviceMemory();
        
                    // Pause the CPU thread until all CUDA calls previously issued are completed
                    cudaThreadSynchronize();
        
       -            // Perform explicit Darcy time step
       +            // Perform a Darcy time step on the CPU
                    explDarcyStep();
        
                    // Transfer data from host to device memory
       t@@ -884,9 +986,6 @@ __host__ void DEM::startTime()
        
                    // Pause the CPU thread until all CUDA calls previously issued are completed
                    cudaThreadSynchronize();
       -#else
       -            // Perform a Darcy time step on the CPU
       -            explDarcyStep();
        #endif
                }
        
       t@@ -1086,6 +1185,22 @@ __host__ void DEM::startTime()
                    << "\t(" << 100.0*t_summation/t_sum << " %)\n"
                    << "  - integrateWalls:\t" << t_integrateWalls/1000.0 << " s"
                    << "\t(" << 100.0*t_integrateWalls/t_sum << " %)\n";
       +        if (darcy == 1) {
       +            cout 
       +            << "  - findPorositiesDev:\t" << t_findPorositiesDev/1000.0 << " s"
       +            << "\t(" << 100.0*t_findPorositiesDev/t_sum << " %)\n"
       +            << "  - findDarcyTransmissivitiesDev:\t" <<
       +            t_findDarcyTransmissivitiesDev/1000.0 << " s"
       +            << "\t(" << 100.0*t_findDarcyTransmissivitiesDev/t_sum << " %)\n"
       +            << "  - explDarcyStepDev:\t" << t_explDarcyStepDev/1000.0 << " s"
       +            << "\t(" << 100.0*t_explDarcyStepDev/t_sum << " %)\n"
       +            << "  - findDarcyGradientsDev:\t" << t_findDarcyGradientsDev/1000.0
       +            << " s"
       +            << "\t(" << 100.0*t_findDarcyGradientsDev/t_sum << " %)\n"
       +            << "  - findDarcyVelocitiesDev:\t"
       +            << t_findDarcyVelocitiesDev/1000.0 << " s"
       +            << "\t(" << 100.0*t_findDarcyVelocitiesDev/t_sum << " %)\n";
       +        }
            }
        
        
       t@@ -1098,6 +1213,11 @@ __host__ void DEM::startTime()
            delete[] k.delta_t;
        
        #ifndef DARCY_GPU
       +    if (darcy == 1) {
       +        endDarcyDev();
       +        endDarcy();
       +    }
       +#else
            if (darcy == 1)
                endDarcy();
        #endif
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -230,6 +230,9 @@ class DEM {
                // Find darcy flow velocities from specific flux (q)
                void findDarcyVelocities();
        
       +        // Returns the mean particle radius
       +        Float meanRadius();
       +
                // Get linear (1D) index from 3D coordinate
                unsigned int idx(
                        const unsigned int x,
       t@@ -238,9 +241,11 @@ class DEM {
        
                // Initialize Darcy values and arrays
                void initDarcy(const Float cellsizemultiplier = 1.0);
       +        void initDarcyDev(const Float cellsizemultiplier = 1.0);
        
                // Clean up Darcy arrays
                void endDarcy();
       +        void endDarcyDev();
        
                // Check whether the explicit integration is going to meet the
                // stability criteria