timplemented ndem parameter for DEM/CFD computation ratio - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit f8ab5f09ac2e8cd7d89525a6e619d6f8c382fa1d
 (DIR) parent e567138fef8fd2ec15afe7a583c3274c757839d0
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Thu,  1 May 2014 10:51:45 +0200
       
       implemented ndem parameter for DEM/CFD computation ratio
       
       Diffstat:
         M src/device.cu                       |     839 ++++++++++++++++---------------
         M src/navierstokes.cuh                |      30 +++++++++++++++++-------------
       
       2 files changed, 440 insertions(+), 429 deletions(-)
       ---
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -761,7 +761,6 @@ __host__ void DEM::startTime()
                                dev_contacts,
                                dev_distmod);
        
       -
                        // Synchronization point
                        cudaThreadSynchronize();
                        if (PROFILING == 1)
       t@@ -774,7 +773,6 @@ __host__ void DEM::startTime()
                                " too large?", iter);
                    }
        
       -
                    // For each particle process collisions and compute resulting forces
                    //cudaPrintfInit();
                    if (PROFILING == 1)
       t@@ -897,484 +895,493 @@ __host__ void DEM::startTime()
                    }
        #endif
        
       -            // Initial guess for the top epsilon values. These may be changed in
       -            // setUpperPressureNS
       -            Float pressure = ns.p[idx(0,0,ns.nz-1)];
       -            Float pressure_new = pressure; // Dirichlet
       -            Float epsilon_value = pressure_new - ns.beta*pressure;
       -            setNSepsilonTop<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_epsilon,
       -                    dev_ns_epsilon_new,
       -                    epsilon_value);
       -            cudaThreadSynchronize();
       -            checkForCudaErrorsIter("Post setNSepsilonTop", iter);
       -                
       -            // Modulate the pressures at the upper boundary cells
       -            if ((ns.p_mod_A > 1.0e-5 || ns.p_mod_A < -1.0e-5) &&
       -                    ns.p_mod_f > 1.0e-7) {
       -                Float new_pressure = ns.p[idx(0,0,ns.nz-1)] // original pressure
       -                    + ns.p_mod_A*sin(2.0*M_PI*ns.p_mod_f*time.current
       -                            + ns.p_mod_phi);
       -                setUpperPressureNS<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_p,
       +            if ((iter % ns.ndem) == 0) {
       +                // Initial guess for the top epsilon values. These may be
       +                // changed in setUpperPressureNS
       +                Float pressure = ns.p[idx(0,0,ns.nz-1)];
       +                Float pressure_new = pressure; // Dirichlet
       +                Float epsilon_value = pressure_new - ns.beta*pressure;
       +                setNSepsilonTop<<<dimGridFluid, dimBlockFluid>>>(
                                dev_ns_epsilon,
                                dev_ns_epsilon_new,
       -                        ns.beta,
       -                        new_pressure);
       +                        epsilon_value);
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setUpperPressureNS", iter);
       -
       -                if (report_even_more_epsilon == 1) {
       -                    std::cout
       -                        << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
       -                        << "\n###### EPSILON AFTER setUpperPressureNS ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       +                checkForCudaErrorsIter("Post setNSepsilonTop", iter);
       +
       +                // Modulate the pressures at the upper boundary cells
       +                if ((ns.p_mod_A > 1.0e-5 || ns.p_mod_A < -1.0e-5) &&
       +                        ns.p_mod_f > 1.0e-7) {
       +                                         // original pressure
       +                    Float new_pressure = ns.p[idx(0,0,ns.nz-1)]
       +                        + ns.p_mod_A*sin(2.0*M_PI*ns.p_mod_f*time.current
       +                                + ns.p_mod_phi);
       +                    setUpperPressureNS<<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_p,
       +                            dev_ns_epsilon,
       +                            dev_ns_epsilon_new,
       +                            ns.beta,
       +                            new_pressure);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrorsIter("Post setUpperPressureNS", iter);
       +
       +                    if (report_even_more_epsilon == 1) {
       +                        std::cout
       +                            << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
       +                            << "\n###### EPSILON AFTER setUpperPressureNS "
       +                            << "######" << std::endl;
       +                        transferNSepsilonFromGlobalDeviceMemory();
       +                        printNSarray(stdout, ns.epsilon, "epsilon");
       +                    }
                        }
       -            }
       -
       -            // Set the values of the ghost nodes in the grid
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            /*setNSghostNodesDev<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_p,
       -                    dev_ns_v,
       -                    dev_ns_v_p,
       -                    dev_ns_phi,
       -                    dev_ns_dphi,
       -                    dev_ns_epsilon);*/
       -            setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_p, ns.bc_bot, ns.bc_top);
       -
       -            setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_v, ns.bc_bot, ns.bc_top);
       -
       -            setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_v_p, ns.bc_bot, ns.bc_top);
       -
       -            setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_phi, ns.bc_bot, ns.bc_top);
       -
       -            setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_dphi, ns.bc_bot, ns.bc_top);
       -
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_setNSghostNodesDev);
       -            checkForCudaErrorsIter("Post setNSghostNodesDev", iter);
       -            /*std::cout << "\n###### EPSILON AFTER setNSghostNodesDev ######"
       -                << std::endl;
       -            transferNSepsilonFromGlobalDeviceMemory();
       -            printNSarray(stdout, ns.epsilon, "epsilon");*/
       -
       -
       -            // Find the fluid stress tensor, needed for predicting the fluid
       -            // velocities
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            findNSstressTensor<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_v,
       -                    dev_ns_tau);
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_findNSstressTensor);
       -            checkForCudaErrorsIter("Post findNSstressTensor", iter);
       -
       -            // Set stress tensor values in the ghost nodes
       -            setNSghostNodes_tau<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_tau,
       -                    ns.bc_bot,
       -                    ns.bc_top); 
       -            cudaThreadSynchronize();
       -            checkForCudaErrorsIter("Post setNSghostNodes_tau", iter);
        
       -            // Find the divergence of phi*vi*v, needed for predicting the fluid
       -            // velocities
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            findNSdivphiviv<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_phi,
       -                    dev_ns_v,
       -                    dev_ns_div_phi_vi_v);
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_findNSdivphiviv);
       -            checkForCudaErrorsIter("Post findNSdivphiviv", iter);
       -
       -            // Find the divergence of phi*tau, needed for predicting the fluid
       -            // velocities
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            findNSdivphitau<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_phi,
       -                    dev_ns_tau,
       -                    dev_ns_div_phi_tau);
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_findNSdivphitau);
       -            checkForCudaErrorsIter("Post findNSdivphitau", iter);
       +                // Set the values of the ghost nodes in the grid
       +                if (PROFILING == 1)
       +                    startTimer(&kernel_tic);
       +                /*setNSghostNodesDev<<<dimGridFluid, dimBlockFluid>>>(
       +                  dev_ns_p,
       +                  dev_ns_v,
       +                  dev_ns_v_p,
       +                  dev_ns_phi,
       +                  dev_ns_dphi,
       +                  dev_ns_epsilon);*/
       +                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_p, ns.bc_bot, ns.bc_top);
        
       -            // Predict the fluid velocities on the base of the old pressure
       -            // field and ignoring the incompressibility constraint
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            findPredNSvelocities<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_p,
       -                    dev_ns_v,
       -                    dev_ns_phi,
       -                    dev_ns_dphi,
       -                    dev_ns_div_phi_vi_v,
       -                    dev_ns_div_phi_tau,
       -                    ns.bc_bot,
       -                    ns.bc_top,
       -                    ns.beta,
       -                    dev_ns_fi,
       -                    dev_ns_v_p);
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_findPredNSvelocities);
       -            checkForCudaErrorsIter("Post findPredNSvelocities", iter);
       +                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v, ns.bc_bot, ns.bc_top);
        
       -            setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_v_p);
       -            cudaThreadSynchronize();
       -            checkForCudaErrorsIter("Post setNSghostNodesFloat3(dev_ns_v_p)",
       -                    iter);
       +                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v_p, ns.bc_bot, ns.bc_top);
        
       -            // In the first iteration of the sphere program, we'll need to
       -            // manually estimate the values of epsilon. In the subsequent
       -            // iterations, the previous values are  used.
       -            if (iter == 0) {
       +                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_phi, ns.bc_bot, ns.bc_top);
        
       -                // Define the first estimate of the values of epsilon.
       -                // The initial guess depends on the value of ns.beta.
       -                Float pressure = ns.p[idx(2,2,2)];
       -                Float pressure_new = pressure; // Guess p_current = p_new
       -                Float epsilon_value = pressure_new - ns.beta*pressure;
       -                if (PROFILING == 1)
       -                    startTimer(&kernel_tic);
       -                setNSepsilonInterior<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon, epsilon_value);
       -                cudaThreadSynchronize();
       +                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_dphi, ns.bc_bot, ns.bc_top);
        
       -                setNSnormZero<<<dimGridFluid, dimBlockFluid>>>(dev_ns_norm);
                        cudaThreadSynchronize();
       -
                        if (PROFILING == 1)
                            stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_setNSepsilon);
       -                checkForCudaErrorsIter("Post setNSepsilonInterior", iter);
       -
       -                if (report_even_more_epsilon == 1) {
       -                    std::cout
       -                        << "\n###### EPSILON AFTER setNSepsilonInterior ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }
       -
       -                // Set the epsilon values at the lower boundary
       -                pressure = ns.p[idx(0,0,0)];
       -                pressure_new = pressure; // Guess p_current = p_new
       -                epsilon_value = pressure_new - ns.beta*pressure;
       +                            &t_setNSghostNodesDev);
       +                checkForCudaErrorsIter("Post setNSghostNodesDev", iter);
       +                /*std::cout << "\n###### EPSILON AFTER setNSghostNodesDev #####"
       +                  << std::endl;
       +                  transferNSepsilonFromGlobalDeviceMemory();
       +                  printNSarray(stdout, ns.epsilon, "epsilon");*/
       +
       +                // Find the fluid stress tensor, needed for predicting the fluid
       +                // velocities
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                setNSepsilonBottom<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        dev_ns_epsilon_new,
       -                        epsilon_value);
       +                findNSstressTensor<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v,
       +                        dev_ns_tau);
                        cudaThreadSynchronize();
                        if (PROFILING == 1)
                            stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_setNSdirichlet);
       -                checkForCudaErrorsIter("Post setNSepsilonBottom", iter);
       +                            &t_findNSstressTensor);
       +                checkForCudaErrorsIter("Post findNSstressTensor", iter);
        
       -                if (report_even_more_epsilon == 1) {
       -                    std::cout
       -                        << "\n###### EPSILON AFTER setNSepsilonBottom ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }
       -
       -                /*setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                  dev_ns_epsilon);
       -                  cudaThreadSynchronize();
       -                  checkForCudaErrors("Post setNSghostNodesFloat(dev_ns_epsilon)",
       -                  iter);*/
       -                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        ns.bc_bot, ns.bc_top);
       +                // Set stress tensor values in the ghost nodes
       +                setNSghostNodes_tau<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_tau,
       +                        ns.bc_bot,
       +                        ns.bc_top); 
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setNSghostNodesEpsilon(1)",
       -                        iter);
       +                checkForCudaErrorsIter("Post setNSghostNodes_tau", iter);
        
       -                if (report_even_more_epsilon == 1) {
       -                    std::cout <<
       -                        "\n###### EPSILON AFTER setNSghostNodes(epsilon) ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }
       -            }
       -
       -            // Solve the system of epsilon using a Jacobi iterative solver.
       -            // The average normalized residual is initialized to a large value.
       -            //double avg_norm_res;
       -            double max_norm_res;
       -
       -            // Write a log file of the normalized residuals during the Jacobi
       -            // iterations
       -            std::ofstream reslog;
       -            if (write_res_log == 1)
       -                reslog.open("max_res_norm.dat");
       -
       -            // transfer normalized residuals from GPU to CPU
       -            if (report_epsilon == 1) {
       -                std::cout << "\n###### BEFORE FIRST JACOBI ITERATION ######"
       -                    << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
       -                    << std::endl;
       -                transferNSepsilonFromGlobalDeviceMemory();
       -                printNSarray(stdout, ns.epsilon, "epsilon");
       -            }
       -
       -            for (unsigned int nijac = 0; nijac<ns.maxiter; ++nijac) {
       -
       -                // Only grad(epsilon) changes during the Jacobi iterations. The
       -                // remaining terms of the forcing function are only calculated
       -                // during the first iteration.
       +                // Find the divergence of phi*vi*v, needed for predicting the
       +                // fluid velocities
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                findNSforcing<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        dev_ns_f1,
       -                        dev_ns_f2,
       -                        dev_ns_f,
       +                findNSdivphiviv<<<dimGridFluid, dimBlockFluid>>>(
                                dev_ns_phi,
       -                        dev_ns_dphi,
       -                        dev_ns_v_p,
       -                        nijac);
       +                        dev_ns_v,
       +                        dev_ns_div_phi_vi_v);
                        cudaThreadSynchronize();
                        if (PROFILING == 1)
                            stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_findNSforcing);
       -                checkForCudaErrorsIter("Post findNSforcing", iter);
       -                /*setNSghostNodesForcing<<<dimGridFluid, dimBlockFluid>>>(
       -                  dev_ns_f1,
       -                  dev_ns_f2,
       -                  dev_ns_f,
       -                  nijac);
       -                  cudaThreadSynchronize();
       -                  checkForCudaErrors("Post setNSghostNodesForcing", iter);*/
       -
       -                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        ns.bc_bot, ns.bc_top);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setNSghostNodesEpsilon(2)",
       -                        iter);
       -
       -                if (report_epsilon == 1) {
       -                    std::cout << "\n###### JACOBI ITERATION "
       -                        << nijac << " after setNSghostNodes(epsilon,2) ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }
       -
       -                /*smoothing<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        ns.bc_bot, ns.bc_top);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post smoothing", iter);
       -
       -                if (report_epsilon == 1) {
       -                    std::cout << "\n###### JACOBI ITERATION "
       -                        << nijac << " after smoothing(epsilon) ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }
       -
       -                setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        ns.bc_bot, ns.bc_top);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post setNSghostNodesEpsilon(3)",
       -                        iter);
       -                        */
       -
       -                /*if (report_epsilon == 1) {
       -                    std::cout << "\n###### JACOBI ITERATION "
       -                        << nijac
       -                        << " after setNSghostNodesEpsilon(epsilon,3) ######"
       -                        << std::endl;
       -                    transferNSepsilonFromGlobalDeviceMemory();
       -                    printNSarray(stdout, ns.epsilon, "epsilon");
       -                }*/
       +                            &t_findNSdivphiviv);
       +                checkForCudaErrorsIter("Post findNSdivphiviv", iter);
        
       -                // Store old values
       -                /*copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        dev_ns_epsilon_old);
       +                // Find the divergence of phi*tau, needed for predicting the
       +                // fluid velocities
       +                if (PROFILING == 1)
       +                    startTimer(&kernel_tic);
       +                findNSdivphitau<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_phi,
       +                        dev_ns_tau,
       +                        dev_ns_div_phi_tau);
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post copyValues (epsilon->epsilon_old)",
       -                        iter);*/
       +                if (PROFILING == 1)
       +                    stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                            &t_findNSdivphitau);
       +                checkForCudaErrorsIter("Post findNSdivphitau", iter);
        
       -                // Perform a single Jacobi iteration
       +                // Predict the fluid velocities on the base of the old pressure
       +                // field and ignoring the incompressibility constraint
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                jacobiIterationNS<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon,
       -                        dev_ns_epsilon_new,
       -                        dev_ns_norm,
       -                        dev_ns_f,
       +                findPredNSvelocities<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_p,
       +                        dev_ns_v,
       +                        dev_ns_phi,
       +                        dev_ns_dphi,
       +                        dev_ns_div_phi_vi_v,
       +                        dev_ns_div_phi_tau,
                                ns.bc_bot,
                                ns.bc_top,
       -                        ns.theta);
       +                        ns.beta,
       +                        dev_ns_fi,
       +                        ns.ndem,
       +                        dev_ns_v_p);
                        cudaThreadSynchronize();
                        if (PROFILING == 1)
                            stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                            &t_jacobiIterationNS);
       -                checkForCudaErrorsIter("Post jacobiIterationNS", iter);
       -
       -                // Flip flop: swap new and current array pointers
       -                /*Float* tmp         = dev_ns_epsilon;
       -                  dev_ns_epsilon     = dev_ns_epsilon_new;
       -                  dev_ns_epsilon_new = tmp;*/
       +                            &t_findPredNSvelocities);
       +                checkForCudaErrorsIter("Post findPredNSvelocities", iter);
        
       -                // Copy new values to current values
       -                copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon_new,
       -                        dev_ns_epsilon);
       +                setNSghostNodes<Float3><<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_v_p);
                        cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post copyValues (epsilon_new->epsilon)",
       +                checkForCudaErrorsIter("Post setNSghostNodesFloat3(dev_ns_v_p)",
                                iter);
        
       -                /*findNormalizedResiduals<<<dimGridFluid, dimBlockFluid>>>(
       -                        dev_ns_epsilon_old,
       -                        dev_ns_epsilon,
       -                        dev_ns_norm,
       -                        ns.bc_bot, ns.bc_top);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter("Post findNormalizedResiduals",
       -                        iter);*/
       +                // In the first iteration of the sphere program, we'll need to
       +                // manually estimate the values of epsilon. In the subsequent
       +                // iterations, the previous values are  used.
       +                if (iter == 0) {
       +
       +                    // Define the first estimate of the values of epsilon.
       +                    // The initial guess depends on the value of ns.beta.
       +                    Float pressure = ns.p[idx(2,2,2)];
       +                    Float pressure_new = pressure; // Guess p_current = p_new
       +                    Float epsilon_value = pressure_new - ns.beta*pressure;
       +                    if (PROFILING == 1)
       +                        startTimer(&kernel_tic);
       +                    setNSepsilonInterior<<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon, epsilon_value);
       +                    cudaThreadSynchronize();
       +
       +                    setNSnormZero<<<dimGridFluid, dimBlockFluid>>>(dev_ns_norm);
       +                    cudaThreadSynchronize();
       +
       +                    if (PROFILING == 1)
       +                        stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                                &t_setNSepsilon);
       +                    checkForCudaErrorsIter("Post setNSepsilonInterior", iter);
       +
       +                    if (report_even_more_epsilon == 1) {
       +                        std::cout
       +                            << "\n###### EPSILON AFTER setNSepsilonInterior "
       +                            << "######" << std::endl;
       +                        transferNSepsilonFromGlobalDeviceMemory();
       +                        printNSarray(stdout, ns.epsilon, "epsilon");
       +                    }
        
       +                    // Set the epsilon values at the lower boundary
       +                    pressure = ns.p[idx(0,0,0)];
       +                    pressure_new = pressure; // Guess p_current = p_new
       +                    epsilon_value = pressure_new - ns.beta*pressure;
       +                    if (PROFILING == 1)
       +                        startTimer(&kernel_tic);
       +                    setNSepsilonBottom<<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon,
       +                            dev_ns_epsilon_new,
       +                            epsilon_value);
       +                    cudaThreadSynchronize();
       +                    if (PROFILING == 1)
       +                        stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                                &t_setNSdirichlet);
       +                    checkForCudaErrorsIter("Post setNSepsilonBottom", iter);
       +
       +                    if (report_even_more_epsilon == 1) {
       +                        std::cout
       +                            << "\n###### EPSILON AFTER setNSepsilonBottom "
       +                            << "######" << std::endl;
       +                        transferNSepsilonFromGlobalDeviceMemory();
       +                        printNSarray(stdout, ns.epsilon, "epsilon");
       +                    }
       +
       +                    /*setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_epsilon);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrors("Post setNSghostNodesFloat(dev_ns_epsilon)",
       +                      iter);*/
       +                    setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon,
       +                            ns.bc_bot, ns.bc_top);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrorsIter("Post setNSghostNodesEpsilon(1)",
       +                            iter);
       +
       +                    if (report_even_more_epsilon == 1) {
       +                        std::cout <<
       +                            "\n###### EPSILON AFTER setNSghostNodes(epsilon) "
       +                            << "######" << std::endl;
       +                        transferNSepsilonFromGlobalDeviceMemory();
       +                        printNSarray(stdout, ns.epsilon, "epsilon");
       +                    }
       +                }
       +
       +                // Solve the system of epsilon using a Jacobi iterative solver.
       +                // The average normalized residual is initialized to a large
       +                // value.
       +                //double avg_norm_res;
       +                double max_norm_res;
       +
       +                // Write a log file of the normalized residuals during the Jacobi
       +                // iterations
       +                std::ofstream reslog;
       +                if (write_res_log == 1)
       +                    reslog.open("max_res_norm.dat");
       +
       +                // transfer normalized residuals from GPU to CPU
                        if (report_epsilon == 1) {
       -                    std::cout << "\n###### JACOBI ITERATION "
       -                        << nijac << " after jacobiIterationNS ######"
       +                    std::cout << "\n###### BEFORE FIRST JACOBI ITERATION ######"
       +                        << "\n@@@@@@ TIME STEP " << iter << " @@@@@@"
                                << std::endl;
                            transferNSepsilonFromGlobalDeviceMemory();
                            printNSarray(stdout, ns.epsilon, "epsilon");
                        }
        
       -                if (nijac % nijacnorm == 0) {
       -
       -                    // Read the normalized residuals from the device
       -                    transferNSnormFromGlobalDeviceMemory();
       -
       -                    // Write the normalized residuals to the terminal
       -                    //printNSarray(stdout, ns.norm, "norm");
       -
       -                    // Find the maximum value of the normalized residuals
       -                    max_norm_res = maxNormResNS();
       -
       -                    // Write the Jacobi iteration number and maximum value of
       -                    // the normalized residual to the log file
       -                    if (write_res_log == 1)
       -                        reslog << nijac << '\t' << max_norm_res << std::endl;
       -                }
       +                for (unsigned int nijac = 0; nijac<ns.maxiter; ++nijac) {
       +
       +                    // Only grad(epsilon) changes during the Jacobi iterations.
       +                    // The remaining terms of the forcing function are only
       +                    // calculated during the first iteration.
       +                    if (PROFILING == 1)
       +                        startTimer(&kernel_tic);
       +                    findNSforcing<<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon,
       +                            dev_ns_f1,
       +                            dev_ns_f2,
       +                            dev_ns_f,
       +                            dev_ns_phi,
       +                            dev_ns_dphi,
       +                            dev_ns_v_p,
       +                            nijac,
       +                            ns.ndem);
       +                    cudaThreadSynchronize();
       +                    if (PROFILING == 1)
       +                        stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                                &t_findNSforcing);
       +                    checkForCudaErrorsIter("Post findNSforcing", iter);
       +                    /*setNSghostNodesForcing<<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_f1,
       +                      dev_ns_f2,
       +                      dev_ns_f,
       +                      nijac);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrors("Post setNSghostNodesForcing", iter);*/
       +
       +                    setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon,
       +                            ns.bc_bot, ns.bc_top);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrorsIter("Post setNSghostNodesEpsilon(2)",
       +                            iter);
        
       -                if (max_norm_res < ns.tolerance) {
       -
       -                    if (write_conv_log == 1 && iter % conv_log_interval == 0)
       -                        convlog << iter << '\t' << nijac << std::endl;
       -
       -                    // Apply smoothing if requested
       -                    if (ns.gamma > 0.0) {
       -                        setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                                dev_ns_epsilon,
       -                                ns.bc_bot, ns.bc_top);
       -                        cudaThreadSynchronize();
       -                        checkForCudaErrorsIter("Post setNSghostNodesEpsilon(4)",
       -                                iter);
       -
       -                        smoothing<<<dimGridFluid, dimBlockFluid>>>(
       -                                dev_ns_epsilon,
       -                                ns.gamma,
       -                                ns.bc_bot, ns.bc_top);
       -                        cudaThreadSynchronize();
       -                        checkForCudaErrorsIter("Post smoothing", iter);
       -
       -                        setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       -                                dev_ns_epsilon,
       -                                ns.bc_bot, ns.bc_top);
       -                        cudaThreadSynchronize();
       -                        checkForCudaErrorsIter("Post setNSghostNodesEpsilon(4)",
       -                                iter);
       +                    if (report_epsilon == 1) {
       +                        std::cout << "\n###### JACOBI ITERATION "
       +                            << nijac
       +                            << " after setNSghostNodes(epsilon,2) ######"
       +                            << std::endl;
       +                        transferNSepsilonFromGlobalDeviceMemory();
       +                        printNSarray(stdout, ns.epsilon, "epsilon");
                            }
        
       +                    /*smoothing<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_epsilon,
       +                      ns.bc_bot, ns.bc_top);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrorsIter("Post smoothing", iter);
       +
       +                      if (report_epsilon == 1) {
       +                      std::cout << "\n###### JACOBI ITERATION "
       +                      << nijac << " after smoothing(epsilon) ######"
       +                      << std::endl;
       +                      transferNSepsilonFromGlobalDeviceMemory();
       +                      printNSarray(stdout, ns.epsilon, "epsilon");
       +                      }
       +
       +                      setNSghostNodes<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_epsilon,
       +                      ns.bc_bot, ns.bc_top);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrorsIter("Post setNSghostNodesEpsilon(3)",
       +                      iter);
       +                     */
       +
       +                    /*if (report_epsilon == 1) {
       +                      std::cout << "\n###### JACOBI ITERATION "
       +                      << nijac
       +                      << " after setNSghostNodesEpsilon(epsilon,3) ######"
       +                      << std::endl;
       +                      transferNSepsilonFromGlobalDeviceMemory();
       +                      printNSarray(stdout, ns.epsilon, "epsilon");
       +                      }*/
       +
       +                    // Store old values
       +                    /*copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_epsilon,
       +                      dev_ns_epsilon_old);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrorsIter
       +                          ("Post copyValues (epsilon->epsilon_old)", iter);*/
       +
       +                    // Perform a single Jacobi iteration
       +                    if (PROFILING == 1)
       +                        startTimer(&kernel_tic);
       +                    jacobiIterationNS<<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon,
       +                            dev_ns_epsilon_new,
       +                            dev_ns_norm,
       +                            dev_ns_f,
       +                            ns.bc_bot,
       +                            ns.bc_top,
       +                            ns.theta);
       +                    cudaThreadSynchronize();
       +                    if (PROFILING == 1)
       +                        stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                                &t_jacobiIterationNS);
       +                    checkForCudaErrorsIter("Post jacobiIterationNS", iter);
       +
       +                    // Flip flop: swap new and current array pointers
       +                    /*Float* tmp         = dev_ns_epsilon;
       +                      dev_ns_epsilon     = dev_ns_epsilon_new;
       +                      dev_ns_epsilon_new = tmp;*/
       +
       +                    // Copy new values to current values
       +                    copyValues<Float><<<dimGridFluid, dimBlockFluid>>>(
       +                            dev_ns_epsilon_new,
       +                            dev_ns_epsilon);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrorsIter
       +                        ("Post copyValues (epsilon_new->epsilon)", iter);
       +
       +                    /*findNormalizedResiduals<<<dimGridFluid, dimBlockFluid>>>(
       +                      dev_ns_epsilon_old,
       +                      dev_ns_epsilon,
       +                      dev_ns_norm,
       +                      ns.bc_bot, ns.bc_top);
       +                      cudaThreadSynchronize();
       +                      checkForCudaErrorsIter("Post findNormalizedResiduals",
       +                      iter);*/
       +
                            if (report_epsilon == 1) {
                                std::cout << "\n###### JACOBI ITERATION "
       -                            << nijac << " after smoothing ######"
       +                            << nijac << " after jacobiIterationNS ######"
                                    << std::endl;
                                transferNSepsilonFromGlobalDeviceMemory();
                                printNSarray(stdout, ns.epsilon, "epsilon");
                            }
        
       -                    break;  // solution has converged, exit Jacobi iter. loop
       -                }
       +                    if (nijac % nijacnorm == 0) {
        
       -                if (nijac >= ns.maxiter-1) {
       +                        // Read the normalized residuals from the device
       +                        transferNSnormFromGlobalDeviceMemory();
        
       -                    if (write_conv_log == 1)
       -                        convlog << iter << '\t' << nijac << std::endl;
       +                        // Write the normalized residuals to the terminal
       +                        //printNSarray(stdout, ns.norm, "norm");
        
       -                    std::cerr << "\nIteration " << iter << ", time " 
       -                        << iter*time.dt << " s: "
       -                        "Error, the epsilon solution in the fluid "
       -                        "calculations did not converge. Try increasing the "
       -                        "value of 'ns.maxiter' (" << ns.maxiter
       -                        << ") or increase 'ns.tolerance' ("
       -                        << ns.tolerance << ")." << std::endl;
       -                }
       -                //break; // end after Jacobi first iteration
       -            } // end Jacobi iteration loop
       +                        // Find the maximum value of the normalized residuals
       +                        max_norm_res = maxNormResNS();
        
       -            if (write_res_log == 1)
       -                reslog.close();
       +                        // Write the Jacobi iteration number and maximum value
       +                        // of the normalized residual to the log file
       +                        if (write_res_log == 1)
       +                            reslog << nijac << '\t' << max_norm_res << std::endl;
       +                    }
        
       -            // Find the new pressures and velocities
       -            if (PROFILING == 1)
       -                startTimer(&kernel_tic);
       -            updateNSvelocityPressure<<<dimGridFluid, dimBlockFluid>>>(
       -                    dev_ns_p,
       -                    dev_ns_v,
       -                    dev_ns_v_p,
       -                    dev_ns_phi,
       -                    dev_ns_epsilon,
       -                    ns.beta,
       -                    ns.bc_bot,
       -                    ns.bc_top);
       -            cudaThreadSynchronize();
       -            if (PROFILING == 1)
       -                stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       -                        &t_updateNSvelocityPressure);
       -            checkForCudaErrorsIter("Post updateNSvelocityPressure", iter);
       -        }
       +                    if (max_norm_res < ns.tolerance) {
       +
       +                        if (write_conv_log == 1 && iter % conv_log_interval == 0)
       +                            convlog << iter << '\t' << nijac << std::endl;
       +
       +                        // Apply smoothing if requested
       +                        if (ns.gamma > 0.0) {
       +                            setNSghostNodes<Float>
       +                                <<<dimGridFluid, dimBlockFluid>>>(
       +                                    dev_ns_epsilon,
       +                                    ns.bc_bot, ns.bc_top);
       +                            cudaThreadSynchronize();
       +                            checkForCudaErrorsIter
       +                                ("Post setNSghostNodesEpsilon(4)", iter);
       +
       +                            smoothing<<<dimGridFluid, dimBlockFluid>>>(
       +                                    dev_ns_epsilon,
       +                                    ns.gamma,
       +                                    ns.bc_bot, ns.bc_top);
       +                            cudaThreadSynchronize();
       +                            checkForCudaErrorsIter("Post smoothing", iter);
       +
       +                            setNSghostNodes<Float>
       +                                <<<dimGridFluid, dimBlockFluid>>>(
       +                                    dev_ns_epsilon,
       +                                    ns.bc_bot, ns.bc_top);
       +                            cudaThreadSynchronize();
       +                            checkForCudaErrorsIter
       +                                ("Post setNSghostNodesEpsilon(4)", iter);
       +                        }
       +
       +                        if (report_epsilon == 1) {
       +                            std::cout << "\n###### JACOBI ITERATION "
       +                                << nijac << " after smoothing ######"
       +                                << std::endl;
       +                            transferNSepsilonFromGlobalDeviceMemory();
       +                            printNSarray(stdout, ns.epsilon, "epsilon");
       +                        }
       +
       +                        break;  // solution has converged, exit Jacobi loop
       +                    }
        
       -        /*std::cout << "\n###### ITERATION "
       -          << iter << " ######" << std::endl;
       -          transferNSepsilonFromGlobalDeviceMemory();
       -          printNSarray(stdout, ns.epsilon, "epsilon");*/
       -        //transferNSepsilonNewFromGlobalDeviceMemory();
       -        //printNSarray(stdout, ns.epsilon_new, "epsilon_new");
       +                    if (nijac >= ns.maxiter-1) {
       +
       +                        if (write_conv_log == 1)
       +                            convlog << iter << '\t' << nijac << std::endl;
       +
       +                        std::cerr << "\nIteration " << iter << ", time " 
       +                            << iter*time.dt << " s: "
       +                            "Error, the epsilon solution in the fluid "
       +                            "calculations did not converge. Try increasing the "
       +                            "value of 'ns.maxiter' (" << ns.maxiter
       +                            << ") or increase 'ns.tolerance' ("
       +                            << ns.tolerance << ")." << std::endl;
       +                    }
       +                    //break; // end after Jacobi first iteration
       +                } // end Jacobi iteration loop
       +
       +                if (write_res_log == 1)
       +                    reslog.close();
       +
       +                // Find the new pressures and velocities
       +                if (PROFILING == 1)
       +                    startTimer(&kernel_tic);
       +                updateNSvelocityPressure<<<dimGridFluid, dimBlockFluid>>>(
       +                        dev_ns_p,
       +                        dev_ns_v,
       +                        dev_ns_v_p,
       +                        dev_ns_phi,
       +                        dev_ns_epsilon,
       +                        ns.beta,
       +                        ns.bc_bot,
       +                        ns.bc_top,
       +                        ns.ndem);
       +                cudaThreadSynchronize();
       +                if (PROFILING == 1)
       +                    stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed,
       +                            &t_updateNSvelocityPressure);
       +                checkForCudaErrorsIter("Post updateNSvelocityPressure", iter);
       +            }
       +
       +            /*std::cout << "\n###### ITERATION "
       +              << iter << " ######" << std::endl;
       +              transferNSepsilonFromGlobalDeviceMemory();
       +              printNSarray(stdout, ns.epsilon, "epsilon");*/
       +            //transferNSepsilonNewFromGlobalDeviceMemory();
       +            //printNSarray(stdout, ns.epsilon_new, "epsilon_new");
       +        }
        
                if (np > 0) {
                    // Update particle kinematics
 (DIR) diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
       t@@ -1059,6 +1059,7 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
                Float3* dev_ns_vp_avg,
                Float*  dev_ns_d_avg,
                const unsigned int iteration,
       +        const unsigned int ndem,
                const unsigned int np)
        {
            // 3D thread index
       t@@ -1207,8 +1208,8 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
                        dot_epsilon_ii.x + dot_epsilon_ii.y + dot_epsilon_ii.z;
        
                    const Float dphi =
       -                (1.0 - fmin(phi_0,0.99))*dot_epsilon_kk*devC_dt;
       -            phi = phi_0 + dphi/devC_dt;
       +                (1.0 - fmin(phi_0,0.99))*dot_epsilon_kk*ndem*devC_dt;
       +            phi = phi_0 + dphi/(ndem*devC_dt);
        
                    //if (dot_epsilon_kk != 0.0)
                        //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
       t@@ -1911,6 +1912,7 @@ __global__ void findPredNSvelocities(
                int     bc_top,                 // in
                Float   beta,                   // in
                Float3* dev_ns_fi,              // in
       +        unsigned int ndem,              // in
                Float3* dev_ns_v_p)             // out
        {
            // 3D thread index
       t@@ -1959,18 +1961,18 @@ __global__ void findPredNSvelocities(
                Float3 pressure_term = MAKE_FLOAT3(0.0, 0.0, 0.0);
                if (beta > 0.0) {
                    grad_p = gradient(dev_ns_p, x, y, z, dx, dy, dz);
       -            pressure_term = -beta/devC_params.rho_f*grad_p*devC_dt/phi;
       +            pressure_term = -beta/devC_params.rho_f*grad_p*ndem*devC_dt/phi;
                }
        
                // Calculate the predicted velocity
                Float3 v_p = v
                    + pressure_term
       -            + 1.0/devC_params.rho_f*div_phi_tau*devC_dt/phi
       +            + 1.0/devC_params.rho_f*div_phi_tau*ndem*devC_dt/phi
                    + MAKE_FLOAT3(devC_params.g[0], devC_params.g[1], devC_params.g[2])
       -                *devC_dt
       -            - devC_dt/(devC_params.rho_f*phi)*f_i
       +                *ndem*devC_dt
       +            - ndem*devC_dt/(devC_params.rho_f*phi)*f_i
                    - v*dphi/phi
       -            - div_phi_vi_v*devC_dt/phi            // advection term
       +            - div_phi_vi_v*ndem*devC_dt/phi            // advection term
                    ;
        
                // Report velocity components to stdout for debugging
       t@@ -2017,7 +2019,8 @@ __global__ void findNSforcing(
                Float*  dev_ns_phi,
                Float*  dev_ns_dphi,
                Float3* dev_ns_v_p,
       -        unsigned int nijac)
       +        unsigned int nijac,
       +        unsigned int ndem)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2066,9 +2069,9 @@ __global__ void findNSforcing(
                        + dot(grad_phi, v_p)*devC_params.rho_f/(devC_dt*phi)
                        + dphi*devC_params.rho_f/(devC_dt*devC_dt*phi);
                    f2 = grad_phi/phi;*/
       -            f1 = div_v_p*devC_params.rho_f*phi/devC_dt
       -                + dot(grad_phi, v_p)*devC_params.rho_f/devC_dt
       -                + dphi*devC_params.rho_f/(devC_dt*devC_dt);
       +            f1 = div_v_p*devC_params.rho_f*phi/(ndem*devC_dt)
       +                + dot(grad_phi, v_p)*devC_params.rho_f/(ndem*devC_dt)
       +                + dphi*devC_params.rho_f/(ndem*devC_dt*devC_dt);
                    f2 = grad_phi/phi;
        
                    // Report values terms in the forcing function for debugging
       t@@ -2392,7 +2395,8 @@ __global__ void updateNSvelocityPressure(
                Float*  dev_ns_epsilon,
                Float   beta,
                int     bc_bot,
       -        int     bc_top)
       +        int     bc_top,
       +        unsigned int ndem)
        {
            // 3D thread index
            const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
       t@@ -2431,7 +2435,7 @@ __global__ void updateNSvelocityPressure(
        
                // Find new velocity
                //Float3 v = v_p - devC_dt/devC_params.rho_f*grad_epsilon;
       -        Float3 v = v_p - devC_dt/(devC_params.rho_f*phi)*grad_epsilon;
       +        Float3 v = v_p - ndem*devC_dt/(devC_params.rho_f*phi)*grad_epsilon;
        
                // Print values for debugging
                /* if (z == 0) {