tfix various minor issues related to memory initialization, suggest clang-3.8 to nvcc as ccbin - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit f892592dcad73329dce16040772caa240511678b
 (DIR) parent 103a8713606c2aa6257b0d5bcf4ae4b6562fea66
 (HTM) Author: Anders Damsgaard <andersd@riseup.net>
       Date:   Thu,  7 Sep 2017 12:54:25 -0700
       
       fix various minor issues related to memory initialization, suggest clang-3.8 to nvcc as ccbin
       
       Diffstat:
         M src/CMakeLists.txt                  |      11 +++++++----
         M src/device.cu                       |      74 ++++++++++++++++---------------
         M src/sphere.cpp                      |       4 ++--
         M tests/io_tests.py                   |       1 +
       
       4 files changed, 49 insertions(+), 41 deletions(-)
       ---
 (DIR) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
       t@@ -3,7 +3,7 @@
        #LINK_LIBRARIES("-L${CUDA_SDK_ROOT_DIR}/lib -lcutil_x86_64") # For 64 bit systems
        
        # Ohter folders to include
       -SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
       +#SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
        INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
        INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/src")
        SET(EXECUTABLE_OUTPUT_PATH "../")
       t@@ -16,13 +16,16 @@ INCLUDE(FindCUDA)
        IF (GPU_GENERATION EQUAL 1) # Kepler
            SET(CUDA_NVCC_FLAGS
                #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc-4.6")
       -        "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc")
       +        #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc-4.6")
       +        #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc;-Xcompiler -fPIC")
       +        "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false;-ccbin clang-3.8")
        ELSE()  # Fermi
            SET(CUDA_NVCC_FLAGS
                #"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc-4.6")
       -        "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc")
       +        #"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc;-Xcompiler -fPIC")
       +        "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false;-ccbin clang-3.8")
        ENDIF (GPU_GENERATION EQUAL 1)
       -SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")
       +#SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")
        
        # Rule to build executable program 
        CUDA_ADD_EXECUTABLE(sphere
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -48,15 +48,14 @@ int cudaCoresPerSM(int major, int minor)
            else if (major == 6 && minor == 1)
                return 128;
            else
       -        printf("Error in cudaCoresPerSM",
       -               "Device compute capability value (%d.%d) not recognized.",
       -               major, minor);
       +        printf("Error in cudaCoresPerSM Device compute capability value "
       +                "(%d.%d) not recognized.", major, minor);
            return -1;
        }
        
        // Wrapper function for initializing the CUDA components.
        // Called from main.cpp
       -__host__ void DEM::initializeGPU(void)
       +void DEM::initializeGPU(void)
        {
            using std::cout; // stdout
        
       t@@ -149,13 +148,13 @@ __host__ void DEM::initializeGPU(void)
        }
        
        // Start timer for kernel profiling
       -__host__ void startTimer(cudaEvent_t* kernel_tic)
       +void startTimer(cudaEvent_t* kernel_tic)
        {
            cudaEventRecord(*kernel_tic);
        }
        
        // Stop timer for kernel profiling and time to function sum
       -__host__ void stopTimer(cudaEvent_t *kernel_tic,
       +void stopTimer(cudaEvent_t *kernel_tic,
                cudaEvent_t *kernel_toc,
                float *kernel_elapsed,
                double* sum)
       t@@ -280,7 +279,7 @@ __global__ void checkParticlePositions(
        // Copy the constant data components to device memory,
        // and check whether the values correspond to the 
        // values in constant memory.
       -__host__ void DEM::checkConstantMemory()
       +void DEM::checkConstantMemory()
        {
            // Allocate space in global device memory
            Grid* dev_grid;
       t@@ -322,7 +321,7 @@ __host__ void DEM::checkConstantMemory()
        }
        
        // Copy selected constant components to constant device memory.
       -__host__ void DEM::transferToConstantDeviceMemory()
       +void DEM::transferToConstantDeviceMemory()
        {
            using std::cout;
        
       t@@ -361,7 +360,7 @@ __global__ void printWorldSize(Float4* dev_walls_nx)
                    dev_walls_nx[0].w);
        }
        
       -__host__ void DEM::updateGridSize()
       +void DEM::updateGridSize()
        {
            //printf("\nDEM::updateGridSize() start\n");
            Float* Lz = new Float;
       t@@ -396,7 +395,7 @@ __host__ void DEM::updateGridSize()
        
        // Allocate device memory for particle variables,
        // tied to previously declared pointers in structures
       -__host__ void DEM::allocateGlobalDeviceMemory(void)
       +void DEM::allocateGlobalDeviceMemory(void)
        {
            // Particle memory size
            unsigned int memSizeF  = sizeof(Float) * np;
       t@@ -482,7 +481,7 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
        
        // Allocate global memory on other devices required for "interact" function.
        // The values of domain_size[ndevices] must be set beforehand.
       -__host__ void DEM::allocateHelperDeviceMemory(void)
       +void DEM::allocateHelperDeviceMemory(void)
        {
            // Particle memory size
            unsigned int memSizeF4 = sizeof(Float4) * np;
       t@@ -555,7 +554,7 @@ __host__ void DEM::allocateHelperDeviceMemory(void)
            cudaSetDevice(device); // select main GPU
        }
        
       -__host__ void DEM::freeHelperDeviceMemory()
       +void DEM::freeHelperDeviceMemory()
        {
            for (int d=0; d<ndevices; d++) {
        
       t@@ -593,7 +592,7 @@ __host__ void DEM::freeHelperDeviceMemory()
            cudaSetDevice(device); // select primary GPU
        }
        
       -__host__ void DEM::freeGlobalDeviceMemory()
       +void DEM::freeGlobalDeviceMemory()
        {
            if (verbose == 1)
                printf("\nFreeing device memory:                           ");
       t@@ -658,7 +657,7 @@ __host__ void DEM::freeGlobalDeviceMemory()
        }
        
        
       -__host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
       +void DEM::transferToGlobalDeviceMemory(int statusmsg)
        {
            if (verbose == 1 && statusmsg == 1)
                std::cout << "  Transfering data to the device:                 ";
       t@@ -745,7 +744,7 @@ __host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
                std::cout << "Done" << std::endl;
        }
        
       -__host__ void DEM::transferFromGlobalDeviceMemory()
       +void DEM::transferFromGlobalDeviceMemory()
        {
            //std::cout << "  Transfering data from the device:               ";
        
       t@@ -824,7 +823,7 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
        
        
        // Iterate through time by explicit time integration
       -__host__ void DEM::startTime()
       +void DEM::startTime()
        {
            using std::cout;
            using std::cerr;
       t@@ -1002,16 +1001,18 @@ __host__ void DEM::startTime()
            unsigned int wall0_iz = 10000000;
            // weight of fluid between two cells in z direction
            Float dp_dz;
       -    if (cfd_solver == 0)
       -        dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
       -    else if (cfd_solver == 1) {
       -        dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
       -
       -        // determine pressure at top wall at t=0
       -        darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
       -                            - darcy.p_mod_A
       -                            *sin(2.0*M_PI*darcy.p_mod_f*time.current
       -                                    + darcy.p_mod_phi);
       +    if (fluid == 1) {
       +        if (cfd_solver == 0)
       +            dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
       +        else if (cfd_solver == 1) {
       +            dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
       +
       +            // determine pressure at top wall at t=0
       +            darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
       +                                - darcy.p_mod_A
       +                                *sin(2.0*M_PI*darcy.p_mod_f*time.current
       +                                        + darcy.p_mod_phi);
       +        }
            }
            //std::cout << "dp_dz = " << dp_dz << std::endl;
        
       t@@ -2589,13 +2590,15 @@ __host__ void DEM::startTime()
                            iter);
        
                    // Empty the dphi values after device to host transfer
       -            if (fluid == 1 && cfd_solver == 1) {
       -                setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
       -                    (dev_darcy_dphi);
       -                cudaThreadSynchronize();
       -                checkForCudaErrorsIter(
       -                        "After setDarcyZeros(dev_darcy_dphi) after transfer",
       -                        iter);
       +            if (fluid == 1) {
       +                if (cfd_solver == 1) {
       +                    setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
       +                        (dev_darcy_dphi);
       +                    cudaThreadSynchronize();
       +                    checkForCudaErrorsIter(
       +                            "After setDarcyZeros(dev_darcy_dphi) after transfer",
       +                            iter);
       +                }
                    }
        
                    // Pause the CPU thread until all CUDA calls previously issued are
       t@@ -2603,8 +2606,9 @@ __host__ void DEM::startTime()
                    cudaThreadSynchronize();
        
                    // Check the numerical stability of the NS solver
       -            if (fluid == 1 && cfd_solver == 0)
       -                checkNSstability();
       +            if (fluid == 1)
       +                if (cfd_solver == 0)
       +                    checkNSstability();
        
                    // Write binary output file
                    time.step_count += 1;
 (DIR) diff --git a/src/sphere.cpp b/src/sphere.cpp
       t@@ -21,7 +21,7 @@ DEM::DEM(const std::string inputbin,
                 const int transferConstMem,
                 const int fluidFlow,
                 const int device)
       -: verbose(verbosity), fluid(fluidFlow), device(device)
       +: verbose(verbosity), device(device), fluid(fluidFlow)
        {
            using std::cout;
            using std::cerr;
       t@@ -874,7 +874,7 @@ void DEM::forcechains(const std::string format, const int threedim,
                        cout << k.x[i].z;
                        cout << " to " << k.x[j].x << ',';
                        if (threedim == 1)
       -                    cout << k.x[j].y, ',';
       +                    cout << k.x[j].y << ',';
                        cout << k.x[j].z;
                        cout << " nohead "
                            << "lw " << ratio * thickness_scaling
 (DIR) diff --git a/tests/io_tests.py b/tests/io_tests.py
       t@@ -27,6 +27,7 @@ compare(orig, py, "Python IO:")
        # Test C++ IO routines
        #orig.run(verbose=True, hideinputfile=True)
        orig.run(dry=True)
       +#orig.run(valgrind=True)
        orig.run()
        cpp = sphere.sim()
        cpp.readbin("../output/" + orig.sid + ".output00000.bin", verbose=False)