tfix various minor issues related to memory initialization, suggest clang-3.8 to nvcc as ccbin - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit f892592dcad73329dce16040772caa240511678b
(DIR) parent 103a8713606c2aa6257b0d5bcf4ae4b6562fea66
(HTM) Author: Anders Damsgaard <andersd@riseup.net>
Date: Thu, 7 Sep 2017 12:54:25 -0700
fix various minor issues related to memory initialization, suggest clang-3.8 to nvcc as ccbin
Diffstat:
M src/CMakeLists.txt | 11 +++++++----
M src/device.cu | 74 ++++++++++++++++---------------
M src/sphere.cpp | 4 ++--
M tests/io_tests.py | 1 +
4 files changed, 49 insertions(+), 41 deletions(-)
---
(DIR) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
t@@ -3,7 +3,7 @@
#LINK_LIBRARIES("-L${CUDA_SDK_ROOT_DIR}/lib -lcutil_x86_64") # For 64 bit systems
# Ohter folders to include
-SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
+#SET(CUDA_SDK_ROOT_DIR "/usr/local/cuda-5.0/samples")
INCLUDE_DIRECTORIES("${CUDA_SDK_ROOT_DIR}/common/inc")
INCLUDE_DIRECTORIES("${CMAKE_BINARY_DIR}/src")
SET(EXECUTABLE_OUTPUT_PATH "../")
t@@ -16,13 +16,16 @@ INCLUDE(FindCUDA)
IF (GPU_GENERATION EQUAL 1) # Kepler
SET(CUDA_NVCC_FLAGS
#"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc-4.6")
- "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc")
+ #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc-4.6")
+ #"--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false -ccbin gcc;-Xcompiler -fPIC")
+ "--use_fast_math;-O3;-gencode=arch=compute_35,code=\"sm_35,compute_35\";--fmad=false;-ccbin clang-3.8")
ELSE() # Fermi
SET(CUDA_NVCC_FLAGS
#"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc-4.6")
- "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc")
+ #"--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false -ccbin gcc;-Xcompiler -fPIC")
+ "--use_fast_math;-O3;-gencode=arch=compute_20,code=\"sm_20,compute_20\";--fmad=false;-ccbin clang-3.8")
ENDIF (GPU_GENERATION EQUAL 1)
-SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")
+#SET(CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS}")
# Rule to build executable program
CUDA_ADD_EXECUTABLE(sphere
(DIR) diff --git a/src/device.cu b/src/device.cu
t@@ -48,15 +48,14 @@ int cudaCoresPerSM(int major, int minor)
else if (major == 6 && minor == 1)
return 128;
else
- printf("Error in cudaCoresPerSM",
- "Device compute capability value (%d.%d) not recognized.",
- major, minor);
+ printf("Error in cudaCoresPerSM Device compute capability value "
+ "(%d.%d) not recognized.", major, minor);
return -1;
}
// Wrapper function for initializing the CUDA components.
// Called from main.cpp
-__host__ void DEM::initializeGPU(void)
+void DEM::initializeGPU(void)
{
using std::cout; // stdout
t@@ -149,13 +148,13 @@ __host__ void DEM::initializeGPU(void)
}
// Start timer for kernel profiling
-__host__ void startTimer(cudaEvent_t* kernel_tic)
+void startTimer(cudaEvent_t* kernel_tic)
{
cudaEventRecord(*kernel_tic);
}
// Stop timer for kernel profiling and time to function sum
-__host__ void stopTimer(cudaEvent_t *kernel_tic,
+void stopTimer(cudaEvent_t *kernel_tic,
cudaEvent_t *kernel_toc,
float *kernel_elapsed,
double* sum)
t@@ -280,7 +279,7 @@ __global__ void checkParticlePositions(
// Copy the constant data components to device memory,
// and check whether the values correspond to the
// values in constant memory.
-__host__ void DEM::checkConstantMemory()
+void DEM::checkConstantMemory()
{
// Allocate space in global device memory
Grid* dev_grid;
t@@ -322,7 +321,7 @@ __host__ void DEM::checkConstantMemory()
}
// Copy selected constant components to constant device memory.
-__host__ void DEM::transferToConstantDeviceMemory()
+void DEM::transferToConstantDeviceMemory()
{
using std::cout;
t@@ -361,7 +360,7 @@ __global__ void printWorldSize(Float4* dev_walls_nx)
dev_walls_nx[0].w);
}
-__host__ void DEM::updateGridSize()
+void DEM::updateGridSize()
{
//printf("\nDEM::updateGridSize() start\n");
Float* Lz = new Float;
t@@ -396,7 +395,7 @@ __host__ void DEM::updateGridSize()
// Allocate device memory for particle variables,
// tied to previously declared pointers in structures
-__host__ void DEM::allocateGlobalDeviceMemory(void)
+void DEM::allocateGlobalDeviceMemory(void)
{
// Particle memory size
unsigned int memSizeF = sizeof(Float) * np;
t@@ -482,7 +481,7 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
// Allocate global memory on other devices required for "interact" function.
// The values of domain_size[ndevices] must be set beforehand.
-__host__ void DEM::allocateHelperDeviceMemory(void)
+void DEM::allocateHelperDeviceMemory(void)
{
// Particle memory size
unsigned int memSizeF4 = sizeof(Float4) * np;
t@@ -555,7 +554,7 @@ __host__ void DEM::allocateHelperDeviceMemory(void)
cudaSetDevice(device); // select main GPU
}
-__host__ void DEM::freeHelperDeviceMemory()
+void DEM::freeHelperDeviceMemory()
{
for (int d=0; d<ndevices; d++) {
t@@ -593,7 +592,7 @@ __host__ void DEM::freeHelperDeviceMemory()
cudaSetDevice(device); // select primary GPU
}
-__host__ void DEM::freeGlobalDeviceMemory()
+void DEM::freeGlobalDeviceMemory()
{
if (verbose == 1)
printf("\nFreeing device memory: ");
t@@ -658,7 +657,7 @@ __host__ void DEM::freeGlobalDeviceMemory()
}
-__host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
+void DEM::transferToGlobalDeviceMemory(int statusmsg)
{
if (verbose == 1 && statusmsg == 1)
std::cout << " Transfering data to the device: ";
t@@ -745,7 +744,7 @@ __host__ void DEM::transferToGlobalDeviceMemory(int statusmsg)
std::cout << "Done" << std::endl;
}
-__host__ void DEM::transferFromGlobalDeviceMemory()
+void DEM::transferFromGlobalDeviceMemory()
{
//std::cout << " Transfering data from the device: ";
t@@ -824,7 +823,7 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
// Iterate through time by explicit time integration
-__host__ void DEM::startTime()
+void DEM::startTime()
{
using std::cout;
using std::cerr;
t@@ -1002,16 +1001,18 @@ __host__ void DEM::startTime()
unsigned int wall0_iz = 10000000;
// weight of fluid between two cells in z direction
Float dp_dz;
- if (cfd_solver == 0)
- dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
- else if (cfd_solver == 1) {
- dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
-
- // determine pressure at top wall at t=0
- darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
- - darcy.p_mod_A
- *sin(2.0*M_PI*darcy.p_mod_f*time.current
- + darcy.p_mod_phi);
+ if (fluid == 1) {
+ if (cfd_solver == 0)
+ dp_dz = fabs(ns.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
+ else if (cfd_solver == 1) {
+ dp_dz = fabs(darcy.rho_f*params.g[2]*grid.L[2]/grid.num[2]);
+
+ // determine pressure at top wall at t=0
+ darcy.p_top_orig = darcy.p[d_idx(0,0,darcy.nz-1)]
+ - darcy.p_mod_A
+ *sin(2.0*M_PI*darcy.p_mod_f*time.current
+ + darcy.p_mod_phi);
+ }
}
//std::cout << "dp_dz = " << dp_dz << std::endl;
t@@ -2589,13 +2590,15 @@ __host__ void DEM::startTime()
iter);
// Empty the dphi values after device to host transfer
- if (fluid == 1 && cfd_solver == 1) {
- setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
- (dev_darcy_dphi);
- cudaThreadSynchronize();
- checkForCudaErrorsIter(
- "After setDarcyZeros(dev_darcy_dphi) after transfer",
- iter);
+ if (fluid == 1) {
+ if (cfd_solver == 1) {
+ setDarcyZeros<Float> <<<dimGridFluid, dimBlockFluid>>>
+ (dev_darcy_dphi);
+ cudaThreadSynchronize();
+ checkForCudaErrorsIter(
+ "After setDarcyZeros(dev_darcy_dphi) after transfer",
+ iter);
+ }
}
// Pause the CPU thread until all CUDA calls previously issued are
t@@ -2603,8 +2606,9 @@ __host__ void DEM::startTime()
cudaThreadSynchronize();
// Check the numerical stability of the NS solver
- if (fluid == 1 && cfd_solver == 0)
- checkNSstability();
+ if (fluid == 1)
+ if (cfd_solver == 0)
+ checkNSstability();
// Write binary output file
time.step_count += 1;
(DIR) diff --git a/src/sphere.cpp b/src/sphere.cpp
t@@ -21,7 +21,7 @@ DEM::DEM(const std::string inputbin,
const int transferConstMem,
const int fluidFlow,
const int device)
-: verbose(verbosity), fluid(fluidFlow), device(device)
+: verbose(verbosity), device(device), fluid(fluidFlow)
{
using std::cout;
using std::cerr;
t@@ -874,7 +874,7 @@ void DEM::forcechains(const std::string format, const int threedim,
cout << k.x[i].z;
cout << " to " << k.x[j].x << ',';
if (threedim == 1)
- cout << k.x[j].y, ',';
+ cout << k.x[j].y << ',';
cout << k.x[j].z;
cout << " nohead "
<< "lw " << ratio * thickness_scaling
(DIR) diff --git a/tests/io_tests.py b/tests/io_tests.py
t@@ -27,6 +27,7 @@ compare(orig, py, "Python IO:")
# Test C++ IO routines
#orig.run(verbose=True, hideinputfile=True)
orig.run(dry=True)
+#orig.run(valgrind=True)
orig.run()
cpp = sphere.sim()
cpp.readbin("../output/" + orig.sid + ".output00000.bin", verbose=False)