tIO now works in Python, C++ and CUDA, see python/tests.py. Now working on implementing raytracer - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit 472d81e85b29f2ad92df2031ff56a9ac2cdf8699
(DIR) parent c494a43f245cf27c2c498ce27636347fbbb1a909
(HTM) Author: Anders Damsgaard <adc@geo.au.dk>
Date: Fri, 2 Nov 2012 09:29:51 +0100
IO now works in Python, C++ and CUDA, see python/tests.py. Now working on implementing raytracer
Diffstat:
M python/sphere.py | 77 +++++++++++++++----------------
M python/tests.py | 20 +++++++++++---------
M src/Makefile | 4 ++--
M src/contactsearch.cuh | 29 +++++++++++++++++++----------
M src/datatypes.h | 1 -
M src/device.cu | 346 ++++++++++++++++---------------
M src/file_io.cpp | 1 -
M src/integration.cuh | 18 ++++++++++--------
M src/sphere.cpp | 22 ++++++++++++----------
M src/sphere.h | 51 +++++++++++++++++++++++++------
M src/utility.cu | 4 ++--
11 files changed, 315 insertions(+), 258 deletions(-)
---
(DIR) diff --git a/python/sphere.py b/python/sphere.py
t@@ -51,17 +51,21 @@ class Spherebin:
self.ev = numpy.zeros(self.np, dtype=numpy.float64)
self.p = numpy.zeros(self.np, dtype=numpy.float64)
- self.g = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
- self.k_n = numpy.ones(1, dtype=numpy.float64) * 1.16e9
- self.k_t = numpy.ones(1, dtype=numpy.float64) * 1.16e9
- self.k_r = numpy.zeros(1, dtype=numpy.float64)
- self.gamma_n = numpy.zeros(1, dtype=numpy.float64)
- self.gamma_t = numpy.zeros(1, dtype=numpy.float64)
- self.gamma_r = numpy.zeros(1, dtype=numpy.float64)
- self.mu_s = numpy.ones(1, dtype=numpy.float64)
- self.mu_d = numpy.ones(1, dtype=numpy.float64)
- self.mu_r = numpy.zeros(1, dtype=numpy.float64)
- self.rho = numpy.ones(1, dtype=numpy.float64) * 2600.0
+ self.g = numpy.array([0.0, 0.0, 0.0], dtype=numpy.float64)
+ self.k_n = numpy.ones(1, dtype=numpy.float64) * 1.16e9
+ self.k_t = numpy.ones(1, dtype=numpy.float64) * 1.16e9
+ self.k_r = numpy.zeros(1, dtype=numpy.float64)
+ self.gamma_n = numpy.zeros(1, dtype=numpy.float64)
+ self.gamma_t = numpy.zeros(1, dtype=numpy.float64)
+ self.gamma_r = numpy.zeros(1, dtype=numpy.float64)
+ self.mu_s = numpy.ones(1, dtype=numpy.float64)
+ self.mu_d = numpy.ones(1, dtype=numpy.float64)
+ self.mu_r = numpy.zeros(1, dtype=numpy.float64)
+ self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
+ self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
+ self.mu_ws = numpy.ones(1, dtype=numpy.float64)
+ self.mu_wd = numpy.ones(1, dtype=numpy.float64)
+ self.rho = numpy.ones(1, dtype=numpy.float64) * 2600.0
self.contactmodel = numpy.ones(1, dtype=numpy.uint32) * 2 # contactLinear default
self.kappa = numpy.zeros(1, dtype=numpy.float64)
self.db = numpy.zeros(1, dtype=numpy.float64)
t@@ -70,19 +74,16 @@ class Spherebin:
# Wall data
self.nw = numpy.ones(1, dtype=numpy.uint32) * nw
self.wmode = numpy.zeros(self.nw, dtype=numpy.int32)
+
self.w_n = numpy.zeros(self.nw*self.nd, dtype=numpy.float64).reshape(self.nw,self.nd)
- self.w_n[nw-1,nd-1] = -1.0
+ if (self.nw > 0):
+ self.w_n[0,2] = -1.0
self.w_x = numpy.ones(self.nw, dtype=numpy.float64)
self.w_m = numpy.zeros(self.nw, dtype=numpy.float64)
self.w_vel = numpy.zeros(self.nw, dtype=numpy.float64)
self.w_force = numpy.zeros(self.nw, dtype=numpy.float64)
self.w_devs = numpy.zeros(self.nw, dtype=numpy.float64)
- # x- and y-boundary behavior
- self.gamma_wn = numpy.ones(1, dtype=numpy.float64) * 1.0e3
- self.gamma_wt = numpy.ones(1, dtype=numpy.float64) * 1.0e3
- self.gamma_wr = numpy.ones(1, dtype=numpy.float64) * 1.0e3
-
# Compare the values of two Spherebin objects, and check
# whether the values are identical
def __cmp__(self, other):
t@@ -136,8 +137,7 @@ class Spherebin:
(self.w_force == other.w_force).all() and\
(self.w_devs == other.w_devs).all() and\
self.gamma_wn == other.gamma_wn and\
- self.gamma_wt == other.gamma_wt and\
- self.gamma_wr == other.gamma_wr\
+ self.gamma_wt == other.gamma_wt\
).all() == True):
return 0 # All equal
else:
t@@ -224,6 +224,10 @@ class Spherebin:
self.mu_s = numpy.fromfile(fh, dtype=numpy.float64, count=1)
self.mu_d = numpy.fromfile(fh, dtype=numpy.float64, count=1)
self.mu_r = numpy.fromfile(fh, dtype=numpy.float64, count=1)
+ self.gamma_wn = numpy.fromfile(fh, dtype=numpy.float64, count=1)
+ self.gamma_wt = numpy.fromfile(fh, dtype=numpy.float64, count=1)
+ self.mu_ws = numpy.fromfile(fh, dtype=numpy.float64, count=1)
+ self.mu_wd = numpy.fromfile(fh, dtype=numpy.float64, count=1)
self.rho = numpy.fromfile(fh, dtype=numpy.float64, count=1)
self.contactmodel = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
self.kappa = numpy.fromfile(fh, dtype=numpy.float64, count=1)
t@@ -250,10 +254,6 @@ class Spherebin:
self.w_force = numpy.fromfile(fh, dtype=numpy.float64, count=1)
self.w_devs = numpy.fromfile(fh, dtype=numpy.float64, count=1)
- # x- and y-boundary behavior
- self.gamma_wn = numpy.fromfile(fh, dtype=numpy.float64, count=1)
- self.gamma_wt = numpy.fromfile(fh, dtype=numpy.float64, count=1)
- self.gamma_wr = numpy.fromfile(fh, dtype=numpy.float64, count=1)
fh.close()
t@@ -325,6 +325,10 @@ class Spherebin:
fh.write(self.mu_s.astype(numpy.float64))
fh.write(self.mu_d.astype(numpy.float64))
fh.write(self.mu_r.astype(numpy.float64))
+ fh.write(self.gamma_wn.astype(numpy.float64))
+ fh.write(self.gamma_wt.astype(numpy.float64))
+ fh.write(self.mu_ws.astype(numpy.float64))
+ fh.write(self.mu_wd.astype(numpy.float64))
fh.write(self.rho.astype(numpy.float64))
fh.write(self.contactmodel.astype(numpy.uint32))
fh.write(self.kappa.astype(numpy.float64))
t@@ -343,11 +347,6 @@ class Spherebin:
fh.write(self.w_force[i].astype(numpy.float64))
fh.write(self.w_devs[i].astype(numpy.float64))
- # x- and y-boundary behavior
- fh.write(self.gamma_wn.astype(numpy.float64))
- fh.write(self.gamma_wt.astype(numpy.float64))
- fh.write(self.gamma_wr.astype(numpy.float64))
-
fh.close()
finally:
t@@ -633,14 +632,15 @@ class Spherebin:
self.L = self.num * cellsize
# Initialize upper wall
- self.wmode[0] = 0
- self.w_n[0,2] = -1.0
- self.w_x[0] = self.L[2]
- self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
- self.w_vel[0] = 0.0
- self.w_force[0] = 0.0
- self.w_devs[0] = 0.0
- self.nw = numpy.ones(1, dtype=numpy.uint32) * 1
+ if (self.nw > 0):
+ self.wmode[0] = 0
+ self.w_n[0,2] = -1.0
+ self.w_x[0] = self.L[2]
+ self.w_m[0] = self.rho[0] * self.np * math.pi * r_max**3
+ self.w_vel[0] = 0.0
+ self.w_force[0] = 0.0
+ self.w_devs[0] = 0.0
+ self.nw = numpy.ones(1, dtype=numpy.uint32) * 1
# Adjust grid and upper wall for consolidation under deviatoric stress
def consolidate(self, deviatoric_stress = 10e3,
t@@ -755,8 +755,6 @@ class Spherebin:
# Set wall viscosities to zero
self.gamma_wn[0] = 0.0
self.gamma_wt[0] = 0.0
- self.gamma_wr[0] = 0.0
-
def initTemporal(self, total,
t@@ -792,7 +790,6 @@ class Spherebin:
gamma_r = 0,
gamma_wn = 1e3,
gamma_wt = 1e3,
- gamma_wr = 2e3,
capillaryCohesion = 0):
""" Initialize particle parameters to default values.
Radii must be set prior to calling this function.
t@@ -837,8 +834,6 @@ class Spherebin:
# Wall viscosities
self.gamma_wn[0] = gamma_wn # normal
self.gamma_wt[0] = gamma_wt # sliding
- self.gamma_wr[0] = gamma_wr # rolling
-
### Parameters related to capillary bonds
(DIR) diff --git a/python/tests.py b/python/tests.py
t@@ -12,17 +12,13 @@ def compare(first, second, string):
print("### Input/output tests ###")
# Generate data in python
-orig = Spherebin(100)
+orig = Spherebin(np = 100, nw = 0)
orig.generateRadii()
orig.defaultParams()
-orig.initRandomGridPos()
-orig.initTemporal(total = 0.0, current = 0.0)
-orig.xysum = numpy.ones(orig.np*2, dtype=numpy.float64).reshape(orig.np, 2) * 1
-orig.vel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 2
-orig.force = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 3
-orig.angpos = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 4
-orig.angvel = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 5
-orig.torque = numpy.ones(orig.np*orig.nd, dtype=numpy.float64).reshape(orig.np, orig.nd) * 6
+orig.initRandomGridPos(g = numpy.zeros(orig.nd))
+orig.initTemporal(current = 0.0, total = 0.0)
+orig.time_total = 2.0*orig.time_dt;
+orig.time_file_dt = orig.time_dt;
orig.writebin("orig.bin", verbose=False)
# Test Python IO routines
t@@ -36,4 +32,10 @@ cpp = Spherebin()
cpp.readbin("../output/orig.output0.bin", verbose=False)
compare(orig, cpp, "C++ IO: ")
+# Test CUDA IO routines
+cuda = Spherebin()
+cuda.readbin("../output/orig.output1.bin", verbose=False)
+cuda.time_current = orig.time_current
+cuda.time_step_count = orig.time_step_count
+compare(orig, cuda, "CUDA IO: ")
(DIR) diff --git a/src/Makefile b/src/Makefile
t@@ -27,8 +27,8 @@ NVCCFLAGS=--use_fast_math -O3 -m64 -gencode=arch=compute_20,code=\"sm_20,compute
# Debugable code? Beware that enabling this option will
# considerably slow down the execution.
-CCFLAGS=-g -O0 -Wall
-NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -Xcompiler "-O0 -g"
+#CCFLAGS=-g -O0 -Wall
+#NVCCFLAGS=-g -G -O0 -m64 -gencode=arch=compute_20,code=\"sm_20,compute_20\" -Xcompiler "-O0 -g"
DATE=`date +'%Y.%m.%d-%H:%M:%S'`
BACKUPNAME=sphere.$(DATE).tar.gz
(DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -91,8 +91,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
Float4* dev_angvel_sorted,
unsigned int* dev_cellStart,
unsigned int* dev_cellEnd,
- Float4* dev_w_nx,
- Float4* dev_w_mvfd)
+ Float4* dev_walls_nx,
+ Float4* dev_walls_mvfd)
//uint4 bonds)
{
t@@ -382,9 +382,9 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
Float* dev_es,
Float* dev_ev,
Float* dev_p,
- Float4* dev_w_nx,
- Float4* dev_w_mvfd,
- Float* dev_w_force, //uint4* dev_bonds_sorted,
+ Float4* dev_walls_nx,
+ Float4* dev_walls_mvfd,
+ Float* dev_walls_force_pp, //uint4* dev_bonds_sorted,
unsigned int* dev_contacts,
Float4* dev_distmod,
Float4* dev_delta_t)
t@@ -399,9 +399,6 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
Float4 x_a = dev_x_sorted[idx_a];
Float radius_a = x_a.w;
- // Fetch wall data in global read
- Float4 w_up_nx = dev_w_nx[0];
- Float4 w_up_mvfd = dev_w_mvfd[0];
// Fetch world dimensions in constant memory read
Float3 origo = MAKE_FLOAT3(devC_grid.origo[0],
t@@ -411,6 +408,17 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
devC_grid.L[1],
devC_grid.L[2]);
+ // Fetch wall data in global read
+ Float4 w_up_nx;
+ Float4 w_up_mvfd;
+ if (devC_nw > 0) {
+ w_up_nx = dev_walls_nx[0];
+ w_up_mvfd = dev_walls_mvfd[0];
+ } else {
+ w_up_nx = MAKE_FLOAT4(0.0f, 0.0f, -1.0f, L.z);
+ w_up_mvfd = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ }
+
// Index of particle which is bonded to particle A.
// The index is equal to the particle no (p.np)
// if particle A is bond-less.
t@@ -526,7 +534,7 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
dev_x_sorted,
dev_vel_sorted, dev_angvel_sorted,
dev_cellStart, dev_cellEnd,
- dev_w_nx, dev_w_mvfd);
+ dev_walls_nx, dev_walls_mvfd);
}
}
}
t@@ -629,7 +637,8 @@ __global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted
dev_es[orig_idx] += es_dot * devC_dt;
dev_ev[orig_idx] += ev_dot * devC_dt;
dev_p[orig_idx] = p;
- dev_w_force[orig_idx] = w_force;
+ if (devC_nw > 0)
+ dev_walls_force_pp[orig_idx] = w_force;
}
} // End of interact(...)
(DIR) diff --git a/src/datatypes.h b/src/datatypes.h
t@@ -95,7 +95,6 @@ struct Walls {
int wmode[MAXWALLS]; // Wall modes
Float4* nx; // Wall normal and position
Float4* mvfd; // Wall mass, velocity, force and dev. stress
- Float* force; // Resulting forces on walls per particle
};
#endif
(DIR) diff --git a/src/device.cu b/src/device.cu
t@@ -220,38 +220,44 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
if (verbose == 1)
std::cout << " Allocating global device memory: ";
- // Particle arrays
- cudaMalloc((void**)&dev_k->x, memSizeF4);
- cudaMalloc((void**)&dev_sort->x_sorted, memSizeF4);
- cudaMalloc((void**)&dev_k->vel, memSizeF4);
- cudaMalloc((void**)&dev_sort->vel_sorted, memSizeF4);
- cudaMalloc((void**)&dev_k->angvel, memSizeF4);
- cudaMalloc((void**)&dev_sort->angvel_sorted, memSizeF4);
- cudaMalloc((void**)&dev_k->acc, memSizeF4);
k.acc = new Float4[np];
- cudaMalloc((void**)&dev_k->angacc, memSizeF4);
k.angacc = new Float4[np];
- cudaMalloc((void**)&dev_k->force, memSizeF4);
- cudaMalloc((void**)&dev_k->torque, memSizeF4);
- cudaMalloc((void**)&dev_k->angpos, memSizeF4);
- cudaMalloc((void**)&dev_e->es_dot, memSizeF);
- cudaMalloc((void**)&dev_e->ev_dot, memSizeF);
- cudaMalloc((void**)&dev_e->es, memSizeF);
- cudaMalloc((void**)&dev_e->ev, memSizeF);
- cudaMalloc((void**)&dev_e->p, memSizeF);
- // Cell-related arrays
- cudaMalloc((void**)&dev_sort->gridParticleCellID, sizeof(unsigned int)*np);
- cudaMalloc((void**)&dev_sort->gridParticleIndex, sizeof(unsigned int)*np);
- cudaMalloc((void**)&dev_sort->cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
- cudaMalloc((void**)&dev_sort->cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
+ // Kinematics arrays
+ cudaMalloc((void**)&dev_x, memSizeF4);
+ cudaMalloc((void**)&dev_xysum, memSizeF4);
+ cudaMalloc((void**)&dev_vel, memSizeF4);
+ cudaMalloc((void**)&dev_acc, memSizeF4);
+ cudaMalloc((void**)&dev_force, memSizeF4);
+ cudaMalloc((void**)&dev_angpos, memSizeF4);
+ cudaMalloc((void**)&dev_angvel, memSizeF4);
+ cudaMalloc((void**)&dev_angacc, memSizeF4);
+ cudaMalloc((void**)&dev_torque, memSizeF4);
// Particle contact bookkeeping arrays
- cudaMalloc((void**)&dev_k->contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
- cudaMalloc((void**)&dev_k->distmod, sizeof(Float4)*np*NC);
- cudaMalloc((void**)&dev_k->delta_t, sizeof(Float4)*np*NC);
+ cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*np*NC); // Max NC contacts per particle
+ cudaMalloc((void**)&dev_distmod, memSizeF4*NC);
+ cudaMalloc((void**)&dev_delta_t, memSizeF4*NC);
+
+ // Sorted arrays
+ cudaMalloc((void**)&dev_x_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
+
+ // Energy arrays
+ cudaMalloc((void**)&dev_es_dot, memSizeF);
+ cudaMalloc((void**)&dev_ev_dot, memSizeF);
+ cudaMalloc((void**)&dev_es, memSizeF);
+ cudaMalloc((void**)&dev_ev, memSizeF);
+ cudaMalloc((void**)&dev_p, memSizeF);
+
+ // Cell-related arrays
+ cudaMalloc((void**)&dev_gridParticleCellID, sizeof(unsigned int)*np);
+ cudaMalloc((void**)&dev_gridParticleIndex, sizeof(unsigned int)*np);
+ cudaMalloc((void**)&dev_cellStart, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
+ cudaMalloc((void**)&dev_cellEnd, sizeof(unsigned int)*grid.num[0]*grid.num[1]*grid.num[2]);
- // Host contact bookkeeping arrays
+ // Host contact bookkeeping arrays
k.contacts = new unsigned int[np*NC];
// Initialize contacts lists to np
for (unsigned int i=0; i<(np*NC); ++i)
t@@ -260,10 +266,11 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
k.delta_t = new Float4[np*NC];
// Wall arrays
- cudaMalloc((void**)&dev_walls->nx, sizeof(Float4)*walls.nw);
- cudaMalloc((void**)&dev_walls->mvfd, sizeof(Float4)*walls.nw);
- cudaMalloc((void**)&dev_walls->force, sizeof(Float)*walls.nw*np);
- // dev_w_force_partial allocated later
+ cudaMalloc((void**)&dev_walls_wmode, sizeof(int)*walls.nw);
+ cudaMalloc((void**)&dev_walls_nx, sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&dev_walls_mvfd, sizeof(Float4)*walls.nw);
+ cudaMalloc((void**)&dev_walls_force_pp, sizeof(Float)*walls.nw*np);
+ // dev_walls_force_partial allocated later
checkForCudaErrors("End of allocateGlobalDeviceMemory");
if (verbose == 1)
t@@ -275,36 +282,40 @@ __host__ void DEM::freeGlobalDeviceMemory()
if (verbose == 1)
printf("\nLiberating device memory: ");
// Particle arrays
- cudaFree(dev_k->x);
- cudaFree(dev_sort->x_sorted);
- cudaFree(dev_k->vel);
- cudaFree(dev_sort->vel_sorted);
- cudaFree(dev_k->angvel);
- cudaFree(dev_sort->angvel_sorted);
- cudaFree(dev_k->acc);
- cudaFree(dev_k->angacc);
- cudaFree(dev_k->force);
- cudaFree(dev_k->torque);
- cudaFree(dev_k->angpos);
- cudaFree(dev_e->es_dot);
- cudaFree(dev_e->ev_dot);
- cudaFree(dev_e->es);
- cudaFree(dev_e->ev);
- cudaFree(dev_e->p);
- cudaFree(dev_k->contacts);
- cudaFree(dev_k->distmod);
- cudaFree(dev_k->delta_t);
+ cudaFree(dev_x);
+ cudaFree(dev_xysum);
+ cudaFree(dev_vel);
+ cudaFree(dev_acc);
+ cudaFree(dev_force);
+ cudaFree(dev_angpos);
+ cudaFree(dev_angvel);
+ cudaFree(dev_angacc);
+ cudaFree(dev_torque);
+
+ cudaFree(dev_contacts);
+ cudaFree(dev_distmod);
+ cudaFree(dev_delta_t);
+
+ cudaFree(dev_es_dot);
+ cudaFree(dev_es);
+ cudaFree(dev_ev_dot);
+ cudaFree(dev_ev);
+ cudaFree(dev_p);
+
+ cudaFree(dev_x_sorted);
+ cudaFree(dev_vel_sorted);
+ cudaFree(dev_angvel_sorted);
// Cell-related arrays
- cudaFree(dev_sort->gridParticleIndex);
- cudaFree(dev_sort->cellStart);
- cudaFree(dev_sort->cellEnd);
+ cudaFree(dev_gridParticleIndex);
+ cudaFree(dev_cellStart);
+ cudaFree(dev_cellEnd);
// Wall arrays
- cudaFree(dev_walls->nx);
- cudaFree(dev_walls->mvfd);
- cudaFree(dev_walls->force);
- //cudaFree(dev_w_force_partial);
+ cudaFree(dev_walls_nx);
+ cudaFree(dev_walls_mvfd);
+ cudaFree(dev_walls_force_partial);
+ cudaFree(dev_walls_force_pp);
if (verbose == 1)
printf("Done\n");
t@@ -321,55 +332,53 @@ __host__ void DEM::transferToGlobalDeviceMemory()
unsigned int memSizeF4 = sizeof(Float4) * np;
// Copy static-size structure data from host to global device memory
- cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
+ //cudaMemcpy(dev_time, &time, sizeof(Time), cudaMemcpyHostToDevice);
// Kinematic particle values
- cudaMemcpy( dev_k->x, k.x,
+ cudaMemcpy( dev_x, k.x,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->xysum, k.xysum,
+ cudaMemcpy( dev_xysum, k.xysum,
sizeof(Float2)*np, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->vel, k.vel,
+ cudaMemcpy( dev_vel, k.vel,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->acc, k.acc,
+ cudaMemcpy( dev_acc, k.acc,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->force, k.force,
+ cudaMemcpy( dev_force, k.force,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->angpos, k.angpos,
+ cudaMemcpy( dev_angpos, k.angpos,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->angvel, k.angvel,
+ cudaMemcpy( dev_angvel, k.angvel,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->angacc, k.angacc,
+ cudaMemcpy( dev_angacc, k.angacc,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->torque, k.torque,
+ cudaMemcpy( dev_torque, k.torque,
memSizeF4, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->contacts, k.contacts,
+ cudaMemcpy( dev_contacts, k.contacts,
sizeof(unsigned int)*np*NC, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->distmod, k.distmod,
+ cudaMemcpy( dev_distmod, k.distmod,
memSizeF4*NC, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_k->delta_t, k.delta_t,
+ cudaMemcpy( dev_delta_t, k.delta_t,
memSizeF4*NC, cudaMemcpyHostToDevice);
// Individual particle energy values
- cudaMemcpy( dev_e->es_dot, e.es_dot,
+ cudaMemcpy( dev_es_dot, e.es_dot,
memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_e->es, e.es,
+ cudaMemcpy( dev_es, e.es,
memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_e->ev_dot, e.ev_dot,
+ cudaMemcpy( dev_ev_dot, e.ev_dot,
memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_e->ev, e.ev,
+ cudaMemcpy( dev_ev, e.ev,
memSizeF, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_e->p, e.p,
+ cudaMemcpy( dev_p, e.p,
memSizeF, cudaMemcpyHostToDevice);
// Wall parameters
- cudaMemcpy( dev_walls->wmode, walls.wmode,
+ cudaMemcpy( dev_walls_wmode, walls.wmode,
sizeof(int)*walls.nw, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_walls->nx, walls.nx,
+ cudaMemcpy( dev_walls_nx, walls.nx,
sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_walls->mvfd, walls.mvfd,
+ cudaMemcpy( dev_walls_mvfd, walls.mvfd,
sizeof(Float4)*walls.nw, cudaMemcpyHostToDevice);
- cudaMemcpy( dev_walls->force, walls.force,
- memSizeF*walls.nw, cudaMemcpyHostToDevice);
checkForCudaErrors("End of transferToGlobalDeviceMemory");
if (verbose == 1)
t@@ -385,55 +394,53 @@ __host__ void DEM::transferFromGlobalDeviceMemory()
unsigned int memSizeF4 = sizeof(Float4) * np;
// Copy static-size structure data from host to global device memory
- cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
+ //cudaMemcpy(&time, dev_time, sizeof(Time), cudaMemcpyDeviceToHost);
// Kinematic particle values
- cudaMemcpy( k.x, dev_k->x,
+ cudaMemcpy( k.x, dev_x,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.xysum, dev_k->xysum,
+ cudaMemcpy( k.xysum, dev_xysum,
sizeof(Float2)*np, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.vel, dev_k->vel,
+ cudaMemcpy( k.vel, dev_vel,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.acc, dev_k->acc,
+ cudaMemcpy( k.acc, dev_acc,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.force, dev_k->force,
+ cudaMemcpy( k.force, dev_force,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.angpos, dev_k->angpos,
+ cudaMemcpy( k.angpos, dev_angpos,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.angvel, dev_k->angvel,
+ cudaMemcpy( k.angvel, dev_angvel,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.angacc, dev_k->angacc,
+ cudaMemcpy( k.angacc, dev_angacc,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.torque, dev_k->torque,
+ cudaMemcpy( k.torque, dev_torque,
memSizeF4, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.contacts, dev_k->contacts,
+ cudaMemcpy( k.contacts, dev_contacts,
sizeof(unsigned int)*np*NC, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.distmod, dev_k->distmod,
+ cudaMemcpy( k.distmod, dev_distmod,
memSizeF4*NC, cudaMemcpyDeviceToHost);
- cudaMemcpy( k.delta_t, dev_k->delta_t,
+ cudaMemcpy( k.delta_t, dev_delta_t,
memSizeF4*NC, cudaMemcpyDeviceToHost);
// Individual particle energy values
- cudaMemcpy( e.es_dot, dev_e->es_dot,
+ cudaMemcpy( e.es_dot, dev_es_dot,
memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy( e.es, dev_e->es,
+ cudaMemcpy( e.es, dev_es,
memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy( e.ev_dot, dev_e->ev_dot,
+ cudaMemcpy( e.ev_dot, dev_ev_dot,
memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy( e.ev, dev_e->ev,
+ cudaMemcpy( e.ev, dev_ev,
memSizeF, cudaMemcpyDeviceToHost);
- cudaMemcpy( e.p, dev_e->p,
+ cudaMemcpy( e.p, dev_p,
memSizeF, cudaMemcpyDeviceToHost);
// Wall parameters
- cudaMemcpy( walls.wmode, dev_walls->wmode,
+ cudaMemcpy( walls.wmode, dev_walls_wmode,
sizeof(int)*walls.nw, cudaMemcpyDeviceToHost);
- cudaMemcpy( walls.nx, dev_walls->nx,
+ cudaMemcpy( walls.nx, dev_walls_nx,
sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
- cudaMemcpy( walls.mvfd, dev_walls->mvfd,
+ cudaMemcpy( walls.mvfd, dev_walls_mvfd,
sizeof(Float4)*walls.nw, cudaMemcpyDeviceToHost);
- cudaMemcpy( walls.force, dev_walls->force,
- memSizeF*walls.nw, cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferFromGlobalDeviceMemory");
}
t@@ -476,8 +483,8 @@ __host__ void DEM::startTime()
// Shared memory per block
unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
- Float* dev_w_force_partial;
- cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
+ // Pre-sum of force per wall
+ cudaMalloc((void**)&dev_walls_force_partial, sizeof(Float)*dimGrid.x);
// Report to stdout
if (verbose == 1) {
t@@ -562,9 +569,9 @@ __host__ void DEM::startTime()
// in the fine, uniform and homogenous grid.
if (PROFILING == 1)
startTimer(&kernel_tic);
- calcParticleCellID<<<dimGrid, dimBlock>>>(dev_sort->gridParticleCellID,
- dev_sort->gridParticleIndex,
- dev_k->x);
+ calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
+ dev_gridParticleIndex,
+ dev_x);
// Synchronization point
cudaThreadSynchronize();
t@@ -576,9 +583,9 @@ __host__ void DEM::startTime()
// Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
if (PROFILING == 1)
startTimer(&kernel_tic);
- thrust::sort_by_key(thrust::device_ptr<uint>(dev_sort->gridParticleCellID),
- thrust::device_ptr<uint>(dev_sort->gridParticleCellID + np),
- thrust::device_ptr<uint>(dev_sort->gridParticleIndex));
+ thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
+ thrust::device_ptr<uint>(dev_gridParticleCellID + np),
+ thrust::device_ptr<uint>(dev_gridParticleIndex));
cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
if (PROFILING == 1)
stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
t@@ -588,7 +595,7 @@ __host__ void DEM::startTime()
// Zero cell array values by setting cellStart to its highest possible value,
// specified with pointer value 0xffffffff, which for a 32 bit unsigned int
// is 4294967295.
- cudaMemset(dev_sort->cellStart, 0xffffffff,
+ cudaMemset(dev_cellStart, 0xffffffff,
grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
cudaThreadSynchronize();
checkForCudaErrors("Post cudaMemset");
t@@ -597,15 +604,15 @@ __host__ void DEM::startTime()
// coherent memory access. Save ordered configurations in new arrays (*_sorted).
if (PROFILING == 1)
startTimer(&kernel_tic);
- reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_sort->cellStart,
- dev_sort->cellEnd,
- dev_sort->gridParticleCellID,
- dev_sort->gridParticleIndex,
- dev_k->x, dev_k->vel,
- dev_k->angvel,
- dev_sort->x_sorted,
- dev_sort->vel_sorted,
- dev_sort->angvel_sorted);
+ reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
+ dev_cellEnd,
+ dev_gridParticleCellID,
+ dev_gridParticleIndex,
+ dev_x, dev_vel,
+ dev_angvel,
+ dev_x_sorted,
+ dev_vel_sorted,
+ dev_angvel_sorted);
// Synchronization point
cudaThreadSynchronize();
t@@ -620,12 +627,12 @@ __host__ void DEM::startTime()
// For each particle: Search contacts in neighbor cells
if (PROFILING == 1)
startTimer(&kernel_tic);
- topology<<<dimGrid, dimBlock>>>(dev_sort->cellStart,
- dev_sort->cellEnd,
- dev_sort->gridParticleIndex,
- dev_sort->x_sorted,
- dev_k->contacts,
- dev_k->distmod);
+ topology<<<dimGrid, dimBlock>>>(dev_cellStart,
+ dev_cellEnd,
+ dev_gridParticleIndex,
+ dev_x_sorted,
+ dev_contacts,
+ dev_distmod);
// Synchronization point
t@@ -639,28 +646,28 @@ __host__ void DEM::startTime()
// For each particle: Process collisions and compute resulting forces.
if (PROFILING == 1)
startTimer(&kernel_tic);
- interact<<<dimGrid, dimBlock>>>(dev_sort->gridParticleIndex,
- dev_sort->cellStart,
- dev_sort->cellEnd,
- dev_k->x,
- dev_sort->x_sorted,
- dev_sort->vel_sorted,
- dev_sort->angvel_sorted,
- dev_k->vel,
- dev_k->angvel,
- dev_k->force,
- dev_k->torque,
- dev_e->es_dot,
- dev_e->ev_dot,
- dev_e->es,
- dev_e->ev,
- dev_e->p,
- dev_walls->nx,
- dev_walls->mvfd,
- dev_walls->force,
- dev_k->contacts,
- dev_k->distmod,
- dev_k->delta_t);
+ interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
+ dev_cellStart,
+ dev_cellEnd,
+ dev_x,
+ dev_x_sorted,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ dev_vel,
+ dev_angvel,
+ dev_force,
+ dev_torque,
+ dev_es_dot,
+ dev_ev_dot,
+ dev_es,
+ dev_ev,
+ dev_p,
+ dev_walls_nx,
+ dev_walls_mvfd,
+ dev_walls_force_pp,
+ dev_contacts,
+ dev_distmod,
+ dev_delta_t);
// Synchronization point
t@@ -672,17 +679,20 @@ __host__ void DEM::startTime()
// Update particle kinematics
if (PROFILING == 1)
startTimer(&kernel_tic);
- integrate<<<dimGrid, dimBlock>>>(dev_sort->x_sorted,
- dev_sort->vel_sorted,
- dev_sort->angvel_sorted,
- dev_k->x,
- dev_k->vel,
- dev_k->angvel,
- dev_k->force,
- dev_k->torque,
- dev_k->angpos,
- dev_k->xysum,
- dev_sort->gridParticleIndex);
+ integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ dev_x,
+ dev_vel,
+ dev_angvel,
+ dev_force,
+ dev_torque,
+ dev_angpos,
+ dev_xysum,
+ dev_gridParticleIndex);
+ cudaThreadSynchronize();
+ checkForCudaErrors("Post integrate");
+
cudaThreadSynchronize();
if (PROFILING == 1)
t@@ -691,20 +701,26 @@ __host__ void DEM::startTime()
// Summation of forces on wall
if (PROFILING == 1)
startTimer(&kernel_tic);
- summation<<<dimGrid, dimBlock>>>(dev_walls->force, dev_w_force_partial);
-
+ if (walls.nw > 0) {
+ summation<<<dimGrid, dimBlock>>>(dev_walls_force_pp,
+ dev_walls_force_partial);
+ }
// Synchronization point
cudaThreadSynchronize();
if (PROFILING == 1)
stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
- checkForCudaErrors("Post integrate & wall force summation");
+ checkForCudaErrors("Post wall force summation");
// Update wall kinematics
if (PROFILING == 1)
startTimer(&kernel_tic);
- integrateWalls<<< 1, walls.nw>>>(dev_walls,
- dev_w_force_partial,
- blocksPerGrid);
+ if (walls.nw > 0) {
+ integrateWalls<<< 1, walls.nw>>>(dev_walls_nx,
+ dev_walls_mvfd,
+ dev_walls_wmode,
+ dev_walls_force_partial,
+ blocksPerGrid);
+ }
// Synchronization point
cudaThreadSynchronize();
(DIR) diff --git a/src/file_io.cpp b/src/file_io.cpp
t@@ -189,7 +189,6 @@ void DEM::readbin(const char *target)
// Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
walls.nx = new Float4[walls.nw];
walls.mvfd = new Float4[walls.nw];
- walls.force = new Float[walls.nw*np];
ifs.read(as_bytes(walls.wmode), sizeof(walls.wmode));
for (i = 0; i<walls.nw; ++i) {
(DIR) diff --git a/src/integration.cuh b/src/integration.cuh
t@@ -175,8 +175,10 @@ __global__ void summation(Float* in, Float *out)
}
// Update wall positions
-__global__ void integrateWalls(Walls* dev_walls,
- Float* dev_w_force_partial,
+__global__ void integrateWalls(Float4* dev_walls_nx,
+ Float4* dev_walls_mvfd,
+ int* dev_walls_wmode,
+ Float* dev_walls_force_partial,
unsigned int blocksPerGrid)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
t@@ -185,9 +187,9 @@ __global__ void integrateWalls(Walls* dev_walls,
// Copy data to temporary arrays to avoid any potential read-after-write,
// write-after-read, or write-after-write hazards.
- Float4 w_nx = dev_walls->nx[idx];
- Float4 w_mvfd = dev_walls->mvfd[idx];
- int wmode = dev_walls->wmode[idx]; // Wall BC, 0: fixed, 1: devs, 2: vel
+ Float4 w_nx = dev_walls_nx[idx];
+ Float4 w_mvfd = dev_walls_mvfd[idx];
+ int wmode = dev_walls_wmode[idx]; // Wall BC, 0: fixed, 1: devs, 2: vel
Float acc;
if (wmode == 0) // Wall fixed: do nothing
t@@ -196,7 +198,7 @@ __global__ void integrateWalls(Walls* dev_walls,
// Find the final sum of forces on wall
w_mvfd.z = 0.0f;
for (int i=0; i<blocksPerGrid; ++i) {
- w_mvfd.z += dev_w_force_partial[i];
+ w_mvfd.z += dev_walls_force_partial[i];
}
Float dt = devC_dt;
t@@ -222,8 +224,8 @@ __global__ void integrateWalls(Walls* dev_walls,
w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
// Store data in global memory
- dev_walls->nx[idx] = w_nx;
- dev_walls->mvfd[idx] = w_mvfd;
+ dev_walls_nx[idx] = w_nx;
+ dev_walls_mvfd[idx] = w_mvfd;
}
} // End of integrateWalls(...)
(DIR) diff --git a/src/sphere.cpp b/src/sphere.cpp
t@@ -207,16 +207,18 @@ void DEM::reportValues()
else
cout << " - 1st and 2nd dim. boundaries: Visco-frictional walls\n";
- cout << " - Top BC: ";
- if (walls.wmode[0] == 0)
- cout << "Fixed\n";
- else if (walls.wmode[0] == 1)
- cout << "Deviatoric stress\n";
- else if (walls.wmode[0] == 2)
- cout << "Velocity\n";
- else {
- cerr << "Top BC not recognized!\n";
- exit(1);
+ if (walls.nw > 0) {
+ cout << " - Top BC: ";
+ if (walls.wmode[0] == 0)
+ cout << "Fixed\n";
+ else if (walls.wmode[0] == 1)
+ cout << "Deviatoric stress\n";
+ else if (walls.wmode[0] == 2)
+ cout << "Velocity\n";
+ else {
+ cerr << "Top BC not recognized!\n";
+ exit(1);
+ }
}
cout << " - Grid: ";
(DIR) diff --git a/src/sphere.h b/src/sphere.h
t@@ -25,13 +25,14 @@ class DEM {
// Number of particles
unsigned int np;
+ // HOST STRUCTURES
// Structure containing individual particle kinematics
Kinematics k; // host
- Kinematics *dev_k; // device
+ //Kinematics *dev_k; // device
// Structure containing energy values
Energies e; // host
- Energies *dev_e; // device
+ //Energies *dev_e; // device
// Structure of global parameters
Params params; // host
t@@ -40,15 +41,46 @@ class DEM {
Grid grid; // host
// Structure containing sorting arrays
- Sorting *dev_sort; // device
+ //Sorting *dev_sort; // device
// Structure of temporal parameters
Time time; // host
- Time *dev_time; // device
+ //Time *dev_time; // device
// Structure of wall parameters
Walls walls; // host
- Walls *dev_walls; // device
+ //Walls *dev_walls; // device
+
+ // DEVICE ARRAYS
+ Float4 *dev_x;
+ Float2 *dev_xysum;
+ Float4 *dev_vel;
+ Float4 *dev_acc;
+ Float4 *dev_force;
+ Float4 *dev_angpos;
+ Float4 *dev_angvel;
+ Float4 *dev_angacc;
+ Float4 *dev_torque;
+ unsigned int *dev_contacts;
+ Float4 *dev_distmod;
+ Float4 *dev_delta_t;
+ Float *dev_es_dot;
+ Float *dev_es;
+ Float *dev_ev_dot;
+ Float *dev_ev;
+ Float *dev_p;
+ Float4 *dev_x_sorted;
+ Float4 *dev_vel_sorted;
+ Float4 *dev_angvel_sorted;
+ unsigned int *dev_gridParticleCellID;
+ unsigned int *dev_gridParticleIndex;
+ unsigned int *dev_cellStart;
+ unsigned int *dev_cellEnd;
+ int *dev_walls_wmode;
+ Float4 *dev_walls_nx; // normal, pos.
+ Float4 *dev_walls_mvfd; // Mass, velocity, force, dev. stress
+ Float *dev_walls_force_partial; // Pre-sum per wall
+ Float *dev_walls_force_pp; // Force per particle per wall
// GPU initialization, must be called before startTime()
void initializeGPU(void);
t@@ -101,10 +133,11 @@ class DEM {
void startTime(void);
// Render particles using raytracing
- // render(const char *target,
- // Float3 lookat,
- // Float3 eye,
- // Float focalLength);
+ void render(const char *target,
+ const Float3 lookat,
+ const Float3 eye,
+ const Float focalLength = 1.0,
+ const int method = 1);
};
(DIR) diff --git a/src/utility.cu b/src/utility.cu
t@@ -11,7 +11,7 @@ void checkForCudaErrors(const char* checkpoint_description)
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_description
- << "\nError string: " << cudaGetErrorString(err) << "\n";
+ << "\nError string: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}
t@@ -22,7 +22,7 @@ void checkForCudaErrors(const char* checkpoint_description, const unsigned int i
if (err != cudaSuccess) {
std::cerr << "\nCuda error detected, checkpoint: " << checkpoint_description
<< "\nduring iteration " << iteration
- << "\nError string: " << cudaGetErrorString(err) << "\n";
+ << "\nError string: " << cudaGetErrorString(err) << std::endl;
exit(EXIT_FAILURE);
}
}