tadd const and __restrict__ keywords to facilitate constant memory cache usage - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit 21ed15ef593a831b2124a48bbc662b9a227f85ac
(DIR) parent 7626d19392ecba63c4b2e5f4d419e8423d7a90a0
(HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
Date: Mon, 11 Aug 2014 14:37:07 +0200
add const and __restrict__ keywords to facilitate constant memory cache usage
Diffstat:
M src/cohesion.cuh | 143 ++++++++-----------------------
M src/contactmodels.cuh | 164 ++++++++++++++++++-------------
M src/contactsearch.cuh | 246 ++++++++++++++++---------------
M src/integration.cuh | 70 ++++++++++++++++++-------------
M src/navierstokes.cuh | 1213 ++++++++++++++++---------------
M src/raytracer.cuh | 173 +++++++++++++++----------------
M tests/fluid_particle_interaction.py | 3 +--
7 files changed, 996 insertions(+), 1016 deletions(-)
---
(DIR) diff --git a/src/cohesion.cuh b/src/cohesion.cuh
t@@ -6,14 +6,14 @@
// Check bond pair list, apply linear contact model to pairs
__global__ void bondsLinear(
- uint2* dev_bonds,
- Float4* dev_bonds_delta, // Contact displacement
- Float4* dev_bonds_omega, // Contact rotational displacement
- Float4* dev_x,
- Float4* dev_vel,
- Float4* dev_angvel,
- Float4* dev_force,
- Float4* dev_torque)
+ uint2* __restrict__ dev_bonds,
+ Float4* __restrict__ dev_bonds_delta, // Contact displacement
+ Float4* __restrict__ dev_bonds_omega, // Contact rotational displacement
+ const Float4* __restrict__ dev_x,
+ const Float4* __restrict__ dev_vel,
+ const Float4* __restrict__ dev_angvel,
+ Float4* __restrict__ dev_force,
+ Float4* __restrict__ dev_torque)
{
// Find thread index
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -35,16 +35,16 @@ __global__ void bondsLinear(
// Convert tangential vectors to Float3's
// Uncorrected tangential component of displacement
Float3 delta0_t = MAKE_FLOAT3(
- delta0_4.x,
- delta0_4.y,
- delta0_4.z);
+ delta0_4.x,
+ delta0_4.y,
+ delta0_4.z);
const Float delta0_n = delta0_4.w;
// Uncorrected tangential component of rotation
Float3 omega0_t = MAKE_FLOAT3(
- omega0_4.x,
- omega0_4.y,
- omega0_4.z);
+ omega0_4.x,
+ omega0_4.y,
+ omega0_4.z);
const Float omega0_n = omega0_4.w;
// Read particle data
t@@ -76,9 +76,9 @@ __global__ void bondsLinear(
// Inter-particle vector
const Float3 x = MAKE_FLOAT3(
- x_i.x - x_j.x,
- x_i.y - x_j.y,
- x_i.z - x_j.z);
+ x_i.x - x_j.x,
+ x_i.y - x_j.y,
+ x_i.z - x_j.z);
const Float x_length = length(x);
// Find overlap (negative value if overlapping)
t@@ -96,13 +96,13 @@ __global__ void bondsLinear(
// Contact displacement, Luding 2008 eq. 10
const Float3 ddelta = (
- MAKE_FLOAT3(
- vel_i.x - vel_j.x,
- vel_i.y - vel_j.y,
- vel_i.z - vel_j.z)
- + (x_i.w + overlap/2.0) * cross(n, angvel_i)
- + (x_j.w + overlap/2.0) * cross(n, angvel_j)
- ) * devC_dt;
+ MAKE_FLOAT3(
+ vel_i.x - vel_j.x,
+ vel_i.y - vel_j.y,
+ vel_i.z - vel_j.z)
+ + (x_i.w + overlap/2.0) * cross(n, angvel_i)
+ + (x_j.w + overlap/2.0) * cross(n, angvel_j)
+ ) * devC_dt;
// Normal component of the displacement increment
//const Float ddelta_n = dot(ddelta, n);
t@@ -141,9 +141,9 @@ __global__ void bondsLinear(
// Contact rotational velocity
Float3 domega = MAKE_FLOAT3(
- angvel_j.x - angvel_i.x,
- angvel_j.y - angvel_i.y,
- angvel_j.z - angvel_i.z) * devC_dt;
+ angvel_j.x - angvel_i.x,
+ angvel_j.y - angvel_i.y,
+ angvel_j.z - angvel_i.z) * devC_dt;
/*const Float3 domega = MAKE_FLOAT3(
angvel_i.x - angvel_j.x,
angvel_i.y - angvel_j.y,
t@@ -232,89 +232,16 @@ __global__ void bondsLinear(
}
}
-// Linear-elastic bond: Attractive force with normal- and shear components
-// acting upon particle A in a bonded particle pair
-__device__ void bondLinear_old(Float3* N, Float3* T, Float* es_dot, Float* p,
- unsigned int idx_a, unsigned int idx_b,
- Float4* dev_x_sorted, Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab)
-{
-
- // If particles are not overlapping, apply bond force
- if (delta_ab > 0.0f) {
-
- // Allocate variables and fetch missing time=t values for particle A and B
- Float4 vel_a = dev_vel_sorted[idx_a];
- Float4 vel_b = dev_vel_sorted[idx_b];
- Float4 angvel4_a = dev_angvel_sorted[idx_a];
- Float4 angvel4_b = dev_angvel_sorted[idx_b];
-
- // Convert to Float3's
- Float3 angvel_a = MAKE_FLOAT3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
- Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
-
- // Normal vector of contact
- Float3 n_ab = x_ab/x_ab_length;
-
- // Relative contact interface velocity, w/o rolling
- Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
-
- // Relative contact interface velocity of particle surfaces at
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
- Float3 vel_ab = vel_ab_linear
- + radius_a * cross(n_ab, angvel_a)
- + radius_b * cross(n_ab, angvel_b);
-
- // Relative contact interface rolling velocity
- //Float3 angvel_ab = angvel_a - angvel_b;
- //Float angvel_ab_length = length(angvel_ab);
-
- // Normal component of the relative contact interface velocity
- //Float vel_n_ab = dot(vel_ab_linear, n_ab);
-
- // Tangential component of the relative contact interface velocity
- // Hinrichsen and Wolf 2004, eq. 13.9
- Float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
- //Float vel_t_ab_length = length(vel_t_ab);
-
- Float3 f_n = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
- Float3 f_t = MAKE_FLOAT3(0.0f, 0.0f, 0.0f);
-
- // Mean radius
- Float R_bar = (radius_a + radius_b)/2.0f;
-
- // Normal force component: Elastic
- f_n = devC_params.k_n * delta_ab * n_ab;
-
- if (length(vel_t_ab) > 0.f) {
- // Shear force component: Viscous
- f_t = -1.0f * devC_params.gamma_t * vel_t_ab;
-
- // Shear friction production rate [W]
- //*es_dot += -dot(vel_t_ab, f_t);
- }
-
- // Add force components from this bond to total force for particle
- *N += f_n + f_t;
- *T += -R_bar * cross(n_ab, f_t);
-
- // Pressure excerted onto the particle from this bond
- *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
-
- }
-} // End of bondLinear()
-
// Capillary cohesion after Richefeu et al. (2006)
-__device__ void capillaryCohesion_exp(Float3* N, Float radius_a,
- Float radius_b, Float delta_ab,
- Float3 x_ab, Float x_ab_length,
- Float kappa)
+__device__ void capillaryCohesion_exp(
+ Float3* N,
+ const Float radius_a,
+ const Float radius_b,
+ const Float delta_ab,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float kappa)
{
// Normal vector
(DIR) diff --git a/src/contactmodels.cuh b/src/contactmodels.cuh
t@@ -7,11 +7,19 @@
// Linear viscoelastic contact model for particle-wall interactions
// with tangential friction and rolling resistance
-__device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
- Float* ev_dot, Float* p,
- unsigned int idx_a, Float radius_a,
- Float4* dev_vel_sorted, Float4* dev_angvel_sorted,
- Float3 n, Float delta, Float wvel)
+__device__ Float contactLinear_wall(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a,
+ const Float radius_a,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const Float3 n,
+ const Float delta,
+ const Float wvel)
{
// Fetch particle velocities from global memory
Float4 vel_tmp = dev_vel_sorted[idx_a];
t@@ -19,13 +27,13 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
// Convert velocities to three-component vectors
Float3 vel_linear = MAKE_FLOAT3(
- vel_tmp.x,
- vel_tmp.y,
- vel_tmp.z);
+ vel_tmp.x,
+ vel_tmp.y,
+ vel_tmp.z);
Float3 angvel = MAKE_FLOAT3(
- angvel_tmp.x,
- angvel_tmp.y,
- angvel_tmp.z);
+ angvel_tmp.x,
+ angvel_tmp.y,
+ angvel_tmp.z);
// Store the length of the angular velocity for later use
Float angvel_length = length(angvel);
t@@ -47,7 +55,7 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
// Normal force component: Elastic - viscous damping
Float3 f_n = fmax(0.0, -devC_params.k_n*delta
- - devC_params.gamma_wn*vel_n) * n;
+ - devC_params.gamma_wn*vel_n) * n;
const Float f_n_length = length(f_n); // Save length for later use
// Store the energy lost by viscous damping. See derivation in
t@@ -103,14 +111,22 @@ __device__ Float contactLinear_wall(Float3* F, Float3* T, Float* es_dot,
// Linear vicoelastic contact model for particle-particle interactions
// with tangential friction and rolling resistance
-__device__ void contactLinearViscous(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a, unsigned int idx_b,
- Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab, Float kappa)
+__device__ void contactLinearViscous(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a,
+ const unsigned int idx_b,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float delta_ab,
+ const Float kappa)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -131,8 +147,8 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
t@@ -209,18 +225,25 @@ __device__ void contactLinearViscous(Float3* F, Float3* T,
// Linear elastic contact model for particle-particle interactions
-__device__ void contactLinear(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a_orig,
- unsigned int idx_b_orig,
- Float4 vel_a,
- Float4* dev_vel,
- Float3 angvel_a,
- Float4* dev_angvel,
- Float radius_a, Float radius_b,
- Float3 x, Float x_length,
- Float delta, Float4* dev_delta_t,
- unsigned int mempos)
+__device__ void contactLinear(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a_orig,
+ const unsigned int idx_b_orig,
+ const Float4 vel_a,
+ const Float4* __restrict__ dev_vel,
+ const Float3 angvel_a,
+ const Float4* __restrict__ dev_angvel,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x,
+ const Float x_length,
+ const Float delta,
+ Float4* __restrict__ dev_delta_t,
+ const unsigned int mempos)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -231,15 +254,15 @@ __device__ void contactLinear(Float3* F, Float3* T,
Float4 delta_t0_4 = dev_delta_t[mempos];
Float3 delta_t0_uncor = MAKE_FLOAT3(
- delta_t0_4.x,
- delta_t0_4.y,
- delta_t0_4.z);
+ delta_t0_4.x,
+ delta_t0_4.y,
+ delta_t0_4.z);
// Convert to Float3
Float3 angvel_b = MAKE_FLOAT3(
- angvel4_b.x,
- angvel4_b.y,
- angvel4_b.z);
+ angvel4_b.x,
+ angvel4_b.y,
+ angvel4_b.z);
// Force between grain pair decomposed into normal- and tangential part
Float3 f_n, f_t, f_c;
t@@ -249,9 +272,9 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_linear = MAKE_FLOAT3(
- vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.x - vel_b.x,
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10,
t@@ -335,7 +358,7 @@ __device__ void contactLinear(Float3* F, Float3* T,
// 2008)
delta_t = -1.0/devC_params.k_t
* (devC_params.mu_d * length(f_n-f_c) * t
- + devC_params.gamma_t * vel_t);
+ + devC_params.gamma_t * vel_t);
// Shear friction heat production rate:
// The energy lost from the tangential spring is dissipated as heat
t@@ -357,10 +380,10 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Store sum of tangential displacements
dev_delta_t[mempos] = MAKE_FLOAT4(
- delta_t.x,
- delta_t.y,
- delta_t.z,
- 0.0f);
+ delta_t.x,
+ delta_t.y,
+ delta_t.z,
+ 0.0f);
} // End of contactLinear()
t@@ -368,18 +391,25 @@ __device__ void contactLinear(Float3* F, Float3* T,
// Non-linear contact model for particle-particle interactions
// Based on Hertzian and Mindlin contact theories (e.g. Hertz, 1882, Mindlin and
// Deresiewicz, 1953, Johnson, 1985). See Yohannes et al 2012 for example.
-__device__ void contactHertz(Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot, Float* p,
- unsigned int idx_a_orig,
- unsigned int idx_b_orig,
- Float4 vel_a,
- Float4* dev_vel,
- Float3 angvel_a,
- Float4* dev_angvel,
- Float radius_a, Float radius_b,
- Float3 x_ab, Float x_ab_length,
- Float delta_ab, Float4* dev_delta_t,
- unsigned int mempos)
+__device__ void contactHertz(
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const unsigned int idx_a_orig,
+ const unsigned int idx_b_orig,
+ const Float4 vel_a,
+ const Float4* __restrict__ dev_vel,
+ const Float3 angvel_a,
+ const Float4* __restrict__ dev_angvel,
+ const Float radius_a,
+ const Float radius_b,
+ const Float3 x_ab,
+ const Float x_ab_length,
+ const Float delta_ab,
+ Float4* __restrict__ dev_delta_t,
+ const unsigned int mempos)
{
// Allocate variables and fetch missing time=t values for particle A and B
t@@ -390,8 +420,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
Float4 delta_t0_4 = dev_delta_t[mempos];
Float3 delta_t0_uncor = MAKE_FLOAT3(delta_t0_4.x,
- delta_t0_4.y,
- delta_t0_4.z);
+ delta_t0_4.y,
+ delta_t0_4.z);
// Convert to Float3
Float3 angvel_b = MAKE_FLOAT3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
t@@ -404,8 +434,8 @@ __device__ void contactHertz(Float3* F, Float3* T,
// Relative contact interface velocity, w/o rolling
Float3 vel_ab_linear = MAKE_FLOAT3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
+ vel_a.y - vel_b.y,
+ vel_a.z - vel_b.z);
// Relative contact interface velocity of particle surfaces at
// the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
t@@ -435,7 +465,7 @@ __device__ void contactHertz(Float3* F, Float3* T,
// Normal force component
f_n = (-devC_params.k_n * powf(delta_ab, 3.0f/2.0f)
- -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
+ -devC_params.gamma_n * powf(delta_ab, 1.0f/4.0f) * vel_n_ab)
* n_ab;
// Store energy dissipated in normal viscous component
t@@ -517,9 +547,9 @@ __device__ void contactHertz(Float3* F, Float3* T,
// New rolling resistance model
/*T_res = -1.0f * fmin(devC_params.gamma_r * R_bar * angvel_ab_length,
devC_params.mu_r * R_bar * f_n_length)
- * angvel_ab/angvel_ab_length;*/
+ * angvel_ab/angvel_ab_length;*/
T_res = -1.0f * fmin(devC_params.gamma_r * radius_a * angvel_ab_length,
- devC_params.mu_r * radius_a * f_n_length)
+ devC_params.mu_r * radius_a * f_n_length)
* angvel_ab/angvel_ab_length;
}
(DIR) diff --git a/src/contactsearch.cuh b/src/contactsearch.cuh
t@@ -85,19 +85,23 @@ __device__ int findDistMod(int3* targetCell, Float3* distmod)
// Used for contactmodel=1, where contact history is not needed.
// Kernel executed on device, and callable from device only.
// Function is called from interact().
-__device__ void findAndProcessContactsInCell(int3 targetCell,
- unsigned int idx_a,
- Float4 x_a, Float radius_a,
- Float3* F, Float3* T,
- Float* es_dot, Float* ev_dot,
- Float* p,
- Float4* dev_x_sorted,
- Float4* dev_vel_sorted,
- Float4* dev_angvel_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- Float4* dev_walls_nx,
- Float4* dev_walls_mvfd)
+__device__ void findAndProcessContactsInCell(
+ int3 targetCell,
+ const unsigned int idx_a,
+ const Float4 x_a,
+ const Float radius_a,
+ Float3* F,
+ Float3* T,
+ Float* es_dot,
+ Float* ev_dot,
+ Float* p,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_walls_nx,
+ Float4* __restrict__ dev_walls_mvfd)
//uint4 bonds)
{
t@@ -133,8 +137,8 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
// Distance between particle centers (Float4 -> Float3)
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
// Adjust interparticle vector if periodic boundary/boundaries
// are crossed
t@@ -147,16 +151,16 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
// Check for particle overlap
if (delta_ab < 0.0f) {
contactLinearViscous(F, T, es_dot, ev_dot, p,
- idx_a, idx_b,
- dev_vel_sorted,
- dev_angvel_sorted,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_ab, kappa);
+ idx_a, idx_b,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_ab, kappa);
} else if (delta_ab < devC_params.db) {
// Check wether particle distance satisfies the capillary bond distance
capillaryCohesion_exp(F, radius_a, radius_b, delta_ab,
- x_ab, x_ab_length, kappa);
+ x_ab, x_ab_length, kappa);
}
// Check wether particles are bonded together
t@@ -183,16 +187,18 @@ __device__ void findAndProcessContactsInCell(int3 targetCell,
// Used for contactmodel=2, where bookkeeping of contact history is necessary.
// Kernel executed on device, and callable from device only.
// Function is called from topology().
-__device__ void findContactsInCell(int3 targetCell,
- unsigned int idx_a,
- Float4 x_a, Float radius_a,
- Float4* dev_x_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- unsigned int* dev_gridParticleIndex,
- int* nc,
- unsigned int* dev_contacts,
- Float4* dev_distmod)
+__device__ void findContactsInCell(
+ int3 targetCell,
+ const unsigned int idx_a,
+ const Float4 x_a,
+ const Float radius_a,
+ const Float4* __restrict__ dev_x_sorted,
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ int* nc,
+ unsigned int* __restrict__ dev_contacts,
+ Float4* __restrict__ dev_distmod)
{
// Get distance modifier for interparticle
// vector, if it crosses a periodic boundary
t@@ -237,8 +243,8 @@ __device__ void findContactsInCell(int3 targetCell,
// Distance between particle centers (Float4 -> Float3)
Float3 x_ab = MAKE_FLOAT3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
+ x_a.y - x_b.y,
+ x_a.z - x_b.z);
// Adjust interparticle vector if periodic boundary/boundaries
// are crossed
t@@ -316,12 +322,13 @@ __device__ void findContactsInCell(int3 targetCell,
// Search for neighbors to particle 'idx' inside the 27 closest cells,
// and save the contact pairs in global memory.
// Function is called from mainGPU loop.
-__global__ void topology(unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd, // Input: Particles in cell
- unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
- Float4* dev_x_sorted,
- unsigned int* dev_contacts,
- Float4* dev_distmod)
+__global__ void topology(
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ const Float4* __restrict__ dev_x_sorted,
+ unsigned int* __restrict__ dev_contacts,
+ Float4* __restrict__ dev_distmod)
{
// Thread index equals index of particle A
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -349,10 +356,10 @@ __global__ void topology(unsigned int* dev_cellStart,
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
findContactsInCell(targetPos, idx_a, x_a, radius_a,
- dev_x_sorted,
- dev_cellStart, dev_cellEnd,
- dev_gridParticleIndex,
- &nc, dev_contacts, dev_distmod);
+ dev_x_sorted,
+ dev_cellStart, dev_cellEnd,
+ dev_gridParticleIndex,
+ &nc, dev_contacts, dev_distmod);
}
}
}
t@@ -372,28 +379,28 @@ __global__ void topology(unsigned int* dev_cellStart,
// Kernel is executed on device, and is callable from host only.
// Function is called from mainGPU loop.
__global__ void interact(
- unsigned int* dev_gridParticleIndex, // in
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float4* dev_x, // in
- Float4* dev_x_sorted, // in
- Float4* dev_vel_sorted, // in
- Float4* dev_angvel_sorted, // in
- Float4* dev_vel, // in
- Float4* dev_angvel, // in
- Float4* dev_force, // out
- Float4* dev_torque, // out
- Float* dev_es_dot, // out
- Float* dev_ev_dot, // out
- Float* dev_es, // out
- Float* dev_ev, // out
- Float* dev_p, // out
- Float4* dev_walls_nx, // in
- Float4* dev_walls_mvfd, // in
- Float* dev_walls_force_pp, // out
- unsigned int* dev_contacts, // out
- Float4* dev_distmod, // in
- Float4* dev_delta_t) // out
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float4* __restrict__ dev_x, // in
+ const Float4* __restrict__ dev_x_sorted, // in
+ const Float4* __restrict__ dev_vel_sorted, // in
+ const Float4* __restrict__ dev_angvel_sorted, // in
+ const Float4* __restrict__ dev_vel, // in
+ const Float4* __restrict__ dev_angvel, // in
+ Float4* __restrict__ dev_force, // out
+ Float4* __restrict__ dev_torque, // out
+ Float* __restrict__ dev_es_dot, // out
+ Float* __restrict__ dev_ev_dot, // out
+ Float* __restrict__ dev_es, // out
+ Float* __restrict__ dev_ev, // out
+ Float* __restrict__ dev_p, // out
+ const Float4* __restrict__ dev_walls_nx, // in
+ Float4* __restrict__ dev_walls_mvfd, // in
+ Float* __restrict__ dev_walls_force_pp, // out
+ unsigned int* __restrict__ dev_contacts, // out
+ const Float4* __restrict__ dev_distmod, // in
+ Float4* __restrict__ dev_delta_t) // out
{
// Thread index equals index of particle A
unsigned int idx_a = blockIdx.x * blockDim.x + threadIdx.x;
t@@ -407,11 +414,11 @@ __global__ void interact(
// Fetch world dimensions in constant memory read
Float3 origo = MAKE_FLOAT3(devC_grid.origo[0],
- devC_grid.origo[1],
- devC_grid.origo[2]);
+ devC_grid.origo[1],
+ devC_grid.origo[2]);
Float3 L = MAKE_FLOAT3(devC_grid.L[0],
- devC_grid.L[1],
- devC_grid.L[2]);
+ devC_grid.L[1],
+ devC_grid.L[2]);
// Fetch wall data in global read
Float4 w_0_nx, w_1_nx, w_2_nx, w_3_nx, w_4_nx;
t@@ -497,8 +504,8 @@ __global__ void interact(
// Inter-particle vector, corrected for periodic boundaries
x_ab = MAKE_FLOAT3(x_a.x - x_b.x + distmod.x,
- x_a.y - x_b.y + distmod.y,
- x_a.z - x_b.z + distmod.z);
+ x_a.y - x_b.y + distmod.y,
+ x_a.z - x_b.z + distmod.z);
x_ab_length = length(x_ab);
delta_n = x_ab_length - (radius_a + radius_b);
t@@ -507,28 +514,28 @@ __global__ void interact(
if (delta_n < 0.0) {
if (devC_params.contactmodel == 2) {
contactLinear(&F, &T, &es_dot, &ev_dot, &p,
- idx_a_orig,
- idx_b_orig,
- vel_a,
- dev_vel,
- angvel_a,
- dev_angvel,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_n, dev_delta_t,
- mempos);
+ idx_a_orig,
+ idx_b_orig,
+ vel_a,
+ dev_vel,
+ angvel_a,
+ dev_angvel,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_n, dev_delta_t,
+ mempos);
} else if (devC_params.contactmodel == 3) {
contactHertz(&F, &T, &es_dot, &ev_dot, &p,
- idx_a_orig,
- idx_b_orig,
- vel_a,
- dev_vel,
- angvel_a,
- dev_angvel,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_n, dev_delta_t,
- mempos);
+ idx_a_orig,
+ idx_b_orig,
+ vel_a,
+ dev_vel,
+ angvel_a,
+ dev_angvel,
+ radius_a, radius_b,
+ x_ab, x_ab_length,
+ delta_n, dev_delta_t,
+ mempos);
}
} else {
__syncthreads();
t@@ -556,11 +563,11 @@ __global__ void interact(
// Calculate address in grid from position
gridPos.x = floor((x_a.x - devC_grid.origo[0])
- / (devC_grid.L[0]/devC_grid.num[0]));
+ / (devC_grid.L[0]/devC_grid.num[0]));
gridPos.y = floor((x_a.y - devC_grid.origo[1])
- / (devC_grid.L[1]/devC_grid.num[1]));
+ / (devC_grid.L[1]/devC_grid.num[1]));
gridPos.z = floor((x_a.z - devC_grid.origo[2])
- / (devC_grid.L[2]/devC_grid.num[2]));
+ / (devC_grid.L[2]/devC_grid.num[2]));
// Find overlaps between particle no. idx and all particles
// from its own cell + 26 neighbor cells.
t@@ -570,12 +577,17 @@ __global__ void interact(
for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
- findAndProcessContactsInCell(targetPos, idx_a, x_a, radius_a,
- &F, &T, &es_dot, &ev_dot, &p,
- dev_x_sorted,
- dev_vel_sorted, dev_angvel_sorted,
- dev_cellStart, dev_cellEnd,
- dev_walls_nx, dev_walls_mvfd);
+ findAndProcessContactsInCell(targetPos, idx_a,
+ x_a, radius_a,
+ &F, &T, &es_dot,
+ &ev_dot, &p,
+ dev_x_sorted,
+ dev_vel_sorted,
+ dev_angvel_sorted,
+ dev_cellStart,
+ dev_cellEnd,
+ dev_walls_nx,
+ dev_walls_mvfd);
}
}
}
t@@ -596,8 +608,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_0_nx.x, w_0_nx.y, w_0_nx.z);
if (delta_w < 0.0f) {
w_0_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
- radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
- w_0_mvfd.y);
+ radius_a, dev_vel_sorted, dev_angvel_sorted, w_n, delta_w,
+ w_0_mvfd.y);
}
// Lower wall (force on wall not stored)
t@@ -605,8 +617,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(0.0f, 0.0f, 1.0f);
if (delta_w < 0.0f) {
(void)contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p, idx_a,
- radius_a, dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
+ radius_a, dev_vel_sorted, dev_angvel_sorted,
+ w_n, delta_w, 0.0f);
}
t@@ -617,8 +629,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_1_nx.x, w_1_nx.y, w_1_nx.z);
if (delta_w < 0.0f) {
w_1_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_1_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_1_mvfd.y);
}
// Left wall (idx 2)
t@@ -626,8 +638,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_2_nx.x, w_2_nx.y, w_2_nx.z);
if (delta_w < 0.0f) {
w_2_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_2_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_2_mvfd.y);
}
// Back wall (idx 3)
t@@ -635,8 +647,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
if (delta_w < 0.0f) {
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_3_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_3_mvfd.y);
}
// Front wall (idx 4)
t@@ -644,8 +656,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
if (delta_w < 0.0f) {
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_4_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_4_mvfd.y);
}
} else if (devC_grid.periodic == 2) { // right and left walls periodic
t@@ -655,8 +667,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_3_nx.x, w_3_nx.y, w_3_nx.z);
if (delta_w < 0.0f) {
w_3_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_3_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_3_mvfd.y);
}
// Front wall (idx 4)
t@@ -664,8 +676,8 @@ __global__ void interact(
w_n = MAKE_FLOAT3(w_4_nx.x, w_4_nx.y, w_4_nx.z);
if (delta_w < 0.0f) {
w_4_force = contactLinear_wall(&F, &T, &es_dot, &ev_dot, &p,
- idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
- delta_w, w_4_mvfd.y);
+ idx_a, radius_a, dev_vel_sorted, dev_angvel_sorted, w_n,
+ delta_w, w_4_mvfd.y);
}
}
(DIR) diff --git a/src/integration.cuh b/src/integration.cuh
t@@ -11,15 +11,23 @@
// Second order integration scheme based on Taylor expansion of particle kinematics.
// Kernel executed on device, and callable from host only.
-__global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
- Float4* dev_angvel_sorted,
- Float4* dev_x, Float4* dev_vel, Float4* dev_angvel, // Output
- Float4* dev_force, Float4* dev_torque, Float4* dev_angpos, // Input
- Float4* dev_acc, Float4* dev_angacc,
- Float4* dev_vel0, Float4* dev_angvel0,
- Float4* dev_xyzsum,
- unsigned int* dev_gridParticleIndex, // Input: Sorted-Unsorted key
- unsigned int iter)
+__global__ void integrate(
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ const Float4* __restrict__ dev_angvel_sorted,
+ Float4* __restrict__ dev_x,
+ Float4* __restrict__ dev_vel,
+ Float4* __restrict__ dev_angvel,
+ const Float4* __restrict__ dev_force,
+ const Float4* __restrict__ dev_torque,
+ Float4* __restrict__ dev_angpos,
+ Float4* __restrict__ dev_acc,
+ Float4* __restrict__ dev_angacc,
+ Float4* __restrict__ dev_vel0,
+ Float4* __restrict__ dev_angvel0,
+ Float4* __restrict__ dev_xyzsum,
+ const unsigned int* __restrict__ dev_gridParticleIndex,
+ const unsigned int iter)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
t@@ -59,13 +67,13 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
// Coherent read from constant memory to registers
const Float dt = devC_dt;
const Float3 origo = MAKE_FLOAT3(
- devC_grid.origo[0],
- devC_grid.origo[1],
- devC_grid.origo[2]);
+ devC_grid.origo[0],
+ devC_grid.origo[1],
+ devC_grid.origo[2]);
const Float3 L = MAKE_FLOAT3(
- devC_grid.L[0],
- devC_grid.L[1],
- devC_grid.L[2]);
+ devC_grid.L[0],
+ devC_grid.L[1],
+ devC_grid.L[2]);
// Particle mass
Float m = 4.0/3.0 * PI * radius*radius*radius * devC_params.rho;
t@@ -179,14 +187,14 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
// Truncation error O(dt^4) for positions, O(dt^3) for velocities
// Approximate acceleration change by backwards difference:
const Float3 dacc_dt = MAKE_FLOAT3(
- (acc.x - acc0.x)/dt,
- (acc.y - acc0.y)/dt,
- (acc.z - acc0.z)/dt);
+ (acc.x - acc0.x)/dt,
+ (acc.y - acc0.y)/dt,
+ (acc.z - acc0.z)/dt);
const Float3 dangacc_dt = MAKE_FLOAT3(
- (angacc.x - angacc0.x)/dt,
- (angacc.y - angacc0.y)/dt,
- (angacc.z - angacc0.z)/dt);
+ (angacc.x - angacc0.x)/dt,
+ (angacc.y - angacc0.y)/dt,
+ (angacc.z - angacc0.z)/dt);
x_new.x = x.x + vel.x*dt + 0.5*acc.x*dt*dt + 1.0/6.0*dacc_dt.x*dt*dt*dt;
x_new.y = x.y + vel.y*dt + 0.5*acc.y*dt*dt + 1.0/6.0*dacc_dt.y*dt*dt*dt;
t@@ -253,7 +261,9 @@ __global__ void integrate(Float4* dev_x_sorted, Float4* dev_vel_sorted, // Input
// Reduce wall force contributions from particles to a single value per wall
-__global__ void summation(Float* in, Float *out)
+__global__ void summation(
+ const Float* __restrict__ in,
+ Float* __restrict__ out)
{
__shared__ Float cache[256];
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -287,14 +297,14 @@ __global__ void summation(Float* in, Float *out)
// Update wall positions
__global__ void integrateWalls(
- Float4* dev_walls_nx,
- Float4* dev_walls_mvfd,
- int* dev_walls_wmode,
- Float* dev_walls_force_partial,
- Float* dev_walls_acc,
- unsigned int blocksPerGrid,
- Float t_current,
- unsigned int iter)
+ Float4* __restrict__ dev_walls_nx,
+ Float4* __restrict__ dev_walls_mvfd,
+ const int* __restrict__ dev_walls_wmode,
+ const Float* __restrict__ dev_walls_force_partial,
+ Float* __restrict__ dev_walls_acc,
+ const unsigned int blocksPerGrid,
+ const Float t_current,
+ const unsigned int iter)
{
unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
(DIR) diff --git a/src/navierstokes.cuh b/src/navierstokes.cuh
t@@ -25,34 +25,34 @@ __inline__ __device__ Float hmean(Float a, Float b) {
// Helper functions for checking whether a value is NaN or Inf
__device__ int checkFiniteFloat(
- const char* desc,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float s)
+ const char* desc,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float s)
{
- __syncthreads();
- if (!isfinite(s)) {
- printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
- return 1;
- }
- return 0;
+ __syncthreads();
+ if (!isfinite(s)) {
+ printf("\n[%d,%d,%d]: Error: %s = %f\n", x, y, z, desc, s);
+ return 1;
+ }
+ return 0;
}
__device__ int checkFiniteFloat3(
- const char* desc,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float3 v)
+ const char* desc,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float3 v)
{
- __syncthreads();
- if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) {
- printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
- x, y, z, desc, v.x, v.y, v.z);
- return 1;
- }
- return 0;
+ __syncthreads();
+ if (!isfinite(v.x) || !isfinite(v.y) || !isfinite(v.z)) {
+ printf("\n[%d,%d,%d]: Error: %s = %f, %f, %f\n",
+ x, y, z, desc, v.x, v.y, v.z);
+ return 1;
+ }
+ return 0;
}
// Initialize memory
t@@ -143,10 +143,10 @@ void DEM::freeNSmemDev()
void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
{
checkForCudaErrors("Before attempting cudaMemcpy in "
- "transferNStoGlobalDeviceMemory");
+ "transferNStoGlobalDeviceMemory");
//if (verbose == 1 && statusmsg == 1)
- //std::cout << " Transfering fluid data to the device: ";
+ //std::cout << " Transfering fluid data to the device: ";
// memory size for a scalar field
unsigned int memSizeF = sizeof(Float)*NScells();
t@@ -162,7 +162,7 @@ void DEM::transferNStoGlobalDeviceMemory(int statusmsg)
checkForCudaErrors("End of transferNStoGlobalDeviceMemory");
//if (verbose == 1 && statusmsg == 1)
- //std::cout << "Done" << std::endl;
+ //std::cout << "Done" << std::endl;
}
// Transfer from device
t@@ -194,7 +194,7 @@ void DEM::transferNSfromGlobalDeviceMemory(int statusmsg)
void DEM::transferNSnormFromGlobalDeviceMemory()
{
cudaMemcpy(ns.norm, dev_ns_norm, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSnormFromGlobalDeviceMemory");
}
t@@ -202,7 +202,7 @@ void DEM::transferNSnormFromGlobalDeviceMemory()
void DEM::transferNSepsilonFromGlobalDeviceMemory()
{
cudaMemcpy(ns.epsilon, dev_ns_epsilon, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
}
t@@ -210,13 +210,13 @@ void DEM::transferNSepsilonFromGlobalDeviceMemory()
void DEM::transferNSepsilonNewFromGlobalDeviceMemory()
{
cudaMemcpy(ns.epsilon_new, dev_ns_epsilon_new, sizeof(Float)*NScells(),
- cudaMemcpyDeviceToHost);
+ cudaMemcpyDeviceToHost);
checkForCudaErrors("End of transferNSepsilonFromGlobalDeviceMemory");
}
// Get linear index from 3D grid position
__inline__ __device__ unsigned int idx(
- const int x, const int y, const int z)
+ const int x, const int y, const int z)
{
// without ghost nodes
//return x + dev_grid.num[0]*y + dev_grid.num[0]*dev_grid.num[1]*z;
t@@ -229,11 +229,11 @@ __inline__ __device__ unsigned int idx(
// Get linear index of velocity node from 3D grid position in staggered grid
__inline__ __device__ unsigned int vidx(
- const int x, const int y, const int z)
+ const int x, const int y, const int z)
{
// without ghost nodes
//return x + (devC_grid.num[0]+1)*y
- //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
+ //+ (devC_grid.num[0]+1)*(devC_grid.num[1]+1)*z;
// with ghost nodes
// the ghost nodes are placed at x,y,z = -1 and WIDTH+1
t@@ -246,10 +246,10 @@ __inline__ __device__ unsigned int vidx(
// dev_ns_v or dev_ns_v_p array. This function does not set the averaged
// velocity values in the ghost node cells.
__global__ void findNSavgVel(
- Float3* dev_ns_v, // out
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z) // in
+ Float3* __restrict__ dev_ns_v, // out
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z) // in
{
// 3D thread index
t@@ -272,9 +272,9 @@ __global__ void findNSavgVel(
// Find average velocity using arithmetic means
const Float3 v_bar = MAKE_FLOAT3(
- amean(v_xn, v_xp),
- amean(v_yn, v_yp),
- amean(v_zn, v_zp));
+ amean(v_xn, v_xp),
+ amean(v_yn, v_yp),
+ amean(v_zn, v_zp));
// Save value
__syncthreads();
t@@ -287,10 +287,10 @@ __global__ void findNSavgVel(
// or dev_ns_v_p array. Make sure that the averaged velocity ghost nodes are set
// beforehand.
__global__ void findNScellFaceVel(
- Float3* dev_ns_v, // in
- Float* dev_ns_v_x, // out
- Float* dev_ns_v_y, // out
- Float* dev_ns_v_z) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_v_x, // out
+ Float* __restrict__ dev_ns_v_y, // out
+ Float* __restrict__ dev_ns_v_z) // out
{
// 3D thread index
t@@ -340,7 +340,9 @@ __global__ void findNScellFaceVel(
// Set the initial guess of the values of epsilon.
-__global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
+__global__ void setNSepsilonInterior(
+ Float* __restrict__ dev_ns_epsilon,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -349,7 +351,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
// check that we are not outside the fluid grid
if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
- z > 0 && z < devC_grid.num[2]-1) {
+ z > 0 && z < devC_grid.num[2]-1) {
__syncthreads();
const unsigned int cellidx = idx(x,y,z);
dev_ns_epsilon[cellidx] = value;
t@@ -358,7 +360,7 @@ __global__ void setNSepsilonInterior(Float* dev_ns_epsilon, Float value)
// The normalized residuals are given an initial value of 0, since the values at
// the Dirichlet boundaries aren't written during the iterations.
-__global__ void setNSnormZero(Float* dev_ns_norm)
+__global__ void setNSnormZero(Float* __restrict__ dev_ns_norm)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -380,9 +382,9 @@ __global__ void setNSnormZero(Float* dev_ns_norm)
// the Dirichlet boundary condition: the new value should be identical to the
// old value, i.e. the temporal gradient is 0
__global__ void setNSepsilonBottom(
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- const Float value)
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -408,9 +410,9 @@ __global__ void setNSepsilonBottom(
// the Dirichlet boundary condition: the new value should be identical to the
// old value, i.e. the temporal gradient is 0
__global__ void setNSepsilonTop(
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- const Float value)
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float value)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -419,7 +421,7 @@ __global__ void setNSepsilonTop(
// check that we are not outside the fluid grid, and at the upper z boundary
if (x < devC_grid.num[0] && y < devC_grid.num[1] &&
- z == devC_grid.num[2]-1) {
+ z == devC_grid.num[2]-1) {
__syncthreads();
const unsigned int cellidx = idx(x,y,z);
t@@ -428,11 +430,14 @@ __global__ void setNSepsilonTop(
}
}
__device__ void copyNSvalsDev(
- unsigned int read, unsigned int write,
- Float* dev_ns_p,
- Float3* dev_ns_v, Float3* dev_ns_v_p,
- Float* dev_ns_phi, Float* dev_ns_dphi,
- Float* dev_ns_epsilon)
+ const unsigned int read,
+ const unsigned int write,
+ Float* __restrict__ dev_ns_p,
+ Float3* __restrict__ dev_ns_v,
+ Float3* __restrict__ dev_ns_v_p,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float* __restrict__ dev_ns_epsilon)
{
// Coalesced read
const Float p = dev_ns_p[read];
t@@ -457,10 +462,12 @@ __device__ void copyNSvalsDev(
// are not written since they are not read. Launch this kernel for all cells in
// the grid
__global__ void setNSghostNodesDev(
- Float* dev_ns_p,
- Float3* dev_ns_v, Float3* dev_ns_v_p,
- Float* dev_ns_phi, Float* dev_ns_dphi,
- Float* dev_ns_epsilon)
+ Float* __restrict__ dev_ns_p,
+ Float3* __restrict__ dev_ns_v,
+ Float3* __restrict__ dev_ns_v_p,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float* __restrict__ dev_ns_epsilon)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -484,72 +491,72 @@ __global__ void setNSghostNodesDev(
if (x == 0) {
writeidx = idx(nx,y,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (x == nx-1) {
writeidx = idx(-1,y,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (y == 0) {
writeidx = idx(x,ny,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (y == ny-1) {
writeidx = idx(x,-1,z);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
// Z boundaries fixed
if (z == 0) {
writeidx = idx(x,y,-1);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
if (z == nz-1) {
writeidx = idx(x,y,nz);
copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
}
// Z boundaries periodic
/*if (z == 0) {
- writeidx = idx(x,y,nz);
- copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
- }
- if (z == nz-1) {
- writeidx = idx(x,y,-1);
- copyNSvalsDev(cellidx, writeidx,
- dev_ns_p,
- dev_ns_v, dev_ns_v_p,
- dev_ns_phi, dev_ns_dphi,
- dev_ns_epsilon);
- }*/
+ writeidx = idx(x,y,nz);
+ copyNSvalsDev(cellidx, writeidx,
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
+ }
+ if (z == nz-1) {
+ writeidx = idx(x,y,-1);
+ copyNSvalsDev(cellidx, writeidx,
+ dev_ns_p,
+ dev_ns_v, dev_ns_v_p,
+ dev_ns_phi, dev_ns_dphi,
+ dev_ns_epsilon);
+ }*/
}
}
t@@ -557,7 +564,7 @@ __global__ void setNSghostNodesDev(
// (diagonal) cells are not written since they are not read. Launch this kernel
// for all cells in the grid usind setNSghostNodes<datatype><<<.. , ..>>>( .. );
template<typename T>
-__global__ void setNSghostNodes(T* dev_scalarfield)
+__global__ void setNSghostNodes(T* __restrict__ dev_scalarfield)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -586,10 +593,10 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
if (z == 0)
dev_scalarfield[idx(x,y,-1)] = val; // Dirichlet
- //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
+ //dev_scalarfield[idx(x,y,nz)] = val; // Periodic -z
if (z == nz-1)
dev_scalarfield[idx(x,y,nz)] = val; // Dirichlet
- //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
+ //dev_scalarfield[idx(x,y,-1)] = val; // Periodic +z
}
}
t@@ -597,9 +604,9 @@ __global__ void setNSghostNodes(T* dev_scalarfield)
// (diagonal) cells are not written since they are not read.
template<typename T>
__global__ void setNSghostNodes(
- T* dev_scalarfield,
- int bc_bot,
- int bc_top)
+ T* __restrict__ dev_scalarfield,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -651,11 +658,11 @@ __global__ void setNSghostNodes(
// According to Griebel et al. 1998 "Numerical Simulation in Fluid Dynamics"
template<typename T>
__global__ void setNSghostNodesFace(
- T* dev_scalarfield_x,
- T* dev_scalarfield_y,
- T* dev_scalarfield_z,
- int bc_bot,
- int bc_top)
+ T* __restrict__ dev_scalarfield_x,
+ T* __restrict__ dev_scalarfield_y,
+ T* __restrict__ dev_scalarfield_z,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -849,9 +856,9 @@ __global__ void setNSghostNodesFace(
// The edge (diagonal) cells are not written since they are not read. Launch
// this kernel for all cells in the grid.
__global__ void setNSghostNodes_tau(
- Float* dev_ns_tau,
- int bc_bot,
- int bc_top)
+ Float* __restrict__ dev_ns_tau,
+ const int bc_bot,
+ const int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -981,108 +988,108 @@ __global__ void setNSghostNodes_tau(
// The edge (diagonal) cells are not written since they are not read. Launch
// this kernel for all cells in the grid.
/*
-__global__ void setNSghostNodesForcing(
- Float* dev_ns_f1,
- Float3* dev_ns_f2,
- Float* dev_ns_f,
- unsigned int nijac)
-
-{
- // 3D thread index
- const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
- const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
- const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
-
- // Grid dimensions
- const unsigned int nx = devC_grid.num[0];
- const unsigned int ny = devC_grid.num[1];
- const unsigned int nz = devC_grid.num[2];
-
- // 1D thread index
- unsigned int cellidx = idx(x,y,z);
-
- // check that we are not outside the fluid grid
- if (x < nx && y < ny && z < nz) {
-
- __syncthreads();
- const Float f = dev_ns_f[cellidx];
- Float f1;
- Float3 f2;
-
- if (nijac == 0) {
- __syncthreads();
- f1 = dev_ns_f1[cellidx];
- f2 = dev_ns_f2[cellidx];
- }
-
- if (x == 0) {
- cellidx = idx(nx,y,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (x == nx-1) {
- cellidx = idx(-1,y,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
-
- if (y == 0) {
- cellidx = idx(x,ny,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (y == ny-1) {
- cellidx = idx(x,-1,z);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
-
- if (z == 0) {
- cellidx = idx(x,y,nz);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- if (z == nz-1) {
- cellidx = idx(x,y,-1);
- dev_ns_f[cellidx] = f;
- if (nijac == 0) {
- dev_ns_f1[cellidx] = f1;
- dev_ns_f2[cellidx] = f2;
- }
- }
- }
-}
+ __global__ void setNSghostNodesForcing(
+ Float* dev_ns_f1,
+ Float3* dev_ns_f2,
+ Float* dev_ns_f,
+ unsigned int nijac)
+
+ {
+ // 3D thread index
+ const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
+ const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
+ const unsigned int z = blockDim.z * blockIdx.z + threadIdx.z;
+
+ // Grid dimensions
+ const unsigned int nx = devC_grid.num[0];
+ const unsigned int ny = devC_grid.num[1];
+ const unsigned int nz = devC_grid.num[2];
+
+ // 1D thread index
+ unsigned int cellidx = idx(x,y,z);
+
+ // check that we are not outside the fluid grid
+ if (x < nx && y < ny && z < nz) {
+
+ __syncthreads();
+ const Float f = dev_ns_f[cellidx];
+ Float f1;
+ Float3 f2;
+
+ if (nijac == 0) {
+ __syncthreads();
+ f1 = dev_ns_f1[cellidx];
+ f2 = dev_ns_f2[cellidx];
+ }
+
+ if (x == 0) {
+ cellidx = idx(nx,y,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (x == nx-1) {
+ cellidx = idx(-1,y,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+
+ if (y == 0) {
+ cellidx = idx(x,ny,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (y == ny-1) {
+ cellidx = idx(x,-1,z);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+
+ if (z == 0) {
+ cellidx = idx(x,y,nz);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ if (z == nz-1) {
+ cellidx = idx(x,y,-1);
+ dev_ns_f[cellidx] = f;
+ if (nijac == 0) {
+ dev_ns_f1[cellidx] = f1;
+ dev_ns_f2[cellidx] = f2;
+ }
+ }
+ }
+ }
*/
// Find the porosity in each cell on the base of a sphere, centered at the cell
// center.
__global__ void findPorositiesVelocitiesDiametersSpherical(
- const unsigned int* dev_cellStart,
- const unsigned int* dev_cellEnd,
- const Float4* dev_x_sorted,
- const Float4* dev_vel_sorted,
- Float* dev_ns_phi,
- Float* dev_ns_dphi,
- Float3* dev_ns_vp_avg,
- Float* dev_ns_d_avg,
- const unsigned int iteration,
- const unsigned int np,
- const Float c_phi)
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float3* __restrict__ dev_ns_vp_avg,
+ Float* __restrict__ dev_ns_d_avg,
+ const unsigned int iteration,
+ const unsigned int np,
+ const Float c_phi)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1114,9 +1121,9 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
// Cell sphere center position
const Float3 X = MAKE_FLOAT3(
- x*dx + 0.5*dx,
- y*dy + 0.5*dy,
- z*dz + 0.5*dz);
+ x*dx + 0.5*dx,
+ y*dy + 0.5*dy,
+ z*dz + 0.5*dz);
Float d, r;
Float phi = 1.00;
t@@ -1143,12 +1150,12 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
// Iterate over 27 neighbor cells, R = cell width
/*for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
+ for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
+ for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis*/
// Iterate over 27 neighbor cells, R = 2*cell width
for (int z_dim=-2; z_dim<3; ++z_dim) { // z-axis
- //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
+ //for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
for (int y_dim=-2; y_dim<3; ++y_dim) { // y-axis
for (int x_dim=-2; x_dim<3; ++x_dim) { // x-axis
t@@ -1186,9 +1193,9 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
// Find center distance
dist = MAKE_FLOAT3(
- X.x - xr.x,
- X.y - xr.y,
- X.z - xr.z);
+ X.x - xr.x,
+ X.y - xr.y,
+ X.z - xr.z);
dist += distmod;
d = length(dist);
t@@ -1196,10 +1203,10 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
if ((R - r) < d && d < (R + r)) {
void_volume -=
1.0/(12.0*d) * (
- M_PI*(R + r - d)*(R + r - d)
- *(d*d + 2.0*d*r - 3.0*r*r
- + 2.0*d*R + 6.0*r*R
- - 3.0*R*R) );
+ M_PI*(R + r - d)*(R + r - d)
+ *(d*d + 2.0*d*r - 3.0*r*r
+ + 2.0*d*R + 6.0*r*R
+ - 3.0*R*R) );
v_avg += MAKE_FLOAT3(v.x, v.y, v.z);
d_avg += 2.0*r;
n++;
t@@ -1259,8 +1266,8 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
//Float phi = 0.5;
//Float dphi = 0.0;
//if (iteration == 20 && x == nx/2 && y == ny/2 && z == nz/2) {
- //phi = 0.4;
- //dphi = 0.1;
+ //phi = 0.4;
+ //dphi = 0.1;
//}
//dev_ns_phi[cellidx] = phi;
//dev_ns_dphi[cellidx] = dphi;
t@@ -1276,17 +1283,17 @@ __global__ void findPorositiesVelocitiesDiametersSpherical(
// Find the porosity in each cell on the base of a sphere, centered at the cell
// center.
__global__ void findPorositiesVelocitiesDiametersSphericalGradient(
- const unsigned int* dev_cellStart,
- const unsigned int* dev_cellEnd,
- const Float4* dev_x_sorted,
- const Float4* dev_vel_sorted,
- Float* dev_ns_phi,
- Float* dev_ns_dphi,
- Float3* dev_ns_vp_avg,
- Float* dev_ns_d_avg,
- const unsigned int iteration,
- const unsigned int ndem,
- const unsigned int np)
+ const unsigned int* __restrict__ dev_cellStart,
+ const unsigned int* __restrict__ dev_cellEnd,
+ const Float4* __restrict__ dev_x_sorted,
+ const Float4* __restrict__ dev_vel_sorted,
+ Float* __restrict__ dev_ns_phi,
+ Float* __restrict__ dev_ns_dphi,
+ Float3* __restrict__ dev_ns_vp_avg,
+ Float* __restrict__ dev_ns_d_avg,
+ const unsigned int iteration,
+ const unsigned int ndem,
+ const unsigned int np)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1304,7 +1311,7 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
const Float dz = devC_grid.L[2]/nz;
// Cell sphere radius
- const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width
+ const Float R = fmin(dx, fmin(dy,dz)); // diameter = 2*cell width
Float4 xr; // particle pos. and radius
t@@ -1315,9 +1322,9 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
// Cell sphere center position
const Float3 X = MAKE_FLOAT3(
- x*dx + 0.5*dx,
- y*dy + 0.5*dy,
- z*dz + 0.5*dz);
+ x*dx + 0.5*dx,
+ y*dy + 0.5*dy,
+ z*dz + 0.5*dz);
Float d, r;
Float phi = 1.00;
t@@ -1396,9 +1403,9 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
// Find center distance and normal vector
x_p = MAKE_FLOAT3(
- xr.x - X.x,
- xr.y - X.y,
- xr.z - X.z);
+ xr.x - X.x,
+ xr.y - X.y,
+ xr.z - X.z);
d = length(x_p);
n_p = x_p/d;
q = d/R;
t@@ -1438,8 +1445,8 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
phi = phi_0 + dphi/(ndem*devC_dt);
//if (dot_epsilon_kk != 0.0)
- //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
- //x,y,z, dot_epsilon_kk, dphi, phi);
+ //printf("%d,%d,%d\tdot_epsilon_kk = %f\tdphi = %f\tphi = %f\n",
+ //x,y,z, dot_epsilon_kk, dphi, phi);
// Make sure that the porosity is in the interval [0.0;1.0]
phi = fmin(1.00, fmax(0.00, phi));
t@@ -1484,11 +1491,11 @@ __global__ void findPorositiesVelocitiesDiametersSphericalGradient(
// Modulate the hydraulic pressure at the upper boundary
__global__ void setUpperPressureNS(
- Float* dev_ns_p,
- Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- Float beta,
- const Float new_pressure)
+ Float* __restrict__ dev_ns_p,
+ Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ const Float beta,
+ const Float new_pressure)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1497,8 +1504,8 @@ __global__ void setUpperPressureNS(
// check that the thread is located at the top boundary
if (x < devC_grid.num[0] &&
- y < devC_grid.num[1] &&
- z == devC_grid.num[2]-1) {
+ y < devC_grid.num[1] &&
+ z == devC_grid.num[2]-1) {
const unsigned int cellidx = idx(x,y,z);
t@@ -1524,13 +1531,13 @@ __global__ void setUpperPressureNS(
// Find the gradient in a cell in a homogeneous, cubic 3D scalar field using
// finite central differences
__device__ Float3 gradient(
- const Float* dev_scalarfield,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_scalarfield,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
// Read 6 neighbor cells
__syncthreads();
t@@ -1544,26 +1551,26 @@ __device__ Float3 gradient(
//__syncthreads();
//if (p != 0.0)
- //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
+ //printf("p[%d,%d,%d] =\t%f\n", x,y,z, p);
// Calculate central-difference gradients
return MAKE_FLOAT3(
- (xp - xn)/(2.0*dx),
- (yp - yn)/(2.0*dy),
- (zp - zn)/(2.0*dz));
+ (xp - xn)/(2.0*dx),
+ (yp - yn)/(2.0*dy),
+ (zp - zn)/(2.0*dz));
}
// Find the divergence in a cell in a homogeneous, cubic, 3D vector field
__device__ Float divergence(
- const Float* dev_vectorfield_x,
- const Float* dev_vectorfield_y,
- const Float* dev_vectorfield_z,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_vectorfield_x,
+ const Float* __restrict__ dev_vectorfield_y,
+ const Float* __restrict__ dev_vectorfield_z,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
// Read 6 cell-face values
__syncthreads();
t@@ -1583,13 +1590,13 @@ __device__ Float divergence(
// Find the divergence of a tensor field
__device__ Float3 divergence_tensor(
- Float* dev_tensorfield,
- const unsigned int x,
- const unsigned int y,
- const unsigned int z,
- const Float dx,
- const Float dy,
- const Float dz)
+ const Float* __restrict__ dev_tensorfield,
+ const unsigned int x,
+ const unsigned int y,
+ const unsigned int z,
+ const Float dx,
+ const Float dy,
+ const Float dz)
{
__syncthreads();
t@@ -1638,18 +1645,18 @@ __device__ Float3 divergence_tensor(
// Calculate div(phi*tau)
const Float3 div_tensor = MAKE_FLOAT3(
- // x
- (t_xx_xp - t_xx_xn)/dx +
- (t_xy_yp - t_xy_yn)/dy +
- (t_xz_zp - t_xz_zn)/dz,
- // y
- (t_xy_xp - t_xy_xn)/dx +
- (t_yy_yp - t_yy_yn)/dy +
- (t_yz_zp - t_yz_zn)/dz,
- // z
- (t_xz_xp - t_xz_xn)/dx +
- (t_yz_yp - t_yz_yn)/dy +
- (t_zz_zp - t_zz_zn)/dz);
+ // x
+ (t_xx_xp - t_xx_xn)/dx +
+ (t_xy_yp - t_xy_yn)/dy +
+ (t_xz_zp - t_xz_zn)/dz,
+ // y
+ (t_xy_xp - t_xy_xn)/dx +
+ (t_yy_yp - t_yy_yn)/dy +
+ (t_yz_zp - t_yz_zn)/dz,
+ // z
+ (t_xz_xp - t_xz_xn)/dx +
+ (t_yz_yp - t_yz_yn)/dy +
+ (t_zz_zp - t_zz_zn)/dz);
#ifdef CHECK_NS_FINITE
(void)checkFiniteFloat3("div_tensor", x, y, z, div_tensor);
t@@ -1661,8 +1668,8 @@ __device__ Float3 divergence_tensor(
// Find the spatial gradient in e.g. pressures per cell
// using first order central differences
__global__ void findNSgradientsDev(
- Float* dev_scalarfield, // in
- Float3* dev_vectorfield) // out
+ const Float* __restrict__ dev_scalarfield, // in
+ Float3* __restrict__ dev_vectorfield) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1699,8 +1706,8 @@ __global__ void findNSgradientsDev(
// Find the outer product of v v
__global__ void findvvOuterProdNS(
- Float3* dev_ns_v, // in
- Float* dev_ns_v_prod) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_v_prod) // out
{
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
const unsigned int y = blockDim.y * blockIdx.y + threadIdx.y;
t@@ -1750,8 +1757,8 @@ __global__ void findvvOuterProdNS(
// Find the fluid stress tensor. It is symmetrical, and can thus be saved in 6
// values in 3D.
__global__ void findNSstressTensor(
- Float3* dev_ns_v, // in
- Float* dev_ns_tau) // out
+ const Float3* __restrict__ dev_ns_v, // in
+ Float* __restrict__ dev_ns_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1809,15 +1816,15 @@ __global__ void findNSstressTensor(
devC_params.mu*((zp.y - zn.y)/(2.0*dz) + (yp.z - yn.z)/(2.0*dy));
/*
- if (x == 0 && y == 0 && z == 0)
- printf("mu = %f\n", mu);
- if (tau_xz > 1.0e-6)
- printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
- if (tau_yz > 1.0e-6)
- printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
- if (tau_zz > 1.0e-6)
- printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
- */
+ if (x == 0 && y == 0 && z == 0)
+ printf("mu = %f\n", mu);
+ if (tau_xz > 1.0e-6)
+ printf("%d,%d,%d\ttau_xx = %f\n", x,y,z, tau_xx);
+ if (tau_yz > 1.0e-6)
+ printf("%d,%d,%d\ttau_yy = %f\n", x,y,z, tau_yy);
+ if (tau_zz > 1.0e-6)
+ printf("%d,%d,%d\ttau_zz = %f\n", x,y,z, tau_zz);
+ */
// Store values in global memory
__syncthreads();
t@@ -1842,9 +1849,9 @@ __global__ void findNSstressTensor(
// Find the divergence of phi*v*v
__global__ void findNSdivphiviv(
- Float* dev_ns_phi, // in
- Float3* dev_ns_v, // in
- Float3* dev_ns_div_phi_vi_v) // out
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float3* __restrict__ dev_ns_v, // in
+ Float3* __restrict__ dev_ns_div_phi_vi_v) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -1888,58 +1895,58 @@ __global__ void findNSdivphiviv(
// Calculate upwind coefficients
//*
const Float3 a = MAKE_FLOAT3(
- copysign(1.0, v.x),
- copysign(1.0, v.y),
- copysign(1.0, v.z));
+ copysign(1.0, v.x),
+ copysign(1.0, v.y),
+ copysign(1.0, v.z));
// Calculate the divergence based on the upwind differences (Griebel et
// al. 1998, eq. 3.9)
const Float3 div_uw = MAKE_FLOAT3(
- // x
- ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
+ // x
+ ((1.0 + a.x)*(phi*v.x*v.x - phi_xn*v_xn.x*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.x*v_xp.x - phi*v.x*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.x*v.y - phi_yn*v_yn.x*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.x*v_yp.y - phi*v.x*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
+ ((1.0 + a.z)*(phi*v.x*v.z - phi_zn*v_zn.x*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.x*v_zp.z - phi*v.x*v.z))/(2.0*dz),
- // y
- ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
+ // y
+ ((1.0 + a.x)*(phi*v.y*v.x - phi_xn*v_xn.y*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.y*v_xp.x - phi*v.y*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.y*v.y - phi_yn*v_yn.y*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.y*v_yp.y - phi*v.y*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
+ ((1.0 + a.z)*(phi*v.y*v.z - phi_zn*v_zn.y*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.y*v_zp.z - phi*v.y*v.z))/(2.0*dz),
- // z
- ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
- (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
+ // z
+ ((1.0 + a.x)*(phi*v.z*v.x - phi_xn*v_xn.z*v_xn.x) +
+ (1.0 - a.x)*(phi_xp*v_xp.z*v_xp.x - phi*v.z*v.x))/(2.0*dx) +
- ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
- (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
+ ((1.0 + a.y)*(phi*v.z*v.y - phi_yn*v_yn.z*v_yn.y) +
+ (1.0 - a.y)*(phi_yp*v_yp.z*v_yp.y - phi*v.z*v.y))/(2.0*dy) +
- ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
- (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
+ ((1.0 + a.z)*(phi*v.z*v.z - phi_zn*v_zn.z*v_zn.z) +
+ (1.0 - a.z)*(phi_zp*v_zp.z*v_zp.z - phi*v.z*v.z))/(2.0*dz));
// Calculate the divergence based on the central-difference gradients
const Float3 div_cd = MAKE_FLOAT3(
- // x
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
- // y
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
- // z
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
+ // x
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
+ // y
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
+ // z
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
// Weighting parameter
const Float tau = 0.5;
t@@ -1951,26 +1958,26 @@ __global__ void findNSdivphiviv(
/*
// Calculate the divergence: div(phi*v_i*v)
const Float3 div_phi_vi_v = MAKE_FLOAT3(
- // x
- (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
- // y
- (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
- // z
- (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
- (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
- (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
- // */
+ // x
+ (phi_xp*v_xp.x*v_xp.x - phi_xn*v_xn.x*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.x*v_yp.y - phi_yn*v_yn.x*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.x*v_zp.z - phi_zn*v_zn.x*v_zn.z)/(2.0*dz),
+ // y
+ (phi_xp*v_xp.y*v_xp.x - phi_xn*v_xn.y*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.y*v_yp.y - phi_yn*v_yn.y*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.y*v_zp.z - phi_zn*v_zn.y*v_zn.z)/(2.0*dz),
+ // z
+ (phi_xp*v_xp.z*v_xp.x - phi_xn*v_xn.z*v_xn.x)/(2.0*dx) +
+ (phi_yp*v_yp.z*v_yp.y - phi_yn*v_yn.z*v_yn.y)/(2.0*dy) +
+ (phi_zp*v_zp.z*v_zp.z - phi_zn*v_zn.z*v_zn.z)/(2.0*dz));
+ // */
// Write divergence
__syncthreads();
dev_ns_div_phi_vi_v[cellidx] = div_phi_vi_v;
//printf("div(phi*v*v) [%d,%d,%d] = %f, %f, %f\n", x,y,z,
- //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
+ //div_phi_vi_v.x, div_phi_vi_v.y, div_phi_vi_v.z);
#ifdef CHECK_NS_FINITE
(void)checkFiniteFloat3("div_phi_vi_v", x, y, z, div_phi_vi_v);
t@@ -1979,8 +1986,8 @@ __global__ void findNSdivphiviv(
}
__global__ void findNSdivtau(
- Float* dev_ns_tau, // in
- Float3* dev_ns_div_tau) // out
+ const Float* __restrict__ dev_ns_tau, // in
+ Float3* __restrict__ dev_ns_div_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2015,9 +2022,9 @@ __global__ void findNSdivtau(
// Find the divergence of phi*tau
__global__ void findNSdivphitau(
- Float* dev_ns_phi, // in
- Float* dev_ns_tau, // in
- Float3* dev_ns_div_phi_tau) // out
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_tau, // in
+ Float3* __restrict__ dev_ns_div_phi_tau) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2094,18 +2101,18 @@ __global__ void findNSdivphitau(
// Calculate div(phi*tau)
const Float3 div_phi_tau = MAKE_FLOAT3(
- // x
- (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
- (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
- (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
- // y
- (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
- (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
- (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
- // z
- (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
- (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
- (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
+ // x
+ (phi_xp*tau_xx_xp - phi_xn*tau_xx_xn)/dx +
+ (phi_yp*tau_xy_yp - phi_yn*tau_xy_yn)/dy +
+ (phi_zp*tau_xz_zp - phi_zn*tau_xz_zn)/dz,
+ // y
+ (phi_xp*tau_xy_xp - phi_xn*tau_xy_xn)/dx +
+ (phi_yp*tau_yy_yp - phi_yn*tau_yy_yn)/dy +
+ (phi_zp*tau_yz_zp - phi_zn*tau_yz_zn)/dz,
+ // z
+ (phi_xp*tau_xz_xp - phi_xn*tau_xz_xn)/dx +
+ (phi_yp*tau_yz_yp - phi_yn*tau_yz_yn)/dy +
+ (phi_zp*tau_zz_zp - phi_zn*tau_zz_zn)/dz);
// Write divergence
__syncthreads();
t@@ -2120,9 +2127,9 @@ __global__ void findNSdivphitau(
// Find the divergence of phi v v
// Unused
__global__ void findNSdivphivv(
- Float* dev_ns_v_prod, // in
- Float* dev_ns_phi, // in
- Float3* dev_ns_div_phi_v_v) // out
+ const Float* __restrict__ dev_ns_v_prod, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ Float3* __restrict__ dev_ns_div_phi_v_v) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2174,24 +2181,24 @@ __global__ void findNSdivphivv(
// The symmetry described in findvvOuterProdNS is used
__syncthreads();
const Float3 div = MAKE_FLOAT3(
- ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
- ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
- ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
- - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
- (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
- - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
- (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
- - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
+ ((dev_ns_v_prod[idx(x+1,y,z)*6]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+1]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+1]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+2]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+2]*phi_zn)/(2.0*dz)),
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+1]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6+1]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+3]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+3]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+4]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+4]*phi_zn)/(2.0*dz)),
+ ((dev_ns_v_prod[idx(x+1,y,z)*6+2]*phi_xp
+ - dev_ns_v_prod[idx(x-1,y,z)*6+2]*phi_xn)/(2.0*dx) +
+ (dev_ns_v_prod[idx(x,y+1,z)*6+4]*phi_yp
+ - dev_ns_v_prod[idx(x,y-1,z)*6+4]*phi_yn)/(2.0*dy) +
+ (dev_ns_v_prod[idx(x,y,z+1)*6+5]*phi_zp
+ - dev_ns_v_prod[idx(x,y,z-1)*6+5]*phi_zn)/(2.0*dz)) );
//printf("div[%d,%d,%d] = %f\t%f\t%f\n", x, y, z, div.x, div.y, div.z);
t@@ -2209,25 +2216,25 @@ __global__ void findNSdivphivv(
// Find predicted fluid velocity
// Launch per face.
__global__ void findPredNSvelocities(
- Float* dev_ns_p, // in
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_dphi, // in
- Float* dev_ns_div_tau_x, // in
- Float* dev_ns_div_tau_y, // in
- Float* dev_ns_div_tau_z, // in
- Float3* dev_ns_div_phi_vi_v, // in
- int bc_bot, // in
- int bc_top, // in
- Float beta, // in
- Float3* dev_ns_F_pf, // in
- unsigned int ndem, // in
- Float c_grad_p, // in
- Float* dev_ns_v_p_x, // out
- Float* dev_ns_v_p_y, // out
- Float* dev_ns_v_p_z) // out
+ const Float* __restrict__ dev_ns_p, // in
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_dphi, // in
+ const Float* __restrict__ dev_ns_div_tau_x, // in
+ const Float* __restrict__ dev_ns_div_tau_y, // in
+ const Float* __restrict__ dev_ns_div_tau_z, // in
+ const Float3* __restrict__ dev_ns_div_phi_vi_v, // in
+ const int bc_bot, // in
+ const int bc_top, // in
+ const Float beta, // in
+ const Float3* __restrict__ dev_ns_F_pf, // in
+ const unsigned int ndem, // in
+ const Float __restrict__ c_grad_p, // in
+ Float* __restrict__ dev_ns_v_p_x, // out
+ Float* __restrict__ dev_ns_v_p_y, // out
+ Float* __restrict__ dev_ns_v_p_z) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2255,9 +2262,9 @@ __global__ void findPredNSvelocities(
// Values that are needed for calculating the predicted velocity
__syncthreads();
const Float3 v = MAKE_FLOAT3(
- dev_ns_v_x[fidx],
- dev_ns_v_y[fidx],
- dev_ns_v_z[fidx]);
+ dev_ns_v_x[fidx],
+ dev_ns_v_y[fidx],
+ dev_ns_v_z[fidx]);
Float3 div_tau = MAKE_FLOAT3(0.0, 0.0, 0.0);
if (devC_params.mu > 0.0) {
t@@ -2285,13 +2292,13 @@ __global__ void findPredNSvelocities(
// component-wise average values
const Float3 phi = MAKE_FLOAT3(
- amean(phi_c, phi_xn),
- amean(phi_c, phi_yn),
- amean(phi_c, phi_zn));
+ amean(phi_c, phi_xn),
+ amean(phi_c, phi_yn),
+ amean(phi_c, phi_zn));
const Float3 dphi = MAKE_FLOAT3(
- amean(dphi_c, dphi_xn),
- amean(dphi_c, dphi_yn),
- amean(dphi_c, dphi_zn));
+ amean(dphi_c, dphi_xn),
+ amean(dphi_c, dphi_yn),
+ amean(dphi_c, dphi_zn));
// The particle-fluid interaction force should only be incoorporated if
// there is a fluid viscosity
t@@ -2308,9 +2315,9 @@ __global__ void findPredNSvelocities(
f_i_zn = MAKE_FLOAT3(0.0, 0.0, 0.0);
}
const Float3 f_i = MAKE_FLOAT3(
- amean(f_i_c.x, f_i_xn.x),
- amean(f_i_c.y, f_i_yn.y),
- amean(f_i_c.z, f_i_zn.z));
+ amean(f_i_c.x, f_i_xn.x),
+ amean(f_i_c.y, f_i_yn.y),
+ amean(f_i_c.z, f_i_zn.z));
const Float dt = ndem*devC_dt;
const Float rho = devC_params.rho_f;
t@@ -2326,9 +2333,9 @@ __global__ void findPredNSvelocities(
const Float p_yn = dev_ns_p[idx(x,y-1,z)];
const Float p_zn = dev_ns_p[idx(x,y,z-1)];
const Float3 grad_p = MAKE_FLOAT3(
- (p - p_xn)/dx,
- (p - p_yn)/dy,
- (p - p_zn)/dz) * c_grad_p;
+ (p - p_xn)/dx,
+ (p - p_yn)/dy,
+ (p - p_zn)/dz) * c_grad_p;
#ifdef SET_1
pressure_term = -beta*dt/(rho*phi)*grad_p;
#endif
t@@ -2338,16 +2345,16 @@ __global__ void findPredNSvelocities(
}
const Float3 div_phi_vi_v = MAKE_FLOAT3(
- amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
- amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
- amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
+ amean(div_phi_vi_v_xn.x, div_phi_vi_v_c.x),
+ amean(div_phi_vi_v_yn.x, div_phi_vi_v_c.y),
+ amean(div_phi_vi_v_zn.x, div_phi_vi_v_c.z));
// Determine the predicted velocity
#ifdef SET_1
const Float3 interaction_term = -dt/(rho*phi)*f_i;
const Float3 diffusion_term = dt/(rho*phi)*div_tau;
const Float3 gravity_term = MAKE_FLOAT3(
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
const Float3 porosity_term = -1.0*v*dphi/phi;
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
#endif
t@@ -2355,7 +2362,7 @@ __global__ void findPredNSvelocities(
const Float3 interaction_term = -dt/(rho*phi)*f_i;
const Float3 diffusion_term = dt/rho*div_tau;
const Float3 gravity_term = MAKE_FLOAT3(
- devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
+ devC_params.g[0], devC_params.g[1], devC_params.g[2])*dt;
const Float3 porosity_term = -1.0*v*dphi/phi;
const Float3 advection_term = -1.0*div_phi_vi_v*dt/phi;
#endif
t@@ -2376,31 +2383,31 @@ __global__ void findPredNSvelocities(
// No slip
/*if ((z == 0 && bc_bot == 2) || (z == nz-1 && bc_top == 2)) {
- v_p.x = 0.0;
- v_p.y = 0.0;
- v_p.z = 0.0;
- }*/
+ v_p.x = 0.0;
+ v_p.y = 0.0;
+ v_p.z = 0.0;
+ }*/
#ifdef REPORT_V_P_COMPONENTS
// Report velocity components to stdout for debugging
if (z==0)
- printf("\n[%d,%d,%d]"
- "\tv_p = %+e %+e %+e\n"
- "\tpres = %+e %+e %+e\n"
- "\tinteract = %+e %+e %+e\n"
- "\tdiff = %+e %+e %+e\n"
- "\tgrav = %+e %+e %+e\n"
- "\tporos = %+e %+e %+e\n"
- "\tadv = %+e %+e %+e\n",
- x, y, z,
- v_p.x, v_p.y, v_p.z,
- pressure_term.x, pressure_term.y, pressure_term.z,
- interaction_term.x, interaction_term.y, interaction_term.z,
- diffusion_term.x, diffusion_term.y, diffusion_term.z,
- gravity_term.x, gravity_term.y, gravity_term.z,
- porosity_term.x, porosity_term.y, porosity_term.z,
- advection_term.x, advection_term.y, advection_term.z);
+ printf("\n[%d,%d,%d]"
+ "\tv_p = %+e %+e %+e\n"
+ "\tpres = %+e %+e %+e\n"
+ "\tinteract = %+e %+e %+e\n"
+ "\tdiff = %+e %+e %+e\n"
+ "\tgrav = %+e %+e %+e\n"
+ "\tporos = %+e %+e %+e\n"
+ "\tadv = %+e %+e %+e\n",
+ x, y, z,
+ v_p.x, v_p.y, v_p.z,
+ pressure_term.x, pressure_term.y, pressure_term.z,
+ interaction_term.x, interaction_term.y, interaction_term.z,
+ diffusion_term.x, diffusion_term.y, diffusion_term.z,
+ gravity_term.x, gravity_term.y, gravity_term.z,
+ porosity_term.x, porosity_term.y, porosity_term.z,
+ advection_term.x, advection_term.y, advection_term.z);
#endif
// Save the predicted velocity
t@@ -2421,18 +2428,18 @@ __global__ void findPredNSvelocities(
// At each iteration, the value of the forcing function is found as:
// f = f1 - f2 dot grad(epsilon)
__global__ void findNSforcing(
- Float* dev_ns_epsilon, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_dphi, // in
- Float3* dev_ns_v_p, // in
- Float* dev_ns_v_p_x, // in
- Float* dev_ns_v_p_y, // in
- Float* dev_ns_v_p_z, // in
- unsigned int nijac, // in
- unsigned int ndem, // in
- Float* dev_ns_f1, // out
- Float3* dev_ns_f2, // out
- Float* dev_ns_f) // out
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_dphi, // in
+ const Float3* __restrict__ dev_ns_v_p, // in
+ const Float* __restrict__ dev_ns_v_p_x, // in
+ const Float* __restrict__ dev_ns_v_p_y, // in
+ const Float* __restrict__ dev_ns_v_p_z, // in
+ const unsigned int nijac, // in
+ const unsigned int ndem, // in
+ Float* __restrict__ dev_ns_f1, // out
+ Float3* __restrict__ dev_ns_f2, // out
+ Float* __restrict__ dev_ns_f) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2472,7 +2479,7 @@ __global__ void findNSforcing(
// Calculate derivatives
const Float div_v_p
= divergence(dev_ns_v_p_x, dev_ns_v_p_y, dev_ns_v_p_z,
- x, y, z, dx, dy, dz);
+ x, y, z, dx, dy, dz);
const Float3 grad_phi
= gradient(dev_ns_phi, x, y, z, dx, dy, dz);
t@@ -2497,7 +2504,7 @@ __global__ void findNSforcing(
#ifdef REPORT_FORCING_TERMS
// Report values terms in the forcing function for debugging
printf("[%d,%d,%d]\tt1 = %f\tt2 = %f\tt4 = %f\n",
- x,y,z, t1, t2, t4);
+ x,y,z, t1, t2, t4);
#endif
// Save values
t@@ -2524,7 +2531,7 @@ __global__ void findNSforcing(
const Float t3 = -dot(f2, grad_epsilon);
if (z >= nz-3)
printf("[%d,%d,%d]\tf = %f\tf1 = %f\tt3 = %f\n",
- x,y,z, f, f1, t3);
+ x,y,z, f, f1, t3);
#endif
// Save forcing function value
t@@ -2542,10 +2549,10 @@ __global__ void findNSforcing(
// non-smoothed and smoothed values.
template<typename T>
__global__ void smoothing(
- T* dev_arr,
- const Float gamma,
- const unsigned int bc_bot,
- const unsigned int bc_top)
+ T* __restrict__ dev_arr,
+ const Float gamma,
+ const unsigned int bc_bot,
+ const unsigned int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2602,13 +2609,13 @@ __global__ void smoothing(
// Perform a single Jacobi iteration
__global__ void jacobiIterationNS(
- const Float* dev_ns_epsilon,
- Float* dev_ns_epsilon_new,
- Float* dev_ns_norm,
- const Float* dev_ns_f,
- const int bc_bot,
- const int bc_top,
- const Float theta)
+ const Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_epsilon_new,
+ Float* __restrict__ dev_ns_norm,
+ const Float* __restrict__ dev_ns_f,
+ const int bc_bot,
+ const int bc_top,
+ const Float theta)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2672,9 +2679,9 @@ __global__ void jacobiIterationNS(
const Float dzdz = dz*dz;
Float e_new
= (-dxdx*dydy*dzdz*f
- + dydy*dzdz*(e_xn + e_xp)
- + dxdx*dzdz*(e_yn + e_yp)
- + dxdx*dydy*(e_zn + e_zp))
+ + dydy*dzdz*(e_xn + e_xp)
+ + dxdx*dzdz*(e_yn + e_yp)
+ + dxdx*dydy*(e_zn + e_zp))
/(2.0*(dxdx*dydy + dxdx*dzdz + dydy*dzdz));
// New value of epsilon in 1D update
t@@ -2682,7 +2689,7 @@ __global__ void jacobiIterationNS(
// Print values for debugging
/*printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\n",
- x,y,z, e, f, e_new);*/
+ x,y,z, e, f, e_new);*/
const Float res_norm = (e_new - e)*(e_new - e)/(e_new*e_new + 1.0e-16);
const Float e_relax = e*(1.0-theta) + e_new*theta;
t@@ -2697,7 +2704,7 @@ __global__ void jacobiIterationNS(
//(void)checkFiniteFloat("res_norm", x, y, z, res_norm);
if (checkFiniteFloat("res_norm", x, y, z, res_norm)) {
printf("[%d,%d,%d]\t e = %f\tf = %f\te_new = %f\tres_norm = %f\n",
- x,y,z, e, f, e_new, res_norm);
+ x,y,z, e, f, e_new, res_norm);
}
#endif
}
t@@ -2706,8 +2713,8 @@ __global__ void jacobiIterationNS(
// Copy all values from one array to the other
template<typename T>
__global__ void copyValues(
- T* dev_read,
- T* dev_write)
+ const T* __restrict__ dev_read,
+ T* __restrict__ dev_write)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2717,10 +2724,10 @@ __global__ void copyValues(
// Internal nodes only
if (x < devC_grid.num[0] && y < devC_grid.num[1] && z < devC_grid.num[2]) {
- // Internal nodes + ghost nodes
- /*if (x <= devC_grid.num[0]+1 &&
- y <= devC_grid.num[1]+1 &&
- z <= devC_grid.num[2]+1) {*/
+ // Internal nodes + ghost nodes
+ /*if (x <= devC_grid.num[0]+1 &&
+ y <= devC_grid.num[1]+1 &&
+ z <= devC_grid.num[2]+1) {*/
const unsigned int cellidx = idx(x,y,z); // without ghost nodes
//const unsigned int cellidx = idx(x-1,y-1,z-1); // with ghost nodes
t@@ -2730,7 +2737,7 @@ __global__ void copyValues(
const T val = dev_read[cellidx];
//if (z == devC_grid.num[2]-1)
- //printf("[%d,%d,%d] = %f\n", x, y, z, val);
+ //printf("[%d,%d,%d] = %f\n", x, y, z, val);
// Write
__syncthreads();
t@@ -2740,11 +2747,11 @@ __global__ void copyValues(
// Find and store the normalized residuals
__global__ void findNormalizedResiduals(
- Float* dev_ns_epsilon_old,
- Float* dev_ns_epsilon,
- Float* dev_ns_norm,
- const unsigned int bc_bot,
- const unsigned int bc_top)
+ const Float* __restrict__ dev_ns_epsilon_old,
+ const Float* __restrict__ dev_ns_epsilon,
+ Float* __restrict__ dev_ns_norm,
+ const unsigned int bc_bot,
+ const unsigned int bc_top)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2791,9 +2798,9 @@ __global__ void findNormalizedResiduals(
// Computes the new velocity and pressure using the corrector
__global__ void updateNSpressure(
- Float* dev_ns_epsilon, // in
- Float beta, // in
- Float* dev_ns_p) // out
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float __restrict__ beta, // in
+ Float* __restrict__ dev_ns_p) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2825,19 +2832,19 @@ __global__ void updateNSpressure(
}
__global__ void updateNSvelocity(
- Float* dev_ns_v_p_x, // in
- Float* dev_ns_v_p_y, // in
- Float* dev_ns_v_p_z, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_epsilon, // in
- Float beta, // in
- int bc_bot, // in
- int bc_top, // in
- unsigned int ndem, // in
- Float c_grad_p, // in
- Float* dev_ns_v_x, // out
- Float* dev_ns_v_y, // out
- Float* dev_ns_v_z) // out
+ const Float* __restrict__ dev_ns_v_p_x, // in
+ const Float* __restrict__ dev_ns_v_p_y, // in
+ const Float* __restrict__ dev_ns_v_p_z, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_epsilon, // in
+ const Float beta, // in
+ const int bc_bot, // in
+ const int bc_top, // in
+ const unsigned int ndem, // in
+ const Float c_grad_p, // in
+ Float* __restrict__ dev_ns_v_x, // out
+ Float* __restrict__ dev_ns_v_y, // out
+ Float* __restrict__ dev_ns_v_z) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -2879,15 +2886,15 @@ __global__ void updateNSvelocity(
const Float phi_zn = dev_ns_phi[idx(x,y,z-1)];
const Float3 phi = MAKE_FLOAT3(
- amean(phi_c, phi_xn),
- amean(phi_c, phi_yn),
- amean(phi_c, phi_zn));
+ amean(phi_c, phi_xn),
+ amean(phi_c, phi_yn),
+ amean(phi_c, phi_zn));
// Find corrector gradient
const Float3 grad_epsilon = MAKE_FLOAT3(
- (epsilon_c - epsilon_xn)/dx,
- (epsilon_c - epsilon_yn)/dy,
- (epsilon_c - epsilon_zn)/dz) * c_grad_p;
+ (epsilon_c - epsilon_xn)/dx,
+ (epsilon_c - epsilon_yn)/dy,
+ (epsilon_c - epsilon_zn)/dz) * c_grad_p;
// Find new velocity
#ifdef SET_1
t@@ -2900,16 +2907,16 @@ __global__ void updateNSvelocity(
// Print values for debugging
/* if (z == 0) {
- Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
- Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
- printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
- x,y,z,
- grad_epsilon.x,
- grad_epsilon.y,
- grad_epsilon.z,
- e_up,
- e_down);
- }*/
+ Float e_up = dev_ns_epsilon[idx(x,y,z+1)];
+ Float e_down = dev_ns_epsilon[idx(x,y,z-1)];
+ printf("[%d,%d,%d]\tgrad_e = %f,%f,%f\te_up = %f\te_down = %f\n",
+ x,y,z,
+ grad_epsilon.x,
+ grad_epsilon.y,
+ grad_epsilon.z,
+ e_up,
+ e_down);
+ }*/
if ((z == 0 && bc_bot == 1) || (z == nz-1 && bc_top == 1))
v.z = 0.0;
t@@ -2919,11 +2926,11 @@ __global__ void updateNSvelocity(
// Check the advection term using the Courant-Friedrichs-Lewy condition
if (v.x*ndem*devC_dt/dx
- + v.y*ndem*devC_dt/dy
- + v.z*ndem*devC_dt/dz > 1.0) {
+ + v.y*ndem*devC_dt/dy
+ + v.z*ndem*devC_dt/dz > 1.0) {
printf("[%d,%d,%d] Warning: Advection term in fluid may be "
- "unstable (CFL condition), v = %f,%f,%f\n",
- x,y,z, v.x, v.y, v.z);
+ "unstable (CFL condition), v = %f,%f,%f\n",
+ x,y,z, v.x, v.y, v.z);
}
// Write new values
t@@ -2941,12 +2948,12 @@ __global__ void updateNSvelocity(
// Find the average particle diameter and velocity for each CFD cell.
// UNUSED: The values are estimated in the porosity estimation function instead
__global__ void findAvgParticleVelocityDiameter(
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float4* dev_vel_sorted, // in
- Float4* dev_x_sorted, // in
- Float3* dev_ns_vp_avg, // out
- Float* dev_ns_d_avg) // out
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float4* __restrict__ dev_vel_sorted, // in
+ const Float4* __restrict__ dev_x_sorted, // in
+ Float3* dev_ns_vp_avg, // out
+ Float* dev_ns_d_avg) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3023,18 +3030,18 @@ __device__ Float dragCoefficient(Float re)
// Find particle-fluid interaction force as outlined by Zhou et al. 2010, and
// originally by Gidaspow 1992.
__global__ void findInteractionForce(
- Float4* dev_x, // in
- Float4* dev_vel, // in
- Float* dev_ns_phi, // in
- Float* dev_ns_p, // in
- Float* dev_ns_v_x, // in
- Float* dev_ns_v_y, // in
- Float* dev_ns_v_z, // in
- Float* dev_ns_div_tau_x,// in
- Float* dev_ns_div_tau_y,// in
- Float* dev_ns_div_tau_z,// in
- Float3* dev_ns_f_pf, // out
- Float4* dev_force) // out
+ const Float4* __restrict__ dev_x, // in
+ const Float4* __restrict__ dev_vel, // in
+ const Float* __restrict__ dev_ns_phi, // in
+ const Float* __restrict__ dev_ns_p, // in
+ const Float* __restrict__ dev_ns_v_x, // in
+ const Float* __restrict__ dev_ns_v_y, // in
+ const Float* __restrict__ dev_ns_v_z, // in
+ const Float* __restrict__ dev_ns_div_tau_x,// in
+ const Float* __restrict__ dev_ns_div_tau_y,// in
+ const Float* __restrict__ dev_ns_div_tau_z,// in
+ Float3* __restrict__ dev_ns_f_pf, // out
+ Float4* __restrict__ dev_force) // out
{
unsigned int i = threadIdx.x + blockIdx.x*blockDim.x; // Particle index
t@@ -3080,14 +3087,14 @@ __global__ void findInteractionForce(
const Float div_tau_z_p = dev_ns_div_tau_z[vidx(i_x,i_y,i_z+1)];
const Float3 v_f = MAKE_FLOAT3(
- amean(v_x, v_x_p),
- amean(v_y, v_y_p),
- amean(v_z, v_z_p));
+ amean(v_x, v_x_p),
+ amean(v_y, v_y_p),
+ amean(v_z, v_z_p));
const Float3 div_tau = MAKE_FLOAT3(
- amean(div_tau_x, div_tau_x_p),
- amean(div_tau_y, div_tau_y_p),
- amean(div_tau_z, div_tau_z_p));
+ amean(div_tau_x, div_tau_x_p),
+ amean(div_tau_y, div_tau_y_p),
+ amean(div_tau_z, div_tau_z_p));
const Float3 v_rel = v_f - v_p;
const Float v_rel_length = length(v_rel);
t@@ -3119,15 +3126,15 @@ __global__ void findInteractionForce(
#ifdef CHECK_NS_FINITE
/*
- printf("\nfindInteractionForce %d [%d,%d,%d]\n"
- "\tV_p = %f Re=%f Cd=%f chi=%f\n"
- "\tf_d = %+e %+e %+e\n"
- "\tf_p = %+e %+e %+e\n"
- "\tf_v = %+e %+e %+e\n",
- i, i_x, i_y, i_z, V_p, Re, Cd, chi,
- f_d.x, f_d.y, f_d.z,
- f_p.x, f_p.y, f_p.z,
- f_v.x, f_v.y, f_v.z);// */
+ printf("\nfindInteractionForce %d [%d,%d,%d]\n"
+ "\tV_p = %f Re=%f Cd=%f chi=%f\n"
+ "\tf_d = %+e %+e %+e\n"
+ "\tf_p = %+e %+e %+e\n"
+ "\tf_v = %+e %+e %+e\n",
+ i, i_x, i_y, i_z, V_p, Re, Cd, chi,
+ f_d.x, f_d.y, f_d.z,
+ f_p.x, f_p.y, f_p.z,
+ f_v.x, f_v.y, f_v.z);// */
checkFiniteFloat3("f_d", i_x, i_y, i_z, f_d);
checkFiniteFloat3("f_p", i_x, i_y, i_z, f_p);
checkFiniteFloat3("f_v", i_x, i_y, i_z, f_v);
t@@ -3149,11 +3156,11 @@ __global__ void findInteractionForce(
// Apply the fluid-particle interaction force to the fluid cell based on the
// interaction forces from each particle in it
__global__ void applyInteractionForceToFluid(
- unsigned int* dev_gridParticleIndex, // in
- unsigned int* dev_cellStart, // in
- unsigned int* dev_cellEnd, // in
- Float3* dev_ns_f_pf, // in
- Float3* dev_ns_F_pf) // out
+ const unsigned int* __restrict__ dev_gridParticleIndex, // in
+ const unsigned int* __restrict__ dev_cellStart, // in
+ const unsigned int* __restrict__ dev_cellEnd, // in
+ const Float3* __restrict__ dev_ns_f_pf, // in
+ Float3* __restrict__ dev_ns_F_pf) // out
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3212,10 +3219,10 @@ __global__ void applyInteractionForceToFluid(
// Launch per cell face node.
// Cell center ghost nodes must be set prior to call.
__global__ void interpolateCenterToFace(
- Float3* dev_in,
- Float* dev_out_x,
- Float* dev_out_y,
- Float* dev_out_z)
+ const Float3* __restrict__ dev_in,
+ Float* __restrict__ dev_out_x,
+ Float* __restrict__ dev_out_y,
+ Float* __restrict__ dev_out_z)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3224,8 +3231,8 @@ __global__ void interpolateCenterToFace(
// Check that we are not outside the fluid grid
if (x <= devC_grid.num[0]
- && y <= devC_grid.num[1]
- && z <= devC_grid.num[2]) {
+ && y <= devC_grid.num[1]
+ && z <= devC_grid.num[2]) {
const unsigned int faceidx = vidx(x,y,z);
t@@ -3249,10 +3256,10 @@ __global__ void interpolateCenterToFace(
// Launch per cell center node
__global__ void interpolateFaceToCenter(
- Float* dev_in_x,
- Float* dev_in_y,
- Float* dev_in_z,
- Float3* dev_out)
+ Float* dev_in_x,
+ Float* dev_in_y,
+ Float* dev_in_z,
+ Float3* dev_out)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3277,9 +3284,9 @@ __global__ void interpolateFaceToCenter(
const Float z_p = dev_in_z[vidx(x,y,z+1)];
const Float3 val = MAKE_FLOAT3(
- amean(x_n, x_p),
- amean(y_n, y_p),
- amean(z_n, z_p));
+ amean(x_n, x_p),
+ amean(y_n, y_p),
+ amean(z_n, z_p));
__syncthreads();
//printf("[%d,%d,%d] = %f, %f, %f\n", x,y,z, val.x, val.y, val.z);
t@@ -3292,12 +3299,12 @@ __global__ void interpolateFaceToCenter(
// Warning: The grid-corner values will be invalid, along with the non-normal
// components of the ghost nodes
__global__ void findFaceDivTau(
- Float* dev_ns_v_x,
- Float* dev_ns_v_y,
- Float* dev_ns_v_z,
- Float* dev_ns_div_tau_x,
- Float* dev_ns_div_tau_y,
- Float* dev_ns_div_tau_z)
+ const Float* __restrict__ dev_ns_v_x,
+ const Float* __restrict__ dev_ns_v_y,
+ const Float* __restrict__ dev_ns_v_z,
+ Float* __restrict__ dev_ns_div_tau_x,
+ Float* __restrict__ dev_ns_div_tau_y,
+ Float* __restrict__ dev_ns_div_tau_z)
{
// 3D thread index
const unsigned int x = blockDim.x * blockIdx.x + threadIdx.x;
t@@ -3347,19 +3354,19 @@ __global__ void findFaceDivTau(
const Float div_tau_x =
devC_params.mu*(
- (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
- (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
- (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
+ (v_x_xp - 2.0*v_x + v_x_xn)/(dx*dx) +
+ (v_x_yp - 2.0*v_x + v_x_yn)/(dy*dy) +
+ (v_x_zp - 2.0*v_x + v_x_zn)/(dz*dz));
const Float div_tau_y =
devC_params.mu*(
- (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
- (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
- (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
+ (v_y_xp - 2.0*v_y + v_y_xn)/(dx*dx) +
+ (v_y_yp - 2.0*v_y + v_y_yn)/(dy*dy) +
+ (v_y_zp - 2.0*v_y + v_y_zn)/(dz*dz));
const Float div_tau_z =
devC_params.mu*(
- (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
- (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
- (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
+ (v_z_xp - 2.0*v_z + v_z_xn)/(dx*dx) +
+ (v_z_yp - 2.0*v_z + v_z_yn)/(dy*dy) +
+ (v_z_zp - 2.0*v_z + v_z_zn)/(dz*dz));
__syncthreads();
//printf("div_tau [%d,%d,%d] = %f, %f, %f\n", x,y,z,
(DIR) diff --git a/src/raytracer.cuh b/src/raytracer.cuh
t@@ -6,12 +6,14 @@
//#include "cuPrintf.cu"
// Template for discarding the last term in four-component vector structs
-__device__ __inline__ float3 f4_to_f3(float4 in) {
+__device__ __inline__ float3 f4_to_f3(const float4 in) {
return make_float3(in.x, in.y, in.z);
}
// Kernel for initializing image data
-__global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
+__global__ void imageInit(
+ unsigned char* __restrict__ dev_img,
+ const unsigned int pixels)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -24,11 +26,12 @@ __global__ void imageInit(unsigned char* dev_img, unsigned int pixels)
}
// Calculate ray origins and directions
-__global__ void rayInitPerspective(float4* dev_ray_origo,
- float4* dev_ray_direction,
- float4 eye,
- unsigned int width,
- unsigned int height)
+__global__ void rayInitPerspective(
+ float4* __restrict__ dev_ray_origo,
+ float4* __restrict__ dev_ray_direction,
+ const float4 eye,
+ const unsigned int width,
+ const unsigned int height)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -47,15 +50,17 @@ __global__ void rayInitPerspective(float4* dev_ray_origo,
// Write ray origo and direction to global memory
dev_ray_origo[mempos] = make_float4(devC_eye, 0.0f);
- dev_ray_direction[mempos] = make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f);
+ dev_ray_direction[mempos] =
+ make_float4(-devC_d*devC_w + p_u*devC_u + p_v*devC_v, 0.0f);
}
// Check wether the pixel's viewing ray intersects with the spheres,
// and shade the pixel correspondingly
-__global__ void rayIntersectSpheres(float4* dev_ray_origo,
- float4* dev_ray_direction,
- Float4* dev_x,
- unsigned char* dev_img)
+__global__ void rayIntersectSpheres(
+ const float4* __restrict__ dev_ray_origo,
+ const float4* __restrict__ dev_ray_direction,
+ const Float4* __restrict__ dev_x,
+ unsigned char* __restrict__ dev_img)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -67,7 +72,8 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
float3 d = f4_to_f3(dev_ray_direction[mempos]);
//float step = length(d);
- // Distance, in ray steps, between object and eye initialized with a large value
+ // Distance, in ray steps, between object and eye initialized with
+ // a large value
float tdist = 1e10f;
// Surface normal at closest sphere intersection
t@@ -86,8 +92,6 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
float3 c = make_float3(x.x, x.y, x.z);
float R = x.w;
- //cuPrintf("particle %d at: %f, %f, %f, radius: %f\n", i, c.x, c.y, c.z, R);
-
// Calculate the discriminant: d = B^2 - 4AC
float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
- 4.0f*dot(d,d) // -4*A
t@@ -98,8 +102,10 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
if (Delta > 0.0f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus = ((dot(-d,(e-c))
+ - sqrt( dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -137,14 +143,15 @@ __global__ void rayIntersectSpheres(float4* dev_ray_origo,
// Check wether the pixel's viewing ray intersects with the spheres,
// and shade the pixel correspondingly using a colormap
-__global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
- float4* dev_ray_direction,
- Float4* dev_x,
- Float4* dev_vel,
- Float* dev_linarr,
- float max_val,
- float lower_cutoff,
- unsigned char* dev_img)
+__global__ void rayIntersectSpheresColormap(
+ const float4* __restrict__ dev_ray_origo,
+ const float4* __restrict__ dev_ray_direction,
+ const Float4* __restrict__ dev_x,
+ const Float4* __restrict__ dev_vel,
+ const Float* __restrict__ dev_linarr,
+ const float max_val,
+ const float lower_cutoff,
+ unsigned char* __restrict__ dev_img)
{
// Compute pixel position from threadIdx/blockIdx
unsigned int mempos = threadIdx.x + blockIdx.x * blockDim.x;
t@@ -155,7 +162,8 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
float3 e = f4_to_f3(dev_ray_origo[mempos]);
float3 d = f4_to_f3(dev_ray_direction[mempos]);
- // Distance, in ray steps, between object and eye initialized with a large value
+ // Distance, in ray steps, between object and eye initialized with
+ // a large value
float tdist = 1e10f;
// Surface normal at closest sphere intersection
t@@ -181,8 +189,6 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
- 4.0f*dot(d,d) // -4*A
* (dot((e-c),(e-c)) - R*R); // C
-
-
// If the determinant is positive, there are two solutions
// One where the line enters the sphere, and one where it exits
if (lower_cutoff > 0.0) {
t@@ -198,8 +204,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
if (Delta > 0.0f && val > lower_cutoff && fixvel == 0.f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus =
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -217,8 +225,10 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
if (Delta > 0.0f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
- * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
+ float t_minus =
+ ((dot(-d,(e-c)) - sqrt(dot(d,(e-c))*dot(d,(e-c))
+ - dot(d,d)*(dot((e-c),(e-c)) - R*R)))
+ / dot(d,d));
// Check wether intersection is closer than previous values
if (fabs(t_minus) < tdist) {
t@@ -269,25 +279,28 @@ __global__ void rayIntersectSpheresColormap(float4* dev_ray_origo,
// Write shading model values to pixel color channels
dev_img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*redv);
+ + k_a * I_a)*redv);
dev_img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*greenv);
+ + k_a * I_a)*greenv);
dev_img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*bluev);
+ + k_a * I_a)*bluev);
}
}
__host__ void DEM::cameraInit(
- const float3 eye,
- const float3 lookat,
- const float imgw,
- const float focalLength)
+ const float3 eye,
+ const float3 lookat,
+ const float imgw,
+ const float focalLength)
{
float hw_ratio = height/width;
// Image dimensions in world space (l, r, b, t)
- float4 imgplane = make_float4(-0.5f*imgw, 0.5f*imgw, -0.5f*imgw*hw_ratio, 0.5f*imgw*hw_ratio);
+ float4 imgplane = make_float4(
+ -0.5f*imgw, 0.5f*imgw,
+ -0.5f*imgw*hw_ratio,
+ 0.5f*imgw*hw_ratio);
// The view vector
float3 view = eye - lookat;
t@@ -309,15 +322,6 @@ __host__ void DEM::cameraInit(
if (verbose == 1)
std::cout << " Transfering camera values to constant memory: ";
- /* Reference by string removed in cuda 5.0
- cudaMemcpyToSymbol("devC_u", &u, sizeof(u));
- cudaMemcpyToSymbol("devC_v", &v, sizeof(v));
- cudaMemcpyToSymbol("devC_w", &w, sizeof(w));
- cudaMemcpyToSymbol("devC_eye", &eye, sizeof(eye));
- cudaMemcpyToSymbol("devC_imgplane", &imgplane, sizeof(imgplane));
- cudaMemcpyToSymbol("devC_d", &d, sizeof(d));
- cudaMemcpyToSymbol("devC_light", &light, sizeof(light));
- cudaMemcpyToSymbol("devC_pixels", &pixels, sizeof(pixels));*/
cudaMemcpyToSymbol(devC_u, &u, sizeof(u));
cudaMemcpyToSymbol(devC_v, &v, sizeof(v));
cudaMemcpyToSymbol(devC_w, &w, sizeof(w));
t@@ -333,7 +337,7 @@ __host__ void DEM::cameraInit(
}
// Allocate global device memory
-__host__ void DEM::rt_allocateGlobalDeviceMemory(void)
+__host__ void DEM::rt_allocateGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Allocating device memory: ";
t@@ -347,7 +351,7 @@ __host__ void DEM::rt_allocateGlobalDeviceMemory(void)
// Free dynamically allocated device memory
-__host__ void DEM::rt_freeGlobalDeviceMemory(void)
+__host__ void DEM::rt_freeGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Freeing device memory: ";
t@@ -360,11 +364,12 @@ __host__ void DEM::rt_freeGlobalDeviceMemory(void)
}
// Transfer image data from device to host
-__host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
+__host__ void DEM::rt_transferFromGlobalDeviceMemory()
{
if (verbose == 1)
std::cout << " Transfering image data: device -> host: ";
- cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char), cudaMemcpyDeviceToHost);
+ cudaMemcpy(img, dev_img, width*height*4*sizeof(unsigned char),
+ cudaMemcpyDeviceToHost);
if (verbose == 1)
std::cout << "Done" << std::endl;
checkForCudaErrors("During rt_transferFromGlobalDeviceMemory()");
t@@ -372,24 +377,13 @@ __host__ void DEM::rt_transferFromGlobalDeviceMemory(void)
// Wrapper for the rt kernel
__host__ void DEM::render(
- const int method,
- const float maxval,
- const float lower_cutoff,
- const float focalLength,
- const unsigned int img_width,
- const unsigned int img_height)
- /*float4* p, unsigned int np,
- rgba* img, unsigned int width, unsigned int height,
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
- int visualize, float max_val,
- float* fixvel,
- float* xsum,
- float* pres,
- float* es_dot,
- float* es,
- float* vel)*/
+ const int method,
+ const float maxval,
+ const float lower_cutoff,
+ const float focalLength,
+ const unsigned int img_width,
+ const unsigned int img_height)
{
- // Namespace directives
using std::cout;
using std::cerr;
using std::endl;
t@@ -418,15 +412,15 @@ __host__ void DEM::render(
// Look at the centre of the mean positions
float3 lookat = make_float3(maxpos.x, maxpos.y, maxpos.z) / 2.0f;
float3 eye = make_float3(
- grid.L[0] * 2.3f,
- grid.L[1] * -5.0f,
- grid.L[2] * 1.3f);
+ grid.L[0] * 2.3f,
+ grid.L[1] * -5.0f,
+ grid.L[2] * 1.3f);
cameraInit(eye, lookat, imgw, focalLength);
// Construct rays for perspective projection
rayInitPerspective<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
+ dev_ray_origo, dev_ray_direction,
+ make_float4(eye.x, eye.y, eye.z, 0.0f), width, height);
cudaThreadSynchronize();
Float* linarr; // Linear array to use for color visualization
t@@ -440,8 +434,8 @@ __host__ void DEM::render(
// Visualize spheres without color scale overlay
if (method == 0) {
rayIntersectSpheres<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- dev_x, dev_img);
+ dev_ray_origo, dev_ray_direction,
+ dev_x, dev_img);
} else {
if (method == 1) { // Visualize pressure
t@@ -455,8 +449,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.vel[i].x*k.vel[i].x
- + k.vel[i].y*k.vel[i].y
- + k.vel[i].z*k.vel[i].z);
+ + k.vel[i].y*k.vel[i].y
+ + k.vel[i].z*k.vel[i].z);
}
transfer = 1;
desc = "Linear velocity";
t@@ -468,8 +462,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.angvel[i].x*k.angvel[i].x
- + k.angvel[i].y*k.angvel[i].y
- + k.angvel[i].z*k.angvel[i].z);
+ + k.angvel[i].y*k.angvel[i].y
+ + k.angvel[i].z*k.angvel[i].z);
}
transfer = 1;
desc = "Angular velocity";
t@@ -492,8 +486,8 @@ __host__ void DEM::render(
#pragma omp parallel for if(np>100)
for (i = 0; i<np; ++i) {
linarr[i] = sqrt(k.angpos[i].x*k.angpos[i].x
- + k.angpos[i].y*k.angpos[i].y
- + k.angpos[i].z*k.angpos[i].z);
+ + k.angpos[i].y*k.angpos[i].y
+ + k.angpos[i].z*k.angpos[i].z);
}
transfer = 1;
desc = "Angular positions";
t@@ -504,23 +498,24 @@ __host__ void DEM::render(
// Report color visualization method and color map range
if (verbose == 1) {
cout << " " << desc << " color map range: [0, "
- << maxval << "] " << unit << endl;
+ << maxval << "] " << unit << endl;
}
// Copy linarr to dev_linarr if required
if (transfer == 1) {
cudaMalloc((void**)&dev_linarr, np*sizeof(Float));
checkForCudaErrors("Error during cudaMalloc of linear array");
- cudaMemcpy(dev_linarr, linarr, np*sizeof(Float), cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_linarr, linarr, np*sizeof(Float),
+ cudaMemcpyHostToDevice);
checkForCudaErrors("Error during cudaMemcpy of linear array");
}
// Start raytracing kernel
rayIntersectSpheresColormap<<< blocksPerGrid, threadsPerBlock >>>(
- dev_ray_origo, dev_ray_direction,
- dev_x, dev_vel,
- dev_linarr, maxval, lower_cutoff,
- dev_img);
+ dev_ray_origo, dev_ray_direction,
+ dev_x, dev_vel,
+ dev_linarr, maxval, lower_cutoff,
+ dev_img);
}
(DIR) diff --git a/tests/fluid_particle_interaction.py b/tests/fluid_particle_interaction.py
t@@ -50,5 +50,4 @@ test(sim.vel[0,0] > 0.0, 'Particle 0 velocity:')
test(sim.vel[1,0] > 0.0, 'Particle 1 velocity:')
test(sim.vel[2,0] > 0.0, 'Particle 2 velocity:')
-
-sim.cleanup()
+#sim.cleanup()