tUpdated sources - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
(HTM) git clone git://src.adamsgaard.dk/sphere
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) LICENSE
---
(DIR) commit d9c25987416182f283d170f16d97a4afa5685f41
(DIR) parent 6ff7614d0fd5bbd0306f7ed63a1ea1bfd0a9b3a0
(HTM) Author: Anders Damsgaard <adc@geo.au.dk>
Date: Wed, 22 Aug 2012 13:19:49 +0200
Updated sources
Diffstat:
M raytracer/header.h | 11 ++---------
M raytracer/main.cpp | 158 ++++++++++++++++---------------
A raytracer/o-ppm.h | 7 +++++++
M raytracer/rt-kernel-cpu.cpp | 106 ++++++++++++++++++-------------
M raytracer/rt-kernel.h | 16 +++++++++++-----
M src/Makefile | 26 ++++++++++++++++++++------
M src/constants.cuh | 37 +++++++++++++++++--------------
M src/datatypes.h | 150 +++++++++++++++++++------------
M src/device.cu | 1783 ++++---------------------------
M src/file_io.cpp | 388 +++++++++++++++++++++++--------
M src/main.cpp | 127 +++++++++++++++++--------------
11 files changed, 849 insertions(+), 1960 deletions(-)
---
(DIR) diff --git a/raytracer/header.h b/raytracer/header.h
t@@ -4,6 +4,8 @@
#ifndef HEADER_H_
#define HEADER_H_
+// Type declaration
+typedef unsigned int Inttype;
//// Structure declarations ////
t@@ -21,13 +23,4 @@ struct rgb {
unsigned char alpha;
};
-
-
-//// Prototype functions ////
-
-// io.cpp
-int image_to_ppm(rgb* rgb_img, const char* filename,
- const unsigned int width, const unsigned int height);
-
-
#endif
(DIR) diff --git a/raytracer/main.cpp b/raytracer/main.cpp
t@@ -5,18 +5,19 @@
#include <cmath>
#include <vector_functions.h>
#include "header.h"
-#include "rt_kernel.h"
-#include "rt_kernel_cpu.h"
+#include "rt-kernel.h"
+#include "rt-kernel-cpu.h"
+#include "colorbar.h"
+#include "o-ppm.h"
int main(const int argc, const char* argv[])
{
using std::cout;
if (argc < 6 || argc > 8 ){
- cout << "Usage: " << argv[0] << " <GPU or CPU> <sphere-binary.bin> <width> <height> <output-image.ppm>\n"
+ cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width> <height> <output-image.ppm>\n"
<< "or\n"
- << "Usage: " << argv[0] << " GPU <sphere-binary.bin | colorbar> <width> <height> <output-image.ppm>"
- << " <pressure | pressure50 | es | es_dot> <max value in color range>\n";
+ << "Usage: " << argv[0] << " GPU <particle-data.txt | colorbar> <width> <height> <output-image.ppm> [pressure,es_dot,es,vel max_val]";
return 1;
}
t@@ -37,22 +38,16 @@ int main(const int argc, const char* argv[])
// Render colorbar image
if (strcmp(argv[2],"colorbar") == 0) {
- for (unsigned int y=0; y<height; y++) {
- for (unsigned int x=0; x<width; x++) {
+ for (unsigned int y=0; y<height; ++y) {
+ for (unsigned int x=0; x<width; ++x) {
// Colormap value is relative to width position
float ratio = (float) (x+1)/width;
- // Determine Blue-White-Red color components
- float red = fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
- float green = fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
- float blue = fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
-
// Write pixel value to image array
- //img[x + y*width].r = red * 250.f;
- img[x + y*width].r = red * 250.f;
- img[x + y*width].g = green * 250.f;
- img[x + y*width].b = blue * 250.f;
+ img[x + y*width].r = red(ratio) * 250.f;
+ img[x + y*width].g = green(ratio) * 250.f;
+ img[x + y*width].b = blue(ratio) * 250.f;
}
}
t@@ -81,21 +76,17 @@ int main(const int argc, const char* argv[])
// Allocate particle structure array
cout << " Allocating host memory\n";
+
+ // Single precision arrays used for computations
float4* p = new float4[np];
float* fixvel = new float[np];
float* es_dot = new float[np];
float* es = new float[np];
float* pres = new float[np];
- for (unsigned int i=0; i<np; i++) {
- fixvel[i] = 0.0f;
- pres[i] = 0.0f; // Initialize values to zero
- es_dot[i] = 0.0f;
- es[i] = 0.0f;
- }
+ float* vel = new float[np];
// Read temporal information
- float dt, file_dt;
- double current, total;
+ double dt, file_dt, current, total;
unsigned int step_count;
(void)fread(&dt, sizeof(dt), 1, fin);
(void)fread(¤t, sizeof(current), 1, fin);
t@@ -103,46 +94,77 @@ int main(const int argc, const char* argv[])
(void)fread(&file_dt, sizeof(file_dt), 1, fin);
(void)fread(&step_count, sizeof(step_count), 1, fin);
+ double d; // Double precision temporary value holder
+
// Canonical coordinate system origo
f3 origo;
- (void)fread(&origo, sizeof(float)*3, 1, fin);
+ (void)fread(&d, sizeof(d), 1, fin);
+ origo.x = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ origo.y = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ origo.z = (float)d;
// Canonical coordinate system dimensions
f3 L;
- (void)fread(&L, sizeof(float)*3, 1, fin);
+ (void)fread(&d, sizeof(d), 1, fin);
+ L.x = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ L.y = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ L.z = (float)d;
+
// Skip over irrelevant data
- float blankf;
- //f3 blankf3;
+ double blankd;
+ //f3 blankd3;
unsigned int blankui;
for (int j=0; j<3; j++) // Skip over grid.num data
(void)fread(&blankui, sizeof(blankui), 1, fin);
+ // Velocity vector, later converted to lenght
+ float3 v;
// Load data into particle array
for (unsigned int i=0; i<np; i++) {
- (void)fread(&p[i].x, sizeof(float), 1, fin);
- for (int j=0; j<4; j++)
- (void)fread(&blankf, sizeof(blankf), 1, fin);
-
- (void)fread(&p[i].y, sizeof(float), 1, fin);
- for (int j=0; j<4; j++)
- (void)fread(&blankf, sizeof(blankf), 1, fin);
-
- (void)fread(&p[i].z, sizeof(float), 1, fin);
- for (int j=0; j<4; j++)
- (void)fread(&blankf, sizeof(blankf), 1, fin);
+ (void)fread(&d, sizeof(d), 1, fin);
+ p[i].x = (float)d; // Typecast to single precision
+ (void)fread(&d, sizeof(d), 1, fin);
+ v.x = (float)d;
+ for (int j=0; j<3; j++)
+ (void)fread(&blankd, sizeof(blankd), 1, fin);
+
+ (void)fread(&d, sizeof(d), 1, fin);
+ p[i].y = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ v.y = (float)d;
+ for (int j=0; j<3; j++)
+ (void)fread(&blankd, sizeof(blankd), 1, fin);
+
+ (void)fread(&d, sizeof(d), 1, fin);
+ p[i].z = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ v.z = (float)d;
+ for (int j=0; j<3; j++)
+ (void)fread(&blankd, sizeof(blankd), 1, fin);
+
+ // Save velocity vector length
+ vel[i] = sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
}
for (unsigned int i=0; i<np; i++) {
- //(void)fread(&blankf, sizeof(blankf), 1, fin); // fixvel
- (void)fread(&fixvel[i], sizeof(float), 1, fin); // fixvel
- (void)fread(&blankf, sizeof(blankf), 1, fin); // xsum
- (void)fread(&p[i].w, sizeof(float), 1, fin); // radius
+ (void)fread(&d, sizeof(d), 1, fin); // fixvel
+ fixvel[i] = (float)d;
+ (void)fread(&blankd, sizeof(blankd), 1, fin); // xsum
+ (void)fread(&d, sizeof(d), 1, fin); // radius
+ p[i].w = (float)d;
for (int j=0; j<10; j++)
- (void)fread(&blankf, sizeof(blankf), 1, fin);
- (void)fread(&es_dot[i], sizeof(float), 1, fin);
- (void)fread(&es[i], sizeof(float), 1, fin);
- (void)fread(&pres[i], sizeof(float), 1, fin);
+ (void)fread(&blankd, sizeof(blankd), 1, fin);
+ (void)fread(&d, sizeof(d), 1, fin);
+ es_dot[i] = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ es[i] = (float)d;
+ (void)fread(&d, sizeof(d), 1, fin);
+ pres[i] = (float)d;
}
fclose(fin); // Binary read complete
t@@ -151,12 +173,14 @@ int main(const int argc, const char* argv[])
<< L.x << "*" << L.y << "*" << L.z << " m\n";
// Eye position and focus point
- //f3 eye = {0.5f*L.x, -4.2*L.y, 0.5f*L.z};
- f3 eye = {2.5f*L.x, -4.2*L.y, 2.0f*L.z};
- f3 lookat = {0.45f*L.x, 0.5f*L.y, 0.45f*L.z};
+ //f3 eye = {0.5f*L.x, -4.2f*L.y, 0.5f*L.z};
+ //f3 eye = {2.5f*L.x, -4.2f*L.y, 2.0f*L.z};
+ //f3 eye = {0.5f*L.x, -5.0f*L.y, 0.5f*L.z};
+ f3 eye = {2.5f*L.x, -5.0f*L.y, 1.5f*L.z};
+ f3 lookat = {0.5f*L.x, 0.5f*L.y, 0.5f*L.z};
if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetup's)
- eye.x = 1.1f*L.x;
- eye.y = 15.1*L.y;
+ eye.x = 4.1f*L.x;
+ eye.y = -15.1*L.y;
eye.z = 1.1*L.z;
/*lookat.x = 0.45f*L.x;
lookat.y = 0.5f*L.y;
t@@ -179,40 +203,19 @@ int main(const int argc, const char* argv[])
visualize = 2; // 2: es_dot view
if(strcmp(argv[6],"es") == 0)
visualize = 3; // 3: es view
- if(strcmp(argv[6],"pressure50") == 0)
- visualize = 4; // 4: pressure50 view
+ if(strcmp(argv[6],"vel") == 0)
+ visualize = 4; // 4: velocity view
// Read max. value specified in command args.
max_val = atof(argv[7]);
- }
-
- // Render colorbar image
- if (strcmp(argv[2],"colorbar") == 0) {
-
- for (unsigned int x=0; x<width; x++) {
- for (unsigned int y=0; y<height; y++) {
-
- // Colormap value is relative to width position
- float ratio = (float)x/width;
- // Determine Blue-White-Red color components
- float red = fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
- float green = fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
- float blue = fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
-
- // Write pixel value to image array
- img[y*height + x].r = (unsigned char) red * 255;
- img[y*height + x].g = (unsigned char) green * 255;
- img[y*height + x].b = (unsigned char) blue * 255;
-
- }
- }
}
+
if (strcmp(argv[1],"GPU") == 0) {
// Call cuda wrapper
- if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize, max_val, fixvel, pres, es_dot, es) != 0) {
+ if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize, max_val, fixvel, pres, es_dot, es, vel) != 0) {
cout << "\nError in rt()\n";
return 1;
}
t@@ -234,6 +237,7 @@ int main(const int argc, const char* argv[])
delete [] pres;
delete [] es;
delete [] es_dot;
+ delete [] vel;
}
(DIR) diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h
t@@ -0,0 +1,7 @@
+#ifndef O_PPM_H_
+#define O_PPM_H_
+
+int image_to_ppm(rgb* rgb_img, const char* filename,
+ const unsigned int width, const unsigned int height);
+
+#endif
(DIR) diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp
t@@ -5,8 +5,9 @@
#include <cuda.h>
#include <cutil_math.h>
#include <string.h>
+#include <stdlib.h>
#include "header.h"
-#include "rt_kernel_cpu.h"
+#include "rt-kernel-cpu.h"
// Constants
float3 constc_u;
t@@ -35,10 +36,10 @@ __inline__ float lengthf3(float3 in)
// Kernel for initializing image data
void imageInit_cpu(unsigned char* _img, unsigned int pixels)
{
- for (unsigned int mempos=0; mempos<pixels; mempos++) {
- _img[mempos*4] = 0; // Red channel
- _img[mempos*4 + 1] = 0; // Green channel
- _img[mempos*4 + 2] = 0; // Blue channel
+ for (unsigned int mempos=0; mempos<pixels; ++mempos) {
+ _img[mempos*4] = 255; // Red channel
+ _img[mempos*4 + 1] = 255; // Green channel
+ _img[mempos*4 + 2] = 255; // Blue channel
}
}
t@@ -49,17 +50,19 @@ void rayInitPerspective_cpu(float3* _ray_origo,
unsigned int width,
unsigned int height)
{
- int i;
- #pragma omp parallel for
- for (i=0; i<width; i++) {
- for (unsigned int j=0; j<height; j++) {
+ int i,j;
+ unsigned int mempos;
+ float p_u, p_v;
+ #pragma omp parallel for private(mempos,j,p_u,p_v)
+ for (i=0; i<(int)width; ++i) {
+ for (j=0; j<(int)height; ++j) {
- unsigned int mempos = i + j*height;
+ mempos = i + j*width;
// Calculate pixel coordinates in image plane
- float p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
+ p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
* (i + 0.5f) / width;
- float p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
+ p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
* (j + 0.5f) / height;
// Write ray origo and direction to global memory
t@@ -74,37 +77,34 @@ void rayInitPerspective_cpu(float3* _ray_origo,
void rayIntersectSpheres_cpu(float3* _ray_origo,
float3* _ray_direction,
float4* _p,
+ float* _nuance,
unsigned char* _img,
unsigned int pixels,
unsigned int np)
{
- int mempos;
- #pragma omp parallel for
- for (mempos=0; mempos<pixels; mempos++) {
+ long int mempos;
+ float3 e, d, n, p, c;
+ float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance;
+ Inttype i, ifinal;
+ #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d,k_d,k_a,I_a,nuance,i,ifinal)
+ for (mempos=0; mempos<pixels; ++mempos) {
// Read ray data from global memory
- float3 e = _ray_origo[mempos];
- float3 d = _ray_direction[mempos];
- //float step = lengthf3(d);
+ e = _ray_origo[mempos];
+ d = _ray_direction[mempos];
// Distance, in ray steps, between object and eye initialized with a large value
- float tdist = 1e10f;
-
- // Surface normal at closest sphere intersection
- float3 n;
-
- // Intersection point coordinates
- float3 p;
+ tdist = 1e10f;
// Iterate through all particles
- for (unsigned int i=0; i<np; i++) {
+ for (i=0; i<np; ++i) {
// Read sphere coordinate and radius
- float3 c = f4_to_f3(_p[i]);
- float R = _p[i].w;
+ c = f4_to_f3(_p[i]);
+ R = _p[i].w;
// Calculate the discriminant: d = B^2 - 4AC
- float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
+ Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c))) // B^2
- 4.0f*dot(d,d) // -4*A
* (dot((e-c),(e-c)) - R*R); // C
t@@ -113,7 +113,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
if (Delta > 0.0f) {
// Calculate roots, Shirley 2009 p. 77
- float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
+ t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
* (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
// Check wether intersection is closer than previous values
t@@ -121,6 +121,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
p = e + t_minus*d;
tdist = fabs(t_minus);
n = normalize(2.0f * (p - c)); // Surface normal
+ ifinal = i;
}
} // End of solution branch
t@@ -131,21 +132,25 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
if (tdist < 1e10) {
// Lambertian shading parameters
- float dotprod = fabs(dot(n, constc_light));
- float I_d = 40.0f; // Light intensity
- float k_d = 5.0f; // Diffuse coefficient
+ //float dotprod = fabs(dot(n, constc_light));
+ dotprod = fmax(0.0f,dot(n, constc_light));
+ I_d = 70.0f; // Light intensity
+ k_d = 5.0f; // Diffuse coefficient
// Ambient shading
- float k_a = 10.0f;
- float I_a = 5.0f;
+ k_a = 30.0f;
+ I_a = 5.0f;
+
+ // Read color nuance of grain
+ nuance = _nuance[ifinal];
// Write shading model values to pixel color channels
_img[mempos*4] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*0.48f);
+ + k_a * I_a)*0.48f*nuance);
_img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*0.41f);
+ + k_a * I_a)*0.41f*nuance);
_img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
- + k_a * I_a)*0.27f);
+ + k_a * I_a)*0.27f*nuance);
}
}
}
t@@ -160,9 +165,11 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio)
float3 view = eye - lookat;
// Construct the camera view orthonormal base
- float3 v = make_float3(0.0f, 1.0f, 0.0f); // v: Pointing upward
- float3 w = -view/lengthf3(view); // w: Pointing backwards
- float3 u = cross(make_float3(v.x, v.y, v.z), make_float3(w.x, w.y, w.z)); // u: Pointing right
+ //float3 up = make_float3(0.0f, 1.0f, 0.0f); // Pointing upward along +y
+ float3 up = make_float3(0.0f, 0.0f, 1.0f); // Pointing upward along +z
+ float3 w = -view/length(view); // w: Pointing backwards
+ float3 u = cross(up, w) / length(cross(up, w));
+ float3 v = cross(w, u);
// Focal length 20% of eye vector length
float d = lengthf3(view)*0.8f;
t@@ -179,6 +186,8 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio)
constc_imgplane = imgplane;
constc_d = d;
constc_light = light;
+
+ std::cout << "Rendering image...";
}
t@@ -197,8 +206,15 @@ int rt_cpu(float4* p, unsigned int np,
// Start timer 1
t1_go = clock();
+ // Generate random nuance values for all grains
+ static float* _nuance; // Values between 0.5 and 1.0
+ _nuance = new float[np];
+ srand(0); // Initialize seed at same value at every run
+ for (unsigned int i=0; i<np; ++i)
+ _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f;
+
// Allocate memory
- cout << " Allocating device memory\n";
+ cout << " Allocating memory\n";
static unsigned char *_img; // RGBw values in image
static float3* _ray_origo; // Ray origo (x,y,z)
static float3* _ray_direction; // Ray direction (x,y,z)
t@@ -230,7 +246,7 @@ int rt_cpu(float4* p, unsigned int np,
// Find closest intersection between rays and spheres
rayIntersectSpheres_cpu(
_ray_origo, _ray_direction,
- p, _img, pixels, np);
+ p, _nuance, _img, pixels, np);
// Stop timer 2
t2_stop = clock();
t@@ -238,6 +254,7 @@ int rt_cpu(float4* p, unsigned int np,
memcpy(img, _img, sizeof(unsigned char)*pixels*4);
// Free dynamically allocated device memory
+ delete [] _nuance;
delete [] _img;
delete [] _ray_origo;
delete [] _ray_direction;
t@@ -246,7 +263,8 @@ int rt_cpu(float4* p, unsigned int np,
t1_stop = clock();
// Report time spent
- cout << " Time spent on entire CPU raytracing routine: "
+ cout << " done.\n"
+ << " Time spent on entire CPU raytracing routine: "
<< (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
cout << " - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
(DIR) diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h
t@@ -2,6 +2,8 @@
#define RT_KERNEL_H_
#include <vector_functions.h>
+#include "../src/datatypes.h"
+#include "header.h"
// Constants
__constant__ float4 const_u;
t@@ -11,19 +13,23 @@ __constant__ float4 const_eye;
__constant__ float4 const_imgplane;
__constant__ float const_d;
__constant__ float4 const_light;
+__constant__ unsigned int const_pixels;
+__constant__ Inttype const_np;
// Host prototype functions
extern "C"
-void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio);
+void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
+ unsigned int pixels, Inttype np);
extern "C"
void checkForCudaErrors(const char* checkpoint_description);
extern "C"
-int rt(float4* p, const unsigned int np,
- rgb* img, const unsigned int width, const unsigned int height,
- f3 origo, f3 L, f3 eye, f3 lookat, float imgw);
-
+int rt(float4* p, Inttype np,
+ rgb* img, unsigned int width, unsigned int height,
+ f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
+ int visualize, float max_val,
+ float* fixvel, float* pres, float* es_dot, float* es, float* vel);
#endif
(DIR) diff --git a/src/Makefile b/src/Makefile
t@@ -1,6 +1,11 @@
+# Cuda paths
CUDA_INSTALL_PATH=/usr/local/cuda
CUDA_BIN=$(CUDA_INSTALL_PATH)/bin
+# Define the editor and optional arguments
+EDITOR=vim -p
+
+# Define compilers and linker
CC=g++
NVCC=$(CUDA_BIN)/nvcc
LINKER=$(CC)
t@@ -26,7 +31,7 @@ CCFILES=main.cpp file_io.cpp
CUFILES=device.cu utility.cu
CCOBJECTS=$(CCFILES:.cpp=.o)
CUOBJECTS=$(CUFILES:.cu=.o)
- OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
+OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
# Detect OS
OSUPPER=$(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
t@@ -53,7 +58,10 @@ LDFLAGS+=-L$(CUDA_INSTALL_PATH)/lib
LDFLAGS+=-L$(SDKPATH)/../../shared/lib -L$(SDKPATH)/../lib
LDFLAGS+=-lcutil_x86_64 -lcuda -lcudart
-all: $(CCFILES) $(CUFILES) $(EXECUTABLE)
+all: $(CCFILES) $(CUFILES) $(EXECUTABLE) raytracer
+
+raytracer:
+ $(MAKE) -C ../raytracer/
$(EXECUTABLE): $(OBJECTS)
$(LINKER) $(OBJECTS) $(LDFLAGS) -o $@
t@@ -61,20 +69,20 @@ $(EXECUTABLE): $(OBJECTS)
utility.o: utility.cu
$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
-file_io.o: file_io.cpp
+file_io.o: file_io.cpp datatypes.h
$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
-device.o: device.cu
+device.o: device.cu datatypes.h *.cuh
$(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
-main.o: main.cpp
+main.o: main.cpp datatypes.h
$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
../sphere_status: sphere_status.cpp
$(CC) $(CCFLAGS) sphere_status.cpp -o ../sphere_status
edit:
- vim -p Makefile $(CCFILES) $(CUFILES) *.h *.cuh
+ $(EDITOR) Makefile $(CCFILES) $(CUFILES) *.h *.cuh
backup:
tar cvfz ../../sphere_backup/$(BACKUPNAME) ../../sphere
t@@ -83,4 +91,10 @@ backup:
clean:
$(RM) $(OBJECTS)
$(RM) ../sphere_*
+ $(MAKE) -C ../raytracer clean
+
+clear:
+ # Remove all output data and images
+ $(RM) -rf ../img_out/*.{ppm,png}
+ $(RM) -rf ../output/*.{dat,bin}
(DIR) diff --git a/src/constants.cuh b/src/constants.cuh
t@@ -2,28 +2,31 @@
#define CONSTANTS_CUH_
// Constant memory size: 64 kb
-__constant__ float devC_origo[ND]; // World coordinate system origo
-__constant__ float devC_L[ND]; // World length in dimension ND
+__constant__ Float devC_origo[ND]; // World coordinate system origo
+__constant__ Float devC_L[ND]; // World length in dimension ND
__constant__ unsigned int devC_num[ND]; // Number of cells in dimension ND
-__constant__ float devC_dt; // Time step length
+__constant__ Float devC_dt; // Time step length
__constant__ int devC_global; // Parameter properties, 1: global, 0: individual
-__constant__ float devC_g[ND]; // Gravitational acceleration vector
+__constant__ Float devC_g[ND]; // Gravitational acceleration vector
__constant__ unsigned int devC_np; // Number of particles
__constant__ char devC_nc; // Max. number of contacts a particle can have
__constant__ unsigned int devC_shearmodel; // Shear force model: 1: viscous, frictional, 2: elastic, frictional
-__constant__ float devC_k_n; // Material normal stiffness
-__constant__ float devC_k_s; // Material shear stiffness
-__constant__ float devC_k_r; // Material rolling stiffness
-__constant__ float devC_gamma_n; // Material normal viscosity
-__constant__ float devC_gamma_s; // Material shear viscosity
-__constant__ float devC_gamma_r; // Material rolling viscosity
-__constant__ float devC_mu_s; // Material static shear friction coefficient
-__constant__ float devC_mu_d; // Material dynamic shear friction coefficient
-__constant__ float devC_mu_r; // Material rolling friction coefficient
-__constant__ float devC_rho; // Material density
-__constant__ float devC_kappa; // Capillary bond prefactor
-__constant__ float devC_db; // Debonding distance
-__constant__ float devC_V_b; // Liquid volume of capillary bond
+__constant__ Float devC_k_n; // Material normal stiffness
+__constant__ Float devC_k_s; // Material shear stiffness
+__constant__ Float devC_k_r; // Material rolling stiffness
+__constant__ Float devC_gamma_n; // Material normal viscosity
+__constant__ Float devC_gamma_s; // Material shear viscosity
+__constant__ Float devC_gamma_r; // Material rolling viscosity
+__constant__ Float devC_gamma_wn; // Wall normal viscosity
+__constant__ Float devC_gamma_ws; // Wall shear viscosity
+__constant__ Float devC_gamma_wr; // Wall rolling viscosity
+__constant__ Float devC_mu_s; // Material static shear friction coefficient
+__constant__ Float devC_mu_d; // Material dynamic shear friction coefficient
+__constant__ Float devC_mu_r; // Material rolling friction coefficient
+__constant__ Float devC_rho; // Material density
+__constant__ Float devC_kappa; // Capillary bond prefactor
+__constant__ Float devC_db; // Debonding distance
+__constant__ Float devC_V_b; // Liquid volume of capillary bond
__constant__ unsigned int devC_nw; // Number of walls
__constant__ unsigned int devC_w_n; // Dimension of orthogonal wall surface normal
__constant__ int devC_periodic; // Behavior of x- and y boundaries: 0: walls, 1: periodic
(DIR) diff --git a/src/datatypes.h b/src/datatypes.h
t@@ -4,25 +4,60 @@
#ifndef DATATYPES_H_
#define DATATYPES_H_
-/////////////////////////////////
-// STATIC VARIABLE DECLARATION //
-/////////////////////////////////
+#include <math.h>
+#include "vector_functions.h"
+//#include "vector_arithmetic.h"
-#include "vector_functions.h"
+// Enable profiling of kernel runtimes?
+// 0: No
+// 1: Yes
+#define PROFILING 1
+
+
+//////////////////////
+// TYPE DEFINITIONS //
+//////////////////////
+
+// REMEMBER: When changing the precision below,
+// change values in typedefs.h accordingly.
+
+// Uncomment all five lines below for single precision
+/*
+typedef Float Float;
+typedef Float3 Float3;
+typedef Float4 Float4;
+#define MAKE_FLOAT3(x, y, z) make_Float3(x, y, z)
+#define MAKE_FLOAT4(x, y, z, w) make_Float4(x, y, z, w)
+*/
+
+
+// Uncomment all five lines below for double precision
+///*
+typedef double Float;
+typedef double2 Float2;
+typedef double3 Float3;
+typedef double4 Float4;
+#define MAKE_FLOAT3(x, y, z) make_double3(x, y, z)
+#define MAKE_FLOAT4(x, y, z, w) make_double4(x, y, z, w)
+//*/
+
-//// Symbolic constants
+////////////////////////
+// SYMBOLIC CONSTANTS //
+////////////////////////
-const float PI = 3.14159265358979f;
+const Float PI = 3.14159265358979;
// Number of dimensions (1 and 2 NOT functional)
const unsigned int ND = 3;
// Define source code version
-const float VERS = 0.25;
+const Float VERS = 0.25;
// Max. number of contacts per particle
-const char NC = 16;
+//const char NC = 16;
+const char NC = 32;
///////////////////////////
t@@ -31,39 +66,39 @@ const char NC = 16;
// Structure containing variable particle parameters
struct Particles {
- float *radius;
- float *k_n;
- float *k_s;
- float *k_r;
- float *gamma_n;
- float *gamma_s;
- float *gamma_r;
- float *mu_s;
- float *mu_d;
- float *mu_r;
- float *rho;
- float *es_dot;
- float *es;
- float *p;
- float *m;
- float *I;
+ Float *radius;
+ Float *k_n;
+ Float *k_s;
+ Float *k_r;
+ Float *gamma_n;
+ Float *gamma_s;
+ Float *gamma_r;
+ Float *mu_s;
+ Float *mu_d;
+ Float *mu_r;
+ Float *rho;
+ Float *es_dot;
+ Float *es;
+ Float *p;
+ Float *m;
+ Float *I;
unsigned int np;
};
// Structure containing grid parameters
struct Grid {
unsigned int nd;
- float *origo;
- float *L;
+ Float *origo;
+ Float *L;
unsigned int *num;
};
// Structure containing time parameters
struct Time {
- float dt;
+ Float dt;
double current;
double total;
- float file_dt;
+ Float file_dt;
unsigned int step_count;
};
t@@ -71,23 +106,26 @@ struct Time {
struct Params {
//bool global;
int global;
- float *g;
+ Float *g;
unsigned int np;
unsigned int nw;
- float dt;
- float k_n;
- float k_s;
- float k_r;
- float gamma_n;
- float gamma_s;
- float gamma_r;
- float mu_s;
- float mu_d;
- float mu_r;
- float rho;
- float kappa;
- float db;
- float V_b;
+ Float dt;
+ Float k_n;
+ Float k_s;
+ Float k_r;
+ Float gamma_n;
+ Float gamma_s;
+ Float gamma_r;
+ Float gamma_wn;
+ Float gamma_ws;
+ Float gamma_wr;
+ Float mu_s;
+ Float mu_d;
+ Float mu_r;
+ Float rho;
+ Float kappa;
+ Float db;
+ Float V_b;
int periodic;
unsigned int shearmodel;
};
t@@ -97,29 +135,29 @@ struct Params {
// PROTOTYPE FUNCTIONS //
/////////////////////////
int fwritebin(char *target, Particles *p,
- float4 *host_x, float4 *host_vel,
- float4 *host_angvel, float4 *host_force,
- float4 *host_torque, uint4 *host_bonds,
+ Float4 *host_x, Float4 *host_vel,
+ Float4 *host_angvel, Float4 *host_force,
+ Float4 *host_torque, uint4 *host_bonds,
Grid *grid, Time *time, Params *params,
- float4 *host_w_nx, float4 *host_w_mvfd);
+ Float4 *host_w_nx, Float4 *host_w_mvfd);
// device.cu
//extern "C"
void initializeGPU(void);
//extern "C"
-void gpuMain(float4* host_x,
- float4* host_vel,
- float4* host_acc,
- float4* host_angvel,
- float4* host_angacc,
- float4* host_force,
- float4* host_torque,
+void gpuMain(Float4* host_x,
+ Float4* host_vel,
+ Float4* host_acc,
+ Float4* host_angvel,
+ Float4* host_angacc,
+ Float4* host_force,
+ Float4* host_torque,
uint4* host_bonds,
Particles* p, Grid* grid,
Time* time, Params* params,
- float4* host_w_nx,
- float4* host_w_mvfd,
+ Float4* host_w_nx,
+ Float4* host_w_mvfd,
const char* cwd,
const char* inputbin);
(DIR) diff --git a/src/device.cu b/src/device.cu
t@@ -2,8 +2,8 @@
#include <iostream>
#include <cstdio>
#include <cuda.h>
-#include <cutil_math.h>
+#include "vector_arithmetic.h"
#include "thrust/device_ptr.h"
#include "thrust/sort.h"
t@@ -11,1488 +11,11 @@
#include "datatypes.cuh"
#include "constants.cuh"
-//#include "cuPrintf.cu"
-
-
-// Returns the cellID containing the particle, based cubic grid
-// See Bayraktar et al. 2009
-// Kernel is executed on the device, and is callable from the device only
-__device__ int calcCellID(float3 x)
-{
- unsigned int i_x, i_y, i_z;
-
- // Calculate integral coordinates:
- i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
- i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
- i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
-
- // Integral coordinates are converted to 1D coordinate:
- return __umul24(__umul24(i_z, devC_num[1]),
- devC_num[0])
- + __umul24(i_y, devC_num[0]) + i_x;
-
-} // End of calcCellID(...)
-
-
-// Calculate hash value for each particle, based on position in grid.
-// Kernel executed on device, and callable from host only.
-__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID,
- unsigned int* dev_gridParticleIndex,
- float4* dev_x)
-{
- //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
- unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-
- if (idx < devC_np) { // Condition prevents block size error
-
- //volatile float4 x = dev_x[idx]; // Ensure coalesced read
- float4 x = dev_x[idx]; // Ensure coalesced read
-
- unsigned int cellID = calcCellID(make_float3(x.x, x.y, x.z));
-
- // Store values
- dev_gridParticleCellID[idx] = cellID;
- dev_gridParticleIndex[idx] = idx;
-
- }
-} // End of calcParticleCellID(...)
-
-
-// Reorder particle data into sorted order, and find the start and end particle indexes
-// of each cell in the sorted hash array.
-// Kernel executed on device, and callable from host only.
-__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
- unsigned int* dev_gridParticleCellID,
- unsigned int* dev_gridParticleIndex,
- float4* dev_x, float4* dev_vel,
- float4* dev_angvel, float* dev_radius,
- //uint4* dev_bonds,
- float4* dev_x_sorted, float4* dev_vel_sorted,
- float4* dev_angvel_sorted, float* dev_radius_sorted)
- //uint4* dev_bonds_sorted)
-{
-
- // Create hash array in shared on-chip memory. The size of the array
- // (threadsPerBlock + 1) is determined at launch time (extern notation).
- extern __shared__ unsigned int shared_data[];
-
- // Thread index in block
- unsigned int tidx = threadIdx.x;
-
- // Thread index in grid
- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
- //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-
- // CellID hash value of particle idx
- unsigned int cellID;
-
- // Read cellID data and store it in shared memory (shared_data)
- if (idx < devC_np) { // Condition prevents block size error
- cellID = dev_gridParticleCellID[idx];
-
- // Load hash data into shared memory, allowing access to neighbor particle cellID values
- shared_data[tidx+1] = cellID;
-
- if (idx > 0 && tidx == 0) {
- // First thread in block must load neighbor particle hash
- shared_data[0] = dev_gridParticleCellID[idx-1];
- }
- }
-
- // Pause completed threads in this block, until all
- // threads are done loading data into shared memory
- __syncthreads();
-
- // Find lowest and highest particle index in each cell
- if (idx < devC_np) { // Condition prevents block size error
- // If this particle has a different cell index to the previous particle, it's the first
- // particle in the cell -> Store the index of this particle in the cell.
- // The previous particle must be the last particle in the previous cell.
- if (idx == 0 || cellID != shared_data[tidx]) {
- dev_cellStart[cellID] = idx;
- if (idx > 0)
- dev_cellEnd[shared_data[tidx]] = idx;
- }
-
- // Check wether the thread is the last one
- if (idx == (devC_np - 1))
- dev_cellEnd[cellID] = idx + 1;
-
-
- // Use the sorted index to reorder the position and velocity data
- unsigned int sortedIndex = dev_gridParticleIndex[idx];
-
- // Fetch from global read
- float4 x = dev_x[sortedIndex];
- float4 vel = dev_vel[sortedIndex];
- float4 angvel = dev_angvel[sortedIndex];
- float radius = dev_radius[sortedIndex];
- //uint4 bonds = dev_bonds[sortedIndex];
-
- __syncthreads();
- // Write sorted data to global memory
- dev_x_sorted[idx] = x;
- dev_vel_sorted[idx] = vel;
- dev_angvel_sorted[idx] = angvel;
- dev_radius_sorted[idx] = radius;
- //dev_bonds_sorted[idx] = bonds;
- }
-} // End of reorderArrays(...)
-
-
-
-
-// COLLISION LOCATION AND PROCESSING FUNCTIONS
-
-
-// Linear viscoelastic contact model for particle-wall interactions
-// with tangential friction and rolling resistance
-__device__ float contactLinear_wall(float3* N, float3* T, float* es_dot, float* p,
- unsigned int idx_a, float radius_a,
- float4* dev_vel_sorted, float4* dev_angvel_sorted,
- float3 n, float delta, float wvel)
-{
- // Fetch particle velocities from global memory
- const float4 linvel_tmp = dev_vel_sorted[idx_a];
- const float4 angvel_tmp = dev_angvel_sorted[idx_a];
-
- // Convert velocities to three-component vectors
- const float3 linvel = make_float3(linvel_tmp.x,
- linvel_tmp.y,
- linvel_tmp.z);
- const float3 angvel = make_float3(angvel_tmp.x,
- angvel_tmp.y,
- angvel_tmp.z);
-
- // Store the length of the angular velocity for later use
- const float angvel_length = length(angvel);
-
- // Contact velocity is the sum of the linear and
- // rotational components
- const float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
-
- // Normal component of the contact velocity
- const float vel_n = dot(vel, n);
-
- // The tangential velocity is the contact velocity
- // with the normal component subtracted
- const float3 vel_t = vel - n * (dot(vel, n));
- const float vel_t_length = length(vel_t);
-
- // Calculate elastic normal component
- //float3 f_n = -devC_k_n * delta * n;
-
- // Normal force component: Elastic - viscous damping
- float3 f_n = (-devC_k_n * delta - devC_gamma_n * vel_n) * n;
-
- // Make sure the viscous damping doesn't exceed the elastic component,
- // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
- if (dot(f_n, n) < 0.0f)
- f_n = make_float3(0.0f, 0.0f, 0.0f);
-
- const float f_n_length = length(f_n); // Save length for later use
-
- // Initialize vectors
- float3 f_t = make_float3(0.0f, 0.0f, 0.0f);
- float3 T_res = make_float3(0.0f, 0.0f, 0.0f);
-
- // Check that the tangential velocity is high enough to avoid
- // divide by zero (producing a NaN)
- if (vel_t_length > 0.f) {
-
- const float f_t_visc = devC_gamma_s * vel_t_length; // Tangential force by viscous model
- const float f_t_limit = devC_mu_s * f_n_length; // Max. friction
-
- // If the shear force component exceeds the friction,
- // the particle slips and energy is dissipated
- if (f_t_visc < f_t_limit) {
- f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
-
- } else { // Dynamic friction, friction failure
- f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
-
- // Shear energy production rate [W]
- //*es_dot += -dot(vel_t, f_t);
- }
- }
-
-/* if (angvel_length > 0.f) {
- // Apply rolling resistance (Zhou et al. 1999)
- //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
-
- // New rolling resistance model
- T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
- devC_mu_r * radius_a * f_n_length)
- * angvel_a/angvel_length;
- }*/
-
- // Total force from wall
- *N += f_n + f_t;
-// *N += f_n;
-
- // Total torque from wall
- *T += -radius_a * cross(n, f_t) + T_res;
-
- // Pressure excerted onto particle from this contact
- *p += f_n_length / (4.0f * PI * radius_a*radius_a);
-
- // Return force excerted onto the wall
- //return -dot(*N, n);
- return dot(f_n, n);
-}
-
-
-// Linear vicoelastic contact model for particle-particle interactions
-// with tangential friction and rolling resistance
-__device__ void contactLinearViscous(float3* N, float3* T, float* es_dot, float* p,
- unsigned int idx_a, unsigned int idx_b,
- float4* dev_vel_sorted,
- float4* dev_angvel_sorted,
- float radius_a, float radius_b,
- float3 x_ab, float x_ab_length,
- float delta_ab, float kappa)
-{
-
- // Allocate variables and fetch missing time=t values for particle A and B
- const float4 vel_a = dev_vel_sorted[idx_a];
- const float4 vel_b = dev_vel_sorted[idx_b];
- const float4 angvel4_a = dev_angvel_sorted[idx_a];
- const float4 angvel4_b = dev_angvel_sorted[idx_b];
-
- // Convert to float3's
- const float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
- const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
-
- // Force between grain pair decomposed into normal- and tangential part
- float3 f_n, f_t, f_c, T_res;
-
- // Normal vector of contact
- const float3 n_ab = x_ab/x_ab_length;
-
- // Relative contact interface velocity, w/o rolling
- const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
-
- // Relative contact interface velocity of particle surfaces at
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
- const float3 vel_ab = vel_ab_linear
- + radius_a * cross(n_ab, angvel_a)
- + radius_b * cross(n_ab, angvel_b);
-
- // Relative contact interface rolling velocity
- const float3 angvel_ab = angvel_a - angvel_b;
- const float angvel_ab_length = length(angvel_ab);
-
- // Normal component of the relative contact interface velocity
- const float vel_n_ab = dot(vel_ab_linear, n_ab);
-
- // Tangential component of the relative contact interface velocity
- // Hinrichsen and Wolf 2004, eq. 13.9
- const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
- const float vel_t_ab_length = length(vel_t_ab);
-
- // Compute the normal stiffness of the contact
- //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
-
- // Calculate rolling radius
- const float R_bar = (radius_a + radius_b) / 2.0f;
-
- // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
- // with velocity dependant damping
- // Damping coefficient: alpha = 0.8
- //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_eff*k_n_ab) * vel_ab) * n_ab;
-
- // Linear spring for normal component (Renzo 2004, eq. 35)
- // Dissipation due to plastic deformation is modelled by using a different
- // unloading spring constant (Walton and Braun 1986)
- // Here the factor in the second term determines the relative strength of the
- // unloading spring relative to the loading spring.
- /* if (vel_n_ab > 0.0f) { // Loading
- f_n = (-k_n_ab * delta_ab) * n_ab;
- } else { // Unloading
- f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
- } // f_n is OK! */
-
- // Normal force component: Elastic
- //f_n = -k_n_ab * delta_ab * n_ab;
-
- // Normal force component: Elastic - viscous damping
- f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
-
- // Make sure the viscous damping doesn't exceed the elastic component,
- // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
- if (dot(f_n, n_ab) < 0.0f)
- f_n = make_float3(0.0f, 0.0f, 0.0f);
-
- const float f_n_length = length(f_n);
-
- // Add max. capillary force
- f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
-
- // Initialize force vectors to zero
- f_t = make_float3(0.0f, 0.0f, 0.0f);
- T_res = make_float3(0.0f, 0.0f, 0.0f);
-
- // Shear force component: Nonlinear relation
- // Coulomb's law of friction limits the tangential force to less or equal
- // to the normal force
- if (vel_t_ab_length > 0.f) {
-
- // Tangential force by viscous model
- const float f_t_visc = devC_gamma_s * vel_t_ab_length;
-
- // Determine max. friction
- float f_t_limit;
- if (vel_t_ab_length > 0.001f) { // Dynamic
- f_t_limit = devC_mu_d * length(f_n-f_c);
- } else { // Static
- f_t_limit = devC_mu_s * length(f_n-f_c);
- }
-
- // If the shear force component exceeds the friction,
- // the particle slips and energy is dissipated
- if (f_t_visc < f_t_limit) { // Static
- f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
-
- } else { // Dynamic, friction failure
- f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
-
- // Shear friction production rate [W]
- //*es_dot += -dot(vel_t_ab, f_t);
- }
- }
-
-/* if (angvel_ab_length > 0.f) {
- // Apply rolling resistance (Zhou et al. 1999)
- //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
-
- // New rolling resistance model
- T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
- devC_mu_r * R_bar * f_n_length)
- * angvel_ab/angvel_ab_length;
- }
-*/
-
- // Add force components from this collision to total force for particle
- *N += f_n + f_t + f_c;
- *T += -R_bar * cross(n_ab, f_t) + T_res;
-
- // Pressure excerted onto the particle from this contact
- *p += f_n_length / (4.0f * PI * radius_a*radius_a);
-
-} // End of contactLinearViscous()
-
-
-// Linear elastic contact model for particle-particle interactions
-__device__ void contactLinear(float3* N, float3* T,
- float* es_dot, float* p,
- unsigned int idx_a_orig,
- unsigned int idx_b_orig,
- float4 vel_a,
- float4* dev_vel,
- float3 angvel_a,
- float4* dev_angvel,
- float radius_a, float radius_b,
- float3 x_ab, float x_ab_length,
- float delta_ab, float4* dev_delta_t,
- unsigned int mempos)
-{
-
- // Allocate variables and fetch missing time=t values for particle A and B
- const float4 vel_b = dev_vel[idx_b_orig];
- const float4 angvel4_b = dev_angvel[idx_b_orig];
-
- // Fetch previous sum of shear displacement for the contact pair
- const float4 delta_t0_4 = dev_delta_t[mempos];
-
- const float3 delta_t0_uncor = make_float3(delta_t0_4.x,
- delta_t0_4.y,
- delta_t0_4.z);
-
- // Convert to float3
- const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
-
- // Force between grain pair decomposed into normal- and tangential part
- float3 f_n, f_t, f_c, T_res;
-
- // Normal vector of contact
- const float3 n_ab = x_ab/x_ab_length;
-
- // Relative contact interface velocity, w/o rolling
- const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
-
- // Relative contact interface velocity of particle surfaces at
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
- const float3 vel_ab = vel_ab_linear
- + radius_a * cross(n_ab, angvel_a)
- + radius_b * cross(n_ab, angvel_b);
-
- // Relative contact interface rolling velocity
- const float3 angvel_ab = angvel_a - angvel_b;
- const float angvel_ab_length = length(angvel_ab);
-
- // Normal component of the relative contact interface velocity
- const float vel_n_ab = dot(vel_ab_linear, n_ab);
-
- // Tangential component of the relative contact interface velocity
- // Hinrichsen and Wolf 2004, eq. 13.9
- const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
- const float vel_t_ab_length = length(vel_t_ab);
-
- // Correct tangential displacement vector, which is
- // necessary if the tangential plane rotated
- const float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
- const float delta_t0_length = length(delta_t0);
-
- // New tangential displacement vector
- float3 delta_t;
-
- // Compute the normal stiffness of the contact
- //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
-
- // Calculate rolling radius
- const float R_bar = (radius_a + radius_b) / 2.0f;
-
- // Normal force component: Elastic
- //f_n = -devC_k_n * delta_ab * n_ab;
-
- // Normal force component: Elastic - viscous damping
- f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
-
- // Make sure the viscous damping doesn't exceed the elastic component,
- // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
- if (dot(f_n, n_ab) < 0.0f)
- f_n = make_float3(0.0f, 0.0f, 0.0f);
-
- const float f_n_length = length(f_n);
-
- // Add max. capillary force
- f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
-
- // Initialize force vectors to zero
- f_t = make_float3(0.0f, 0.0f, 0.0f);
- T_res = make_float3(0.0f, 0.0f, 0.0f);
-
- // Apply a tangential force if the previous tangential displacement
- // is non-zero, or the current sliding velocity is non-zero.
- if (delta_t0_length > 0.f || vel_t_ab_length > 0.f) {
-
- // Shear force: Visco-Elastic, limited by Coulomb friction
- float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
- float3 f_t_visc = -1.0f * devC_gamma_s * vel_t_ab;
-
- float f_t_limit;
-
- if (vel_t_ab_length > 0.001f) { // Dynamic friciton
- f_t_limit = devC_mu_d * length(f_n-f_c);
- } else { // Static friction
- f_t_limit = devC_mu_s * length(f_n-f_c);
- }
-
- // Tangential force before friction limit correction
- f_t = f_t_elast + f_t_visc;
- float f_t_length = length(f_t);
-
- // If failure criterion is not met, contact is viscous-linear elastic.
- // If failure criterion is met, contact force is limited,
- // resulting in a slip and energy dissipation
- if (f_t_length > f_t_limit) { // Dynamic case
-
- // Frictional force is reduced to equal the limit
- f_t *= f_t_limit/f_t_length;
-
- // A slip event zeros the displacement vector
- //delta_t = make_float3(0.0f, 0.0f, 0.0f);
-
- // In a slip event, the tangential spring is adjusted to a
- // length which is consistent with Coulomb's equation
- // (Hinrichsen and Wolf, 2004)
- delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
-
- // Shear friction heat production rate:
- // The energy lost from the tangential spring is dissipated as heat
- //*es_dot += -dot(vel_t_ab, f_t);
- *es_dot += length(delta_t0 - delta_t) * devC_k_s / devC_dt; // Seen in EsyS-Particle
-
- } else { // Static case
-
- // No correction of f_t is required
-
- // Add tangential displacement to total tangential displacement
- delta_t = delta_t0 + vel_t_ab * devC_dt;
- }
- }
-
- if (angvel_ab_length > 0.f) {
- // Apply rolling resistance (Zhou et al. 1999)
- //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
-
- // New rolling resistance model
- T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
- devC_mu_r * R_bar * f_n_length)
- * angvel_ab/angvel_ab_length;
- }
-
- // Add force components from this collision to total force for particle
- *N += f_n + f_t + f_c;
- *T += -R_bar * cross(n_ab, f_t) + T_res;
-
- // Pressure excerted onto the particle from this contact
- *p += f_n_length / (4.0f * PI * radius_a*radius_a);
-
- // Store sum of tangential displacements
- dev_delta_t[mempos] = make_float4(delta_t, 0.0f);
-
-} // End of contactLinear()
-
-
-// Linear-elastic bond: Attractive force with normal- and shear components
-// acting upon particle A in a bonded particle pair
-__device__ void bondLinear(float3* N, float3* T, float* es_dot, float* p,
- unsigned int idx_a, unsigned int idx_b,
- float4* dev_x_sorted, float4* dev_vel_sorted,
- float4* dev_angvel_sorted,
- float radius_a, float radius_b,
- float3 x_ab, float x_ab_length,
- float delta_ab)
-{
-
- // If particles are not overlapping, apply bond force
- if (delta_ab > 0.0f) {
-
- // Allocate variables and fetch missing time=t values for particle A and B
- float4 vel_a = dev_vel_sorted[idx_a];
- float4 vel_b = dev_vel_sorted[idx_b];
- float4 angvel4_a = dev_angvel_sorted[idx_a];
- float4 angvel4_b = dev_angvel_sorted[idx_b];
-
- // Convert to float3's
- float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
- float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
-
- // Normal vector of contact
- float3 n_ab = x_ab/x_ab_length;
-
- // Relative contact interface velocity, w/o rolling
- float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x,
- vel_a.y - vel_b.y,
- vel_a.z - vel_b.z);
-
- // Relative contact interface velocity of particle surfaces at
- // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
- float3 vel_ab = vel_ab_linear
- + radius_a * cross(n_ab, angvel_a)
- + radius_b * cross(n_ab, angvel_b);
-
- // Relative contact interface rolling velocity
- //float3 angvel_ab = angvel_a - angvel_b;
- //float angvel_ab_length = length(angvel_ab);
-
- // Normal component of the relative contact interface velocity
- //float vel_n_ab = dot(vel_ab_linear, n_ab);
-
- // Tangential component of the relative contact interface velocity
- // Hinrichsen and Wolf 2004, eq. 13.9
- float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
- //float vel_t_ab_length = length(vel_t_ab);
-
- float3 f_n = make_float3(0.0f, 0.0f, 0.0f);
- float3 f_t = make_float3(0.0f, 0.0f, 0.0f);
-
- // Mean radius
- float R_bar = (radius_a + radius_b)/2.0f;
-
- // Normal force component: Elastic
- f_n = devC_k_n * delta_ab * n_ab;
-
- if (length(vel_t_ab) > 0.f) {
- // Shear force component: Viscous
- f_t = -1.0f * devC_gamma_s * vel_t_ab;
-
- // Shear friction production rate [W]
- //*es_dot += -dot(vel_t_ab, f_t);
- }
-
- // Add force components from this bond to total force for particle
- *N += f_n + f_t;
- *T += -R_bar * cross(n_ab, f_t);
-
- // Pressure excerted onto the particle from this bond
- *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
-
- }
-} // End of bondLinear()
-
-
-// Capillary cohesion after Richefeu et al. (2006)
-__device__ void capillaryCohesion_exp(float3* N, float radius_a,
- float radius_b, float delta_ab,
- float3 x_ab, float x_ab_length,
- float kappa)
-{
-
- // Normal vector
- float3 n_ab = x_ab/x_ab_length;
-
- float3 f_c;
- float lambda, R_geo, R_har, r, h;
-
- // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
- if ((radius_a/radius_b) > (radius_b/radius_a))
- r = radius_a/radius_b;
- else
- r = radius_b/radius_a;
-
- // Exponential decay function
- h = -sqrtf(r);
-
- // The harmonic mean
- R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
-
- // The geometrical mean
- R_geo = sqrtf(radius_a * radius_b);
-
- // The exponential falloff of the capillary force with distance
- lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
-
- // Calculate cohesional force
- f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
-
- // Add force components from this collision to total force for particle
- *N += f_c;
-
-} // End of capillaryCohesion_exp
-
-
-
-// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
-// Kernel executed on device, and callable from device only.
-__device__ void overlapsInCell(int3 targetCell,
- unsigned int idx_a,
- float4 x_a, float radius_a,
- float3* N, float3* T,
- float* es_dot, float* p,
- float4* dev_x_sorted,
- float* dev_radius_sorted,
- float4* dev_vel_sorted,
- float4* dev_angvel_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- float4* dev_w_nx,
- float4* dev_w_mvfd)
- //uint4 bonds)
-{
-
- // Variable containing modifier for interparticle
- // vector, if it crosses a periodic boundary
- float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
-
- // Check whether x- and y boundaries are to be treated as periodic
- // 1: x- and y boundaries periodic
- // 2: x boundaries periodic
- if (devC_periodic == 1) {
-
- // Periodic x-boundary
- if (targetCell.x < 0) {
- targetCell.x = devC_num[0] - 1;
- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
- }
- if (targetCell.x == devC_num[0]) {
- targetCell.x = 0;
- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
- }
-
- // Periodic y-boundary
- if (targetCell.y < 0) {
- targetCell.y = devC_num[1] - 1;
- distmod += make_float3(0.0f, devC_L[1], 0.0f);
- }
- if (targetCell.y == devC_num[1]) {
- targetCell.y = 0;
- distmod -= make_float3(0.0f, devC_L[1], 0.0f);
- }
-
- } else if (devC_periodic == 2) {
-
- // Periodic x-boundary
- if (targetCell.x < 0) {
- targetCell.x = devC_num[0] - 1;
- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
- }
- if (targetCell.x == devC_num[0]) {
- targetCell.x = 0;
- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
- }
-
- // Hande out-of grid cases on y-axis
- if (targetCell.y < 0 || targetCell.y == devC_num[1])
- return;
-
- } else {
-
- // Hande out-of grid cases on x- and y-axes
- if (targetCell.x < 0 || targetCell.x == devC_num[0])
- return;
- if (targetCell.y < 0 || targetCell.y == devC_num[1])
- return;
- }
-
- // Handle out-of-grid cases on z-axis
- if (targetCell.z < 0 || targetCell.z == devC_num[2])
- return;
-
-
- //// Check and process particle-particle collisions
-
- // Calculate linear cell ID
- unsigned int cellID = targetCell.x
- + __umul24(targetCell.y, devC_num[0])
- + __umul24(__umul24(devC_num[0],
- devC_num[1]),
- targetCell.z);
-
- // Lowest particle index in cell
- unsigned int startIdx = dev_cellStart[cellID];
-
- // Make sure cell is not empty
- if (startIdx != 0xffffffff) {
-
- // Highest particle index in cell + 1
- unsigned int endIdx = dev_cellEnd[cellID];
-
- // Iterate over cell particles
- for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
- if (idx_b != idx_a) { // Do not collide particle with itself
-
- // Fetch position and velocity of particle B.
- float4 x_b = dev_x_sorted[idx_b];
- float radius_b = dev_radius_sorted[idx_b];
- float kappa = devC_kappa;
-
- // Distance between particle centers (float4 -> float3)
- float3 x_ab = make_float3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
-
- // Adjust interparticle vector if periodic boundary/boundaries
- // are crossed
- x_ab += distmod;
-
- float x_ab_length = length(x_ab);
-
- // Distance between particle perimeters
- float delta_ab = x_ab_length - (radius_a + radius_b);
-
- // Check for particle overlap
- if (delta_ab < 0.0f) {
- contactLinearViscous(N, T, es_dot, p,
- idx_a, idx_b,
- dev_vel_sorted,
- dev_angvel_sorted,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_ab, kappa);
- } else if (delta_ab < devC_db) {
- // Check wether particle distance satisfies the capillary bond distance
- capillaryCohesion_exp(N, radius_a, radius_b, delta_ab,
- x_ab, x_ab_length, kappa);
- }
-
- // Check wether particles are bonded together
- /*if (bonds.x == idx_b || bonds.y == idx_b ||
- bonds.z == idx_b || bonds.w == idx_b) {
- bondLinear(N, T, es_dot, p,
- idx_a, idx_b,
- dev_x_sorted, dev_vel_sorted,
- dev_angvel_sorted,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_ab);
- }*/
-
- } // Do not collide particle with itself end
- } // Iterate over cell particles end
- } // Check wether cell is empty end
-} // End of overlapsInCell(...)
-
-// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
-// Write the indexes of the overlaps in array contacts[]
-// Kernel executed on device, and callable from device only.
-__device__ void findContactsInCell(int3 targetCell,
- unsigned int idx_a,
- float4 x_a, float radius_a,
- float4* dev_x_sorted,
- float* dev_radius_sorted,
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- unsigned int* dev_gridParticleIndex,
- int* nc,
- unsigned int* dev_contacts,
- float4* dev_distmod)
-{
- // Variable containing modifier for interparticle
- // vector, if it crosses a periodic boundary
- float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
-
- // Check whether x- and y boundaries are to be treated as periodic
- // 1: x- and y boundaries periodic
- // 2: x boundaries periodic
- if (devC_periodic == 1) {
-
- // Periodic x-boundary
- if (targetCell.x < 0) {
- targetCell.x = devC_num[0] - 1;
- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
- }
- if (targetCell.x == devC_num[0]) {
- targetCell.x = 0;
- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
- }
-
- // Periodic y-boundary
- if (targetCell.y < 0) {
- targetCell.y = devC_num[1] - 1;
- distmod += make_float3(0.0f, devC_L[1], 0.0f);
- }
- if (targetCell.y == devC_num[1]) {
- targetCell.y = 0;
- distmod -= make_float3(0.0f, devC_L[1], 0.0f);
- }
-
- } else if (devC_periodic == 2) {
-
- // Periodic x-boundary
- if (targetCell.x < 0) {
- targetCell.x = devC_num[0] - 1;
- distmod += make_float3(devC_L[0], 0.0f, 0.0f);
- }
- if (targetCell.x == devC_num[0]) {
- targetCell.x = 0;
- distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
- }
-
- // Hande out-of grid cases on y-axis
- if (targetCell.y < 0 || targetCell.y == devC_num[1])
- return;
-
- } else {
-
- // Hande out-of grid cases on x- and y-axes
- if (targetCell.x < 0 || targetCell.x == devC_num[0])
- return;
- if (targetCell.y < 0 || targetCell.y == devC_num[1])
- return;
- }
-
- // Handle out-of-grid cases on z-axis
- if (targetCell.z < 0 || targetCell.z == devC_num[2])
- return;
-
-
- //// Check and process particle-particle collisions
-
- // Calculate linear cell ID
- unsigned int cellID = targetCell.x
- + __umul24(targetCell.y, devC_num[0])
- + __umul24(__umul24(devC_num[0],
- devC_num[1]),
- targetCell.z);
-
- // Lowest particle index in cell
- unsigned int startIdx = dev_cellStart[cellID];
-
- // Make sure cell is not empty
- if (startIdx != 0xffffffff) {
-
- // Highest particle index in cell + 1
- unsigned int endIdx = dev_cellEnd[cellID];
-
- // Read the original index of particle A
- unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
-
- // Iterate over cell particles
- for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
- if (idx_b != idx_a) { // Do not collide particle with itself
-
- // Fetch position and radius of particle B.
- float4 x_b = dev_x_sorted[idx_b];
- float radius_b = dev_radius_sorted[idx_b];
-
- // Read the original index of particle B
- unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
-
- __syncthreads();
-
- // Distance between particle centers (float4 -> float3)
- float3 x_ab = make_float3(x_a.x - x_b.x,
- x_a.y - x_b.y,
- x_a.z - x_b.z);
-
- // Adjust interparticle vector if periodic boundary/boundaries
- // are crossed
- x_ab += distmod;
-
- float x_ab_length = length(x_ab);
-
- // Distance between particle perimeters
- float delta_ab = x_ab_length - (radius_a + radius_b);
-
- // Check for particle overlap
- if (delta_ab < 0.0f) {
-
- // If the particle is not yet registered in the contact list,
- // use the next position in the array
- int cpos = *nc;
- unsigned int cidx;
-
- // Find out, if particle is already registered in contact list
- for (int i=0; i<devC_nc; ++i) {
- __syncthreads();
- cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
- if (cidx == idx_b_orig)
- cpos = i;
- }
-
- __syncthreads();
-
- // Write the particle index to the relevant position,
- // no matter if it already is there or not (concurrency of write)
- dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
-
-
- // Write the interparticle vector and radius of particle B
- //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(x_ab, radius_b);
- dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(distmod, radius_b);
-
- // Increment contact counter
- ++*nc;
- }
-
- // Write the inter-particle position vector correction and radius of particle B
- //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(distmod, radius_b);
-
- // Check wether particles are bonded together
- /*if (bonds.x == idx_b || bonds.y == idx_b ||
- bonds.z == idx_b || bonds.w == idx_b) {
- bondLinear(N, T, es_dot, p,
- idx_a, idx_b,
- dev_x_sorted, dev_vel_sorted,
- dev_angvel_sorted,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_ab);
- }*/
-
- } // Do not collide particle with itself end
- } // Iterate over cell particles end
- } // Check wether cell is empty end
-} // End of findContactsInCell(...)
-
-
-// For a single particle:
-// Search for neighbors to particle 'idx' inside the 27 closest cells,
-// and save the contact pairs in global memory.
-__global__ void topology(unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd, // Input: Particles in cell
- unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
- float4* dev_x_sorted, float* dev_radius_sorted,
- unsigned int* dev_contacts,
- float4* dev_distmod)
-{
- // Thread index equals index of particle A
- unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
- if (idx_a < devC_np) {
- // Fetch particle data in global read
- float4 x_a = dev_x_sorted[idx_a];
- float radius_a = dev_radius_sorted[idx_a];
-
- // Count the number of contacts in this time step
- int nc = 0;
-
- // Grid position of host and neighbor cells in uniform, cubic grid
- int3 gridPos;
- int3 targetPos;
-
- // Calculate cell address in grid from position of particle
- gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
- gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
- gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
-
- // Find overlaps between particle no. idx and all particles
- // from its own cell + 26 neighbor cells
- for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
- targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
- findContactsInCell(targetPos, idx_a, x_a, radius_a,
- dev_x_sorted, dev_radius_sorted,
- dev_cellStart, dev_cellEnd,
- dev_gridParticleIndex,
- &nc, dev_contacts, dev_distmod);
- }
- }
- }
- }
-} // End of topology(...)
-
-
-// For a single particle:
-// Search for neighbors to particle 'idx' inside the 27 closest cells,
-// and compute the resulting normal- and shear force on it.
-// Collide with top- and bottom walls, save resulting force on upper wall.
-// Kernel is executed on device, and is callable from host only.
-__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
- unsigned int* dev_cellStart,
- unsigned int* dev_cellEnd,
- float4* dev_x, float* dev_radius,
- float4* dev_x_sorted, float* dev_radius_sorted,
- float4* dev_vel_sorted, float4* dev_angvel_sorted,
- float4* dev_vel, float4* dev_angvel,
- float4* dev_force, float4* dev_torque,
- float* dev_es_dot, float* dev_es, float* dev_p,
- float4* dev_w_nx, float4* dev_w_mvfd,
- float* dev_w_force, //uint4* dev_bonds_sorted,
- unsigned int* dev_contacts,
- float4* dev_distmod,
- float4* dev_delta_t)
-{
- // Thread index equals index of particle A
- unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
-
- if (idx_a < devC_np) {
-
- // Fetch particle data in global read
- unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
- float4 x_a = dev_x_sorted[idx_a];
- float radius_a = dev_radius_sorted[idx_a];
-
- // Fetch wall data in global read
- float4 w_up_nx = dev_w_nx[0];
- float4 w_up_mvfd = dev_w_mvfd[0];
-
- // Fetch world dimensions in constant memory read
- float3 origo = make_float3(devC_origo[0],
- devC_origo[1],
- devC_origo[2]);
- float3 L = make_float3(devC_L[0],
- devC_L[1],
- devC_L[2]);
-
- // Index of particle which is bonded to particle A.
- // The index is equal to the particle no (p.np)
- // if particle A is bond-less.
- //uint4 bonds = dev_bonds_sorted[idx_a];
-
- // Initiate shear friction loss rate at 0.0
- float es_dot = 0.0f;
-
- // Initiate pressure on particle at 0.0
- float p = 0.0f;
-
- // Allocate memory for temporal force/torque vector values
- float3 N = make_float3(0.0f, 0.0f, 0.0f);
- float3 T = make_float3(0.0f, 0.0f, 0.0f);
-
- // Apply linear elastic, frictional contact model to registered contacts
- if (devC_shearmodel == 2) {
- unsigned int idx_b_orig, mempos;
- float delta_n, x_ab_length;
- float4 x_b;
- float4 distmod;
- float radius_b;
- float3 x_ab;
- float4 vel_a = dev_vel_sorted[idx_a];
- float4 angvel4_a = dev_angvel_sorted[idx_a];
- float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
-
- // Loop over all possible contacts, and remove contacts
- // that no longer are valid (delta_n > 0.0)
- for (int i = 0; i<(int)devC_nc; ++i) {
- mempos = (unsigned int)(idx_a_orig * devC_nc + i);
- __syncthreads();
- idx_b_orig = dev_contacts[mempos];
- distmod = dev_distmod[mempos];
- x_b = dev_x[idx_b_orig];
- //radius_b = dev_radius[idx_b_orig];
- radius_b = distmod.w;
-
- // Inter-particle vector, corrected for periodic boundaries
- x_ab = make_float3(x_a.x - x_b.x + distmod.x,
- x_a.y - x_b.y + distmod.y,
- x_a.z - x_b.z + distmod.z);
-
- x_ab_length = length(x_ab);
- delta_n = x_ab_length - (radius_a + radius_b);
-
-
- if (idx_b_orig != (unsigned int)devC_np) {
-
- // Process collision if the particles are overlapping
- if (delta_n < 0.0f) {
- //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u, contact = %d, delta_n = %f\n",
- // idx_a_orig, idx_b_orig, i, delta_n);
- contactLinear(&N, &T, &es_dot, &p,
- idx_a_orig,
- idx_b_orig,
- vel_a,
- dev_vel,
- angvel_a,
- dev_angvel,
- radius_a, radius_b,
- x_ab, x_ab_length,
- delta_n, dev_delta_t,
- mempos);
- } else {
- __syncthreads();
- // Remove this contact (there is no particle with index=np)
- dev_contacts[mempos] = devC_np;
- // Zero sum of shear displacement in this position
- dev_delta_t[mempos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- } else {
- __syncthreads();
- dev_delta_t[mempos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
- } // Contact loop end
-
- } else if (devC_shearmodel == 1) {
-
- int3 gridPos;
- int3 targetPos;
-
- // Calculate address in grid from position
- gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
- gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
- gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
-
- // Find overlaps between particle no. idx and all particles
- // from its own cell + 26 neighbor cells.
- // Calculate resulting normal- and shear-force components and
- // torque for the particle on the base of contactLinearViscous()
- for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
- for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
- for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
- targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
- overlapsInCell(targetPos, idx_a, x_a, radius_a,
- &N, &T, &es_dot, &p,
- dev_x_sorted, dev_radius_sorted,
- dev_vel_sorted, dev_angvel_sorted,
- dev_cellStart, dev_cellEnd,
- dev_w_nx, dev_w_mvfd);
- }
- }
- }
-
- }
-
- //// Interact with walls
- float delta_w; // Overlap distance
- float3 w_n; // Wall surface normal
- float w_force = 0.0f; // Force on wall from particle A
-
- // Upper wall (idx 0)
- delta_w = w_up_nx.w - (x_a.z + radius_a);
- w_n = make_float3(0.0f, 0.0f, -1.0f);
- if (delta_w < 0.0f) {
- w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, w_up_mvfd.y);
- }
-
- // Lower wall (force on wall not stored)
- delta_w = x_a.z - radius_a - origo.z;
- w_n = make_float3(0.0f, 0.0f, 1.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
-
- if (devC_periodic == 0) {
-
- // Left wall
- delta_w = x_a.x - radius_a - origo.x;
- w_n = make_float3(1.0f, 0.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
- // Right wall
- delta_w = L.x - (x_a.x + radius_a);
- w_n = make_float3(-1.0f, 0.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
- // Front wall
- delta_w = x_a.y - radius_a - origo.y;
- w_n = make_float3(0.0f, 1.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
- // Back wall
- delta_w = L.y - (x_a.y + radius_a);
- w_n = make_float3(0.0f, -1.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
- } else if (devC_periodic == 2) {
-
- // Front wall
- delta_w = x_a.y - radius_a - origo.y;
- w_n = make_float3(0.0f, 1.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
-
- // Back wall
- delta_w = L.y - (x_a.y + radius_a);
- w_n = make_float3(0.0f, -1.0f, 0.0f);
- if (delta_w < 0.0f) {
- (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
- dev_vel_sorted, dev_angvel_sorted,
- w_n, delta_w, 0.0f);
- }
- }
-
-
- // Hold threads for coalesced write
- __syncthreads();
-
- // Write force to unsorted position
- unsigned int orig_idx = dev_gridParticleIndex[idx_a];
- dev_force[orig_idx] = make_float4(N, 0.0f);
- dev_torque[orig_idx] = make_float4(T, 0.0f);
- dev_es_dot[orig_idx] = es_dot;
- dev_es[orig_idx] += es_dot * devC_dt;
- dev_p[orig_idx] = p;
- dev_w_force[orig_idx] = w_force;
- }
-} // End of interact(...)
-
-
-
-
-
-// FUNCTION FOR UPDATING TRAJECTORIES
-
-// Second order integration scheme based on Taylor expansion of particle kinematics.
-// Kernel executed on device, and callable from host only.
-__global__ void integrate(float4* dev_x_sorted, float4* dev_vel_sorted, // Input
- float4* dev_angvel_sorted, float* dev_radius_sorted, // Input
- float4* dev_x, float4* dev_vel, float4* dev_angvel, // Output
- float4* dev_force, float4* dev_torque, // Input
- unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
-{
- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
-
- if (idx < devC_np) { // Condition prevents block size error
-
- // Copy data to temporary arrays to avoid any potential read-after-write,
- // write-after-read, or write-after-write hazards.
- unsigned int orig_idx = dev_gridParticleIndex[idx];
- float4 force = dev_force[orig_idx];
- float4 torque = dev_torque[orig_idx];
-
- // Initialize acceleration vectors to zero
- float4 acc = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- float4 angacc = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
-
- // Fetch particle position and velocity values from global read
- float4 x = dev_x_sorted[idx];
- float4 vel = dev_vel_sorted[idx];
- float4 angvel = dev_angvel_sorted[idx];
- float radius = dev_radius_sorted[idx];
-
- // Coherent read from constant memory to registers
- float dt = devC_dt;
- float3 origo = make_float3(devC_origo[0], devC_origo[1], devC_origo[2]);
- float3 L = make_float3(devC_L[0], devC_L[1], devC_L[2]);
- float rho = devC_rho;
-
- // Particle mass
- float m = 4.0f/3.0f * PI * radius*radius*radius * rho;
-
- // Update linear acceleration of particle
- acc.x = force.x / m;
- acc.y = force.y / m;
- acc.z = force.z / m;
-
- // Update angular acceleration of particle
- // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
- angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
- angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
- angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
-
- // Add gravity
- acc.x += devC_g[0];
- acc.y += devC_g[1];
- acc.z += devC_g[2];
- //acc.z -= 9.82f;
-
- // Update angular velocity
- angvel.x += angacc.x * dt;
- angvel.y += angacc.y * dt;
- angvel.z += angacc.z * dt;
-
- // Check if particle has a fixed horizontal velocity
- if (vel.w > 0.0f) {
-
- // Zero horizontal acceleration and disable
- // gravity to counteract segregation.
- // Particles may move in the z-dimension,
- // to allow for dilation.
- acc.x = 0.0f;
- acc.y = 0.0f;
- acc.z -= devC_g[2];
-
- // Zero the angular acceleration and -velocity
- angvel = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- }
-
- // Update linear velocity
- vel.x += acc.x * dt;
- vel.y += acc.y * dt;
- vel.z += acc.z * dt;
-
- // Update position. First-order Euler's scheme:
- //x.x += vel.x * dt;
- //x.y += vel.y * dt;
- //x.z += vel.z * dt;
-
- // Update position. Second-order scheme based on Taylor expansion
- // (greater accuracy than the first-order Euler's scheme)
- x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
- x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
- x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
-
-
- // Add x-displacement for this time step to
- // sum of x-displacements
- x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
-
- // Move particle across boundary if it is periodic
- if (devC_periodic == 1) {
- if (x.x < origo.x)
- x.x += L.x;
- if (x.x > L.x)
- x.x -= L.x;
- if (x.y < origo.y)
- x.y += L.y;
- if (x.y > L.y)
- x.y -= L.y;
- } else if (devC_periodic == 2) {
- if (x.x < origo.x)
- x.x += L.x;
- if (x.x > L.x)
- x.x -= L.x;
- }
-
- // Hold threads for coalesced write
- __syncthreads();
-
- // Store data in global memory at original, pre-sort positions
- dev_angvel[orig_idx] = angvel;
- dev_vel[orig_idx] = vel;
- dev_x[orig_idx] = x;
- }
-} // End of integrate(...)
-
-
-// Reduce wall force contributions from particles to a single value per wall
-__global__ void summation(float* in, float *out)
-{
- __shared__ float cache[256];
- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
- unsigned int cacheIdx = threadIdx.x;
-
- float temp = 0.0f;
- while (idx < devC_np) {
- temp += in[idx];
- idx += blockDim.x * gridDim.x;
- }
-
- // Set the cache values
- cache[cacheIdx] = temp;
-
- __syncthreads();
-
- // For reductions, threadsPerBlock must be a power of two
- // because of the following code
- unsigned int i = blockDim.x/2;
- while (i != 0) {
- if (cacheIdx < i)
- cache[cacheIdx] += cache[cacheIdx + i];
- __syncthreads();
- i /= 2;
- }
-
- // Write sum for block to global memory
- if (cacheIdx == 0)
- out[blockIdx.x] = cache[0];
-}
-
-// Update wall positions
-__global__ void integrateWalls(float4* dev_w_nx,
- float4* dev_w_mvfd,
- float* dev_w_force_partial,
- unsigned int blocksPerGrid)
-{
- unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
-
- if (idx < devC_nw) { // Condition prevents block size error
-
- // Copy data to temporary arrays to avoid any potential read-after-write,
- // write-after-read, or write-after-write hazards.
- float4 w_nx = dev_w_nx[idx];
- float4 w_mvfd = dev_w_mvfd[idx];
-
- // Find the final sum of forces on wall
- w_mvfd.z = 0.0f;
- for (int i=0; i<blocksPerGrid; ++i) {
- w_mvfd.z += dev_w_force_partial[i];
- }
-
- float dt = devC_dt;
-
- // Normal load = Deviatoric stress times wall surface area,
- // directed downwards.
- float N = -w_mvfd.w*devC_L[0]*devC_L[1];
-
- // Calculate resulting acceleration of wall
- // (Wall mass is stored in w component of position float4)
- float acc = (w_mvfd.z+N)/w_mvfd.x;
-
- // Update linear velocity
- w_mvfd.y += acc * dt;
-
- // Update position. Second-order scheme based on Taylor expansion
- // (greater accuracy than the first-order Euler's scheme)
- w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
-
- // Store data in global memory
- dev_w_nx[idx] = w_nx;
- dev_w_mvfd[idx] = w_mvfd;
- }
-} // End of integrateWalls(...)
-
-
+#include "sorting.cuh"
+#include "contactmodels.cuh"
+#include "cohesion.cuh"
+#include "contactsearch.cuh"
+#include "integration.cuh"
// Wrapper function for initializing the CUDA components.
t@@ -1542,6 +65,24 @@ __host__ void initializeGPU(void)
checkForCudaErrors("After initializing CUDA device");
}
+// Start timer for kernel profiling
+__host__ void startTimer(cudaEvent_t* kernel_tic)
+{
+ cudaEventRecord(*kernel_tic);
+}
+
+// Stop timer for kernel profiling and time to function sum
+__host__ void stopTimer(cudaEvent_t *kernel_tic,
+ cudaEvent_t *kernel_toc,
+ float *kernel_elapsed,
+ double* sum)
+{
+ cudaEventRecord(*kernel_toc, 0);
+ cudaEventSynchronize(*kernel_toc);
+ cudaEventElapsedTime(kernel_elapsed, *kernel_tic, *kernel_toc);
+ *sum += *kernel_elapsed;
+}
+
// Copy selected constant components to constant device memory.
//extern "C"
__host__ void transferToConstantMemory(Particles* p,
t@@ -1555,12 +96,12 @@ __host__ void transferToConstantMemory(Particles* p,
cudaMemcpyToSymbol("devC_np", &p->np, sizeof(p->np));
cudaMemcpyToSymbol("devC_nc", &NC, sizeof(char));
- cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(float)*ND);
- cudaMemcpyToSymbol("devC_L", grid->L, sizeof(float)*ND);
+ cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(Float)*ND);
+ cudaMemcpyToSymbol("devC_L", grid->L, sizeof(Float)*ND);
cudaMemcpyToSymbol("devC_num", grid->num, sizeof(unsigned int)*ND);
- cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(float));
+ cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(Float));
cudaMemcpyToSymbol("devC_global", ¶ms->global, sizeof(int));
- cudaMemcpyToSymbol("devC_g", params->g, sizeof(float)*ND);
+ cudaMemcpyToSymbol("devC_g", params->g, sizeof(Float)*ND);
cudaMemcpyToSymbol("devC_nw", ¶ms->nw, sizeof(unsigned int));
cudaMemcpyToSymbol("devC_periodic", ¶ms->periodic, sizeof(int));
t@@ -1578,19 +119,22 @@ __host__ void transferToConstantMemory(Particles* p,
params->mu_d = p->mu_d[0];
params->mu_r = p->mu_r[0];
params->rho = p->rho[0];
- cudaMemcpyToSymbol("devC_k_n", ¶ms->k_n, sizeof(float));
- cudaMemcpyToSymbol("devC_k_s", ¶ms->k_s, sizeof(float));
- cudaMemcpyToSymbol("devC_k_r", ¶ms->k_r, sizeof(float));
- cudaMemcpyToSymbol("devC_gamma_n", ¶ms->gamma_n, sizeof(float));
- cudaMemcpyToSymbol("devC_gamma_s", ¶ms->gamma_s, sizeof(float));
- cudaMemcpyToSymbol("devC_gamma_r", ¶ms->gamma_r, sizeof(float));
- cudaMemcpyToSymbol("devC_mu_s", ¶ms->mu_s, sizeof(float));
- cudaMemcpyToSymbol("devC_mu_d", ¶ms->mu_d, sizeof(float));
- cudaMemcpyToSymbol("devC_mu_r", ¶ms->mu_r, sizeof(float));
- cudaMemcpyToSymbol("devC_rho", ¶ms->rho, sizeof(float));
- cudaMemcpyToSymbol("devC_kappa", ¶ms->kappa, sizeof(float));
- cudaMemcpyToSymbol("devC_db", ¶ms->db, sizeof(float));
- cudaMemcpyToSymbol("devC_V_b", ¶ms->V_b, sizeof(float));
+ cudaMemcpyToSymbol("devC_k_n", ¶ms->k_n, sizeof(Float));
+ cudaMemcpyToSymbol("devC_k_s", ¶ms->k_s, sizeof(Float));
+ cudaMemcpyToSymbol("devC_k_r", ¶ms->k_r, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_n", ¶ms->gamma_n, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_s", ¶ms->gamma_s, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_r", ¶ms->gamma_r, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_wn", ¶ms->gamma_wn, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_ws", ¶ms->gamma_ws, sizeof(Float));
+ cudaMemcpyToSymbol("devC_gamma_wr", ¶ms->gamma_wr, sizeof(Float));
+ cudaMemcpyToSymbol("devC_mu_s", ¶ms->mu_s, sizeof(Float));
+ cudaMemcpyToSymbol("devC_mu_d", ¶ms->mu_d, sizeof(Float));
+ cudaMemcpyToSymbol("devC_mu_r", ¶ms->mu_r, sizeof(Float));
+ cudaMemcpyToSymbol("devC_rho", ¶ms->rho, sizeof(Float));
+ cudaMemcpyToSymbol("devC_kappa", ¶ms->kappa, sizeof(Float));
+ cudaMemcpyToSymbol("devC_db", ¶ms->db, sizeof(Float));
+ cudaMemcpyToSymbol("devC_V_b", ¶ms->V_b, sizeof(Float));
cudaMemcpyToSymbol("devC_shearmodel", ¶ms->shearmodel, sizeof(unsigned int));
} else {
//printf("(params.global == %d) ", params.global);
t@@ -1609,20 +153,20 @@ __host__ void transferToConstantMemory(Particles* p,
//extern "C"
-__host__ void gpuMain(float4* host_x,
- float4* host_vel,
- float4* host_acc,
- float4* host_angvel,
- float4* host_angacc,
- float4* host_force,
- float4* host_torque,
+__host__ void gpuMain(Float4* host_x,
+ Float4* host_vel,
+ Float4* host_acc,
+ Float4* host_angvel,
+ Float4* host_angacc,
+ Float4* host_force,
+ Float4* host_torque,
uint4* host_bonds,
Particles* p,
Grid* grid,
Time* time,
Params* params,
- float4* host_w_nx,
- float4* host_w_mvfd,
+ Float4* host_w_nx,
+ Float4* host_w_mvfd,
const char* cwd,
const char* inputbin)
{
t@@ -1633,31 +177,31 @@ __host__ void gpuMain(float4* host_x,
transferToConstantMemory(p, grid, time, params);
// Declare pointers for particle variables on the device
- float4* dev_x; // Particle position
- float4* dev_vel; // Particle linear velocity
- float4* dev_angvel; // Particle angular velocity
- float4* dev_acc; // Particle linear acceleration
- float4* dev_angacc; // Particle angular acceleration
- float4* dev_force; // Sum of forces
- float4* dev_torque; // Sum of torques
- float* dev_radius; // Particle radius
- float* dev_es_dot; // Current shear energy producion rate
- float* dev_es; // Total shear energy excerted on particle
- float* dev_p; // Pressure excerted onto particle
+ Float4* dev_x; // Particle position
+ Float4* dev_vel; // Particle linear velocity
+ Float4* dev_angvel; // Particle angular velocity
+ Float4* dev_acc; // Particle linear acceleration
+ Float4* dev_angacc; // Particle angular acceleration
+ Float4* dev_force; // Sum of forces
+ Float4* dev_torque; // Sum of torques
+ Float* dev_radius; // Particle radius
+ Float* dev_es_dot; // Current shear energy producion rate
+ Float* dev_es; // Total shear energy excerted on particle
+ Float* dev_p; // Pressure excerted onto particle
//uint4* dev_bonds; // Particle bond pairs
// Declare pointers for wall vectors on the device
- float4* dev_w_nx; // Wall normal (x,y,z) and position (w)
- float4* dev_w_mvfd; // Wall mass (x), velocity (y), force (z)
+ Float4* dev_w_nx; // Wall normal (x,y,z) and position (w)
+ Float4* dev_w_mvfd; // Wall mass (x), velocity (y), force (z)
// and deviatoric stress (w)
- float* dev_w_force; // Resulting force on wall per particle
- float* dev_w_force_partial; // Partial sum from block of threads
+ Float* dev_w_force; // Resulting force on wall per particle
+ Float* dev_w_force_partial; // Partial sum from block of threads
// Memory for sorted particle data
- float4* dev_x_sorted;
- float4* dev_vel_sorted;
- float4* dev_angvel_sorted;
- float* dev_radius_sorted;
+ Float4* dev_x_sorted;
+ Float4* dev_vel_sorted;
+ Float4* dev_angvel_sorted;
+ Float* dev_radius_sorted;
//uint4* dev_bonds_sorted;
// Grid-particle array pointers
t@@ -1669,35 +213,35 @@ __host__ void gpuMain(float4* host_x,
// Particle contact bookkeeping
unsigned int* dev_contacts;
// Particle pair distance correction across periodic boundaries
- float4* dev_distmod;
+ Float4* dev_distmod;
// x,y,z contains the interparticle vector, corrected if contact
// is across a periodic boundary.
- float4* dev_delta_t; // Accumulated shear distance of contact
+ Float4* dev_delta_t; // Accumulated shear distance of contact
// Particle memory size
- unsigned int memSizef = sizeof(float) * p->np;
- unsigned int memSizef4 = sizeof(float4) * p->np;
+ unsigned int memSizeF = sizeof(Float) * p->np;
+ unsigned int memSizeF4 = sizeof(Float4) * p->np;
// Allocate device memory for particle variables,
// tie to previously declared pointers
cout << " Allocating device memory: ";
// Particle arrays
- cudaMalloc((void**)&dev_x, memSizef4);
- cudaMalloc((void**)&dev_x_sorted, memSizef4);
- cudaMalloc((void**)&dev_vel, memSizef4);
- cudaMalloc((void**)&dev_vel_sorted, memSizef4);
- cudaMalloc((void**)&dev_angvel, memSizef4);
- cudaMalloc((void**)&dev_angvel_sorted, memSizef4);
- cudaMalloc((void**)&dev_acc, memSizef4);
- cudaMalloc((void**)&dev_angacc, memSizef4);
- cudaMalloc((void**)&dev_force, memSizef4);
- cudaMalloc((void**)&dev_torque, memSizef4);
- cudaMalloc((void**)&dev_radius, memSizef);
- cudaMalloc((void**)&dev_radius_sorted, memSizef);
- cudaMalloc((void**)&dev_es_dot, memSizef);
- cudaMalloc((void**)&dev_es, memSizef);
- cudaMalloc((void**)&dev_p, memSizef);
+ cudaMalloc((void**)&dev_x, memSizeF4);
+ cudaMalloc((void**)&dev_x_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_vel, memSizeF4);
+ cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_angvel, memSizeF4);
+ cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
+ cudaMalloc((void**)&dev_acc, memSizeF4);
+ cudaMalloc((void**)&dev_angacc, memSizeF4);
+ cudaMalloc((void**)&dev_force, memSizeF4);
+ cudaMalloc((void**)&dev_torque, memSizeF4);
+ cudaMalloc((void**)&dev_radius, memSizeF);
+ cudaMalloc((void**)&dev_radius_sorted, memSizeF);
+ cudaMalloc((void**)&dev_es_dot, memSizeF);
+ cudaMalloc((void**)&dev_es, memSizeF);
+ cudaMalloc((void**)&dev_p, memSizeF);
//cudaMalloc((void**)&dev_bonds, sizeof(uint4) * p->np);
//cudaMalloc((void**)&dev_bonds_sorted, sizeof(uint4) * p->np);
t@@ -1709,13 +253,13 @@ __host__ void gpuMain(float4* host_x,
// Particle contact bookkeeping arrays
cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*p->np*NC); // Max NC contacts per particle
- cudaMalloc((void**)&dev_distmod, sizeof(float4)*p->np*NC);
- cudaMalloc((void**)&dev_delta_t, sizeof(float4)*p->np*NC);
+ cudaMalloc((void**)&dev_distmod, sizeof(Float4)*p->np*NC);
+ cudaMalloc((void**)&dev_delta_t, sizeof(Float4)*p->np*NC);
// Wall arrays
- cudaMalloc((void**)&dev_w_nx, sizeof(float)*params->nw*4);
- cudaMalloc((void**)&dev_w_mvfd, sizeof(float)*params->nw*4);
- cudaMalloc((void**)&dev_w_force, sizeof(float)*params->nw*p->np);
+ cudaMalloc((void**)&dev_w_nx, sizeof(Float)*params->nw*4);
+ cudaMalloc((void**)&dev_w_mvfd, sizeof(Float)*params->nw*4);
+ cudaMalloc((void**)&dev_w_force, sizeof(Float)*params->nw*p->np);
checkForCudaErrors("Post device memory allocation");
cout << "Done\n";
t@@ -1724,22 +268,22 @@ __host__ void gpuMain(float4* host_x,
cout << " Transfering data to the device: ";
// Particle data
- cudaMemcpy(dev_x, host_x, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_vel, host_vel, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_acc, host_acc, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_angvel, host_angvel, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_angacc, host_angacc, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_force, host_force, memSizef4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_torque, host_torque, memSizef4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
//cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p->np, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_radius, p->radius, memSizef, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_es_dot, p->es_dot, memSizef, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_es, p->es, memSizef, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_p, p->p, memSizef, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_radius, p->radius, memSizeF, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_es_dot, p->es_dot, memSizeF, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_es, p->es, memSizeF, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_p, p->p, memSizeF, cudaMemcpyHostToDevice);
// Wall data (wall mass and number in constant memory)
- cudaMemcpy(dev_w_nx, host_w_nx, sizeof(float)*params->nw*4, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(float)*params->nw*4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)*params->nw*4, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)*params->nw*4, cudaMemcpyHostToDevice);
// Initialize contacts lists to p.np
unsigned int* npu = new unsigned int[p->np*NC];
t@@ -1750,12 +294,12 @@ __host__ void gpuMain(float4* host_x,
// Create array of 0.0 values on the host and transfer these to the distance
// modifier and shear displacement arrays
- float4* zerosf4 = new float4[p->np*NC];
+ Float4* zerosF4 = new Float4[p->np*NC];
for (unsigned int i=0; i<(p->np*NC); ++i)
- zerosf4[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
- cudaMemcpy(dev_distmod, zerosf4, sizeof(float4)*p->np*NC, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_delta_t, zerosf4, sizeof(float4)*p->np*NC, cudaMemcpyHostToDevice);
- delete[] zerosf4;
+ zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
+ cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)*p->np*NC, cudaMemcpyHostToDevice);
+ cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)*p->np*NC, cudaMemcpyHostToDevice);
+ delete[] zerosF4;
checkForCudaErrors("Post memcopy");
cout << "Done\n";
t@@ -1783,7 +327,7 @@ __host__ void gpuMain(float4* host_x,
// Shared memory per block
unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
- cudaMalloc((void**)&dev_w_force_partial, sizeof(float)*dimGrid.x);
+ cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
// Report to stdout
cout << "\n Device memory allocation and transfer complete.\n"
t@@ -1821,9 +365,7 @@ __host__ void gpuMain(float4* host_x,
cout << "\n Entering the main calculation time loop...\n\n"
<< " IMPORTANT: Do not close this terminal, doing so will \n"
<< " terminate this SPHERE process. Follow the \n"
- << " progress in MATLAB using:\n"
- << " >> status('" << inputbin << "')\n"
- << " or in this directory by executing:\n"
+ << " progress by executing:\n"
<< " $ ./sphere_status " << inputbin << "\n\n";
// Enable cuPrintf()
t@@ -1835,6 +377,23 @@ __host__ void gpuMain(float4* host_x,
cudaEventCreate(&dev_toc);
cudaEventRecord(dev_tic, 0);
+ // If profiling is enabled, initialize timers for each kernel
+ cudaEvent_t kernel_tic, kernel_toc;
+ float kernel_elapsed;
+ double t_calcParticleCellID = 0.0;
+ double t_thrustsort = 0.0;
+ double t_reorderArrays = 0.0;
+ double t_topology = 0.0;
+ double t_interact = 0.0;
+ double t_integrate = 0.0;
+ double t_summation = 0.0;
+ double t_integrateWalls = 0.0;
+
+ if (PROFILING == 1) {
+ cudaEventCreate(&kernel_tic);
+ cudaEventCreate(&kernel_toc);
+ }
+
cout << " Current simulation time: " << time->current << " s.";
t@@ -1855,20 +414,28 @@ __host__ void gpuMain(float4* host_x,
// For each particle:
// Compute hash key (cell index) from position
// in the fine, uniform and homogenous grid.
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
dev_gridParticleIndex,
dev_x);
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_calcParticleCellID);
checkForCudaErrors("Post calcParticleCellID");
// Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
thrust::device_ptr<uint>(dev_gridParticleCellID + p->np),
thrust::device_ptr<uint>(dev_gridParticleIndex));
cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
checkForCudaErrors("Post calcParticleCellID");
t@@ -1882,6 +449,8 @@ __host__ void gpuMain(float4* host_x,
// Use sorted order to reorder particle arrays (position, velocities, radii) to ensure
// coherent memory access. Save ordered configurations in new arrays (*_sorted).
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
dev_cellEnd,
dev_gridParticleCellID,
t@@ -1897,6 +466,8 @@ __host__ void gpuMain(float4* host_x,
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_reorderArrays);
checkForCudaErrors("Post reorderArrays", iter);
// The contact search in topology() is only necessary for determining
t@@ -1904,6 +475,8 @@ __host__ void gpuMain(float4* host_x,
// shear force model
if (params->shearmodel == 2) {
// For each particle: Search contacts in neighbor cells
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
topology<<<dimGrid, dimBlock>>>(dev_cellStart,
dev_cellEnd,
dev_gridParticleIndex,
t@@ -1918,11 +491,15 @@ __host__ void gpuMain(float4* host_x,
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_topology);
checkForCudaErrors("Post topology: One or more particles moved outside the grid.\nThis could possibly be caused by a numerical instability.\nIs the computational time step too large?", iter);
}
// For each particle: Process collisions and compute resulting forces.
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
dev_cellStart,
dev_cellEnd,
t@@ -1944,23 +521,37 @@ __host__ void gpuMain(float4* host_x,
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_interact);
checkForCudaErrors("Post interact - often caused if particles move outside the grid", iter);
// Update particle kinematics
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted,
dev_angvel_sorted, dev_radius_sorted,
dev_x, dev_vel, dev_angvel,
dev_force, dev_torque,
dev_gridParticleIndex);
+ cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrate);
+
// Summation of forces on wall
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
checkForCudaErrors("Post integrate & wall force summation");
// Update wall kinematics
+ if (PROFILING == 1)
+ startTimer(&kernel_tic);
integrateWalls<<< 1, params->nw>>>(dev_w_nx,
dev_w_mvfd,
dev_w_force_partial,
t@@ -1968,6 +559,8 @@ __host__ void gpuMain(float4* host_x,
// Synchronization point
cudaThreadSynchronize();
+ if (PROFILING == 1)
+ stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrateWalls);
checkForCudaErrors("Post integrateWalls");
t@@ -1976,7 +569,8 @@ __host__ void gpuMain(float4* host_x,
filetimeclock += time->dt;
// Report time to console
- cout << "\r Current simulation time: " << time->current << " s. ";
+ cout << "\r Current simulation time: "
+ << time->current << " s. ";// << std::flush;
// Produce output binary if the time interval
t@@ -1990,22 +584,22 @@ __host__ void gpuMain(float4* host_x,
//// Copy device data to host memory
// Particle data
- cudaMemcpy(host_x, dev_x, memSizef4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_vel, dev_vel, memSizef4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_acc, dev_acc, memSizef4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_angvel, dev_angvel, memSizef4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_force, dev_force, memSizef4, cudaMemcpyDeviceToHost);
- cudaMemcpy(host_torque, dev_torque, memSizef4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
+ cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
//cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p->np, cudaMemcpyDeviceToHost);
- cudaMemcpy(p->es_dot, dev_es_dot, memSizef, cudaMemcpyDeviceToHost);
- cudaMemcpy(p->es, dev_es, memSizef, cudaMemcpyDeviceToHost);
- cudaMemcpy(p->p, dev_p, memSizef, cudaMemcpyDeviceToHost);
+ cudaMemcpy(p->es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
+ cudaMemcpy(p->es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
+ cudaMemcpy(p->p, dev_p, memSizeF, cudaMemcpyDeviceToHost);
// Wall data
cudaMemcpy(host_w_nx, dev_w_nx,
- sizeof(float)*params->nw*4, cudaMemcpyDeviceToHost);
+ sizeof(Float)*params->nw*4, cudaMemcpyDeviceToHost);
cudaMemcpy(host_w_mvfd, dev_w_mvfd,
- sizeof(float)*params->nw*4, cudaMemcpyDeviceToHost);
+ sizeof(Float)*params->nw*4, cudaMemcpyDeviceToHost);
// Pause the CPU thread until all CUDA calls previously issued are completed
cudaThreadSynchronize();
t@@ -2057,6 +651,23 @@ __host__ void gpuMain(float4* host_x,
cudaEventDestroy(dev_tic);
cudaEventDestroy(dev_toc);
+ cudaEventDestroy(kernel_tic);
+ cudaEventDestroy(kernel_toc);
+
+ // Report time spent on each kernel
+ if (PROFILING == 1) {
+ cout << "\nKernel profiling statistics:\n"
+ << " - calcParticleCellID:\t" << t_calcParticleCellID/1000.0 << " s\n"
+ << " - thrustsort:\t\t" << t_thrustsort/1000.0 << " s\n"
+ << " - reorderArrays:\t" << t_reorderArrays/1000.0 << " s\n"
+ << " - topology:\t\t" << t_topology/1000.0 << " s\n"
+ << " - interact:\t\t" << t_interact/1000.0 << " s\n"
+ << " - integrate:\t\t" << t_integrate/1000.0 << " s\n"
+ << " - summation:\t\t" << t_summation/1000.0 << " s\n"
+ << " - integrateWalls:\t" << t_integrateWalls/1000.0 << " s\n";
+ }
+
+
// Free memory allocated to cudaPrintfInit
//cudaPrintfEnd();
(DIR) diff --git a/src/file_io.cpp b/src/file_io.cpp
t@@ -7,12 +7,22 @@
// Write host variables to target binary file
-int fwritebin(char *target, Particles *p, float4 *host_x, float4 *host_vel, float4 *host_angvel,
- float4 *host_force, float4 *host_torque,
+// The output format should ALWAYS be double precision,
+// so this function will typecast the data before write
+// if it is single precision.
+int fwritebin(char *target,
+ Particles *p,
+ Float4 *host_x,
+ Float4 *host_vel,
+ Float4 *host_angvel,
+ Float4 *host_force,
+ Float4 *host_torque,
uint4 *host_bonds,
- Grid *grid, Time *time,
+ Grid *grid,
+ Time *time,
Params *params,
- float4 *host_w_nx, float4 *host_w_mvfd)
+ Float4 *host_w_nx,
+ Float4 *host_w_mvfd)
{
FILE *fp;
t@@ -24,110 +34,286 @@ int fwritebin(char *target, Particles *p, float4 *host_x, float4 *host_vel, floa
return 1; // Return unsuccessful exit status
}
- // World dimensions
- fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
+ // If double precision: Values can be written directly
+ if (sizeof(Float) == sizeof(double)) {
- // Number of particles
- fwrite(&p->np, sizeof(p->np), 1, fp);
+ // World dimensions
+ fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
- // Temporal parameters
- fwrite(&time->dt, sizeof(time->dt), 1, fp);
- fwrite(&time->current, sizeof(time->current), 1, fp);
- fwrite(&time->total, sizeof(time->total), 1, fp);
- fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
- fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
+ // Number of particles
+ fwrite(&p->np, sizeof(p->np), 1, fp);
- // World coordinate system origo
- for (u=0; u<grid->nd; ++u) {
- fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
- }
+ // Temporal parameters
+ fwrite(&time->dt, sizeof(time->dt), 1, fp);
+ fwrite(&time->current, sizeof(time->current), 1, fp);
+ fwrite(&time->total, sizeof(time->total), 1, fp);
+ fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
+ fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
- // World dimensions
- for (u=0; u<grid->nd; ++u) {
- fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
- }
+ // World coordinate system origo
+ for (u=0; u<grid->nd; ++u) {
+ fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
+ }
- // Grid cells along each dimension
- for (u=0; u<grid->nd; ++u) {
- fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
- }
+ // World dimensions
+ for (u=0; u<grid->nd; ++u) {
+ fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
+ }
- // Particle vectors
- for (j=0; j<p->np; ++j) {
- // x-axis
- fwrite(&host_x[j].x, sizeof(float), 1, fp);
- fwrite(&host_vel[j].x, sizeof(float), 1, fp);
- fwrite(&host_angvel[j].x, sizeof(float), 1, fp);
- fwrite(&host_force[j].x, sizeof(float), 1, fp);
- fwrite(&host_torque[j].x, sizeof(float), 1, fp);
-
- // y-axis
- fwrite(&host_x[j].y, sizeof(float), 1, fp);
- fwrite(&host_vel[j].y, sizeof(float), 1, fp);
- fwrite(&host_angvel[j].y, sizeof(float), 1, fp);
- fwrite(&host_force[j].y, sizeof(float), 1, fp);
- fwrite(&host_torque[j].y, sizeof(float), 1, fp);
-
- // z-axis
- fwrite(&host_x[j].z, sizeof(float), 1, fp);
- fwrite(&host_vel[j].z, sizeof(float), 1, fp);
- fwrite(&host_angvel[j].z, sizeof(float), 1, fp);
- fwrite(&host_force[j].z, sizeof(float), 1, fp);
- fwrite(&host_torque[j].z, sizeof(float), 1, fp);
- }
-
- // Individual particle values
- for (j=0; j<p->np; ++j) {
- fwrite(&host_vel[j].w, sizeof(float), 1, fp);
- fwrite(&host_x[j].w, sizeof(float), 1, fp);
- fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
- fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
- fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
- fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
- fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
- fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
- fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
- fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
- fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
- fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
- fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
- fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
- fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
- fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
- }
+ // Grid cells along each dimension
+ for (u=0; u<grid->nd; ++u) {
+ fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
+ }
- // Singular parameters
- fwrite(¶ms->global, sizeof(params->global), 1, fp);
- for (u=0; u<grid->nd; ++u) {
- fwrite(¶ms->g[u], sizeof(params->g[u]), 1, fp);
- }
- fwrite(¶ms->kappa, sizeof(params->kappa), 1, fp);
- fwrite(¶ms->db, sizeof(params->db), 1, fp);
- fwrite(¶ms->V_b, sizeof(params->V_b), 1, fp);
- fwrite(¶ms->shearmodel, sizeof(params->shearmodel), 1, fp);
-
- // Walls
- fwrite(¶ms->nw, sizeof(params->nw), 1, fp); // No. of walls
- for (j=0; j<params->nw; ++j) {
- // Wall normal
- fwrite(&host_w_nx[j].x, sizeof(float), 1, fp);
- fwrite(&host_w_nx[j].y, sizeof(float), 1, fp);
- fwrite(&host_w_nx[j].z, sizeof(float), 1, fp);
-
- fwrite(&host_w_nx[j].w, sizeof(float), 1, fp); // Wall position
- fwrite(&host_w_mvfd[j].x, sizeof(float), 1, fp); // Wall mass
- fwrite(&host_w_mvfd[j].y, sizeof(float), 1, fp); // Wall velocity
- fwrite(&host_w_mvfd[j].z, sizeof(float), 1, fp); // Wall force
- fwrite(&host_w_mvfd[j].w, sizeof(float), 1, fp); // Wall deviatoric stress
- }
- fwrite(¶ms->periodic, sizeof(params->periodic), 1, fp);
-
- // Write bond pair values
- for (j=0; j<p->np; ++j) {
- fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
- fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
- fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
- fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
+ // Particle vectors
+ for (j=0; j<p->np; ++j) {
+ // x-axis
+ fwrite(&host_x[j].x, sizeof(Float), 1, fp);
+ fwrite(&host_vel[j].x, sizeof(Float), 1, fp);
+ fwrite(&host_angvel[j].x, sizeof(Float), 1, fp);
+ fwrite(&host_force[j].x, sizeof(Float), 1, fp);
+ fwrite(&host_torque[j].x, sizeof(Float), 1, fp);
+
+ // y-axis
+ fwrite(&host_x[j].y, sizeof(Float), 1, fp);
+ fwrite(&host_vel[j].y, sizeof(Float), 1, fp);
+ fwrite(&host_angvel[j].y, sizeof(Float), 1, fp);
+ fwrite(&host_force[j].y, sizeof(Float), 1, fp);
+ fwrite(&host_torque[j].y, sizeof(Float), 1, fp);
+
+ // z-axis
+ fwrite(&host_x[j].z, sizeof(Float), 1, fp);
+ fwrite(&host_vel[j].z, sizeof(Float), 1, fp);
+ fwrite(&host_angvel[j].z, sizeof(Float), 1, fp);
+ fwrite(&host_force[j].z, sizeof(Float), 1, fp);
+ fwrite(&host_torque[j].z, sizeof(Float), 1, fp);
+ }
+
+ // Individual particle values
+ for (j=0; j<p->np; ++j) {
+ fwrite(&host_vel[j].w, sizeof(Float), 1, fp);
+ fwrite(&host_x[j].w, sizeof(Float), 1, fp);
+ fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
+ fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
+ fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
+ fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
+ fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
+ fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
+ fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
+ fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
+ fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
+ fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
+ fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
+ fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
+ fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
+ fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
+ }
+
+ // Singular parameters
+ fwrite(¶ms->global, sizeof(params->global), 1, fp);
+ for (u=0; u<grid->nd; ++u) {
+ fwrite(¶ms->g[u], sizeof(params->g[u]), 1, fp);
+ }
+ fwrite(¶ms->kappa, sizeof(params->kappa), 1, fp);
+ fwrite(¶ms->db, sizeof(params->db), 1, fp);
+ fwrite(¶ms->V_b, sizeof(params->V_b), 1, fp);
+ fwrite(¶ms->shearmodel, sizeof(params->shearmodel), 1, fp);
+
+ // Walls
+ fwrite(¶ms->nw, sizeof(params->nw), 1, fp); // No. of walls
+ for (j=0; j<params->nw; ++j) {
+ // Wall normal
+ fwrite(&host_w_nx[j].x, sizeof(Float), 1, fp);
+ fwrite(&host_w_nx[j].y, sizeof(Float), 1, fp);
+ fwrite(&host_w_nx[j].z, sizeof(Float), 1, fp);
+
+ fwrite(&host_w_nx[j].w, sizeof(Float), 1, fp); // Wall position
+ fwrite(&host_w_mvfd[j].x, sizeof(Float), 1, fp); // Wall mass
+ fwrite(&host_w_mvfd[j].y, sizeof(Float), 1, fp); // Wall velocity
+ fwrite(&host_w_mvfd[j].z, sizeof(Float), 1, fp); // Wall force
+ fwrite(&host_w_mvfd[j].w, sizeof(Float), 1, fp); // Wall deviatoric stress
+ }
+ fwrite(¶ms->periodic, sizeof(params->periodic), 1, fp);
+ fwrite(¶ms->gamma_wn, sizeof(params->gamma_wn), 1, fp);
+ fwrite(¶ms->gamma_ws, sizeof(params->gamma_ws), 1, fp);
+ fwrite(¶ms->gamma_wr, sizeof(params->gamma_wr), 1, fp);
+
+
+ // Write bond pair values
+ for (j=0; j<p->np; ++j) {
+ fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
+ }
+
+ } else if (sizeof(Float) == sizeof(float)) {
+ // Single precision: Type conversion required
+
+ double d; // Double precision placeholder
+
+ // World dimensions
+ fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
+
+ // Number of particles
+ fwrite(&p->np, sizeof(p->np), 1, fp);
+
+ // Temporal parameters
+ d = (double)time->dt;
+ fwrite(&d, sizeof(d), 1, fp);
+ fwrite(&time->current, sizeof(time->current), 1, fp);
+ fwrite(&time->total, sizeof(time->total), 1, fp);
+ d = (double)time->file_dt;
+ fwrite(&d, sizeof(d), 1, fp);
+ fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
+
+ // World coordinate system origo
+ for (u=0; u<grid->nd; ++u) {
+ d = (double)grid->origo[u];
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+
+ // World dimensions
+ for (u=0; u<grid->nd; ++u) {
+ d = (double)grid->L[u];
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+
+ // Grid cells along each dimension
+ for (u=0; u<grid->nd; ++u) {
+ fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
+ }
+
+ // Particle vectors
+ for (j=0; j<p->np; ++j) {
+ // x-axis
+ d = (double)host_x[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_vel[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_angvel[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_force[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_torque[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+
+ // y-axis
+ d = (double)host_x[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_vel[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_angvel[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_force[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_torque[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+
+ // z-axis
+ d = (double)host_x[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_vel[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_angvel[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_force[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_torque[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+
+ // Individual particle values
+ for (j=0; j<p->np; ++j) {
+ d = (double)host_vel[j].w;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_x[j].w;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->radius[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->rho[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->k_n[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->k_s[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->k_r[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->gamma_n[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->gamma_s[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->gamma_r[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->mu_s[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->mu_d[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->mu_r[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->es_dot[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->es[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)p->p[j];
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+
+ // Singular parameters
+ fwrite(¶ms->global, sizeof(params->global), 1, fp);
+ for (u=0; u<grid->nd; ++u) {
+ d = (double)params->g[u];
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+ d = (double)params->kappa;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)params->db;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)params->V_b;
+ fwrite(&d, sizeof(d), 1, fp);
+ fwrite(¶ms->shearmodel, sizeof(params->shearmodel), 1, fp);
+
+ // Walls
+ fwrite(¶ms->nw, sizeof(params->nw), 1, fp); // No. of walls
+ for (j=0; j<params->nw; ++j) {
+ // Wall normal
+ d = (double)host_w_nx[j].x;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_nx[j].y;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_nx[j].z;
+ fwrite(&d, sizeof(d), 1, fp);
+
+ d = (double)host_w_nx[j].w; // Wall position
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_mvfd[j].x; // Wall mass
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_mvfd[j].y; // Wall velocity
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_mvfd[j].z; // Wall force
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)host_w_mvfd[j].w; // Wall deviatoric stress
+ fwrite(&d, sizeof(d), 1, fp);
+ }
+ fwrite(¶ms->periodic, sizeof(params->periodic), 1, fp);
+ d = (double)params->gamma_wn;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)params->gamma_ws;
+ fwrite(&d, sizeof(d), 1, fp);
+ d = (double)params->gamma_wr;
+ fwrite(&d, sizeof(d), 1, fp);
+
+ // Write bond pair values
+ for (j=0; j<p->np; ++j) {
+ fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
+ fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
+ }
+
+ } else {
+ fprintf(stderr, "Error: Chosen floating-point precision is incompatible with the data file format.\n");
+ exit(1);
}
fclose(fp);
(DIR) diff --git a/src/main.cpp b/src/main.cpp
t@@ -2,10 +2,10 @@
/* SPHERE source code by Anders Damsgaard Christensen, 2010-12, */
/* a 3D Discrete Element Method algorithm with CUDA GPU acceleration. */
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
-
+// Licence: Gnu Public License (GPL) v. 3. See license.txt.
// See doc/sphere-doc.pdf for full documentation.
// Compile with GNU make by typing 'make' in the src/ directory.
-// SPHERE is called from the command line with e.g. './sphere_<architecture> projectname'
+// SPHERE is called from the command line with './sphere_<architecture> projectname'
// Including library files
t@@ -15,7 +15,7 @@
#include <unistd.h> // UNIX only: For getcwd
#include <string.h> // For strerror and strcmp
#include <errno.h> // For errno
-#include <math.h> // Basic floating point mathematical operations
+#include <math.h> // Basic Floating point mathematical operations
#include <time.h> // Time and date functions for timer
// Including user files
t@@ -139,41 +139,41 @@ int main(int argc, char *argv[])
<< time.step_count << "\n";
- // For spatial vectors an array of float4 vectors is chosen for best fit with
+ // For spatial vectors an array of Float4 vectors is chosen for best fit with
// GPU memory handling. Vector variable structure: ( x, y, z, <empty>).
// Indexing starts from 0.
// Allocate host arrays
cout << "\n Allocating host memory: ";
- grid.origo = new float[ND]; // Coordinate system origo
- grid.L = new float[ND]; // Model world dimensions
+ grid.origo = new Float[ND]; // Coordinate system origo
+ grid.L = new Float[ND]; // Model world dimensions
grid.num = new unsigned int[ND]; // Number of cells in each dimension
- params.g = new float[ND]; // Gravitational acceleration vector
- p.radius = new float[p.np]; // Particle radii
- p.rho = new float[p.np]; // Particle densities
- p.m = new float[p.np]; // Particle masses
- p.I = new float[p.np]; // Particle moment of inertia
- p.k_n = new float[p.np]; // Particle normal stiffnesses
- p.k_s = new float[p.np]; // Particle shear stiffnesses
- p.k_r = new float[p.np]; // Particle rolling stiffnesses
- p.gamma_n = new float[p.np]; // Particle normal viscosity
- p.gamma_s = new float[p.np]; // Particle shear viscosity
- p.gamma_r = new float[p.np]; // Particle rolling viscosity
- p.mu_s = new float[p.np]; // Inter-particle static shear contact friction coefficients
- p.mu_d = new float[p.np]; // Inter-particle dynamic shear contact friction coefficients
- p.mu_r = new float[p.np]; // Inter-particle rolling contact friction coefficients
- p.es_dot = new float[p.np]; // Rate of shear energy dissipation
- p.es = new float[p.np]; // Total shear energy dissipation
- p.p = new float[p.np]; // Pressure excerted onto particle
-
- // Allocate float4 host arrays
- float4 *host_x = new float4[p.np]; // Center coordinates for each particle (x)
- float4 *host_vel = new float4[p.np]; // Particle velocities (dotx = v)
- float4 *host_acc = new float4[p.np]; // Particle accellerations (dotdotx = a)
- float4 *host_angvel = new float4[p.np]; // Particle angular velocity vector (omega)
- float4 *host_angacc = new float4[p.np]; // Particle angular acceleration vector (dotomega)
- float4 *host_force = new float4[p.np]; // Particle summed force
- float4 *host_torque = new float4[p.np]; // Particle summed torque
+ params.g = new Float[ND]; // Gravitational acceleration vector
+ p.radius = new Float[p.np]; // Particle radii
+ p.rho = new Float[p.np]; // Particle densities
+ p.m = new Float[p.np]; // Particle masses
+ p.I = new Float[p.np]; // Particle moment of inertia
+ p.k_n = new Float[p.np]; // Particle normal stiffnesses
+ p.k_s = new Float[p.np]; // Particle shear stiffnesses
+ p.k_r = new Float[p.np]; // Particle rolling stiffnesses
+ p.gamma_n = new Float[p.np]; // Particle normal viscosity
+ p.gamma_s = new Float[p.np]; // Particle shear viscosity
+ p.gamma_r = new Float[p.np]; // Particle rolling viscosity
+ p.mu_s = new Float[p.np]; // Inter-particle static shear contact friction coefficients
+ p.mu_d = new Float[p.np]; // Inter-particle dynamic shear contact friction coefficients
+ p.mu_r = new Float[p.np]; // Inter-particle rolling contact friction coefficients
+ p.es_dot = new Float[p.np]; // Rate of shear energy dissipation
+ p.es = new Float[p.np]; // Total shear energy dissipation
+ p.p = new Float[p.np]; // Pressure excerted onto particle
+
+ // Allocate Float4 host arrays
+ Float4 *host_x = new Float4[p.np]; // Center coordinates for each particle (x)
+ Float4 *host_vel = new Float4[p.np]; // Particle velocities (dotx = v)
+ Float4 *host_acc = new Float4[p.np]; // Particle accellerations (dotdotx = a)
+ Float4 *host_angvel = new Float4[p.np]; // Particle angular velocity vector (omega)
+ Float4 *host_angacc = new Float4[p.np]; // Particle angular acceleration vector (dotomega)
+ Float4 *host_force = new Float4[p.np]; // Particle summed force
+ Float4 *host_torque = new Float4[p.np]; // Particle summed torque
uint4 *host_bonds = new uint4[p.np]; // Bonds from particle [i] to two particles
cout << "Done\n";
t@@ -196,44 +196,44 @@ int main(int argc, char *argv[])
}
for (j=0; j<p.np; ++j) {
- if (fread(&host_x[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_x[j].x, sizeof(Float), 1, fp) != 1)
return 1; // Return unsuccessful exit status
- if (fread(&host_vel[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_vel[j].x, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_angvel[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_angvel[j].x, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_force[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_force[j].x, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_torque[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_torque[j].x, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_x[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_x[j].y, sizeof(Float), 1, fp) != 1)
return 1; // Return unsuccessful exit status
- if (fread(&host_vel[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_vel[j].y, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_angvel[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_angvel[j].y, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_force[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_force[j].y, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_torque[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_torque[j].y, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_x[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_x[j].z, sizeof(Float), 1, fp) != 1)
return 1; // Return unsuccessful exit status
- if (fread(&host_vel[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_vel[j].z, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_angvel[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_angvel[j].z, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_force[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_force[j].z, sizeof(Float), 1, fp) != 1)
return 1;
- if (fread(&host_torque[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_torque[j].z, sizeof(Float), 1, fp) != 1)
return 1;
}
for (j=0; j<p.np; ++j) {
- if (fread(&host_vel[j].w, sizeof(float), 1, fp) != 1) // Fixvel
+ if (fread(&host_vel[j].w, sizeof(Float), 1, fp) != 1) // Fixvel
return 1; // Return unsuccessful exit status
- if (fread(&host_x[j].w, sizeof(float), 1, fp) != 1) // xsum
+ if (fread(&host_x[j].w, sizeof(Float), 1, fp) != 1) // xsum
return 1;
if (fread(&p.radius[j], sizeof(p.radius[j]), 1, fp) != 1)
return 1;
t@@ -288,39 +288,39 @@ int main(int argc, char *argv[])
// Allocate host memory for walls
// Wall normal (x,y,z), w: wall position on axis parallel to wall normal
// Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
- float4 *host_w_nx = new float4[params.nw];
- float4 *host_w_mvfd = new float4[params.nw];
+ Float4 *host_w_nx = new Float4[params.nw];
+ Float4 *host_w_mvfd = new Float4[params.nw];
// Read wall data
for (j=0; j<params.nw; ++j) {
// Wall normal, x-dimension
- if (fread(&host_w_nx[j].x, sizeof(float), 1, fp) != 1) // Read in single precision
+ if (fread(&host_w_nx[j].x, sizeof(Float), 1, fp) != 1) // Read in single precision
return 1;
// Wall normal, y-dimension
- if (fread(&host_w_nx[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_nx[j].y, sizeof(Float), 1, fp) != 1)
return 1;
// Wall normal, z-dimension
- if (fread(&host_w_nx[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_nx[j].z, sizeof(Float), 1, fp) != 1)
return 1;
// Wall position along axis parallel to wall normal
- if (fread(&host_w_nx[j].w, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_nx[j].w, sizeof(Float), 1, fp) != 1)
return 1;
// Wall mass
- if (fread(&host_w_mvfd[j].x, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_mvfd[j].x, sizeof(Float), 1, fp) != 1)
return 1;
// Wall velocity along axis parallel to wall normal
- if (fread(&host_w_mvfd[j].y, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_mvfd[j].y, sizeof(Float), 1, fp) != 1)
return 1;
// Wall force along axis parallel to wall normal
- if (fread(&host_w_mvfd[j].z, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_mvfd[j].z, sizeof(Float), 1, fp) != 1)
return 1;
// Wall deviatoric stress
- if (fread(&host_w_mvfd[j].w, sizeof(float), 1, fp) != 1)
+ if (fread(&host_w_mvfd[j].w, sizeof(Float), 1, fp) != 1)
return 1;
}
// x- and y boundary behavior.
t@@ -330,6 +330,15 @@ int main(int argc, char *argv[])
params.periodic = 0;
}
+ // Wall viscosities
+ if (fread(¶ms.gamma_wn, sizeof(params.gamma_wn), 1, fp) != 1)
+ return 1;
+ if (fread(¶ms.gamma_ws, sizeof(params.gamma_ws), 1, fp) != 1)
+ return 1;
+ if (fread(¶ms.gamma_wr, sizeof(params.gamma_wr), 1, fp) != 1)
+ return 1;
+
+
for (i=0; i<p.np; ++i) {
if (fread(&host_bonds[i].x, sizeof(unsigned int), 1, fp) != 1)
return 1;