tUpdated sources - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit d9c25987416182f283d170f16d97a4afa5685f41
 (DIR) parent 6ff7614d0fd5bbd0306f7ed63a1ea1bfd0a9b3a0
 (HTM) Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Wed, 22 Aug 2012 13:19:49 +0200
       
       Updated sources
       
       Diffstat:
         M raytracer/header.h                  |      11 ++---------
         M raytracer/main.cpp                  |     158 ++++++++++++++++---------------
         A raytracer/o-ppm.h                   |       7 +++++++
         M raytracer/rt-kernel-cpu.cpp         |     106 ++++++++++++++++++-------------
         M raytracer/rt-kernel.h               |      16 +++++++++++-----
         M src/Makefile                        |      26 ++++++++++++++++++++------
         M src/constants.cuh                   |      37 +++++++++++++++++--------------
         M src/datatypes.h                     |     150 +++++++++++++++++++------------
         M src/device.cu                       |    1783 ++++---------------------------
         M src/file_io.cpp                     |     388 +++++++++++++++++++++++--------
         M src/main.cpp                        |     127 +++++++++++++++++--------------
       
       11 files changed, 849 insertions(+), 1960 deletions(-)
       ---
 (DIR) diff --git a/raytracer/header.h b/raytracer/header.h
       t@@ -4,6 +4,8 @@
        #ifndef HEADER_H_
        #define HEADER_H_
        
       +// Type declaration
       +typedef unsigned int Inttype;
        
        //// Structure declarations ////
        
       t@@ -21,13 +23,4 @@ struct rgb {
          unsigned char alpha;
        };
        
       -
       -
       -//// Prototype functions ////
       -
       -// io.cpp
       -int image_to_ppm(rgb* rgb_img, const char* filename,
       -                     const unsigned int width, const unsigned int height);
       -
       -
        #endif
 (DIR) diff --git a/raytracer/main.cpp b/raytracer/main.cpp
       t@@ -5,18 +5,19 @@
        #include <cmath>
        #include <vector_functions.h>
        #include "header.h"
       -#include "rt_kernel.h"
       -#include "rt_kernel_cpu.h"
       +#include "rt-kernel.h"
       +#include "rt-kernel-cpu.h"
       +#include "colorbar.h"
       +#include "o-ppm.h"
        
        int main(const int argc, const char* argv[])
        {
          using std::cout;
        
          if (argc < 6 || argc > 8 ){
       -    cout << "Usage: " << argv[0] << " <GPU or CPU> <sphere-binary.bin> <width> <height> <output-image.ppm>\n"
       +    cout << "Usage: " << argv[0] << " <GPU or CPU> <particle-data.txt> <width> <height> <output-image.ppm>\n"
                 << "or\n"
       -         << "Usage: " << argv[0] << " GPU <sphere-binary.bin | colorbar> <width> <height> <output-image.ppm>"
       -         << " <pressure | pressure50 | es | es_dot> <max value in color range>\n";
       +         << "Usage: " << argv[0] << " GPU <particle-data.txt | colorbar> <width> <height> <output-image.ppm> [pressure,es_dot,es,vel max_val]";
            return 1;
          }
        
       t@@ -37,22 +38,16 @@ int main(const int argc, const char* argv[])
          // Render colorbar image
          if (strcmp(argv[2],"colorbar") == 0) {
        
       -    for (unsigned int y=0; y<height; y++) {
       -      for (unsigned int x=0; x<width; x++) {
       +    for (unsigned int y=0; y<height; ++y) {
       +      for (unsigned int x=0; x<width; ++x) {
        
                // Colormap value is relative to width position
                float ratio = (float) (x+1)/width;
        
       -        // Determine Blue-White-Red color components
       -        float red   = fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
       -        float green = fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
       -        float blue  = fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
       -
                // Write pixel value to image array
       -        //img[x + y*width].r = red * 250.f;
       -        img[x + y*width].r = red * 250.f;
       -        img[x + y*width].g = green * 250.f;
       -        img[x + y*width].b = blue * 250.f;
       +        img[x + y*width].r = red(ratio) * 250.f;
       +        img[x + y*width].g = green(ratio) * 250.f;
       +        img[x + y*width].b = blue(ratio) * 250.f;
        
              }
            }
       t@@ -81,21 +76,17 @@ int main(const int argc, const char* argv[])
        
            // Allocate particle structure array
            cout << "  Allocating host memory\n";
       +
       +    // Single precision arrays used for computations
            float4* p      = new float4[np];
            float*  fixvel = new float[np];
            float*  es_dot = new float[np];
            float*  es     = new float[np];
            float*  pres   = new float[np];
       -    for (unsigned int i=0; i<np; i++) {
       -      fixvel[i] = 0.0f;
       -      pres[i]   = 0.0f; // Initialize values to zero
       -      es_dot[i] = 0.0f;
       -      es[i]     = 0.0f;
       -    }
       +    float*  vel           = new float[np];
        
            // Read temporal information
       -    float dt, file_dt;
       -    double current, total;
       +    double dt, file_dt, current, total;
            unsigned int step_count;
            (void)fread(&dt, sizeof(dt), 1, fin);
            (void)fread(&current, sizeof(current), 1, fin);
       t@@ -103,46 +94,77 @@ int main(const int argc, const char* argv[])
            (void)fread(&file_dt, sizeof(file_dt), 1, fin);
            (void)fread(&step_count, sizeof(step_count), 1, fin);
        
       +    double d; // Double precision temporary value holder
       +
            // Canonical coordinate system origo
            f3 origo;
       -    (void)fread(&origo, sizeof(float)*3, 1, fin);
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    origo.x = (float)d;
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    origo.y = (float)d;
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    origo.z = (float)d;
        
            // Canonical coordinate system dimensions
            f3 L;
       -    (void)fread(&L, sizeof(float)*3, 1, fin);
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    L.x = (float)d;
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    L.y = (float)d;
       +    (void)fread(&d, sizeof(d), 1, fin);
       +    L.z = (float)d;
       +
        
            // Skip over irrelevant data
       -    float blankf;
       -    //f3 blankf3;
       +    double blankd;
       +    //f3 blankd3;
            unsigned int blankui;
            for (int j=0; j<3; j++)         // Skip over grid.num data
              (void)fread(&blankui, sizeof(blankui), 1, fin);
        
       +    // Velocity vector, later converted to lenght
       +    float3 v;
        
            // Load data into particle array
            for (unsigned int i=0; i<np; i++) {
       -      (void)fread(&p[i].x, sizeof(float), 1, fin);
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankf, sizeof(blankf), 1, fin);
       -
       -      (void)fread(&p[i].y, sizeof(float), 1, fin);
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankf, sizeof(blankf), 1, fin);
       -
       -      (void)fread(&p[i].z, sizeof(float), 1, fin);
       -      for (int j=0; j<4; j++)
       -        (void)fread(&blankf, sizeof(blankf), 1, fin);
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      p[i].x = (float)d;        // Typecast to single precision
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      v.x  = (float)d;
       +      for (int j=0; j<3; j++)
       +        (void)fread(&blankd, sizeof(blankd), 1, fin);
       +
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      p[i].y = (float)d;
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      v.y  = (float)d;
       +      for (int j=0; j<3; j++)
       +        (void)fread(&blankd, sizeof(blankd), 1, fin);
       +
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      p[i].z = (float)d;
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      v.z  = (float)d;
       +      for (int j=0; j<3; j++)
       +        (void)fread(&blankd, sizeof(blankd), 1, fin);
       +
       +      // Save velocity vector length
       +      vel[i] = sqrt(v.x*v.x + v.y*v.y + v.z*v.z);
            }
            for (unsigned int i=0; i<np; i++) {
       -      //(void)fread(&blankf, sizeof(blankf), 1, fin); // fixvel
       -      (void)fread(&fixvel[i], sizeof(float), 1, fin); // fixvel
       -      (void)fread(&blankf, sizeof(blankf), 1, fin); // xsum
       -      (void)fread(&p[i].w, sizeof(float), 1, fin);  // radius
       +      (void)fread(&d, sizeof(d), 1, fin); // fixvel
       +      fixvel[i] = (float)d;
       +      (void)fread(&blankd, sizeof(blankd), 1, fin); // xsum
       +      (void)fread(&d, sizeof(d), 1, fin);  // radius
       +      p[i].w = (float)d;
              for (int j=0; j<10; j++)
       -        (void)fread(&blankf, sizeof(blankf), 1, fin);
       -      (void)fread(&es_dot[i], sizeof(float), 1, fin);
       -      (void)fread(&es[i], sizeof(float), 1, fin);
       -      (void)fread(&pres[i], sizeof(float), 1, fin);
       +        (void)fread(&blankd, sizeof(blankd), 1, fin);
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      es_dot[i] = (float)d;
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      es[i] = (float)d;
       +      (void)fread(&d, sizeof(d), 1, fin);
       +      pres[i] = (float)d;
            }
        
            fclose(fin);        // Binary read complete
       t@@ -151,12 +173,14 @@ int main(const int argc, const char* argv[])
              << L.x << "*" << L.y << "*" << L.z << " m\n";
        
            // Eye position and focus point
       -    //f3 eye    = {0.5f*L.x, -4.2*L.y, 0.5f*L.z};
       -    f3 eye    = {2.5f*L.x, -4.2*L.y, 2.0f*L.z};
       -    f3 lookat = {0.45f*L.x, 0.5f*L.y, 0.45f*L.z};
       +    //f3 eye    = {0.5f*L.x, -4.2f*L.y, 0.5f*L.z};
       +    //f3 eye    = {2.5f*L.x, -4.2f*L.y, 2.0f*L.z};
       +    //f3 eye    = {0.5f*L.x, -5.0f*L.y, 0.5f*L.z};
       +    f3 eye    = {2.5f*L.x, -5.0f*L.y, 1.5f*L.z};
       +    f3 lookat = {0.5f*L.x, 0.5f*L.y, 0.5f*L.z};
            if (L.z > (L.x + L.y)/1.5f) { // Render only bottom of world (for initsetup's)
       -      eye.x = 1.1f*L.x;
       -      eye.y = 15.1*L.y;
       +      eye.x = 4.1f*L.x;
       +      eye.y = -15.1*L.y;
              eye.z = 1.1*L.z;
              /*lookat.x = 0.45f*L.x;
                lookat.y = 0.5f*L.y;
       t@@ -179,40 +203,19 @@ int main(const int argc, const char* argv[])
                visualize = 2; // 2: es_dot view
              if(strcmp(argv[6],"es") == 0)
                visualize = 3; // 3: es view
       -      if(strcmp(argv[6],"pressure50") == 0)
       -        visualize = 4; // 4: pressure50 view
       +      if(strcmp(argv[6],"vel") == 0)
       +        visualize = 4; // 4: velocity view
        
              // Read max. value specified in command args.
              max_val = atof(argv[7]);
       -    }
       -
       -    // Render colorbar image
       -    if (strcmp(argv[2],"colorbar") == 0) {
       -
       -      for (unsigned int x=0; x<width; x++) {
       -        for (unsigned int y=0; y<height; y++) {
       -
       -          // Colormap value is relative to width position
       -          float ratio = (float)x/width;
        
       -          // Determine Blue-White-Red color components
       -          float red   = fmin(1.0f, 0.209f*ratio*ratio*ratio - 2.49f*ratio*ratio + 3.0f*ratio + 0.0109f);
       -          float green = fmin(1.0f, -2.44f*ratio*ratio + 2.15f*ratio + 0.369f);
       -          float blue  = fmin(1.0f, -2.21f*ratio*ratio + 1.61f*ratio + 0.573f);
       -
       -          // Write pixel value to image array
       -          img[y*height + x].r = (unsigned char) red * 255;
       -          img[y*height + x].g = (unsigned char) green * 255;
       -          img[y*height + x].b = (unsigned char) blue * 255;
       -
       -        }
       -      }
            }
        
       +
            if (strcmp(argv[1],"GPU") == 0) {
        
              // Call cuda wrapper
       -      if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize, max_val, fixvel, pres, es_dot, es) != 0) {
       +      if (rt(p, np, img, width, height, origo, L, eye, lookat, imgw, visualize, max_val, fixvel, pres, es_dot, es, vel) != 0) {
                cout << "\nError in rt()\n";
                return 1;
              }
       t@@ -234,6 +237,7 @@ int main(const int argc, const char* argv[])
            delete [] pres;
            delete [] es;
            delete [] es_dot;
       +    delete [] vel;
        
          }
        
 (DIR) diff --git a/raytracer/o-ppm.h b/raytracer/o-ppm.h
       t@@ -0,0 +1,7 @@
       +#ifndef O_PPM_H_
       +#define O_PPM_H_
       +
       +int image_to_ppm(rgb* rgb_img, const char* filename,
       +                     const unsigned int width, const unsigned int height);
       +
       +#endif
 (DIR) diff --git a/raytracer/rt-kernel-cpu.cpp b/raytracer/rt-kernel-cpu.cpp
       t@@ -5,8 +5,9 @@
        #include <cuda.h>
        #include <cutil_math.h>
        #include <string.h>
       +#include <stdlib.h>
        #include "header.h"
       -#include "rt_kernel_cpu.h"
       +#include "rt-kernel-cpu.h"
        
        // Constants
        float3 constc_u;
       t@@ -35,10 +36,10 @@ __inline__ float lengthf3(float3 in)
        // Kernel for initializing image data
        void imageInit_cpu(unsigned char* _img, unsigned int pixels)
        {
       -  for (unsigned int mempos=0; mempos<pixels; mempos++) {
       -    _img[mempos*4]     = 0;        // Red channel
       -    _img[mempos*4 + 1] = 0;        // Green channel
       -    _img[mempos*4 + 2] = 0;        // Blue channel
       +  for (unsigned int mempos=0; mempos<pixels; ++mempos) {
       +    _img[mempos*4]     = 255;        // Red channel
       +    _img[mempos*4 + 1] = 255;        // Green channel
       +    _img[mempos*4 + 2] = 255;        // Blue channel
          }
        }
        
       t@@ -49,17 +50,19 @@ void rayInitPerspective_cpu(float3* _ray_origo,
                                unsigned int width,
                                unsigned int height)
        {
       -  int i;
       -  #pragma omp parallel for
       -  for (i=0; i<width; i++) {
       -    for (unsigned int j=0; j<height; j++) {
       +  int i,j;
       +  unsigned int mempos;
       +  float p_u, p_v;
       +  #pragma omp parallel for private(mempos,j,p_u,p_v)
       +  for (i=0; i<(int)width; ++i) {
       +    for (j=0; j<(int)height; ++j) {
        
       -      unsigned int mempos = i + j*height;
       +      mempos = i + j*width;
        
              // Calculate pixel coordinates in image plane
       -      float p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
       +      p_u = constc_imgplane.x + (constc_imgplane.y - constc_imgplane.x)
                * (i + 0.5f) / width;
       -      float p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
       +      p_v = constc_imgplane.z + (constc_imgplane.w - constc_imgplane.z)
                * (j + 0.5f) / height;
        
              // Write ray origo and direction to global memory
       t@@ -74,37 +77,34 @@ void rayInitPerspective_cpu(float3* _ray_origo,
        void rayIntersectSpheres_cpu(float3* _ray_origo, 
                                 float3* _ray_direction,
                                 float4* _p, 
       +                         float* _nuance,
                                 unsigned char* _img, 
                                 unsigned int pixels,
                                 unsigned int np)
        {
       -  int mempos;
       -  #pragma omp parallel for
       -  for (mempos=0; mempos<pixels; mempos++) {
       +  long int mempos;
       +  float3 e, d, n, p, c;
       +  float tdist, R, Delta, t_minus, dotprod, I_d, k_d, k_a, I_a, nuance;
       +  Inttype i, ifinal;
       +  #pragma omp parallel for private(e,d,n,p,c,tdist,R,Delta,t_minus,dotprod,I_d,k_d,k_a,I_a,nuance,i,ifinal)
       +  for (mempos=0; mempos<pixels; ++mempos) {
            
            // Read ray data from global memory
       -    float3 e = _ray_origo[mempos];
       -    float3 d = _ray_direction[mempos];
       -    //float  step = lengthf3(d);
       +    e = _ray_origo[mempos];
       +    d = _ray_direction[mempos];
        
            // Distance, in ray steps, between object and eye initialized with a large value
       -    float tdist = 1e10f;
       -
       -    // Surface normal at closest sphere intersection
       -    float3 n;
       -
       -    // Intersection point coordinates
       -    float3 p;
       +    tdist = 1e10f;
        
            // Iterate through all particles
       -    for (unsigned int i=0; i<np; i++) {
       +    for (i=0; i<np; ++i) {
        
              // Read sphere coordinate and radius
       -      float3 c = f4_to_f3(_p[i]);
       -      float  R = _p[i].w;
       +      c = f4_to_f3(_p[i]);
       +      R = _p[i].w;
        
              // Calculate the discriminant: d = B^2 - 4AC
       -      float Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
       +      Delta = (2.0f*dot(d,(e-c)))*(2.0f*dot(d,(e-c)))  // B^2
                - 4.0f*dot(d,d)        // -4*A
                * (dot((e-c),(e-c)) - R*R);  // C
        
       t@@ -113,7 +113,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
              if (Delta > 0.0f) { 
        
                // Calculate roots, Shirley 2009 p. 77
       -        float t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
       +        t_minus = ((dot(-d,(e-c)) - sqrt( dot(d,(e-c))*dot(d,(e-c)) - dot(d,d)
                        * (dot((e-c),(e-c)) - R*R) ) ) / dot(d,d));
        
                // Check wether intersection is closer than previous values
       t@@ -121,6 +121,7 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
                  p = e + t_minus*d;
                  tdist = fabs(t_minus);
                  n = normalize(2.0f * (p - c));   // Surface normal
       +          ifinal = i;
                }
        
              } // End of solution branch
       t@@ -131,21 +132,25 @@ void rayIntersectSpheres_cpu(float3* _ray_origo,
            if (tdist < 1e10) {
        
              // Lambertian shading parameters
       -      float dotprod = fabs(dot(n, constc_light));
       -      float I_d = 40.0f;  // Light intensity
       -      float k_d = 5.0f;  // Diffuse coefficient
       +      //float dotprod = fabs(dot(n, constc_light));
       +      dotprod = fmax(0.0f,dot(n, constc_light));
       +      I_d = 70.0f;  // Light intensity
       +      k_d = 5.0f;  // Diffuse coefficient
        
              // Ambient shading
       -      float k_a = 10.0f;
       -      float I_a = 5.0f;
       +      k_a = 30.0f;
       +      I_a = 5.0f;
       +
       +      // Read color nuance of grain
       +      nuance = _nuance[ifinal];
        
              // Write shading model values to pixel color channels
              _img[mempos*4]     = (unsigned char) ((k_d * I_d * dotprod 
       -            + k_a * I_a)*0.48f);
       +            + k_a * I_a)*0.48f*nuance);
              _img[mempos*4 + 1] = (unsigned char) ((k_d * I_d * dotprod
       -            + k_a * I_a)*0.41f);
       +            + k_a * I_a)*0.41f*nuance);
              _img[mempos*4 + 2] = (unsigned char) ((k_d * I_d * dotprod
       -            + k_a * I_a)*0.27f);
       +            + k_a * I_a)*0.27f*nuance);
            }
          }
        }
       t@@ -160,9 +165,11 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio)
          float3 view = eye - lookat;
        
          // Construct the camera view orthonormal base
       -  float3 v = make_float3(0.0f, 1.0f, 0.0f);  // v: Pointing upward
       -  float3 w = -view/lengthf3(view);                   // w: Pointing backwards
       -  float3 u = cross(make_float3(v.x, v.y, v.z), make_float3(w.x, w.y, w.z)); // u: Pointing right
       +  //float3 up = make_float3(0.0f, 1.0f, 0.0f);  // Pointing upward along +y
       +  float3 up = make_float3(0.0f, 0.0f, 1.0f);  // Pointing upward along +z
       +  float3 w = -view/length(view);                   // w: Pointing backwards
       +  float3 u = cross(up, w) / length(cross(up, w));
       +  float3 v = cross(w, u);
        
          // Focal length 20% of eye vector length
          float d = lengthf3(view)*0.8f;
       t@@ -179,6 +186,8 @@ void cameraInit_cpu(float3 eye, float3 lookat, float imgw, float hw_ratio)
          constc_imgplane = imgplane;
          constc_d = d;
          constc_light = light;
       +
       +  std::cout << "Rendering image...";
        }
        
        
       t@@ -197,8 +206,15 @@ int rt_cpu(float4* p, unsigned int np,
          // Start timer 1
          t1_go = clock();
        
       +  // Generate random nuance values for all grains
       +  static float* _nuance; // Values between 0.5 and 1.0
       +  _nuance = new float[np];
       +  srand(0); // Initialize seed at same value at every run
       +  for (unsigned int i=0; i<np; ++i)
       +    _nuance[i] = (float)rand()/RAND_MAX * 0.5f + 0.5f;
       +
          // Allocate memory
       -  cout << "  Allocating device memory\n";
       +  cout << "  Allocating memory\n";
          static unsigned char *_img;                 // RGBw values in image
          static float3* _ray_origo;                // Ray origo (x,y,z)
          static float3* _ray_direction;        // Ray direction (x,y,z)
       t@@ -230,7 +246,7 @@ int rt_cpu(float4* p, unsigned int np,
          // Find closest intersection between rays and spheres
          rayIntersectSpheres_cpu(
              _ray_origo, _ray_direction,
       -      p, _img, pixels, np);
       +      p, _nuance, _img, pixels, np);
        
          // Stop timer 2
          t2_stop = clock();
       t@@ -238,6 +254,7 @@ int rt_cpu(float4* p, unsigned int np,
          memcpy(img, _img, sizeof(unsigned char)*pixels*4);
        
          // Free dynamically allocated device memory
       +  delete [] _nuance;
          delete [] _img;
          delete [] _ray_origo;
          delete [] _ray_direction;
       t@@ -246,7 +263,8 @@ int rt_cpu(float4* p, unsigned int np,
          t1_stop = clock();
          
          // Report time spent 
       -  cout << "  Time spent on entire CPU raytracing routine: "
       +  cout << " done.\n"
       +       << "  Time spent on entire CPU raytracing routine: "
               << (t1_stop-t1_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
          cout << "  - Functions: " << (t2_stop-t2_go)/CLOCKS_PER_SEC*1000.0 << " ms\n";
        
 (DIR) diff --git a/raytracer/rt-kernel.h b/raytracer/rt-kernel.h
       t@@ -2,6 +2,8 @@
        #define RT_KERNEL_H_
        
        #include <vector_functions.h>
       +#include "../src/datatypes.h"
       +#include "header.h"
        
        // Constants
        __constant__ float4 const_u;
       t@@ -11,19 +13,23 @@ __constant__ float4 const_eye;
        __constant__ float4 const_imgplane;
        __constant__ float  const_d;
        __constant__ float4 const_light;
       +__constant__ unsigned int const_pixels;
       +__constant__ Inttype const_np;
        
        
        // Host prototype functions
        
        extern "C"
       -void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio);
       +void cameraInit(float4 eye, float4 lookat, float imgw, float hw_ratio,
       +                    unsigned int pixels, Inttype np);
        
        extern "C"
        void checkForCudaErrors(const char* checkpoint_description);
        
        extern "C"
       -int rt(float4* p, const unsigned int np, 
       -       rgb* img, const unsigned int width, const unsigned int height,
       -       f3 origo, f3 L, f3 eye, f3 lookat, float imgw);
       -
       +int rt(float4* p, Inttype np,
       +       rgb* img, unsigned int width, unsigned int height,
       +       f3 origo, f3 L, f3 eye, f3 lookat, float imgw,
       +       int visualize, float max_val,
       +       float* fixvel, float* pres, float* es_dot, float* es, float* vel);
        #endif
 (DIR) diff --git a/src/Makefile b/src/Makefile
       t@@ -1,6 +1,11 @@
       +# Cuda paths
        CUDA_INSTALL_PATH=/usr/local/cuda
        CUDA_BIN=$(CUDA_INSTALL_PATH)/bin
        
       +# Define the editor and optional arguments
       +EDITOR=vim -p
       +
       +# Define compilers and linker
        CC=g++
        NVCC=$(CUDA_BIN)/nvcc
        LINKER=$(CC)
       t@@ -26,7 +31,7 @@ CCFILES=main.cpp file_io.cpp
        CUFILES=device.cu utility.cu
        CCOBJECTS=$(CCFILES:.cpp=.o)
        CUOBJECTS=$(CUFILES:.cu=.o)
       -        OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
       +OBJECTS=$(CCOBJECTS) $(CUOBJECTS)
        
        # Detect OS
        OSUPPER=$(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
       t@@ -53,7 +58,10 @@ LDFLAGS+=-L$(CUDA_INSTALL_PATH)/lib
        LDFLAGS+=-L$(SDKPATH)/../../shared/lib -L$(SDKPATH)/../lib 
        LDFLAGS+=-lcutil_x86_64 -lcuda -lcudart
        
       -all: $(CCFILES) $(CUFILES) $(EXECUTABLE)
       +all: $(CCFILES) $(CUFILES) $(EXECUTABLE) raytracer
       +
       +raytracer: 
       +        $(MAKE) -C ../raytracer/
        
        $(EXECUTABLE): $(OBJECTS)
                $(LINKER) $(OBJECTS) $(LDFLAGS) -o $@
       t@@ -61,20 +69,20 @@ $(EXECUTABLE): $(OBJECTS)
        utility.o: utility.cu
                $(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
        
       -file_io.o: file_io.cpp
       +file_io.o: file_io.cpp datatypes.h
                $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
        
       -device.o: device.cu
       +device.o: device.cu datatypes.h *.cuh
                $(NVCC) $(NVCCFLAGS) $(INCLUDES) -c $< -o $@
        
       -main.o: main.cpp
       +main.o: main.cpp datatypes.h
                $(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
        
        ../sphere_status: sphere_status.cpp
                $(CC) $(CCFLAGS) sphere_status.cpp -o ../sphere_status
        
        edit:
       -        vim -p Makefile $(CCFILES) $(CUFILES) *.h *.cuh 
       +        $(EDITOR) Makefile $(CCFILES) $(CUFILES) *.h *.cuh 
        
        backup:
                tar cvfz ../../sphere_backup/$(BACKUPNAME) ../../sphere
       t@@ -83,4 +91,10 @@ backup:
        clean:
                $(RM) $(OBJECTS)
                $(RM) ../sphere_*
       +        $(MAKE) -C ../raytracer clean
       +
       +clear:
       +        # Remove all output data and images
       +        $(RM) -rf ../img_out/*.{ppm,png}
       +        $(RM) -rf ../output/*.{dat,bin}
        
 (DIR) diff --git a/src/constants.cuh b/src/constants.cuh
       t@@ -2,28 +2,31 @@
        #define CONSTANTS_CUH_
        
        // Constant memory size: 64 kb
       -__constant__ float        devC_origo[ND];  // World coordinate system origo
       -__constant__ float        devC_L[ND];           // World length in dimension ND
       +__constant__ Float        devC_origo[ND];  // World coordinate system origo
       +__constant__ Float        devC_L[ND];           // World length in dimension ND
        __constant__ unsigned int devC_num[ND];           // Number of cells in dimension ND
       -__constant__ float        devC_dt;           // Time step length
       +__constant__ Float        devC_dt;           // Time step length
        __constant__ int          devC_global;           // Parameter properties, 1: global, 0: individual
       -__constant__ float        devC_g[ND];           // Gravitational acceleration vector
       +__constant__ Float        devC_g[ND];           // Gravitational acceleration vector
        __constant__ unsigned int devC_np;           // Number of particles
        __constant__ char          devC_nc;           // Max. number of contacts a particle can have
        __constant__ unsigned int devC_shearmodel; // Shear force model: 1: viscous, frictional, 2: elastic, frictional
       -__constant__ float        devC_k_n;           // Material normal stiffness
       -__constant__ float        devC_k_s;           // Material shear stiffness
       -__constant__ float        devC_k_r;           // Material rolling stiffness
       -__constant__ float        devC_gamma_n;           // Material normal viscosity
       -__constant__ float        devC_gamma_s;           // Material shear viscosity
       -__constant__ float          devC_gamma_r;           // Material rolling viscosity
       -__constant__ float        devC_mu_s;           // Material static shear friction coefficient
       -__constant__ float        devC_mu_d;           // Material dynamic shear friction coefficient
       -__constant__ float          devC_mu_r;           // Material rolling friction coefficient
       -__constant__ float        devC_rho;           // Material density
       -__constant__ float          devC_kappa;           // Capillary bond prefactor
       -__constant__ float          devC_db;           // Debonding distance
       -__constant__ float          devC_V_b;           // Liquid volume of capillary bond
       +__constant__ Float        devC_k_n;           // Material normal stiffness
       +__constant__ Float        devC_k_s;           // Material shear stiffness
       +__constant__ Float        devC_k_r;           // Material rolling stiffness
       +__constant__ Float        devC_gamma_n;           // Material normal viscosity
       +__constant__ Float        devC_gamma_s;           // Material shear viscosity
       +__constant__ Float          devC_gamma_r;           // Material rolling viscosity
       +__constant__ Float        devC_gamma_wn;   // Wall normal viscosity
       +__constant__ Float        devC_gamma_ws;   // Wall shear viscosity
       +__constant__ Float          devC_gamma_wr;   // Wall rolling viscosity
       +__constant__ Float        devC_mu_s;           // Material static shear friction coefficient
       +__constant__ Float        devC_mu_d;           // Material dynamic shear friction coefficient
       +__constant__ Float          devC_mu_r;           // Material rolling friction coefficient
       +__constant__ Float        devC_rho;           // Material density
       +__constant__ Float          devC_kappa;           // Capillary bond prefactor
       +__constant__ Float          devC_db;           // Debonding distance
       +__constant__ Float          devC_V_b;           // Liquid volume of capillary bond
        __constant__ unsigned int devC_nw;           // Number of walls
        __constant__ unsigned int devC_w_n;           // Dimension of orthogonal wall surface normal
        __constant__ int          devC_periodic;   // Behavior of x- and y boundaries: 0: walls, 1: periodic
 (DIR) diff --git a/src/datatypes.h b/src/datatypes.h
       t@@ -4,25 +4,60 @@
        #ifndef DATATYPES_H_
        #define DATATYPES_H_
        
       -/////////////////////////////////
       -// STATIC VARIABLE DECLARATION //
       -/////////////////////////////////
       +#include <math.h>
       +#include "vector_functions.h"
       +//#include "vector_arithmetic.h"
        
        
       -#include "vector_functions.h"
       +// Enable profiling of kernel runtimes?
       +// 0: No
       +// 1: Yes
       +#define PROFILING 1
       +
       +
       +//////////////////////
       +// TYPE DEFINITIONS //
       +//////////////////////
       +
       +// REMEMBER: When changing the precision below,
       +// change values in typedefs.h accordingly.
       +
       +// Uncomment all five lines below for single precision
       +/*
       +typedef Float Float;
       +typedef Float3 Float3;
       +typedef Float4 Float4;
       +#define MAKE_FLOAT3(x, y, z) make_Float3(x, y, z)
       +#define MAKE_FLOAT4(x, y, z, w) make_Float4(x, y, z, w)
       +*/
       +
       +
       +// Uncomment all five lines below for double precision
       +///*
       +typedef double Float;
       +typedef double2 Float2;
       +typedef double3 Float3;
       +typedef double4 Float4;
       +#define MAKE_FLOAT3(x, y, z) make_double3(x, y, z)
       +#define MAKE_FLOAT4(x, y, z, w) make_double4(x, y, z, w)
       +//*/
       +
        
       -//// Symbolic constants
       +////////////////////////
       +// SYMBOLIC CONSTANTS //
       +////////////////////////
        
       -const float PI = 3.14159265358979f;
       +const Float PI = 3.14159265358979;
        
        // Number of dimensions (1 and 2 NOT functional)
        const unsigned int ND = 3;
        
        // Define source code version
       -const float VERS = 0.25;
       +const Float VERS = 0.25;
        
        // Max. number of contacts per particle
       -const char NC = 16;
       +//const char NC = 16;
       +const char NC = 32;
        
        
        ///////////////////////////
       t@@ -31,39 +66,39 @@ const char NC = 16;
        
        // Structure containing variable particle parameters
        struct Particles {
       -  float *radius;
       -  float *k_n;
       -  float *k_s;
       -  float *k_r;
       -  float *gamma_n;
       -  float *gamma_s;
       -  float *gamma_r;
       -  float *mu_s;
       -  float *mu_d;
       -  float *mu_r;
       -  float *rho;
       -  float *es_dot;
       -  float *es;
       -  float *p;
       -  float *m;
       -  float *I;
       +  Float *radius;
       +  Float *k_n;
       +  Float *k_s;
       +  Float *k_r;
       +  Float *gamma_n;
       +  Float *gamma_s;
       +  Float *gamma_r;
       +  Float *mu_s;
       +  Float *mu_d;
       +  Float *mu_r;
       +  Float *rho;
       +  Float *es_dot;
       +  Float *es;
       +  Float *p;
       +  Float *m;
       +  Float *I;
          unsigned int np;
        };
        
        // Structure containing grid parameters
        struct Grid {
          unsigned int nd;
       -  float *origo;
       -  float *L;
       +  Float *origo;
       +  Float *L;
          unsigned int *num;
        };
        
        // Structure containing time parameters
        struct Time {
       -  float dt;
       +  Float dt;
          double current;
          double total;
       -  float file_dt;
       +  Float file_dt;
          unsigned int step_count;
        };
        
       t@@ -71,23 +106,26 @@ struct Time {
        struct Params {
          //bool global;
          int global;
       -  float *g;
       +  Float *g;
          unsigned int np;
          unsigned int nw;
       -  float dt; 
       -  float k_n;
       -  float k_s;
       -  float k_r;
       -  float gamma_n;
       -  float gamma_s;
       -  float gamma_r;
       -  float mu_s; 
       -  float mu_d;
       -  float mu_r;
       -  float rho;
       -  float kappa;
       -  float db;
       -  float V_b;
       +  Float dt; 
       +  Float k_n;
       +  Float k_s;
       +  Float k_r;
       +  Float gamma_n;
       +  Float gamma_s;
       +  Float gamma_r;
       +  Float gamma_wn;
       +  Float gamma_ws;
       +  Float gamma_wr;
       +  Float mu_s; 
       +  Float mu_d;
       +  Float mu_r;
       +  Float rho;
       +  Float kappa;
       +  Float db;
       +  Float V_b;
          int periodic;
          unsigned int shearmodel;
        };
       t@@ -97,29 +135,29 @@ struct Params {
        // PROTOTYPE FUNCTIONS //
        /////////////////////////
        int fwritebin(char *target, Particles *p, 
       -                  float4 *host_x, float4 *host_vel, 
       -              float4 *host_angvel, float4 *host_force, 
       -              float4 *host_torque, uint4 *host_bonds,
       +                  Float4 *host_x, Float4 *host_vel, 
       +              Float4 *host_angvel, Float4 *host_force, 
       +              Float4 *host_torque, uint4 *host_bonds,
                      Grid *grid, Time *time, Params *params,
       -              float4 *host_w_nx, float4 *host_w_mvfd);
       +              Float4 *host_w_nx, Float4 *host_w_mvfd);
        
        // device.cu
        //extern "C"
        void initializeGPU(void);
        
        //extern "C"
       -void gpuMain(float4* host_x,
       -                 float4* host_vel,
       -             float4* host_acc,
       -             float4* host_angvel,
       -             float4* host_angacc,
       -             float4* host_force,
       -             float4* host_torque,
       +void gpuMain(Float4* host_x,
       +                 Float4* host_vel,
       +             Float4* host_acc,
       +             Float4* host_angvel,
       +             Float4* host_angacc,
       +             Float4* host_force,
       +             Float4* host_torque,
                     uint4*  host_bonds,
                     Particles* p, Grid* grid,
                     Time* time, Params* params,
       -             float4* host_w_nx,
       -             float4* host_w_mvfd,
       +             Float4* host_w_nx,
       +             Float4* host_w_mvfd,
                     const char* cwd,
                     const char* inputbin);
        
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -2,8 +2,8 @@
        #include <iostream>
        #include <cstdio>
        #include <cuda.h>
       -#include <cutil_math.h>
        
       +#include "vector_arithmetic.h"
        #include "thrust/device_ptr.h"
        #include "thrust/sort.h"
        
       t@@ -11,1488 +11,11 @@
        #include "datatypes.cuh"
        #include "constants.cuh"
        
       -//#include "cuPrintf.cu"
       -
       -
       -// Returns the cellID containing the particle, based cubic grid
       -// See Bayraktar et al. 2009
       -// Kernel is executed on the device, and is callable from the device only
       -__device__ int calcCellID(float3 x) 
       -{ 
       -  unsigned int i_x, i_y, i_z;
       -
       -  // Calculate integral coordinates:
       -  i_x = floor((x.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       -  i_y = floor((x.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       -  i_z = floor((x.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       -
       -  // Integral coordinates are converted to 1D coordinate:
       -  return __umul24(__umul24(i_z, devC_num[1]),
       -                        devC_num[0]) 
       -             + __umul24(i_y, devC_num[0]) + i_x;
       -
       -} // End of calcCellID(...)
       -
       -
       -// Calculate hash value for each particle, based on position in grid.
       -// Kernel executed on device, and callable from host only.
       -__global__ void calcParticleCellID(unsigned int* dev_gridParticleCellID, 
       -                                       unsigned int* dev_gridParticleIndex, 
       -                                   float4* dev_x) 
       -{
       -  //unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       -  unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       -
       -  if (idx < devC_np) { // Condition prevents block size error
       -
       -    //volatile float4 x = dev_x[idx]; // Ensure coalesced read
       -    float4 x = dev_x[idx]; // Ensure coalesced read
       -
       -    unsigned int cellID = calcCellID(make_float3(x.x, x.y, x.z));
       -
       -    // Store values    
       -    dev_gridParticleCellID[idx] = cellID;
       -    dev_gridParticleIndex[idx]  = idx;
       -
       -  }
       -} // End of calcParticleCellID(...)
       -
       -
       -// Reorder particle data into sorted order, and find the start and end particle indexes
       -// of each cell in the sorted hash array.
       -// Kernel executed on device, and callable from host only.
       -__global__ void reorderArrays(unsigned int* dev_cellStart, unsigned int* dev_cellEnd,
       -                              unsigned int* dev_gridParticleCellID, 
       -                              unsigned int* dev_gridParticleIndex,
       -                              float4* dev_x, float4* dev_vel, 
       -                              float4* dev_angvel, float* dev_radius,
       -                              //uint4* dev_bonds,
       -                              float4* dev_x_sorted, float4* dev_vel_sorted,
       -                              float4* dev_angvel_sorted, float* dev_radius_sorted)
       -                              //uint4* dev_bonds_sorted)
       -{ 
       -
       -  // Create hash array in shared on-chip memory. The size of the array 
       -  // (threadsPerBlock + 1) is determined at launch time (extern notation).
       -  extern __shared__ unsigned int shared_data[]; 
       -
       -  // Thread index in block
       -  unsigned int tidx = threadIdx.x;
       -
       -  // Thread index in grid
       -  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; 
       -  //unsigned int idx = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       -
       -  // CellID hash value of particle idx
       -  unsigned int cellID;
       -
       -  // Read cellID data and store it in shared memory (shared_data)
       -  if (idx < devC_np) { // Condition prevents block size error
       -    cellID = dev_gridParticleCellID[idx];
       -
       -    // Load hash data into shared memory, allowing access to neighbor particle cellID values
       -    shared_data[tidx+1] = cellID; 
       -
       -    if (idx > 0 && tidx == 0) {
       -      // First thread in block must load neighbor particle hash
       -      shared_data[0] = dev_gridParticleCellID[idx-1];
       -    }
       -  }
       -
       -  // Pause completed threads in this block, until all 
       -  // threads are done loading data into shared memory
       -  __syncthreads();
       -
       -  // Find lowest and highest particle index in each cell
       -  if (idx < devC_np) { // Condition prevents block size error
       -    // If this particle has a different cell index to the previous particle, it's the first
       -    // particle in the cell -> Store the index of this particle in the cell.
       -    // The previous particle must be the last particle in the previous cell.
       -    if (idx == 0 || cellID != shared_data[tidx]) {
       -      dev_cellStart[cellID] = idx;
       -      if (idx > 0)
       -        dev_cellEnd[shared_data[tidx]] = idx;
       -    }
       -
       -    // Check wether the thread is the last one
       -    if (idx == (devC_np - 1)) 
       -      dev_cellEnd[cellID] = idx + 1;
       -
       -
       -    // Use the sorted index to reorder the position and velocity data
       -    unsigned int sortedIndex = dev_gridParticleIndex[idx];
       -
       -    // Fetch from global read
       -    float4 x      = dev_x[sortedIndex];
       -    float4 vel    = dev_vel[sortedIndex];
       -    float4 angvel = dev_angvel[sortedIndex];
       -    float  radius = dev_radius[sortedIndex];
       -    //uint4  bonds  = dev_bonds[sortedIndex];
       -
       -    __syncthreads();
       -    // Write sorted data to global memory
       -    dev_x_sorted[idx]      = x;
       -    dev_vel_sorted[idx]    = vel;
       -    dev_angvel_sorted[idx] = angvel;
       -    dev_radius_sorted[idx] = radius;
       -    //dev_bonds_sorted[idx]  = bonds;
       -  }
       -} // End of reorderArrays(...)
       -
       -
       -
       -
       -// COLLISION LOCATION AND PROCESSING FUNCTIONS
       -
       -
       -// Linear viscoelastic contact model for particle-wall interactions
       -// with tangential friction and rolling resistance
       -__device__ float contactLinear_wall(float3* N, float3* T, float* es_dot, float* p,
       -                                    unsigned int idx_a, float radius_a,
       -                                    float4* dev_vel_sorted, float4* dev_angvel_sorted,
       -                                    float3 n, float delta, float wvel)
       -{
       -  // Fetch particle velocities from global memory
       -  const float4 linvel_tmp = dev_vel_sorted[idx_a];
       -  const float4 angvel_tmp = dev_angvel_sorted[idx_a];
       -
       -  // Convert velocities to three-component vectors
       -  const float3 linvel = make_float3(linvel_tmp.x,
       -                                          linvel_tmp.y,
       -                                    linvel_tmp.z);
       -  const float3 angvel = make_float3(angvel_tmp.x,
       -                                          angvel_tmp.y,
       -                                    angvel_tmp.z);
       -
       -  // Store the length of the angular velocity for later use
       -  const float angvel_length = length(angvel);
       -
       -  // Contact velocity is the sum of the linear and
       -  // rotational components
       -  const float3 vel = linvel + radius_a * cross(n, angvel) + wvel;
       -
       -  // Normal component of the contact velocity
       -  const float vel_n = dot(vel, n);
       -
       -  // The tangential velocity is the contact velocity
       -  // with the normal component subtracted
       -  const float3 vel_t = vel - n * (dot(vel, n));
       -  const float  vel_t_length = length(vel_t);
       -
       -  // Calculate elastic normal component
       -  //float3 f_n = -devC_k_n * delta * n;
       -
       -  // Normal force component: Elastic - viscous damping
       -  float3 f_n = (-devC_k_n * delta - devC_gamma_n * vel_n) * n;
       -
       -  // Make sure the viscous damping doesn't exceed the elastic component,
       -  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       -  if (dot(f_n, n) < 0.0f)
       -    f_n = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  const float  f_n_length = length(f_n); // Save length for later use
       -
       -  // Initialize vectors
       -  float3 f_t   = make_float3(0.0f, 0.0f, 0.0f);
       -  float3 T_res = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  // Check that the tangential velocity is high enough to avoid
       -  // divide by zero (producing a NaN)
       -  if (vel_t_length > 0.f) {
       -
       -    const float f_t_visc  = devC_gamma_s * vel_t_length; // Tangential force by viscous model
       -    const float f_t_limit = devC_mu_s * f_n_length;      // Max. friction
       -
       -    // If the shear force component exceeds the friction,
       -    // the particle slips and energy is dissipated
       -    if (f_t_visc < f_t_limit) {
       -      f_t = -1.0f * f_t_visc * vel_t/vel_t_length;
       -
       -    } else { // Dynamic friction, friction failure
       -      f_t = -1.0f * f_t_limit * vel_t/vel_t_length;
       -      
       -      // Shear energy production rate [W]
       -      //*es_dot += -dot(vel_t, f_t);
       -    }
       -  }
       -
       -/*  if (angvel_length > 0.f) {
       -    // Apply rolling resistance (Zhou et al. 1999)
       -    //T_res = -angvel_a/angvel_length * devC_mu_r * radius_a * f_n_length;
       -
       -    // New rolling resistance model
       -    T_res = -1.0f * fmin(devC_gamma_r * radius_a * angvel_length,
       -                         devC_mu_r * radius_a * f_n_length)
       -            * angvel_a/angvel_length;
       -  }*/
       -
       -  // Total force from wall
       -  *N += f_n + f_t;
       -//  *N += f_n;
       -
       -  // Total torque from wall
       -  *T += -radius_a * cross(n, f_t) + T_res;
       -
       -  // Pressure excerted onto particle from this contact
       -  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       -
       -  // Return force excerted onto the wall
       -  //return -dot(*N, n);
       -  return dot(f_n, n);
       -}
       -
       -
       -// Linear vicoelastic contact model for particle-particle interactions
       -// with tangential friction and rolling resistance
       -__device__ void contactLinearViscous(float3* N, float3* T, float* es_dot, float* p,
       -                                               unsigned int idx_a, unsigned int idx_b, 
       -                                     float4* dev_vel_sorted, 
       -                                     float4* dev_angvel_sorted,
       -                                     float radius_a, float radius_b, 
       -                                     float3 x_ab, float x_ab_length, 
       -                                     float delta_ab, float kappa) 
       -{
       -
       -  // Allocate variables and fetch missing time=t values for particle A and B
       -  const float4 vel_a     = dev_vel_sorted[idx_a];
       -  const float4 vel_b     = dev_vel_sorted[idx_b];
       -  const float4 angvel4_a = dev_angvel_sorted[idx_a];
       -  const float4 angvel4_b = dev_angvel_sorted[idx_b];
       -
       -  // Convert to float3's
       -  const float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       -  const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       -
       -  // Force between grain pair decomposed into normal- and tangential part
       -  float3 f_n, f_t, f_c, T_res;
       -
       -  // Normal vector of contact
       -  const float3 n_ab = x_ab/x_ab_length;
       -
       -  // Relative contact interface velocity, w/o rolling
       -  const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x, 
       -                                                 vel_a.y - vel_b.y, 
       -                                           vel_a.z - vel_b.z);
       -
       -  // Relative contact interface velocity of particle surfaces at
       -  // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       -  const float3 vel_ab = vel_ab_linear
       -                        + radius_a * cross(n_ab, angvel_a)
       -                        + radius_b * cross(n_ab, angvel_b);
       -
       -  // Relative contact interface rolling velocity
       -  const float3 angvel_ab = angvel_a - angvel_b;
       -  const float  angvel_ab_length = length(angvel_ab);
       -
       -  // Normal component of the relative contact interface velocity
       -  const float vel_n_ab = dot(vel_ab_linear, n_ab);
       -
       -  // Tangential component of the relative contact interface velocity
       -  // Hinrichsen and Wolf 2004, eq. 13.9
       -  const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       -  const float  vel_t_ab_length = length(vel_t_ab);
       -
       -  // Compute the normal stiffness of the contact
       -  //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
       -
       -  // Calculate rolling radius
       -  const float R_bar = (radius_a + radius_b) / 2.0f;
       -
       -  // Normal force component: linear-elastic approximation (Augier 2009, eq. 3)
       -  // with velocity dependant damping
       -  //   Damping coefficient: alpha = 0.8
       -  //f_n = (-k_n_ab * delta_ab + 2.0f * 0.8f * sqrtf(m_eff*k_n_ab) * vel_ab) * n_ab;
       -
       -  // Linear spring for normal component (Renzo 2004, eq. 35)
       -  // Dissipation due to  plastic deformation is modelled by using a different
       -  // unloading spring constant (Walton and Braun 1986)
       -  // Here the factor in the second term determines the relative strength of the
       -  // unloading spring relative to the loading spring.
       -  /*  if (vel_n_ab > 0.0f) {        // Loading
       -      f_n = (-k_n_ab * delta_ab) * n_ab;
       -      } else {                        // Unloading
       -      f_n = (-k_n_ab * 0.90f * delta_ab) * n_ab;
       -      } // f_n is OK! */
       -
       -  // Normal force component: Elastic
       -  //f_n = -k_n_ab * delta_ab * n_ab;
       -
       -  // Normal force component: Elastic - viscous damping
       -  f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
       -
       -  // Make sure the viscous damping doesn't exceed the elastic component,
       -  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       -  if (dot(f_n, n_ab) < 0.0f)
       -    f_n = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  const float f_n_length = length(f_n);
       -
       -  // Add max. capillary force
       -  f_c = -kappa * sqrtf(radius_a * radius_b) * n_ab;
       -
       -  // Initialize force vectors to zero
       -  f_t   = make_float3(0.0f, 0.0f, 0.0f);
       -  T_res = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  // Shear force component: Nonlinear relation
       -  // Coulomb's law of friction limits the tangential force to less or equal
       -  // to the normal force
       -  if (vel_t_ab_length > 0.f) {
       -
       -    // Tangential force by viscous model
       -    const float f_t_visc  = devC_gamma_s * vel_t_ab_length;
       -
       -    // Determine max. friction
       -    float f_t_limit;
       -    if (vel_t_ab_length > 0.001f) { // Dynamic
       -      f_t_limit = devC_mu_d * length(f_n-f_c);
       -    } else { // Static
       -      f_t_limit = devC_mu_s * length(f_n-f_c);
       -    }
       -
       -    // If the shear force component exceeds the friction,
       -    // the particle slips and energy is dissipated
       -    if (f_t_visc < f_t_limit) { // Static
       -      f_t = -1.0f * f_t_visc * vel_t_ab/vel_t_ab_length;
       -
       -    } else { // Dynamic, friction failure
       -      f_t = -1.0f * f_t_limit * vel_t_ab/vel_t_ab_length;
       -
       -      // Shear friction production rate [W]
       -      //*es_dot += -dot(vel_t_ab, f_t);
       -    }
       -  }
       -
       -/*  if (angvel_ab_length > 0.f) {
       -    // Apply rolling resistance (Zhou et al. 1999)
       -    //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
       -
       -    // New rolling resistance model
       -    T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
       -                         devC_mu_r * R_bar * f_n_length)
       -            * angvel_ab/angvel_ab_length;
       -  }
       -*/
       -
       -  // Add force components from this collision to total force for particle
       -  *N += f_n + f_t + f_c; 
       -  *T += -R_bar * cross(n_ab, f_t) + T_res;
       -
       -  // Pressure excerted onto the particle from this contact
       -  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       -
       -} // End of contactLinearViscous()
       -
       -
       -// Linear elastic contact model for particle-particle interactions
       -__device__ void contactLinear(float3* N, float3* T, 
       -                                  float* es_dot, float* p,
       -                              unsigned int idx_a_orig,
       -                              unsigned int idx_b_orig, 
       -                              float4  vel_a, 
       -                              float4* dev_vel,
       -                              float3  angvel_a,
       -                              float4* dev_angvel,
       -                              float radius_a, float radius_b, 
       -                              float3 x_ab, float x_ab_length, 
       -                              float delta_ab, float4* dev_delta_t,
       -                              unsigned int mempos) 
       -{
       -
       -  // Allocate variables and fetch missing time=t values for particle A and B
       -  const float4 vel_b     = dev_vel[idx_b_orig];
       -  const float4 angvel4_b = dev_angvel[idx_b_orig];
       -
       -  // Fetch previous sum of shear displacement for the contact pair
       -  const float4 delta_t0_4 = dev_delta_t[mempos];
       -
       -  const float3 delta_t0_uncor = make_float3(delta_t0_4.x,
       -                                                  delta_t0_4.y,
       -                                            delta_t0_4.z);
       -
       -  // Convert to float3
       -  const float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       -
       -  // Force between grain pair decomposed into normal- and tangential part
       -  float3 f_n, f_t, f_c, T_res;
       -
       -  // Normal vector of contact
       -  const float3 n_ab = x_ab/x_ab_length;
       -
       -  // Relative contact interface velocity, w/o rolling
       -  const float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x, 
       -                                                 vel_a.y - vel_b.y, 
       -                                           vel_a.z - vel_b.z);
       -
       -  // Relative contact interface velocity of particle surfaces at
       -  // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       -  const float3 vel_ab = vel_ab_linear
       -                        + radius_a * cross(n_ab, angvel_a)
       -                        + radius_b * cross(n_ab, angvel_b);
       -
       -  // Relative contact interface rolling velocity
       -  const float3 angvel_ab = angvel_a - angvel_b;
       -  const float  angvel_ab_length = length(angvel_ab);
       -
       -  // Normal component of the relative contact interface velocity
       -  const float vel_n_ab = dot(vel_ab_linear, n_ab);
       -
       -  // Tangential component of the relative contact interface velocity
       -  // Hinrichsen and Wolf 2004, eq. 13.9
       -  const float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       -  const float  vel_t_ab_length = length(vel_t_ab);
       -
       -  // Correct tangential displacement vector, which is
       -  // necessary if the tangential plane rotated
       -  const float3 delta_t0 = delta_t0_uncor - (n_ab * dot(delta_t0_uncor, n_ab));
       -  const float  delta_t0_length = length(delta_t0);
       -
       -  // New tangential displacement vector
       -  float3 delta_t;
       -
       -  // Compute the normal stiffness of the contact
       -  //float k_n_ab = k_n_a * k_n_b / (k_n_a + k_n_b);
       -
       -  // Calculate rolling radius
       -  const float R_bar = (radius_a + radius_b) / 2.0f;
       -
       -  // Normal force component: Elastic
       -  //f_n = -devC_k_n * delta_ab * n_ab;
       -
       -  // Normal force component: Elastic - viscous damping
       -  f_n = (-devC_k_n * delta_ab - devC_gamma_n * vel_n_ab) * n_ab;
       -
       -  // Make sure the viscous damping doesn't exceed the elastic component,
       -  // i.e. the damping factor doesn't exceed the critical damping, 2*sqrt(m*k_n)
       -  if (dot(f_n, n_ab) < 0.0f)
       -    f_n = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  const float f_n_length = length(f_n);
       -
       -  // Add max. capillary force
       -  f_c = -devC_kappa * sqrtf(radius_a * radius_b) * n_ab;
       -
       -  // Initialize force vectors to zero
       -  f_t   = make_float3(0.0f, 0.0f, 0.0f);
       -  T_res = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  // Apply a tangential force if the previous tangential displacement
       -  // is non-zero, or the current sliding velocity is non-zero.
       -  if (delta_t0_length > 0.f || vel_t_ab_length > 0.f) {
       -
       -    // Shear force: Visco-Elastic, limited by Coulomb friction
       -    float3 f_t_elast = -1.0f * devC_k_s * delta_t0;
       -    float3 f_t_visc  = -1.0f * devC_gamma_s * vel_t_ab;
       -
       -    float f_t_limit;
       -    
       -    if (vel_t_ab_length > 0.001f) { // Dynamic friciton
       -      f_t_limit = devC_mu_d * length(f_n-f_c);
       -    } else { // Static friction
       -      f_t_limit = devC_mu_s * length(f_n-f_c);
       -    }
       -
       -    // Tangential force before friction limit correction
       -    f_t = f_t_elast + f_t_visc;
       -    float f_t_length = length(f_t);
       -
       -    // If failure criterion is not met, contact is viscous-linear elastic.
       -    // If failure criterion is met, contact force is limited, 
       -    // resulting in a slip and energy dissipation
       -    if (f_t_length > f_t_limit) { // Dynamic case
       -      
       -      // Frictional force is reduced to equal the limit
       -      f_t *= f_t_limit/f_t_length;
       -
       -      // A slip event zeros the displacement vector
       -      //delta_t = make_float3(0.0f, 0.0f, 0.0f);
       -
       -      // In a slip event, the tangential spring is adjusted to a 
       -      // length which is consistent with Coulomb's equation
       -      // (Hinrichsen and Wolf, 2004)
       -      delta_t = -1.0f/devC_k_s * f_t + devC_gamma_s * vel_t_ab;
       -
       -      // Shear friction heat production rate: 
       -      // The energy lost from the tangential spring is dissipated as heat
       -      //*es_dot += -dot(vel_t_ab, f_t);
       -      *es_dot += length(delta_t0 - delta_t) * devC_k_s / devC_dt; // Seen in EsyS-Particle
       -
       -    } else { // Static case
       -
       -      // No correction of f_t is required
       -
       -      // Add tangential displacement to total tangential displacement
       -      delta_t = delta_t0 + vel_t_ab * devC_dt;
       -    }
       -  }
       -
       -  if (angvel_ab_length > 0.f) {
       -    // Apply rolling resistance (Zhou et al. 1999)
       -    //T_res = -angvel_ab/angvel_ab_length * devC_mu_r * R_bar * length(f_n);
       -
       -    // New rolling resistance model
       -    T_res = -1.0f * fmin(devC_gamma_r * R_bar * angvel_ab_length,
       -                         devC_mu_r * R_bar * f_n_length)
       -            * angvel_ab/angvel_ab_length;
       -  }
       -
       -  // Add force components from this collision to total force for particle
       -  *N += f_n + f_t + f_c; 
       -  *T += -R_bar * cross(n_ab, f_t) + T_res;
       -
       -  // Pressure excerted onto the particle from this contact
       -  *p += f_n_length / (4.0f * PI * radius_a*radius_a);
       -
       -  // Store sum of tangential displacements
       -  dev_delta_t[mempos] = make_float4(delta_t, 0.0f);
       -
       -} // End of contactLinear()
       -
       -
       -// Linear-elastic bond: Attractive force with normal- and shear components
       -// acting upon particle A in a bonded particle pair
       -__device__ void bondLinear(float3* N, float3* T, float* es_dot, float* p,
       -                           unsigned int idx_a, unsigned int idx_b, 
       -                           float4* dev_x_sorted, float4* dev_vel_sorted, 
       -                           float4* dev_angvel_sorted,
       -                           float radius_a, float radius_b, 
       -                           float3 x_ab, float x_ab_length, 
       -                           float delta_ab) 
       -{
       -
       -  // If particles are not overlapping, apply bond force
       -  if (delta_ab > 0.0f) {
       -
       -    // Allocate variables and fetch missing time=t values for particle A and B
       -    float4 vel_a     = dev_vel_sorted[idx_a];
       -    float4 vel_b     = dev_vel_sorted[idx_b];
       -    float4 angvel4_a = dev_angvel_sorted[idx_a];
       -    float4 angvel4_b = dev_angvel_sorted[idx_b];
       -
       -    // Convert to float3's
       -    float3 angvel_a = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       -    float3 angvel_b = make_float3(angvel4_b.x, angvel4_b.y, angvel4_b.z);
       -
       -    // Normal vector of contact
       -    float3 n_ab = x_ab/x_ab_length;
       -
       -    // Relative contact interface velocity, w/o rolling
       -    float3 vel_ab_linear = make_float3(vel_a.x - vel_b.x, 
       -                                       vel_a.y - vel_b.y, 
       -                                       vel_a.z - vel_b.z);
       -
       -    // Relative contact interface velocity of particle surfaces at
       -    // the contact, with rolling (Hinrichsen and Wolf 2004, eq. 13.10)
       -    float3 vel_ab = vel_ab_linear
       -                    + radius_a * cross(n_ab, angvel_a)
       -                    + radius_b * cross(n_ab, angvel_b);
       -
       -    // Relative contact interface rolling velocity
       -    //float3 angvel_ab = angvel_a - angvel_b;
       -    //float  angvel_ab_length = length(angvel_ab);
       -
       -    // Normal component of the relative contact interface velocity
       -    //float vel_n_ab = dot(vel_ab_linear, n_ab);
       -
       -    // Tangential component of the relative contact interface velocity
       -    // Hinrichsen and Wolf 2004, eq. 13.9
       -    float3 vel_t_ab = vel_ab - (n_ab * dot(vel_ab, n_ab));
       -    //float  vel_t_ab_length = length(vel_t_ab);
       -
       -    float3 f_n = make_float3(0.0f, 0.0f, 0.0f);
       -    float3 f_t = make_float3(0.0f, 0.0f, 0.0f);
       -
       -    // Mean radius
       -    float R_bar = (radius_a + radius_b)/2.0f;
       -
       -    // Normal force component: Elastic
       -    f_n = devC_k_n * delta_ab * n_ab;
       -
       -    if (length(vel_t_ab) > 0.f) {
       -      // Shear force component: Viscous
       -      f_t = -1.0f * devC_gamma_s * vel_t_ab;
       -
       -      // Shear friction production rate [W]
       -      //*es_dot += -dot(vel_t_ab, f_t);
       -    }
       -
       -    // Add force components from this bond to total force for particle
       -    *N += f_n + f_t;
       -    *T += -R_bar * cross(n_ab, f_t);
       -
       -    // Pressure excerted onto the particle from this bond
       -    *p += length(f_n) / (4.0f * PI * radius_a*radius_a);
       -
       -  }
       -} // End of bondLinear()
       -
       -
       -// Capillary cohesion after Richefeu et al. (2006)
       -__device__ void capillaryCohesion_exp(float3* N, float radius_a, 
       -                                          float radius_b, float delta_ab,
       -                                          float3 x_ab, float x_ab_length, 
       -                                      float kappa)
       -{
       -
       -  // Normal vector 
       -  float3 n_ab = x_ab/x_ab_length;
       -
       -  float3 f_c;
       -  float lambda, R_geo, R_har, r, h;
       -
       -  // Determine the ratio; r = max{Ri/Rj;Rj/Ri}
       -  if ((radius_a/radius_b) > (radius_b/radius_a))
       -    r = radius_a/radius_b;
       -  else
       -    r = radius_b/radius_a;
       -
       -  // Exponential decay function
       -  h = -sqrtf(r);
       -
       -  // The harmonic mean
       -  R_har = (2.0f * radius_a * radius_b) / (radius_a + radius_b);
       -
       -  // The geometrical mean
       -  R_geo = sqrtf(radius_a * radius_b);
       -
       -  // The exponential falloff of the capillary force with distance
       -  lambda = 0.9f * h * sqrtf(devC_V_b/R_har);
       -
       -  // Calculate cohesional force
       -  f_c = -kappa * R_geo * expf(-delta_ab/lambda) * n_ab;
       -
       -  // Add force components from this collision to total force for particle
       -  *N += f_c;
       -
       -} // End of capillaryCohesion_exp
       -
       -
       -
       -// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
       -// Kernel executed on device, and callable from device only.
       -__device__ void overlapsInCell(int3 targetCell, 
       -                                   unsigned int idx_a, 
       -                               float4 x_a, float radius_a,
       -                               float3* N, float3* T, 
       -                               float* es_dot, float* p,
       -                               float4* dev_x_sorted, 
       -                               float* dev_radius_sorted,
       -                               float4* dev_vel_sorted, 
       -                               float4* dev_angvel_sorted,
       -                               unsigned int* dev_cellStart, 
       -                               unsigned int* dev_cellEnd,
       -                               float4* dev_w_nx, 
       -                               float4* dev_w_mvfd)
       -                               //uint4 bonds)
       -{
       -
       -  // Variable containing modifier for interparticle
       -  // vector, if it crosses a periodic boundary
       -  float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  // Check whether x- and y boundaries are to be treated as periodic
       -  // 1: x- and y boundaries periodic
       -  // 2: x boundaries periodic
       -  if (devC_periodic == 1) {
       -
       -    // Periodic x-boundary
       -    if (targetCell.x < 0) {
       -      targetCell.x = devC_num[0] - 1;
       -      distmod += make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -    if (targetCell.x == devC_num[0]) {
       -      targetCell.x = 0;
       -      distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -
       -    // Periodic y-boundary
       -    if (targetCell.y < 0) {
       -      targetCell.y = devC_num[1] - 1;
       -      distmod += make_float3(0.0f, devC_L[1], 0.0f);
       -    }
       -    if (targetCell.y == devC_num[1]) {
       -      targetCell.y = 0;
       -      distmod -= make_float3(0.0f, devC_L[1], 0.0f);
       -    }
       -
       -  } else if (devC_periodic == 2) {
       -
       -    // Periodic x-boundary
       -    if (targetCell.x < 0) {
       -      targetCell.x = devC_num[0] - 1;
       -      distmod += make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -    if (targetCell.x == devC_num[0]) {
       -      targetCell.x = 0;
       -      distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -
       -    // Hande out-of grid cases on y-axis
       -    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       -      return;
       -
       -  } else {
       -
       -    // Hande out-of grid cases on x- and y-axes
       -    if (targetCell.x < 0 || targetCell.x == devC_num[0])
       -      return;
       -    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       -      return;
       -  }
       -
       -  // Handle out-of-grid cases on z-axis
       -  if (targetCell.z < 0 || targetCell.z == devC_num[2])
       -    return;
       -
       -
       -  //// Check and process particle-particle collisions
       -
       -  // Calculate linear cell ID
       -  unsigned int cellID = targetCell.x  
       -                            + __umul24(targetCell.y, devC_num[0]) 
       -                        + __umul24(__umul24(devC_num[0], 
       -                                                  devC_num[1]), 
       -                                    targetCell.z); 
       -
       -  // Lowest particle index in cell
       -  unsigned int startIdx = dev_cellStart[cellID];
       -
       -  // Make sure cell is not empty
       -  if (startIdx != 0xffffffff) {
       -
       -    // Highest particle index in cell + 1
       -    unsigned int endIdx = dev_cellEnd[cellID];
       -
       -    // Iterate over cell particles
       -    for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
       -      if (idx_b != idx_a) { // Do not collide particle with itself
       -
       -        // Fetch position and velocity of particle B.
       -        float4 x_b      = dev_x_sorted[idx_b];
       -        float  radius_b = dev_radius_sorted[idx_b];
       -        float  kappa         = devC_kappa;
       -
       -        // Distance between particle centers (float4 -> float3)
       -        float3 x_ab = make_float3(x_a.x - x_b.x, 
       -                                      x_a.y - x_b.y, 
       -                                  x_a.z - x_b.z);
       -
       -        // Adjust interparticle vector if periodic boundary/boundaries
       -        // are crossed
       -        x_ab += distmod;
       -
       -        float x_ab_length = length(x_ab);
       -
       -        // Distance between particle perimeters
       -        float delta_ab = x_ab_length - (radius_a + radius_b); 
       -
       -        // Check for particle overlap
       -        if (delta_ab < 0.0f) {
       -                  contactLinearViscous(N, T, es_dot, p, 
       -                                     idx_a, idx_b,
       -                               dev_vel_sorted, 
       -                               dev_angvel_sorted,
       -                               radius_a, radius_b, 
       -                               x_ab, x_ab_length,
       -                               delta_ab, kappa);
       -        } else if (delta_ab < devC_db) { 
       -          // Check wether particle distance satisfies the capillary bond distance
       -          capillaryCohesion_exp(N, radius_a, radius_b, delta_ab, 
       -                                      x_ab, x_ab_length, kappa);
       -        }
       -
       -        // Check wether particles are bonded together
       -        /*if (bonds.x == idx_b || bonds.y == idx_b ||
       -            bonds.z == idx_b || bonds.w == idx_b) {
       -          bondLinear(N, T, es_dot, p, 
       -                           idx_a, idx_b,
       -                        dev_x_sorted, dev_vel_sorted,
       -                     dev_angvel_sorted,
       -                     radius_a, radius_b,
       -                     x_ab, x_ab_length,
       -                     delta_ab);
       -        }*/
       -
       -      } // Do not collide particle with itself end
       -    } // Iterate over cell particles end
       -  } // Check wether cell is empty end
       -} // End of overlapsInCell(...)
       -
       -// Find overlaps between particle no. 'idx' and particles in cell 'gridpos'
       -// Write the indexes of the overlaps in array contacts[]
       -// Kernel executed on device, and callable from device only.
       -__device__ void findContactsInCell(int3 targetCell, 
       -                                       unsigned int idx_a, 
       -                                   float4 x_a, float radius_a,
       -                                   float4* dev_x_sorted, 
       -                                   float* dev_radius_sorted,
       -                                   unsigned int* dev_cellStart, 
       -                                   unsigned int* dev_cellEnd,
       -                                   unsigned int* dev_gridParticleIndex,
       -                                   int* nc,
       -                                   unsigned int* dev_contacts,
       -                                   float4* dev_distmod)
       -{
       -  // Variable containing modifier for interparticle
       -  // vector, if it crosses a periodic boundary
       -  float3 distmod = make_float3(0.0f, 0.0f, 0.0f);
       -
       -  // Check whether x- and y boundaries are to be treated as periodic
       -  // 1: x- and y boundaries periodic
       -  // 2: x boundaries periodic
       -  if (devC_periodic == 1) {
       -
       -    // Periodic x-boundary
       -    if (targetCell.x < 0) {
       -      targetCell.x = devC_num[0] - 1;
       -      distmod += make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -    if (targetCell.x == devC_num[0]) {
       -      targetCell.x = 0;
       -      distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -
       -    // Periodic y-boundary
       -    if (targetCell.y < 0) {
       -      targetCell.y = devC_num[1] - 1;
       -      distmod += make_float3(0.0f, devC_L[1], 0.0f);
       -    }
       -    if (targetCell.y == devC_num[1]) {
       -      targetCell.y = 0;
       -      distmod -= make_float3(0.0f, devC_L[1], 0.0f);
       -    }
       -
       -  } else if (devC_periodic == 2) {
       -
       -    // Periodic x-boundary
       -    if (targetCell.x < 0) {
       -      targetCell.x = devC_num[0] - 1;
       -      distmod += make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -    if (targetCell.x == devC_num[0]) {
       -      targetCell.x = 0;
       -      distmod -= make_float3(devC_L[0], 0.0f, 0.0f);
       -    }
       -
       -    // Hande out-of grid cases on y-axis
       -    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       -      return;
       -
       -  } else {
       -
       -    // Hande out-of grid cases on x- and y-axes
       -    if (targetCell.x < 0 || targetCell.x == devC_num[0])
       -      return;
       -    if (targetCell.y < 0 || targetCell.y == devC_num[1])
       -      return;
       -  }
       -
       -  // Handle out-of-grid cases on z-axis
       -  if (targetCell.z < 0 || targetCell.z == devC_num[2])
       -    return;
       -
       -
       -  //// Check and process particle-particle collisions
       -
       -  // Calculate linear cell ID
       -  unsigned int cellID = targetCell.x  
       -                            + __umul24(targetCell.y, devC_num[0]) 
       -                        + __umul24(__umul24(devC_num[0], 
       -                                                  devC_num[1]), 
       -                                    targetCell.z); 
       -
       -  // Lowest particle index in cell
       -  unsigned int startIdx = dev_cellStart[cellID];
       -
       -  // Make sure cell is not empty
       -  if (startIdx != 0xffffffff) {
       -
       -    // Highest particle index in cell + 1
       -    unsigned int endIdx = dev_cellEnd[cellID];
       -
       -    // Read the original index of particle A
       -    unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
       -
       -    // Iterate over cell particles
       -    for (unsigned int idx_b = startIdx; idx_b<endIdx; ++idx_b) {
       -      if (idx_b != idx_a) { // Do not collide particle with itself
       -
       -        // Fetch position and radius of particle B.
       -        float4 x_b      = dev_x_sorted[idx_b];
       -        float  radius_b = dev_radius_sorted[idx_b];
       -
       -        // Read the original index of particle B
       -        unsigned int idx_b_orig = dev_gridParticleIndex[idx_b];
       -
       -        __syncthreads();
       -
       -        // Distance between particle centers (float4 -> float3)
       -        float3 x_ab = make_float3(x_a.x - x_b.x, 
       -                                      x_a.y - x_b.y, 
       -                                  x_a.z - x_b.z);
       -
       -        // Adjust interparticle vector if periodic boundary/boundaries
       -        // are crossed
       -        x_ab += distmod;
       -
       -        float x_ab_length = length(x_ab);
       -
       -        // Distance between particle perimeters
       -        float delta_ab = x_ab_length - (radius_a + radius_b); 
       -
       -        // Check for particle overlap
       -        if (delta_ab < 0.0f) {
       -          
       -          // If the particle is not yet registered in the contact list,
       -          // use the next position in the array
       -          int cpos = *nc;
       -          unsigned int cidx;
       -
       -          // Find out, if particle is already registered in contact list
       -          for (int i=0; i<devC_nc; ++i) {
       -            __syncthreads();
       -            cidx = dev_contacts[(unsigned int)(idx_a_orig*devC_nc+i)];
       -            if (cidx == idx_b_orig)
       -              cpos = i;
       -          }
       -
       -          __syncthreads();
       -
       -          // Write the particle index to the relevant position,
       -          // no matter if it already is there or not (concurrency of write)
       -          dev_contacts[(unsigned int)(idx_a_orig*devC_nc+cpos)] = idx_b_orig;
       -
       -
       -          // Write the interparticle vector and radius of particle B
       -         //dev_x_ab_r_b[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(x_ab, radius_b);
       -          dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(distmod, radius_b);
       -          
       -          // Increment contact counter
       -          ++*nc;
       -        }
       -
       -        // Write the inter-particle position vector correction and radius of particle B
       -        //dev_distmod[(unsigned int)(idx_a_orig*devC_nc+cpos)] = make_float4(distmod, radius_b);
       -
       -        // Check wether particles are bonded together
       -        /*if (bonds.x == idx_b || bonds.y == idx_b ||
       -            bonds.z == idx_b || bonds.w == idx_b) {
       -          bondLinear(N, T, es_dot, p, 
       -                           idx_a, idx_b,
       -                        dev_x_sorted, dev_vel_sorted,
       -                     dev_angvel_sorted,
       -                     radius_a, radius_b,
       -                     x_ab, x_ab_length,
       -                     delta_ab);
       -        }*/
       -
       -      } // Do not collide particle with itself end
       -    } // Iterate over cell particles end
       -  } // Check wether cell is empty end
       -} // End of findContactsInCell(...)
       -
       -
       -// For a single particle:
       -// Search for neighbors to particle 'idx' inside the 27 closest cells, 
       -// and save the contact pairs in global memory.
       -__global__ void topology(unsigned int* dev_cellStart, 
       -                             unsigned int* dev_cellEnd, // Input: Particles in cell 
       -                         unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       -                         float4* dev_x_sorted, float* dev_radius_sorted, 
       -                         unsigned int* dev_contacts,
       -                         float4* dev_distmod)
       -{
       -  // Thread index equals index of particle A
       -  unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       -  if (idx_a < devC_np) {
       -    // Fetch particle data in global read
       -    float4 x_a      = dev_x_sorted[idx_a];
       -    float  radius_a = dev_radius_sorted[idx_a];
       -
       -    // Count the number of contacts in this time step
       -    int nc = 0;
       -
       -    // Grid position of host and neighbor cells in uniform, cubic grid
       -    int3 gridPos;
       -    int3 targetPos;
       -
       -    // Calculate cell address in grid from position of particle
       -    gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       -    gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       -    gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       -
       -    // Find overlaps between particle no. idx and all particles
       -    // from its own cell + 26 neighbor cells
       -    for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       -      for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       -        for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
       -          targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
       -          findContactsInCell(targetPos, idx_a, x_a, radius_a,
       -                                    dev_x_sorted, dev_radius_sorted,
       -                             dev_cellStart, dev_cellEnd,
       -                             dev_gridParticleIndex,
       -                                 &nc, dev_contacts, dev_distmod);
       -        }
       -      }
       -    }
       -  }
       -} // End of topology(...)
       -
       -
       -// For a single particle:
       -// Search for neighbors to particle 'idx' inside the 27 closest cells, 
       -// and compute the resulting normal- and shear force on it.
       -// Collide with top- and bottom walls, save resulting force on upper wall.
       -// Kernel is executed on device, and is callable from host only.
       -__global__ void interact(unsigned int* dev_gridParticleIndex, // Input: Unsorted-sorted key
       -                         unsigned int* dev_cellStart,
       -                         unsigned int* dev_cellEnd,
       -                         float4* dev_x, float* dev_radius,
       -                             float4* dev_x_sorted, float* dev_radius_sorted, 
       -                         float4* dev_vel_sorted, float4* dev_angvel_sorted,
       -                         float4* dev_vel, float4* dev_angvel,
       -                         float4* dev_force, float4* dev_torque,
       -                         float* dev_es_dot, float* dev_es, float* dev_p,
       -                         float4* dev_w_nx, float4* dev_w_mvfd, 
       -                         float* dev_w_force, //uint4* dev_bonds_sorted,
       -                         unsigned int* dev_contacts, 
       -                         float4* dev_distmod,
       -                         float4* dev_delta_t)
       -{
       -  // Thread index equals index of particle A
       -  unsigned int idx_a = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
       -
       -  if (idx_a < devC_np) {
       -
       -    // Fetch particle data in global read
       -    unsigned int idx_a_orig = dev_gridParticleIndex[idx_a];
       -    float4 x_a      = dev_x_sorted[idx_a];
       -    float  radius_a = dev_radius_sorted[idx_a];
       -
       -    // Fetch wall data in global read
       -    float4 w_up_nx   = dev_w_nx[0];
       -    float4 w_up_mvfd = dev_w_mvfd[0];
       -
       -    // Fetch world dimensions in constant memory read
       -    float3 origo = make_float3(devC_origo[0], 
       -                               devC_origo[1], 
       -                               devC_origo[2]); 
       -    float3 L = make_float3(devC_L[0], 
       -                           devC_L[1], 
       -                           devC_L[2]);
       -
       -    // Index of particle which is bonded to particle A.
       -    // The index is equal to the particle no (p.np)
       -    // if particle A is bond-less.
       -    //uint4 bonds = dev_bonds_sorted[idx_a];
       -
       -    // Initiate shear friction loss rate at 0.0
       -    float es_dot = 0.0f; 
       -
       -    // Initiate pressure on particle at 0.0
       -    float p = 0.0f;
       -
       -    // Allocate memory for temporal force/torque vector values
       -    float3 N = make_float3(0.0f, 0.0f, 0.0f);
       -    float3 T = make_float3(0.0f, 0.0f, 0.0f);
       -
       -    // Apply linear elastic, frictional contact model to registered contacts
       -    if (devC_shearmodel == 2) {
       -      unsigned int idx_b_orig, mempos;
       -      float delta_n, x_ab_length;
       -      float4 x_b;
       -      float4 distmod;
       -      float  radius_b;
       -      float3 x_ab;
       -      float4 vel_a     = dev_vel_sorted[idx_a];
       -      float4 angvel4_a = dev_angvel_sorted[idx_a];
       -      float3 angvel_a  = make_float3(angvel4_a.x, angvel4_a.y, angvel4_a.z);
       -
       -      // Loop over all possible contacts, and remove contacts
       -      // that no longer are valid (delta_n > 0.0)
       -      for (int i = 0; i<(int)devC_nc; ++i) {
       -        mempos = (unsigned int)(idx_a_orig * devC_nc + i);
       -        __syncthreads();
       -        idx_b_orig = dev_contacts[mempos];
       -        distmod    = dev_distmod[mempos];
       -        x_b        = dev_x[idx_b_orig];
       -        //radius_b   = dev_radius[idx_b_orig];
       -        radius_b   = distmod.w;
       -
       -        // Inter-particle vector, corrected for periodic boundaries
       -        x_ab = make_float3(x_a.x - x_b.x + distmod.x,
       -                               x_a.y - x_b.y + distmod.y,
       -                           x_a.z - x_b.z + distmod.z);
       -
       -        x_ab_length = length(x_ab);
       -        delta_n = x_ab_length - (radius_a + radius_b);
       -
       -
       -        if (idx_b_orig != (unsigned int)devC_np) {
       -
       -          // Process collision if the particles are overlapping
       -          if (delta_n < 0.0f) {
       -            //cuPrintf("\nProcessing contact, idx_a_orig = %u, idx_b_orig = %u, contact = %d, delta_n = %f\n",
       -            //  idx_a_orig, idx_b_orig, i, delta_n);
       -            contactLinear(&N, &T, &es_dot, &p, 
       -                          idx_a_orig,
       -                          idx_b_orig,
       -                          vel_a,
       -                          dev_vel,
       -                          angvel_a,
       -                          dev_angvel,
       -                          radius_a, radius_b, 
       -                          x_ab, x_ab_length,
       -                          delta_n, dev_delta_t, 
       -                          mempos);
       -          } else {
       -            __syncthreads();
       -            // Remove this contact (there is no particle with index=np)
       -            dev_contacts[mempos] = devC_np; 
       -            // Zero sum of shear displacement in this position
       -            dev_delta_t[mempos]  = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -          }
       -        } else {
       -          __syncthreads();
       -          dev_delta_t[mempos] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -        }
       -      } // Contact loop end
       -
       -    } else if (devC_shearmodel == 1) {
       -
       -      int3 gridPos;
       -      int3 targetPos;
       -
       -      // Calculate address in grid from position
       -      gridPos.x = floor((x_a.x - devC_origo[0]) / (devC_L[0]/devC_num[0]));
       -      gridPos.y = floor((x_a.y - devC_origo[1]) / (devC_L[1]/devC_num[1]));
       -      gridPos.z = floor((x_a.z - devC_origo[2]) / (devC_L[2]/devC_num[2]));
       -
       -      // Find overlaps between particle no. idx and all particles
       -      // from its own cell + 26 neighbor cells.
       -      // Calculate resulting normal- and shear-force components and
       -      // torque for the particle on the base of contactLinearViscous()
       -      for (int z_dim=-1; z_dim<2; ++z_dim) { // z-axis
       -        for (int y_dim=-1; y_dim<2; ++y_dim) { // y-axis
       -          for (int x_dim=-1; x_dim<2; ++x_dim) { // x-axis
       -            targetPos = gridPos + make_int3(x_dim, y_dim, z_dim);
       -            overlapsInCell(targetPos, idx_a, x_a, radius_a,
       -                           &N, &T, &es_dot, &p,
       -                           dev_x_sorted, dev_radius_sorted, 
       -                           dev_vel_sorted, dev_angvel_sorted,
       -                           dev_cellStart, dev_cellEnd,
       -                           dev_w_nx, dev_w_mvfd);
       -          }
       -        }
       -      }
       -
       -    }
       -
       -    //// Interact with walls
       -    float delta_w; // Overlap distance
       -    float3 w_n;    // Wall surface normal
       -    float w_force = 0.0f; // Force on wall from particle A
       -
       -    // Upper wall (idx 0)
       -    delta_w = w_up_nx.w - (x_a.z + radius_a);
       -    w_n = make_float3(0.0f, 0.0f, -1.0f);
       -    if (delta_w < 0.0f) {
       -      w_force = contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                   w_n, delta_w, w_up_mvfd.y);
       -    }
       -
       -    // Lower wall (force on wall not stored)
       -    delta_w = x_a.z - radius_a - origo.z;
       -    w_n = make_float3(0.0f, 0.0f, 1.0f);
       -    if (delta_w < 0.0f) {
       -      (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                 dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -    }
       -
       -
       -    if (devC_periodic == 0) {
       -
       -      // Left wall
       -      delta_w = x_a.x - radius_a - origo.x;
       -      w_n = make_float3(1.0f, 0.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -
       -      // Right wall
       -      delta_w = L.x - (x_a.x + radius_a);
       -      w_n = make_float3(-1.0f, 0.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -
       -      // Front wall
       -      delta_w = x_a.y - radius_a - origo.y;
       -      w_n = make_float3(0.0f, 1.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -
       -      // Back wall
       -      delta_w = L.y - (x_a.y + radius_a);
       -      w_n = make_float3(0.0f, -1.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -
       -    } else if (devC_periodic == 2) {
       -
       -      // Front wall
       -      delta_w = x_a.y - radius_a - origo.y;
       -      w_n = make_float3(0.0f, 1.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -
       -      // Back wall
       -      delta_w = L.y - (x_a.y + radius_a);
       -      w_n = make_float3(0.0f, -1.0f, 0.0f);
       -      if (delta_w < 0.0f) {
       -        (void)contactLinear_wall(&N, &T, &es_dot, &p, idx_a, radius_a,
       -                                     dev_vel_sorted, dev_angvel_sorted,
       -                                 w_n, delta_w, 0.0f);
       -      }
       -    }
       -
       -
       -    // Hold threads for coalesced write
       -    __syncthreads();
       -
       -    // Write force to unsorted position
       -    unsigned int orig_idx = dev_gridParticleIndex[idx_a];
       -    dev_force[orig_idx]   = make_float4(N, 0.0f);
       -    dev_torque[orig_idx]  = make_float4(T, 0.0f);
       -    dev_es_dot[orig_idx]  = es_dot;
       -    dev_es[orig_idx]     += es_dot * devC_dt;
       -    dev_p[orig_idx]       = p;
       -    dev_w_force[orig_idx] = w_force;
       -  }
       -} // End of interact(...)
       -
       -
       -
       -
       -
       -// FUNCTION FOR UPDATING TRAJECTORIES
       -
       -// Second order integration scheme based on Taylor expansion of particle kinematics. 
       -// Kernel executed on device, and callable from host only.
       -__global__ void integrate(float4* dev_x_sorted, float4* dev_vel_sorted, // Input
       -                          float4* dev_angvel_sorted, float* dev_radius_sorted, // Input
       -                          float4* dev_x, float4* dev_vel, float4* dev_angvel, // Output
       -                          float4* dev_force, float4* dev_torque, // Input
       -                          unsigned int* dev_gridParticleIndex) // Input: Sorted-Unsorted key
       -{
       -  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       -
       -  if (idx < devC_np) { // Condition prevents block size error
       -
       -    // Copy data to temporary arrays to avoid any potential read-after-write, 
       -    // write-after-read, or write-after-write hazards. 
       -    unsigned int orig_idx = dev_gridParticleIndex[idx];
       -    float4 force  = dev_force[orig_idx];
       -    float4 torque = dev_torque[orig_idx];
       -
       -    // Initialize acceleration vectors to zero
       -    float4 acc    = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -    float4 angacc = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -
       -    // Fetch particle position and velocity values from global read
       -    float4 x      = dev_x_sorted[idx];
       -    float4 vel    = dev_vel_sorted[idx];
       -    float4 angvel = dev_angvel_sorted[idx];
       -    float  radius = dev_radius_sorted[idx];
       -
       -    // Coherent read from constant memory to registers
       -    float  dt    = devC_dt;
       -    float3 origo = make_float3(devC_origo[0], devC_origo[1], devC_origo[2]); 
       -    float3 L     = make_float3(devC_L[0], devC_L[1], devC_L[2]);
       -    float  rho   = devC_rho;
       -
       -    // Particle mass
       -    float m = 4.0f/3.0f * PI * radius*radius*radius * rho;
       -
       -    // Update linear acceleration of particle
       -    acc.x = force.x / m;
       -    acc.y = force.y / m;
       -    acc.z = force.z / m;
       -
       -    // Update angular acceleration of particle 
       -    // (angacc = (total moment)/Intertia, intertia = 2/5*m*r^2)
       -    angacc.x = torque.x * 1.0f / (2.0f/5.0f * m * radius*radius);
       -    angacc.y = torque.y * 1.0f / (2.0f/5.0f * m * radius*radius);
       -    angacc.z = torque.z * 1.0f / (2.0f/5.0f * m * radius*radius);
       -
       -    // Add gravity
       -    acc.x += devC_g[0];
       -    acc.y += devC_g[1];
       -    acc.z += devC_g[2];
       -    //acc.z -= 9.82f;
       -
       -    // Update angular velocity
       -    angvel.x += angacc.x * dt;
       -    angvel.y += angacc.y * dt;
       -    angvel.z += angacc.z * dt;
       -
       -    // Check if particle has a fixed horizontal velocity
       -    if (vel.w > 0.0f) {
       -
       -      // Zero horizontal acceleration and disable
       -      // gravity to counteract segregation.
       -      // Particles may move in the z-dimension,
       -      // to allow for dilation.
       -      acc.x = 0.0f;
       -      acc.y = 0.0f;
       -      acc.z -= devC_g[2];
       -
       -      // Zero the angular acceleration and -velocity
       -      angvel = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -    } 
       -
       -    // Update linear velocity
       -    vel.x += acc.x * dt;
       -    vel.y += acc.y * dt;
       -    vel.z += acc.z * dt;
       -
       -    // Update position. First-order Euler's scheme:
       -    //x.x += vel.x * dt;
       -    //x.y += vel.y * dt;
       -    //x.z += vel.z * dt;
       -
       -    // Update position. Second-order scheme based on Taylor expansion 
       -    // (greater accuracy than the first-order Euler's scheme)
       -    x.x += vel.x * dt + (acc.x * dt*dt)/2.0f;
       -    x.y += vel.y * dt + (acc.y * dt*dt)/2.0f;
       -    x.z += vel.z * dt + (acc.z * dt*dt)/2.0f;
       -
       -
       -    // Add x-displacement for this time step to 
       -    // sum of x-displacements
       -    x.w += vel.x * dt + (acc.x * dt*dt)/2.0f;
       -
       -    // Move particle across boundary if it is periodic
       -    if (devC_periodic == 1) {
       -      if (x.x < origo.x)
       -        x.x += L.x;
       -      if (x.x > L.x)
       -        x.x -= L.x;
       -      if (x.y < origo.y)
       -        x.y += L.y;
       -      if (x.y > L.y)
       -        x.y -= L.y;
       -    } else if (devC_periodic == 2) {
       -      if (x.x < origo.x)
       -        x.x += L.x;
       -      if (x.x > L.x)
       -        x.x -= L.x;
       -    }
       -
       -    // Hold threads for coalesced write
       -    __syncthreads();
       -
       -    // Store data in global memory at original, pre-sort positions
       -    dev_angvel[orig_idx] = angvel;
       -    dev_vel[orig_idx]    = vel;
       -    dev_x[orig_idx]      = x;
       -  } 
       -} // End of integrate(...)
       -
       -
       -// Reduce wall force contributions from particles to a single value per wall
       -__global__ void summation(float* in, float *out)
       -{
       -  __shared__ float cache[256];
       -  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
       -  unsigned int cacheIdx = threadIdx.x;
       -
       -  float temp = 0.0f;
       -  while (idx < devC_np) {
       -    temp += in[idx];
       -    idx += blockDim.x * gridDim.x;
       -  }
       -
       -  // Set the cache values
       -  cache[cacheIdx] = temp;
       -
       -  __syncthreads();
       -
       -  // For reductions, threadsPerBlock must be a power of two
       -  // because of the following code
       -  unsigned int i = blockDim.x/2;
       -  while (i != 0) {
       -    if (cacheIdx < i)
       -      cache[cacheIdx] += cache[cacheIdx + i];
       -    __syncthreads();
       -    i /= 2;
       -  }
       -
       -  // Write sum for block to global memory
       -  if (cacheIdx == 0)
       -    out[blockIdx.x] = cache[0];
       -}
       -
       -// Update wall positions
       -__global__ void integrateWalls(float4* dev_w_nx, 
       -                                   float4* dev_w_mvfd,
       -                               float* dev_w_force_partial,
       -                               unsigned int blocksPerGrid)
       -{
       -  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; // Thread id
       -
       -  if (idx < devC_nw) { // Condition prevents block size error
       -
       -    // Copy data to temporary arrays to avoid any potential read-after-write, 
       -    // write-after-read, or write-after-write hazards. 
       -    float4 w_nx   = dev_w_nx[idx];
       -    float4 w_mvfd = dev_w_mvfd[idx];
       -
       -    // Find the final sum of forces on wall
       -    w_mvfd.z = 0.0f;
       -    for (int i=0; i<blocksPerGrid; ++i) {
       -      w_mvfd.z += dev_w_force_partial[i];
       -    }
       -
       -    float dt = devC_dt;
       -
       -    // Normal load = Deviatoric stress times wall surface area,
       -    // directed downwards.
       -    float N = -w_mvfd.w*devC_L[0]*devC_L[1];
       -
       -    // Calculate resulting acceleration of wall
       -    // (Wall mass is stored in w component of position float4)
       -    float acc = (w_mvfd.z+N)/w_mvfd.x;
       -
       -    // Update linear velocity
       -    w_mvfd.y += acc * dt;
       -
       -    // Update position. Second-order scheme based on Taylor expansion 
       -    // (greater accuracy than the first-order Euler's scheme)
       -    w_nx.w += w_mvfd.y * dt + (acc * dt*dt)/2.0f;
       -
       -    // Store data in global memory
       -    dev_w_nx[idx]   = w_nx;
       -    dev_w_mvfd[idx] = w_mvfd;
       -  }
       -} // End of integrateWalls(...)
       -
       -
       +#include "sorting.cuh"        
       +#include "contactmodels.cuh"
       +#include "cohesion.cuh"
       +#include "contactsearch.cuh"
       +#include "integration.cuh"
        
        
        // Wrapper function for initializing the CUDA components.
       t@@ -1542,6 +65,24 @@ __host__ void initializeGPU(void)
          checkForCudaErrors("After initializing CUDA device");
        }
        
       +// Start timer for kernel profiling
       +__host__ void startTimer(cudaEvent_t* kernel_tic)
       +{
       +  cudaEventRecord(*kernel_tic);
       +}
       +
       +// Stop timer for kernel profiling and time to function sum
       +__host__ void stopTimer(cudaEvent_t *kernel_tic,
       +                            cudaEvent_t *kernel_toc,
       +                        float *kernel_elapsed,
       +                        double* sum)
       +{
       +    cudaEventRecord(*kernel_toc, 0);
       +    cudaEventSynchronize(*kernel_toc);
       +    cudaEventElapsedTime(kernel_elapsed, *kernel_tic, *kernel_toc);
       +    *sum += *kernel_elapsed;
       +}
       +
        // Copy selected constant components to constant device memory.
        //extern "C"
        __host__ void transferToConstantMemory(Particles* p,
       t@@ -1555,12 +96,12 @@ __host__ void transferToConstantMemory(Particles* p,
        
          cudaMemcpyToSymbol("devC_np", &p->np, sizeof(p->np));
          cudaMemcpyToSymbol("devC_nc", &NC, sizeof(char));
       -  cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(float)*ND);
       -  cudaMemcpyToSymbol("devC_L", grid->L, sizeof(float)*ND);
       +  cudaMemcpyToSymbol("devC_origo", grid->origo, sizeof(Float)*ND);
       +  cudaMemcpyToSymbol("devC_L", grid->L, sizeof(Float)*ND);
          cudaMemcpyToSymbol("devC_num", grid->num, sizeof(unsigned int)*ND);
       -  cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(float));
       +  cudaMemcpyToSymbol("devC_dt", &time->dt, sizeof(Float));
          cudaMemcpyToSymbol("devC_global", &params->global, sizeof(int));
       -  cudaMemcpyToSymbol("devC_g", params->g, sizeof(float)*ND);
       +  cudaMemcpyToSymbol("devC_g", params->g, sizeof(Float)*ND);
          cudaMemcpyToSymbol("devC_nw", &params->nw, sizeof(unsigned int));
          cudaMemcpyToSymbol("devC_periodic", &params->periodic, sizeof(int));
        
       t@@ -1578,19 +119,22 @@ __host__ void transferToConstantMemory(Particles* p,
            params->mu_d    = p->mu_d[0];
            params->mu_r    = p->mu_r[0];
            params->rho     = p->rho[0];
       -    cudaMemcpyToSymbol("devC_k_n", &params->k_n, sizeof(float));
       -    cudaMemcpyToSymbol("devC_k_s", &params->k_s, sizeof(float));
       -    cudaMemcpyToSymbol("devC_k_r", &params->k_r, sizeof(float));
       -    cudaMemcpyToSymbol("devC_gamma_n", &params->gamma_n, sizeof(float));
       -    cudaMemcpyToSymbol("devC_gamma_s", &params->gamma_s, sizeof(float));
       -    cudaMemcpyToSymbol("devC_gamma_r", &params->gamma_r, sizeof(float));
       -    cudaMemcpyToSymbol("devC_mu_s", &params->mu_s, sizeof(float));
       -    cudaMemcpyToSymbol("devC_mu_d", &params->mu_d, sizeof(float));
       -    cudaMemcpyToSymbol("devC_mu_r", &params->mu_r, sizeof(float));
       -    cudaMemcpyToSymbol("devC_rho", &params->rho, sizeof(float));
       -    cudaMemcpyToSymbol("devC_kappa", &params->kappa, sizeof(float));
       -    cudaMemcpyToSymbol("devC_db", &params->db, sizeof(float));
       -    cudaMemcpyToSymbol("devC_V_b", &params->V_b, sizeof(float));
       +    cudaMemcpyToSymbol("devC_k_n", &params->k_n, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_k_s", &params->k_s, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_k_r", &params->k_r, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_n", &params->gamma_n, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_s", &params->gamma_s, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_r", &params->gamma_r, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_wn", &params->gamma_wn, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_ws", &params->gamma_ws, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_gamma_wr", &params->gamma_wr, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_mu_s", &params->mu_s, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_mu_d", &params->mu_d, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_mu_r", &params->mu_r, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_rho", &params->rho, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_kappa", &params->kappa, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_db", &params->db, sizeof(Float));
       +    cudaMemcpyToSymbol("devC_V_b", &params->V_b, sizeof(Float));
            cudaMemcpyToSymbol("devC_shearmodel", &params->shearmodel, sizeof(unsigned int));
          } else {
            //printf("(params.global == %d) ", params.global);
       t@@ -1609,20 +153,20 @@ __host__ void transferToConstantMemory(Particles* p,
        
        
        //extern "C"
       -__host__ void gpuMain(float4* host_x,
       -                      float4* host_vel,
       -                      float4* host_acc,
       -                      float4* host_angvel,
       -                      float4* host_angacc,
       -                      float4* host_force,
       -                      float4* host_torque,
       +__host__ void gpuMain(Float4* host_x,
       +                      Float4* host_vel,
       +                      Float4* host_acc,
       +                      Float4* host_angvel,
       +                      Float4* host_angacc,
       +                      Float4* host_force,
       +                      Float4* host_torque,
                              uint4*  host_bonds,
                              Particles* p, 
                              Grid* grid, 
                              Time* time, 
                              Params* params,
       -                      float4* host_w_nx,
       -                      float4* host_w_mvfd,
       +                      Float4* host_w_nx,
       +                      Float4* host_w_mvfd,
                              const char* cwd, 
                              const char* inputbin)
        {
       t@@ -1633,31 +177,31 @@ __host__ void gpuMain(float4* host_x,
          transferToConstantMemory(p, grid, time, params);
        
          // Declare pointers for particle variables on the device
       -  float4* dev_x;        // Particle position
       -  float4* dev_vel;        // Particle linear velocity
       -  float4* dev_angvel;        // Particle angular velocity
       -  float4* dev_acc;        // Particle linear acceleration
       -  float4* dev_angacc;        // Particle angular acceleration
       -  float4* dev_force;        // Sum of forces
       -  float4* dev_torque;        // Sum of torques
       -  float*  dev_radius;        // Particle radius
       -  float*  dev_es_dot;        // Current shear energy producion rate
       -  float*  dev_es;        // Total shear energy excerted on particle
       -  float*  dev_p;        // Pressure excerted onto particle
       +  Float4* dev_x;        // Particle position
       +  Float4* dev_vel;        // Particle linear velocity
       +  Float4* dev_angvel;        // Particle angular velocity
       +  Float4* dev_acc;        // Particle linear acceleration
       +  Float4* dev_angacc;        // Particle angular acceleration
       +  Float4* dev_force;        // Sum of forces
       +  Float4* dev_torque;        // Sum of torques
       +  Float*  dev_radius;        // Particle radius
       +  Float*  dev_es_dot;        // Current shear energy producion rate
       +  Float*  dev_es;        // Total shear energy excerted on particle
       +  Float*  dev_p;        // Pressure excerted onto particle
          //uint4*  dev_bonds;        // Particle bond pairs
        
          // Declare pointers for wall vectors on the device
       -  float4* dev_w_nx;            // Wall normal (x,y,z) and position (w)
       -  float4* dev_w_mvfd;          // Wall mass (x), velocity (y), force (z) 
       +  Float4* dev_w_nx;            // Wall normal (x,y,z) and position (w)
       +  Float4* dev_w_mvfd;          // Wall mass (x), velocity (y), force (z) 
                                         // and deviatoric stress (w)
       -  float*  dev_w_force;               // Resulting force on wall per particle
       -  float*  dev_w_force_partial; // Partial sum from block of threads
       +  Float*  dev_w_force;               // Resulting force on wall per particle
       +  Float*  dev_w_force_partial; // Partial sum from block of threads
        
          // Memory for sorted particle data
       -  float4* dev_x_sorted;
       -  float4* dev_vel_sorted;
       -  float4* dev_angvel_sorted;
       -  float*  dev_radius_sorted; 
       +  Float4* dev_x_sorted;
       +  Float4* dev_vel_sorted;
       +  Float4* dev_angvel_sorted;
       +  Float*  dev_radius_sorted; 
          //uint4*  dev_bonds_sorted;
        
          // Grid-particle array pointers
       t@@ -1669,35 +213,35 @@ __host__ void gpuMain(float4* host_x,
          // Particle contact bookkeeping
          unsigned int* dev_contacts;
          // Particle pair distance correction across periodic boundaries
       -  float4* dev_distmod;
       +  Float4* dev_distmod;
          // x,y,z contains the interparticle vector, corrected if contact 
          // is across a periodic boundary. 
       -  float4* dev_delta_t; // Accumulated shear distance of contact
       +  Float4* dev_delta_t; // Accumulated shear distance of contact
        
          // Particle memory size
       -  unsigned int memSizef  = sizeof(float) * p->np;
       -  unsigned int memSizef4 = sizeof(float4) * p->np;
       +  unsigned int memSizeF  = sizeof(Float) * p->np;
       +  unsigned int memSizeF4 = sizeof(Float4) * p->np;
        
          // Allocate device memory for particle variables,
          // tie to previously declared pointers
          cout << "  Allocating device memory:                       ";
        
          // Particle arrays
       -  cudaMalloc((void**)&dev_x, memSizef4);
       -  cudaMalloc((void**)&dev_x_sorted, memSizef4);
       -  cudaMalloc((void**)&dev_vel, memSizef4);
       -  cudaMalloc((void**)&dev_vel_sorted, memSizef4);
       -  cudaMalloc((void**)&dev_angvel, memSizef4);
       -  cudaMalloc((void**)&dev_angvel_sorted, memSizef4);
       -  cudaMalloc((void**)&dev_acc, memSizef4);
       -  cudaMalloc((void**)&dev_angacc, memSizef4);
       -  cudaMalloc((void**)&dev_force, memSizef4);
       -  cudaMalloc((void**)&dev_torque, memSizef4);
       -  cudaMalloc((void**)&dev_radius, memSizef);
       -  cudaMalloc((void**)&dev_radius_sorted, memSizef);
       -  cudaMalloc((void**)&dev_es_dot, memSizef);
       -  cudaMalloc((void**)&dev_es, memSizef);
       -  cudaMalloc((void**)&dev_p, memSizef);
       +  cudaMalloc((void**)&dev_x, memSizeF4);
       +  cudaMalloc((void**)&dev_x_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_vel, memSizeF4);
       +  cudaMalloc((void**)&dev_vel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_angvel, memSizeF4);
       +  cudaMalloc((void**)&dev_angvel_sorted, memSizeF4);
       +  cudaMalloc((void**)&dev_acc, memSizeF4);
       +  cudaMalloc((void**)&dev_angacc, memSizeF4);
       +  cudaMalloc((void**)&dev_force, memSizeF4);
       +  cudaMalloc((void**)&dev_torque, memSizeF4);
       +  cudaMalloc((void**)&dev_radius, memSizeF);
       +  cudaMalloc((void**)&dev_radius_sorted, memSizeF);
       +  cudaMalloc((void**)&dev_es_dot, memSizeF);
       +  cudaMalloc((void**)&dev_es, memSizeF);
       +  cudaMalloc((void**)&dev_p, memSizeF);
          //cudaMalloc((void**)&dev_bonds, sizeof(uint4) * p->np);
          //cudaMalloc((void**)&dev_bonds_sorted, sizeof(uint4) * p->np);
        
       t@@ -1709,13 +253,13 @@ __host__ void gpuMain(float4* host_x,
        
          // Particle contact bookkeeping arrays
          cudaMalloc((void**)&dev_contacts, sizeof(unsigned int)*p->np*NC); // Max NC contacts per particle
       -  cudaMalloc((void**)&dev_distmod, sizeof(float4)*p->np*NC);
       -  cudaMalloc((void**)&dev_delta_t, sizeof(float4)*p->np*NC);
       +  cudaMalloc((void**)&dev_distmod, sizeof(Float4)*p->np*NC);
       +  cudaMalloc((void**)&dev_delta_t, sizeof(Float4)*p->np*NC);
        
          // Wall arrays
       -  cudaMalloc((void**)&dev_w_nx, sizeof(float)*params->nw*4);
       -  cudaMalloc((void**)&dev_w_mvfd, sizeof(float)*params->nw*4);
       -  cudaMalloc((void**)&dev_w_force, sizeof(float)*params->nw*p->np);
       +  cudaMalloc((void**)&dev_w_nx, sizeof(Float)*params->nw*4);
       +  cudaMalloc((void**)&dev_w_mvfd, sizeof(Float)*params->nw*4);
       +  cudaMalloc((void**)&dev_w_force, sizeof(Float)*params->nw*p->np);
        
          checkForCudaErrors("Post device memory allocation");
          cout << "Done\n";
       t@@ -1724,22 +268,22 @@ __host__ void gpuMain(float4* host_x,
          cout << "  Transfering data to the device:                 ";
        
          // Particle data
       -  cudaMemcpy(dev_x, host_x, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_vel, host_vel, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_acc, host_acc, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_angvel, host_angvel, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_angacc, host_angacc, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_force, host_force, memSizef4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_torque, host_torque, memSizef4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_x, host_x, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_vel, host_vel, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_acc, host_acc, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_angvel, host_angvel, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_angacc, host_angacc, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_force, host_force, memSizeF4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_torque, host_torque, memSizeF4, cudaMemcpyHostToDevice);
          //cudaMemcpy(dev_bonds, host_bonds, sizeof(uint4) * p->np, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_radius, p->radius, memSizef, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_es_dot, p->es_dot, memSizef, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_es, p->es, memSizef, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_p, p->p, memSizef, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_radius, p->radius, memSizeF, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_es_dot, p->es_dot, memSizeF, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_es, p->es, memSizeF, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_p, p->p, memSizeF, cudaMemcpyHostToDevice);
        
          // Wall data (wall mass and number in constant memory)
       -  cudaMemcpy(dev_w_nx, host_w_nx, sizeof(float)*params->nw*4, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(float)*params->nw*4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_w_nx, host_w_nx, sizeof(Float)*params->nw*4, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_w_mvfd, host_w_mvfd, sizeof(Float)*params->nw*4, cudaMemcpyHostToDevice);
          
          // Initialize contacts lists to p.np
          unsigned int* npu = new unsigned int[p->np*NC];
       t@@ -1750,12 +294,12 @@ __host__ void gpuMain(float4* host_x,
        
          // Create array of 0.0 values on the host and transfer these to the distance 
          // modifier and shear displacement arrays
       -  float4* zerosf4 = new float4[p->np*NC];
       +  Float4* zerosF4 = new Float4[p->np*NC];
          for (unsigned int i=0; i<(p->np*NC); ++i)
       -    zerosf4[i] = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
       -  cudaMemcpy(dev_distmod, zerosf4, sizeof(float4)*p->np*NC, cudaMemcpyHostToDevice);
       -  cudaMemcpy(dev_delta_t, zerosf4, sizeof(float4)*p->np*NC, cudaMemcpyHostToDevice);
       -  delete[] zerosf4;
       +    zerosF4[i] = MAKE_FLOAT4(0.0f, 0.0f, 0.0f, 0.0f);
       +  cudaMemcpy(dev_distmod, zerosF4, sizeof(Float4)*p->np*NC, cudaMemcpyHostToDevice);
       +  cudaMemcpy(dev_delta_t, zerosF4, sizeof(Float4)*p->np*NC, cudaMemcpyHostToDevice);
       +  delete[] zerosF4;
        
          checkForCudaErrors("Post memcopy");
          cout << "Done\n";
       t@@ -1783,7 +327,7 @@ __host__ void gpuMain(float4* host_x,
          // Shared memory per block
          unsigned int smemSize = sizeof(unsigned int)*(threadsPerBlock+1);
        
       -  cudaMalloc((void**)&dev_w_force_partial, sizeof(float)*dimGrid.x);
       +  cudaMalloc((void**)&dev_w_force_partial, sizeof(Float)*dimGrid.x);
        
          // Report to stdout
          cout << "\n  Device memory allocation and transfer complete.\n"
       t@@ -1821,9 +365,7 @@ __host__ void gpuMain(float4* host_x,
          cout << "\n  Entering the main calculation time loop...\n\n"
               << "  IMPORTANT: Do not close this terminal, doing so will \n"
               << "             terminate this SPHERE process. Follow the \n"
       -       << "             progress in MATLAB using:\n"
       -       << "                >> status('" << inputbin << "')\n"
       -       << "             or in this directory by executing:\n"
       +       << "             progress by executing:\n"
               << "                $ ./sphere_status " << inputbin << "\n\n";
        
          // Enable cuPrintf()
       t@@ -1835,6 +377,23 @@ __host__ void gpuMain(float4* host_x,
          cudaEventCreate(&dev_toc);
          cudaEventRecord(dev_tic, 0);
        
       +  // If profiling is enabled, initialize timers for each kernel
       +  cudaEvent_t kernel_tic, kernel_toc;
       +  float kernel_elapsed;
       +  double t_calcParticleCellID = 0.0;
       +  double t_thrustsort = 0.0;
       +  double t_reorderArrays = 0.0;
       +  double t_topology = 0.0;
       +  double t_interact = 0.0;
       +  double t_integrate = 0.0;
       +  double t_summation = 0.0;
       +  double t_integrateWalls = 0.0;
       +
       +  if (PROFILING == 1) {
       +    cudaEventCreate(&kernel_tic);
       +    cudaEventCreate(&kernel_toc);
       +  }
       +
          cout << "  Current simulation time: " << time->current << " s.";
        
        
       t@@ -1855,20 +414,28 @@ __host__ void gpuMain(float4* host_x,
            // For each particle: 
            // Compute hash key (cell index) from position 
            // in the fine, uniform and homogenous grid.
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID, 
                                                      dev_gridParticleIndex, 
                                                      dev_x);
        
            // Synchronization point
            cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_calcParticleCellID);
            checkForCudaErrors("Post calcParticleCellID");
        
        
            // Sort particle (key, particle ID) pairs by hash key with Thrust radix sort
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            thrust::sort_by_key(thrust::device_ptr<uint>(dev_gridParticleCellID),
                                thrust::device_ptr<uint>(dev_gridParticleCellID + p->np),
                                thrust::device_ptr<uint>(dev_gridParticleIndex));
            cudaThreadSynchronize(); // Needed? Does thrust synchronize threads implicitly?
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_thrustsort);
            checkForCudaErrors("Post calcParticleCellID");
        
        
       t@@ -1882,6 +449,8 @@ __host__ void gpuMain(float4* host_x,
        
            // Use sorted order to reorder particle arrays (position, velocities, radii) to ensure
            // coherent memory access. Save ordered configurations in new arrays (*_sorted).
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart, 
                                                           dev_cellEnd,
                                                           dev_gridParticleCellID, 
       t@@ -1897,6 +466,8 @@ __host__ void gpuMain(float4* host_x,
        
            // Synchronization point
            cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_reorderArrays);
            checkForCudaErrors("Post reorderArrays", iter);
        
            // The contact search in topology() is only necessary for determining
       t@@ -1904,6 +475,8 @@ __host__ void gpuMain(float4* host_x,
            // shear force model
            if (params->shearmodel == 2) {
              // For each particle: Search contacts in neighbor cells
       +      if (PROFILING == 1)
       +        startTimer(&kernel_tic);
              topology<<<dimGrid, dimBlock>>>(dev_cellStart, 
                                                dev_cellEnd,
                                              dev_gridParticleIndex,
       t@@ -1918,11 +491,15 @@ __host__ void gpuMain(float4* host_x,
        
              // Synchronization point
              cudaThreadSynchronize();
       +      if (PROFILING == 1)
       +        stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_topology);
              checkForCudaErrors("Post topology: One or more particles moved outside the grid.\nThis could possibly be caused by a numerical instability.\nIs the computational time step too large?", iter);
            }
        
        
            // For each particle: Process collisions and compute resulting forces.
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            interact<<<dimGrid, dimBlock>>>(dev_gridParticleIndex,
                                            dev_cellStart,
                                            dev_cellEnd,
       t@@ -1944,23 +521,37 @@ __host__ void gpuMain(float4* host_x,
        
            // Synchronization point
            cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_interact);
            checkForCudaErrors("Post interact - often caused if particles move outside the grid", iter);
        
            // Update particle kinematics
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, dev_vel_sorted, 
                                             dev_angvel_sorted, dev_radius_sorted,
                                             dev_x, dev_vel, dev_angvel,
                                             dev_force, dev_torque, 
                                             dev_gridParticleIndex);
        
       +    cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrate);
       +
            // Summation of forces on wall
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            summation<<<dimGrid, dimBlock>>>(dev_w_force, dev_w_force_partial);
        
            // Synchronization point
            cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_summation);
            checkForCudaErrors("Post integrate & wall force summation");
        
            // Update wall kinematics
       +    if (PROFILING == 1)
       +      startTimer(&kernel_tic);
            integrateWalls<<< 1, params->nw>>>(dev_w_nx, 
                                               dev_w_mvfd,
                                               dev_w_force_partial,
       t@@ -1968,6 +559,8 @@ __host__ void gpuMain(float4* host_x,
        
            // Synchronization point
            cudaThreadSynchronize();
       +    if (PROFILING == 1)
       +      stopTimer(&kernel_tic, &kernel_toc, &kernel_elapsed, &t_integrateWalls);
            checkForCudaErrors("Post integrateWalls");
        
        
       t@@ -1976,7 +569,8 @@ __host__ void gpuMain(float4* host_x,
            filetimeclock    += time->dt;
        
            // Report time to console
       -    cout << "\r  Current simulation time: " << time->current << " s.        ";
       +    cout << "\r  Current simulation time: " 
       +         << time->current << " s.        ";// << std::flush;
        
        
            // Produce output binary if the time interval 
       t@@ -1990,22 +584,22 @@ __host__ void gpuMain(float4* host_x,
              //// Copy device data to host memory
        
              // Particle data
       -      cudaMemcpy(host_x, dev_x, memSizef4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_vel, dev_vel, memSizef4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_acc, dev_acc, memSizef4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_angvel, dev_angvel, memSizef4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_force, dev_force, memSizef4, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(host_torque, dev_torque, memSizef4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_x, dev_x, memSizeF4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_vel, dev_vel, memSizeF4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_acc, dev_acc, memSizeF4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_angvel, dev_angvel, memSizeF4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_force, dev_force, memSizeF4, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(host_torque, dev_torque, memSizeF4, cudaMemcpyDeviceToHost);
              //cudaMemcpy(host_bonds, dev_bonds, sizeof(uint4) * p->np, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p->es_dot, dev_es_dot, memSizef, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p->es, dev_es, memSizef, cudaMemcpyDeviceToHost);
       -      cudaMemcpy(p->p, dev_p, memSizef, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(p->es_dot, dev_es_dot, memSizeF, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(p->es, dev_es, memSizeF, cudaMemcpyDeviceToHost);
       +      cudaMemcpy(p->p, dev_p, memSizeF, cudaMemcpyDeviceToHost);
        
              // Wall data
              cudaMemcpy(host_w_nx, dev_w_nx, 
       -                   sizeof(float)*params->nw*4, cudaMemcpyDeviceToHost);
       +                   sizeof(Float)*params->nw*4, cudaMemcpyDeviceToHost);
              cudaMemcpy(host_w_mvfd, dev_w_mvfd, 
       -                   sizeof(float)*params->nw*4, cudaMemcpyDeviceToHost);
       +                   sizeof(Float)*params->nw*4, cudaMemcpyDeviceToHost);
        
              // Pause the CPU thread until all CUDA calls previously issued are completed
              cudaThreadSynchronize();
       t@@ -2057,6 +651,23 @@ __host__ void gpuMain(float4* host_x,
          cudaEventDestroy(dev_tic);
          cudaEventDestroy(dev_toc);
        
       +  cudaEventDestroy(kernel_tic);
       +  cudaEventDestroy(kernel_toc);
       +
       +  // Report time spent on each kernel
       +  if (PROFILING == 1) {
       +    cout << "\nKernel profiling statistics:\n"
       +         << "  - calcParticleCellID:\t" << t_calcParticleCellID/1000.0 << " s\n"
       +         << "  - thrustsort:\t\t" << t_thrustsort/1000.0 << " s\n"
       +         << "  - reorderArrays:\t" << t_reorderArrays/1000.0 << " s\n"
       +         << "  - topology:\t\t" << t_topology/1000.0 << " s\n"
       +         << "  - interact:\t\t" << t_interact/1000.0 << " s\n"
       +         << "  - integrate:\t\t" << t_integrate/1000.0 << " s\n"
       +         << "  - summation:\t\t" << t_summation/1000.0 << " s\n"
       +         << "  - integrateWalls:\t" << t_integrateWalls/1000.0 << " s\n";
       +  }
       +
       +
          // Free memory allocated to cudaPrintfInit
          //cudaPrintfEnd();
        
 (DIR) diff --git a/src/file_io.cpp b/src/file_io.cpp
       t@@ -7,12 +7,22 @@
        
        
        // Write host variables to target binary file
       -int fwritebin(char *target, Particles *p, float4 *host_x, float4 *host_vel, float4 *host_angvel, 
       -    float4 *host_force, float4 *host_torque, 
       +// The output format should ALWAYS be double precision,
       +// so this function will typecast the data before write
       +// if it is single precision.
       +int fwritebin(char *target, 
       +    Particles *p, 
       +    Float4 *host_x, 
       +    Float4 *host_vel, 
       +    Float4 *host_angvel, 
       +    Float4 *host_force, 
       +    Float4 *host_torque, 
            uint4 *host_bonds,
       -    Grid *grid, Time *time, 
       +    Grid *grid, 
       +    Time *time, 
            Params *params,
       -    float4 *host_w_nx, float4 *host_w_mvfd)
       +    Float4 *host_w_nx, 
       +    Float4 *host_w_mvfd)
        {
        
          FILE *fp;
       t@@ -24,110 +34,286 @@ int fwritebin(char *target, Particles *p, float4 *host_x, float4 *host_vel, floa
            return 1; // Return unsuccessful exit status
          }
        
       -  // World dimensions
       -  fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
       +  // If double precision: Values can be written directly
       +  if (sizeof(Float) == sizeof(double)) {
        
       -  // Number of particles
       -  fwrite(&p->np, sizeof(p->np), 1, fp);
       +    // World dimensions
       +    fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
        
       -  // Temporal parameters
       -  fwrite(&time->dt, sizeof(time->dt), 1, fp);
       -  fwrite(&time->current, sizeof(time->current), 1, fp);
       -  fwrite(&time->total, sizeof(time->total), 1, fp);
       -  fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
       -  fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
       +    // Number of particles
       +    fwrite(&p->np, sizeof(p->np), 1, fp);
        
       -  // World coordinate system origo
       -  for (u=0; u<grid->nd; ++u) {
       -    fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
       -  }
       +    // Temporal parameters
       +    fwrite(&time->dt, sizeof(time->dt), 1, fp);
       +    fwrite(&time->current, sizeof(time->current), 1, fp);
       +    fwrite(&time->total, sizeof(time->total), 1, fp);
       +    fwrite(&time->file_dt, sizeof(time->file_dt), 1, fp);
       +    fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
        
       -  // World dimensions
       -  for (u=0; u<grid->nd; ++u) {
       -    fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
       -  }
       +    // World coordinate system origo
       +    for (u=0; u<grid->nd; ++u) {
       +      fwrite(&grid->origo[u], sizeof(grid->origo[u]), 1, fp);
       +    }
        
       -  // Grid cells along each dimension
       -  for (u=0; u<grid->nd; ++u) {
       -    fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
       -  }
       +    // World dimensions
       +    for (u=0; u<grid->nd; ++u) {
       +      fwrite(&grid->L[u], sizeof(grid->L[u]), 1, fp);
       +    }
        
       -  // Particle vectors
       -  for (j=0; j<p->np; ++j) {
       -    // x-axis
       -    fwrite(&host_x[j].x, sizeof(float), 1, fp);
       -    fwrite(&host_vel[j].x, sizeof(float), 1, fp);
       -    fwrite(&host_angvel[j].x, sizeof(float), 1, fp);
       -    fwrite(&host_force[j].x, sizeof(float), 1, fp);
       -    fwrite(&host_torque[j].x, sizeof(float), 1, fp);
       -
       -    // y-axis
       -    fwrite(&host_x[j].y, sizeof(float), 1, fp);
       -    fwrite(&host_vel[j].y, sizeof(float), 1, fp);
       -    fwrite(&host_angvel[j].y, sizeof(float), 1, fp);
       -    fwrite(&host_force[j].y, sizeof(float), 1, fp);
       -    fwrite(&host_torque[j].y, sizeof(float), 1, fp);
       -
       -    // z-axis
       -    fwrite(&host_x[j].z, sizeof(float), 1, fp);
       -    fwrite(&host_vel[j].z, sizeof(float), 1, fp);
       -    fwrite(&host_angvel[j].z, sizeof(float), 1, fp);
       -    fwrite(&host_force[j].z, sizeof(float), 1, fp);
       -    fwrite(&host_torque[j].z, sizeof(float), 1, fp);
       -  } 
       -
       -  // Individual particle values
       -  for (j=0; j<p->np; ++j) {
       -    fwrite(&host_vel[j].w, sizeof(float), 1, fp);
       -    fwrite(&host_x[j].w, sizeof(float), 1, fp);
       -    fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
       -    fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
       -    fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
       -    fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
       -    fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
       -    fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
       -    fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
       -    fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
       -    fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
       -    fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
       -    fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
       -    fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
       -    fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
       -    fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
       -  }
       +    // Grid cells along each dimension
       +    for (u=0; u<grid->nd; ++u) {
       +      fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
       +    }
        
       -  // Singular parameters
       -  fwrite(&params->global, sizeof(params->global), 1, fp);
       -  for (u=0; u<grid->nd; ++u) {
       -    fwrite(&params->g[u], sizeof(params->g[u]), 1, fp);
       -  }
       -  fwrite(&params->kappa, sizeof(params->kappa), 1, fp);
       -  fwrite(&params->db, sizeof(params->db), 1, fp);
       -  fwrite(&params->V_b, sizeof(params->V_b), 1, fp);
       -  fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
       -
       -  // Walls
       -  fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
       -  for (j=0; j<params->nw; ++j) {
       -    // Wall normal
       -    fwrite(&host_w_nx[j].x, sizeof(float), 1, fp);
       -    fwrite(&host_w_nx[j].y, sizeof(float), 1, fp);
       -    fwrite(&host_w_nx[j].z, sizeof(float), 1, fp);
       -
       -    fwrite(&host_w_nx[j].w, sizeof(float), 1, fp);   // Wall position
       -    fwrite(&host_w_mvfd[j].x, sizeof(float), 1, fp); // Wall mass
       -    fwrite(&host_w_mvfd[j].y, sizeof(float), 1, fp); // Wall velocity
       -    fwrite(&host_w_mvfd[j].z, sizeof(float), 1, fp); // Wall force
       -    fwrite(&host_w_mvfd[j].w, sizeof(float), 1, fp); // Wall deviatoric stress
       -  }
       -  fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
       -
       -  // Write bond pair values
       -  for (j=0; j<p->np; ++j) {
       -    fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
       -    fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
       -    fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
       -    fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
       +    // Particle vectors
       +    for (j=0; j<p->np; ++j) {
       +      // x-axis
       +      fwrite(&host_x[j].x, sizeof(Float), 1, fp);
       +      fwrite(&host_vel[j].x, sizeof(Float), 1, fp);
       +      fwrite(&host_angvel[j].x, sizeof(Float), 1, fp);
       +      fwrite(&host_force[j].x, sizeof(Float), 1, fp);
       +      fwrite(&host_torque[j].x, sizeof(Float), 1, fp);
       +
       +      // y-axis
       +      fwrite(&host_x[j].y, sizeof(Float), 1, fp);
       +      fwrite(&host_vel[j].y, sizeof(Float), 1, fp);
       +      fwrite(&host_angvel[j].y, sizeof(Float), 1, fp);
       +      fwrite(&host_force[j].y, sizeof(Float), 1, fp);
       +      fwrite(&host_torque[j].y, sizeof(Float), 1, fp);
       +
       +      // z-axis
       +      fwrite(&host_x[j].z, sizeof(Float), 1, fp);
       +      fwrite(&host_vel[j].z, sizeof(Float), 1, fp);
       +      fwrite(&host_angvel[j].z, sizeof(Float), 1, fp);
       +      fwrite(&host_force[j].z, sizeof(Float), 1, fp);
       +      fwrite(&host_torque[j].z, sizeof(Float), 1, fp);
       +    } 
       +
       +    // Individual particle values
       +    for (j=0; j<p->np; ++j) {
       +      fwrite(&host_vel[j].w, sizeof(Float), 1, fp);
       +      fwrite(&host_x[j].w, sizeof(Float), 1, fp);
       +      fwrite(&p->radius[j], sizeof(p->radius[j]), 1, fp);
       +      fwrite(&p->rho[j], sizeof(p->rho[j]), 1, fp);
       +      fwrite(&p->k_n[j], sizeof(p->k_n[j]), 1, fp);
       +      fwrite(&p->k_s[j], sizeof(p->k_s[j]), 1, fp);
       +      fwrite(&p->k_r[j], sizeof(p->k_r[j]), 1, fp);
       +      fwrite(&p->gamma_n[j], sizeof(p->gamma_n[j]), 1, fp);
       +      fwrite(&p->gamma_s[j], sizeof(p->gamma_s[j]), 1, fp);
       +      fwrite(&p->gamma_r[j], sizeof(p->gamma_r[j]), 1, fp);
       +      fwrite(&p->mu_s[j], sizeof(p->mu_s[j]), 1, fp);
       +      fwrite(&p->mu_d[j], sizeof(p->mu_d[j]), 1, fp);
       +      fwrite(&p->mu_r[j], sizeof(p->mu_r[j]), 1, fp);
       +      fwrite(&p->es_dot[j], sizeof(p->es_dot[j]), 1, fp);
       +      fwrite(&p->es[j], sizeof(p->es[j]), 1, fp);
       +      fwrite(&p->p[j], sizeof(p->p[j]), 1, fp);
       +    }
       +
       +    // Singular parameters
       +    fwrite(&params->global, sizeof(params->global), 1, fp);
       +    for (u=0; u<grid->nd; ++u) {
       +      fwrite(&params->g[u], sizeof(params->g[u]), 1, fp);
       +    }
       +    fwrite(&params->kappa, sizeof(params->kappa), 1, fp);
       +    fwrite(&params->db, sizeof(params->db), 1, fp);
       +    fwrite(&params->V_b, sizeof(params->V_b), 1, fp);
       +    fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
       +
       +    // Walls
       +    fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
       +    for (j=0; j<params->nw; ++j) {
       +      // Wall normal
       +      fwrite(&host_w_nx[j].x, sizeof(Float), 1, fp);
       +      fwrite(&host_w_nx[j].y, sizeof(Float), 1, fp);
       +      fwrite(&host_w_nx[j].z, sizeof(Float), 1, fp);
       +
       +      fwrite(&host_w_nx[j].w, sizeof(Float), 1, fp);   // Wall position
       +      fwrite(&host_w_mvfd[j].x, sizeof(Float), 1, fp); // Wall mass
       +      fwrite(&host_w_mvfd[j].y, sizeof(Float), 1, fp); // Wall velocity
       +      fwrite(&host_w_mvfd[j].z, sizeof(Float), 1, fp); // Wall force
       +      fwrite(&host_w_mvfd[j].w, sizeof(Float), 1, fp); // Wall deviatoric stress
       +    }
       +    fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
       +    fwrite(&params->gamma_wn, sizeof(params->gamma_wn), 1, fp);
       +    fwrite(&params->gamma_ws, sizeof(params->gamma_ws), 1, fp);
       +    fwrite(&params->gamma_wr, sizeof(params->gamma_wr), 1, fp);
       +
       +
       +    // Write bond pair values
       +    for (j=0; j<p->np; ++j) {
       +      fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
       +    }
       +
       +  } else if (sizeof(Float) == sizeof(float)) {
       +    // Single precision: Type conversion required
       +
       +    double d; // Double precision placeholder
       +
       +    // World dimensions
       +    fwrite(&grid->nd, sizeof(grid->nd), 1, fp);
       +
       +    // Number of particles
       +    fwrite(&p->np, sizeof(p->np), 1, fp);
       +
       +    // Temporal parameters
       +    d = (double)time->dt;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    fwrite(&time->current, sizeof(time->current), 1, fp);
       +    fwrite(&time->total, sizeof(time->total), 1, fp);
       +    d = (double)time->file_dt;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    fwrite(&time->step_count, sizeof(time->step_count), 1, fp);
       +
       +    // World coordinate system origo
       +    for (u=0; u<grid->nd; ++u) {
       +      d = (double)grid->origo[u];
       +      fwrite(&d, sizeof(d), 1, fp);
       +    }
       +
       +    // World dimensions
       +    for (u=0; u<grid->nd; ++u) {
       +      d = (double)grid->L[u];
       +      fwrite(&d, sizeof(d), 1, fp);
       +    }
       +
       +    // Grid cells along each dimension
       +    for (u=0; u<grid->nd; ++u) {
       +      fwrite(&grid->num[u], sizeof(grid->num[u]), 1, fp);
       +    }
       +
       +    // Particle vectors
       +    for (j=0; j<p->np; ++j) {
       +      // x-axis
       +      d = (double)host_x[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_vel[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_angvel[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_force[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_torque[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +
       +      // y-axis
       +      d = (double)host_x[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_vel[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_angvel[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_force[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_torque[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +
       +      // z-axis
       +      d = (double)host_x[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_vel[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_angvel[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_force[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_torque[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +    } 
       +
       +    // Individual particle values
       +    for (j=0; j<p->np; ++j) {
       +      d = (double)host_vel[j].w;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_x[j].w;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->radius[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->rho[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->k_n[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->k_s[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->k_r[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->gamma_n[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->gamma_s[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->gamma_r[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->mu_s[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->mu_d[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->mu_r[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->es_dot[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->es[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)p->p[j];
       +      fwrite(&d, sizeof(d), 1, fp);
       +    }
       +
       +    // Singular parameters
       +    fwrite(&params->global, sizeof(params->global), 1, fp);
       +    for (u=0; u<grid->nd; ++u) {
       +      d = (double)params->g[u];
       +      fwrite(&d, sizeof(d), 1, fp);
       +    }
       +    d = (double)params->kappa;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    d = (double)params->db;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    d = (double)params->V_b;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    fwrite(&params->shearmodel, sizeof(params->shearmodel), 1, fp);
       +
       +    // Walls
       +    fwrite(&params->nw, sizeof(params->nw), 1, fp); // No. of walls
       +    for (j=0; j<params->nw; ++j) {
       +      // Wall normal
       +      d = (double)host_w_nx[j].x;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_nx[j].y;
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_nx[j].z;
       +      fwrite(&d, sizeof(d), 1, fp);
       +
       +      d = (double)host_w_nx[j].w;         // Wall position
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_mvfd[j].x;        // Wall mass
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_mvfd[j].y;        // Wall velocity
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_mvfd[j].z;        // Wall force
       +      fwrite(&d, sizeof(d), 1, fp);
       +      d = (double)host_w_mvfd[j].w;        // Wall deviatoric stress
       +      fwrite(&d, sizeof(d), 1, fp);
       +    }
       +    fwrite(&params->periodic, sizeof(params->periodic), 1, fp);
       +    d = (double)params->gamma_wn;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    d = (double)params->gamma_ws;
       +    fwrite(&d, sizeof(d), 1, fp);
       +    d = (double)params->gamma_wr;
       +    fwrite(&d, sizeof(d), 1, fp);
       +
       +    // Write bond pair values
       +    for (j=0; j<p->np; ++j) {
       +      fwrite(&host_bonds[j].x, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].y, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].z, sizeof(unsigned int), 1, fp);
       +      fwrite(&host_bonds[j].w, sizeof(unsigned int), 1, fp);
       +    }
       +
       +  } else {
       +    fprintf(stderr, "Error: Chosen floating-point precision is incompatible with the data file format.\n");
       +    exit(1);
          }
        
          fclose(fp);
 (DIR) diff --git a/src/main.cpp b/src/main.cpp
       t@@ -2,10 +2,10 @@
        /*  SPHERE source code by Anders Damsgaard Christensen, 2010-12,       */
        /*  a 3D Discrete Element Method algorithm with CUDA GPU acceleration. */
        /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
       -
       +// Licence: Gnu Public License (GPL) v. 3. See license.txt.
        // See doc/sphere-doc.pdf for full documentation.
        // Compile with GNU make by typing 'make' in the src/ directory.               
       -// SPHERE is called from the command line with e.g. './sphere_<architecture> projectname' 
       +// SPHERE is called from the command line with './sphere_<architecture> projectname' 
        
        
        // Including library files
       t@@ -15,7 +15,7 @@
        #include <unistd.h> // UNIX only: For getcwd
        #include <string.h> // For strerror and strcmp
        #include <errno.h>  // For errno
       -#include <math.h>   // Basic floating point mathematical operations
       +#include <math.h>   // Basic Floating point mathematical operations
        #include <time.h>   // Time and date functions for timer
        
        // Including user files
       t@@ -139,41 +139,41 @@ int main(int argc, char *argv[])
               << time.step_count << "\n";
        
        
       -  // For spatial vectors an array of float4 vectors is chosen for best fit with 
       +  // For spatial vectors an array of Float4 vectors is chosen for best fit with 
          // GPU memory handling. Vector variable structure: ( x, y, z, <empty>).
          // Indexing starts from 0.
        
          // Allocate host arrays
          cout << "\n  Allocating host memory:                         ";
       -  grid.origo = new float[ND];        // Coordinate system origo
       -  grid.L     = new float[ND];        // Model world dimensions
       +  grid.origo = new Float[ND];        // Coordinate system origo
       +  grid.L     = new Float[ND];        // Model world dimensions
          grid.num   = new unsigned int[ND]; // Number of cells in each dimension
       -  params.g   = new float[ND];             // Gravitational acceleration vector
       -  p.radius   = new float[p.np];      // Particle radii
       -  p.rho      = new float[p.np];      // Particle densities
       -  p.m        = new float[p.np];      // Particle masses
       -  p.I        = new float[p.np];      // Particle moment of inertia
       -  p.k_n      = new float[p.np];      // Particle normal stiffnesses
       -  p.k_s             = new float[p.np];             // Particle shear stiffnesses
       -  p.k_r             = new float[p.np];             // Particle rolling stiffnesses
       -  p.gamma_n  = new float[p.np];             // Particle normal viscosity
       -  p.gamma_s  = new float[p.np];      // Particle shear viscosity
       -  p.gamma_r  = new float[p.np];             // Particle rolling viscosity
       -  p.mu_s     = new float[p.np];      // Inter-particle static shear contact friction coefficients
       -  p.mu_d     = new float[p.np];             // Inter-particle dynamic shear contact friction coefficients
       -  p.mu_r     = new float[p.np];             // Inter-particle rolling contact friction coefficients
       -  p.es_dot   = new float[p.np];             // Rate of shear energy dissipation
       -  p.es       = new float[p.np];             // Total shear energy dissipation
       -  p.p             = new float[p.np];             // Pressure excerted onto particle
       -
       -  // Allocate float4 host arrays
       -  float4 *host_x      = new float4[p.np];  // Center coordinates for each particle (x)
       -  float4 *host_vel    = new float4[p.np];  // Particle velocities (dotx = v)
       -  float4 *host_acc    = new float4[p.np];  // Particle accellerations (dotdotx = a)
       -  float4 *host_angvel = new float4[p.np];  // Particle angular velocity vector (omega)
       -  float4 *host_angacc = new float4[p.np];  // Particle angular acceleration vector (dotomega)
       -  float4 *host_force  = new float4[p.np];  // Particle summed force
       -  float4 *host_torque = new float4[p.np];  // Particle summed torque
       +  params.g   = new Float[ND];             // Gravitational acceleration vector
       +  p.radius   = new Float[p.np];      // Particle radii
       +  p.rho      = new Float[p.np];      // Particle densities
       +  p.m        = new Float[p.np];      // Particle masses
       +  p.I        = new Float[p.np];      // Particle moment of inertia
       +  p.k_n      = new Float[p.np];      // Particle normal stiffnesses
       +  p.k_s             = new Float[p.np];             // Particle shear stiffnesses
       +  p.k_r             = new Float[p.np];             // Particle rolling stiffnesses
       +  p.gamma_n  = new Float[p.np];             // Particle normal viscosity
       +  p.gamma_s  = new Float[p.np];      // Particle shear viscosity
       +  p.gamma_r  = new Float[p.np];             // Particle rolling viscosity
       +  p.mu_s     = new Float[p.np];      // Inter-particle static shear contact friction coefficients
       +  p.mu_d     = new Float[p.np];             // Inter-particle dynamic shear contact friction coefficients
       +  p.mu_r     = new Float[p.np];             // Inter-particle rolling contact friction coefficients
       +  p.es_dot   = new Float[p.np];             // Rate of shear energy dissipation
       +  p.es       = new Float[p.np];             // Total shear energy dissipation
       +  p.p             = new Float[p.np];             // Pressure excerted onto particle
       +
       +  // Allocate Float4 host arrays
       +  Float4 *host_x      = new Float4[p.np];  // Center coordinates for each particle (x)
       +  Float4 *host_vel    = new Float4[p.np];  // Particle velocities (dotx = v)
       +  Float4 *host_acc    = new Float4[p.np];  // Particle accellerations (dotdotx = a)
       +  Float4 *host_angvel = new Float4[p.np];  // Particle angular velocity vector (omega)
       +  Float4 *host_angacc = new Float4[p.np];  // Particle angular acceleration vector (dotomega)
       +  Float4 *host_force  = new Float4[p.np];  // Particle summed force
       +  Float4 *host_torque = new Float4[p.np];  // Particle summed torque
        
          uint4  *host_bonds  = new uint4[p.np];   // Bonds from particle [i] to two particles
          cout << "Done\n";
       t@@ -196,44 +196,44 @@ int main(int argc, char *argv[])
          }
        
          for (j=0; j<p.np; ++j) {
       -    if (fread(&host_x[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_x[j].x, sizeof(Float), 1, fp) != 1)
              return 1; // Return unsuccessful exit status
       -    if (fread(&host_vel[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_vel[j].x, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_angvel[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_angvel[j].x, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_force[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_force[j].x, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_torque[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_torque[j].x, sizeof(Float), 1, fp) != 1)
              return 1;
        
       -    if (fread(&host_x[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_x[j].y, sizeof(Float), 1, fp) != 1)
              return 1; // Return unsuccessful exit status
       -    if (fread(&host_vel[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_vel[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_angvel[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_angvel[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_force[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_force[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_torque[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_torque[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
        
       -    if (fread(&host_x[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_x[j].z, sizeof(Float), 1, fp) != 1)
              return 1; // Return unsuccessful exit status
       -    if (fread(&host_vel[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_vel[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_angvel[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_angvel[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_force[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_force[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
       -    if (fread(&host_torque[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_torque[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
          }
        
          for (j=0; j<p.np; ++j) {
       -    if (fread(&host_vel[j].w, sizeof(float), 1, fp) != 1) // Fixvel
       +    if (fread(&host_vel[j].w, sizeof(Float), 1, fp) != 1) // Fixvel
              return 1; // Return unsuccessful exit status
       -    if (fread(&host_x[j].w, sizeof(float), 1, fp) != 1) // xsum
       +    if (fread(&host_x[j].w, sizeof(Float), 1, fp) != 1) // xsum
              return 1;
            if (fread(&p.radius[j], sizeof(p.radius[j]), 1, fp) != 1)
              return 1;
       t@@ -288,39 +288,39 @@ int main(int argc, char *argv[])
          // Allocate host memory for walls
          // Wall normal (x,y,z), w: wall position on axis parallel to wall normal
          // Wall mass (x), velocity (y), force (z), and deviatoric stress (w)
       -  float4 *host_w_nx   = new float4[params.nw];
       -  float4 *host_w_mvfd = new float4[params.nw]; 
       +  Float4 *host_w_nx   = new Float4[params.nw];
       +  Float4 *host_w_mvfd = new Float4[params.nw]; 
        
          // Read wall data
          for (j=0; j<params.nw; ++j) {
            // Wall normal, x-dimension
       -    if (fread(&host_w_nx[j].x, sizeof(float), 1, fp) != 1) // Read in single precision
       +    if (fread(&host_w_nx[j].x, sizeof(Float), 1, fp) != 1) // Read in single precision
              return 1;
            // Wall normal, y-dimension
       -    if (fread(&host_w_nx[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_nx[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
            // Wall normal, z-dimension
       -    if (fread(&host_w_nx[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_nx[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
        
            // Wall position along axis parallel to wall normal
       -    if (fread(&host_w_nx[j].w, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_nx[j].w, sizeof(Float), 1, fp) != 1)
              return 1;
        
            // Wall mass
       -    if (fread(&host_w_mvfd[j].x, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_mvfd[j].x, sizeof(Float), 1, fp) != 1)
              return 1;
        
            // Wall velocity along axis parallel to wall normal
       -    if (fread(&host_w_mvfd[j].y, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_mvfd[j].y, sizeof(Float), 1, fp) != 1)
              return 1;
        
            // Wall force along axis parallel to wall normal
       -    if (fread(&host_w_mvfd[j].z, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_mvfd[j].z, sizeof(Float), 1, fp) != 1)
              return 1;
        
            // Wall deviatoric stress
       -    if (fread(&host_w_mvfd[j].w, sizeof(float), 1, fp) != 1)
       +    if (fread(&host_w_mvfd[j].w, sizeof(Float), 1, fp) != 1)
              return 1;
          }
          // x- and y boundary behavior.
       t@@ -330,6 +330,15 @@ int main(int argc, char *argv[])
            params.periodic = 0;
          }
        
       +  // Wall viscosities
       +  if (fread(&params.gamma_wn, sizeof(params.gamma_wn), 1, fp) != 1)
       +    return 1;
       +  if (fread(&params.gamma_ws, sizeof(params.gamma_ws), 1, fp) != 1)
       +    return 1;
       +  if (fread(&params.gamma_wr, sizeof(params.gamma_wr), 1, fp) != 1)
       +    return 1;
       +
       +
          for (i=0; i<p.np; ++i) {
            if (fread(&host_bonds[i].x, sizeof(unsigned int), 1, fp) != 1)
              return 1;