tFixed threadsPerBlock value - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 5c10dece10e2b7eb6aebc93258c423f586a83dd6
 (DIR) parent 18da38f2435da18e108b208f2054eb4c11107207
 (HTM) Author: Anders Damsgaard <adc@geo.au.dk>
       Date:   Thu,  2 May 2013 09:58:15 +0200
       
       Fixed threadsPerBlock value
       
       Diffstat:
         M CMakeLists.txt                      |      10 +++++-----
         M python/sphere.py                    |      37 ++++++++++++++++++++++---------
         M src/device.cu                       |       4 ++--
         M src/latticeboltzmann.cuh            |       1 -
       
       4 files changed, 33 insertions(+), 19 deletions(-)
       ---
 (DIR) diff --git a/CMakeLists.txt b/CMakeLists.txt
       t@@ -24,13 +24,13 @@ find_package(OpenMP)
        enable_testing()
        
        # Set build type = Debug
       -#set(CMAKE_BUILD_TYPE Debug)
       -#if (CUDA_FOUND)
       -#    set(CUDA_NVCC_FLAGS -g;-G)
       -#endif()
       +set(CMAKE_BUILD_TYPE Debug)
       +if (CUDA_FOUND)
       +    set(CUDA_NVCC_FLAGS -g;-G)
       +endif()
        
        # Set build type = Release
       -set(CMAKE_BUILD_TYPE Release)
       +#set(CMAKE_BUILD_TYPE Release)
        
        # Add source directory to project.
        add_subdirectory(src)
 (DIR) diff --git a/python/sphere.py b/python/sphere.py
       t@@ -116,9 +116,9 @@ class Spherebin:
        
                self.nu = numpy.zeros(1, dtype=numpy.float64)
                self.f_v = numpy.zeros(
       -            (self.num[0] * self.num[1] * self.num[2], self.nd),
       +            (self.num[0], self.num[1], self.num[2], self.nd),
                    dtype=numpy.float64)
       -        self.f_rho = numpy.zeros(self.num[0] * self.num[1] * self.num[2],
       +        self.f_rho = numpy.zeros((self.num[0], self.num[1], self.num[2]),
                                       dtype=numpy.float64)
        
            def __cmp__(self, other):
       t@@ -308,22 +308,34 @@ class Spherebin:
                        self.tau_b = numpy.fromfile(fh, dtype=numpy.float64, count=1)
                        self.bonds = numpy.empty((self.nb0, 2), dtype=numpy.uint32)
                        for i in range(self.nb0):
       -                    self.bonds[i,0] = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
       -                    self.bonds[i,1] = numpy.fromfile(fh, dtype=numpy.uint32, count=1)
       -                self.bonds_delta_n = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0)
       -                self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0, self.nd)
       -                self.bonds_omega_n = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0)
       -                self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64, count=self.nb0*self.nd).reshape(self.nb0, self.nd)
       +                    self.bonds[i,0] = numpy.fromfile(fh, dtype=numpy.uint32,
       +                            count=1)
       +                    self.bonds[i,1] = numpy.fromfile(fh, dtype=numpy.uint32,
       +                            count=1)
       +                self.bonds_delta_n = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=self.nb0)
       +                self.bonds_delta_t = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=self.nb0*self.nd).reshape(self.nb0, self.nd)
       +                self.bonds_omega_n = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=self.nb0)
       +                self.bonds_omega_t = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=self.nb0*self.nd).reshape(self.nb0, self.nd)
                    else:
                        self.nb0 = numpy.zeros(1, dtype=numpy.uint32)
        
                    if (fluid == True):
                        ncells = self.num[0]*self.num[1]*self.num[2]
                        self.nu = numpy.fromfile(fh, dtype=numpy.float64, count=1)
       -                self.f_v = numpy.empty(ncells*self.nd, dtype=numpy.float64)
       +                self.f_v = numpy.empty(
       +                        (self.num[0], self.num[1], self.num[2], self.nd),
       +                        dtype=numpy.float64)
                        self.f_rho = numpy.empty(ncells, dtype=numpy.float64)
       -                self.f_v = numpy.fromfile(fh, dtype=numpy.float64, count=ncells*self.nd)
       -                self.f_rho = numpy.fromfile(fh, dtype=numpy.float64, count=ncells)
       +                self.f_v = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=ncells*self.nd).reshape(
       +                                self.num[0], self.num[1], self.num[2], self.nd)
       +                self.f_rho = numpy.fromfile(fh, dtype=numpy.float64,
       +                        count=ncells).reshape(
       +                                self.num[0], self.num[1], self.num[2])
        
                finally:
                    if fh is not None:
       t@@ -1771,6 +1783,9 @@ class Spherebin:
                fig.savefig('../img_out/' + self.sid + '-ts-x1x3-slipangles.png')
                fig.clf()
        
       +    def plotRho(self):
       +        x=2
       +
        
        def convert(graphicsformat = "png",
                folder = "../img_out"):
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -562,8 +562,8 @@ __host__ void DEM::startTime()
            tic = clock();
        
            //// GPU workload configuration
       -    //unsigned int threadsPerBlock = 256; 
       -    unsigned int threadsPerBlock = 512; 
       +    unsigned int threadsPerBlock = 256; 
       +    //unsigned int threadsPerBlock = 512; 
        
            // Create enough blocks to accomodate the particles
            unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock); 
 (DIR) diff --git a/src/latticeboltzmann.cuh b/src/latticeboltzmann.cuh
       t@@ -124,7 +124,6 @@ __global__ void latticeBoltzmannD3Q19(
        
                //printf("(x,y,x) = (%d,%d,%d), tidx = %d\n", x, y, z, tidx);
        
       -
                // Load the fluid distribution into local registers
                __syncthreads();
                Float f_0  = dev_f[grid2index(x,y,z,0)];