tThe number of memory chunks increased to 95% of total memory - cuda-memscrub - scrubs the global device memory of CUDA GPUs
 (HTM) git clone git://src.adamsgaard.dk/cuda-memscrub
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 4bd9658bceab5d4dc01e41b8410802fd3fef3859
 (DIR) parent 1cf91a1710beee16200b48487cacf9bc45bb4938
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Mon, 13 Jan 2014 14:21:07 +0100
       
       The number of memory chunks increased to 95% of total memory
       
       Diffstat:
         M scrub.cu                            |      43 +++++++++++++++++++------------
         M utility.cu                          |      11 +++++++++++
         M utility.cuh                         |       1 +
       
       3 files changed, 38 insertions(+), 17 deletions(-)
       ---
 (DIR) diff --git a/scrub.cu b/scrub.cu
       t@@ -6,6 +6,7 @@
        
        #define VERSION "0.1"
        #define VALUETOWRITE 1234
       +#define MEMCHUNKS 19
        
        __global__ void write_value(int* d_mem, long unsigned int n_ints,
                unsigned int nx, unsigned int ny)
       t@@ -78,19 +79,24 @@ int main(int argc, char** argv)
            size_t mem_size = prop.totalGlobalMem;
            printf("global memory size: %lu bytes\n", mem_size);
        
       -    long unsigned int n_ints = mem_size/sizeof(int)/20;
       +    long unsigned int n_ints = mem_size/sizeof(int)/(MEMCHUNKS+1);
            printf("overwriting the first %ld bytes, corresponding to %ld int values "
                    "or the first %.1f%% of the global device memory.\n",
       -            n_ints*sizeof(int), n_ints,
       -            (float)100*n_ints*sizeof(int)/prop.totalGlobalMem);
       -    int* d_mem;
       -    if (cudaMalloc((void**)&d_mem, n_ints*sizeof(int))
       -            == cudaErrorMemoryAllocation) {
       -        fprintf(stderr, "Error: Could not allocate the requested amount of "
       -                "global memory on the device.\n");
       -        cudaDeviceReset();
       -        exit(EXIT_FAILURE);
       +            n_ints*sizeof(int)*MEMCHUNKS, n_ints*MEMCHUNKS,
       +            (float)100*n_ints*sizeof(int)*MEMCHUNKS/prop.totalGlobalMem);
       +
       +    int* d[MEMCHUNKS];  // array of device pointers
       +    int i;
       +    for (i=0; i<MEMCHUNKS; i++) {
       +        if (cudaMalloc((void**)&d[i], n_ints*sizeof(int))
       +                    == cudaErrorMemoryAllocation) {
       +            fprintf(stderr, "Error: Could not allocate the requested amount of "
       +                    "global memory on the device.\n");
       +            cudaDeviceReset();
       +            exit(EXIT_FAILURE);
       +        }
            }
       +    checkForCudaErrors("After memory allocation");
        
            dim3 dimBlock(prop.maxThreadsPerBlock, 1, 1);
            unsigned int grid_size = iDivUp(n_ints, prop.maxThreadsPerBlock);
       t@@ -101,15 +107,18 @@ int main(int argc, char** argv)
                exit(EXIT_FAILURE);
            }
            dim3 dimGrid(grid_size, 1, 1);
       -    //printf("dimBlock = %d,%d,%d\n", dimBlock.x, dimBlock.y, dimBlock.z);
       -    //printf("dimGrid = %d,%d,%d\n", dimGrid.x, dimGrid.y, dimGrid.z);
        
       -    write_value<<<dimGrid, dimBlock>>>(d_mem, n_ints, 1, 1);
       -    cudaThreadSynchronize();
       -    checkForCudaErrors("After write_value");
       +    for (i=0; i<MEMCHUNKS; i++) {
       +        write_value<<<dimGrid, dimBlock>>>(d[i], n_ints, 1, 1);
       +        cudaThreadSynchronize();
       +        checkForCudaErrors("After write_value", i);
       +    }
        
       -    cudaFree(d_mem);
       -    checkForCudaErrors("After cudaFree(d_mem)");
       +
       +    for (i=0; i<MEMCHUNKS; i++) {
       +        cudaFree(d[i]);
       +        checkForCudaErrors("After cudaFree(d[i])", i);
       +    }
        
            cudaDeviceReset();
        
 (DIR) diff --git a/utility.cu b/utility.cu
       t@@ -17,6 +17,17 @@ void checkForCudaErrors(const char* checkpoint_description)
            }
        }
        
       +void checkForCudaErrors(const char* checkpoint_description, int iteration)
       +{
       +    cudaError_t err = cudaGetLastError();
       +    if (err != cudaSuccess) {
       +        fprintf(stderr, "CUDA error detected at: %s at iteration %d.\n"
       +                "System error string: %s\n", checkpoint_description, iteration,
       +                cudaGetErrorString(err));
       +        exit(EXIT_FAILURE);
       +    }
       +}
       +
        //Round a / b to nearest higher integer value
        unsigned int iDivUp(unsigned int a, unsigned int b)
        {
 (DIR) diff --git a/utility.cuh b/utility.cuh
       t@@ -3,6 +3,7 @@
        #define UTILITY_CUH_
        
        void checkForCudaErrors(const char* checkpoint_description);
       +void checkForCudaErrors(const char* checkpoint_description, int iteration);
        unsigned int iDivUp(unsigned int a, unsigned int b);
        
        #endif