tworking on multi-GPU call for interact routine - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 983734d17bb48ec0d5191c2e18e66808584afc95
 (DIR) parent b620d26c6032c277e46b2e68fc566449bcdc015b
 (HTM) Author: Anders Damsgaard <anders.damsgaard@geo.au.dk>
       Date:   Mon, 30 Jun 2014 12:43:30 +0200
       
       working on multi-GPU call for interact routine
       
       Diffstat:
         M src/device.cu                       |     113 +++++++++++++++++++++++++++++++
         M src/sphere.h                        |      37 +++++++++++++++++++++++++++++--
       
       2 files changed, 148 insertions(+), 2 deletions(-)
       ---
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -340,6 +340,119 @@ __host__ void DEM::allocateGlobalDeviceMemory(void)
                std::cout << "Done" << std::endl;
        }
        
       +// Allocate global memory on other devices required for "interact" function.
       +// The values of domain_size[ndevices] must be set beforehand.
       +__host__ void DEM::allocateHelperDeviceMemory(void)
       +{
       +    // Particle memory size
       +    unsigned int memSizeF4 = sizeof(Float4) * np;
       +
       +    // Initialize pointers to per-GPU arrays
       +    hdev_gridParticleIndex = (unsigned**)malloc(ndevices*sizeof(unsigned*));
       +    hdev_gridCellStart     = (unsigned**)malloc(ndevices*sizeof(unsigned*));
       +    hdev_gridCellEnd       = (unsigned**)malloc(ndevices*sizeof(unsigned*));
       +    hdev_x                 = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_x_sorted          = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_vel               = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_vel_sorted        = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_angvel            = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_angvel_sorted     = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_walls_nx          = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_walls_mvfd        = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_distmod           = (Float4**)malloc(ndevices*sizeof(Float4*));
       +
       +    hdev_force             = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_torque            = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_delta_t           = (Float4**)malloc(ndevices*sizeof(Float4*));
       +    hdev_es_dot            = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_es                = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_ev_dot            = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_ev                = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_p                 = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_walls_force_pp    = (Float**)malloc(ndevices*sizeof(Float*));
       +    hdev_contacts          = (unsigned**)malloc(ndevices*sizeof(unsigned*));
       +
       +    for (int d=0; d<ndevices; d++) {
       +
       +        // do not allocate memory on primary GPU
       +        if (d == device)
       +            continue;
       +
       +        cudaSetDevice(d);
       +
       +        // allocate space for full input arrays for interact()
       +        cudaMalloc((void**)&hdev_gridParticleIndex[d], sizeof(unsigned int)*np);
       +        cudaMalloc((void**)&hdev_gridCellStart[d], sizeof(unsigned int)
       +                   *grid.num[0]*grid.num[1]*grid.num[2]);
       +        cudaMalloc((void**)&hdev_gridCellEnd[d], sizeof(unsigned int)
       +                   *grid.num[0]*grid.num[1]*grid.num[2]);
       +        cudaMalloc((void**)&hdev_x[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_x_sorted[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_vel[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_vel_sorted[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_angvel[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_angvel_sorted[d], memSizeF4);
       +        cudaMalloc((void**)&hdev_walls_nx[d], sizeof(Float4)*walls.nw);
       +        cudaMalloc((void**)&hdev_walls_mvfd[d], sizeof(Float4)*walls.nw);
       +        cudaMalloc((void**)&hdev_distmod[d], memSizeF4*NC);
       +
       +        // allocate space for partial output arrays for interact()
       +        cudaMalloc((void**)&hdev_force[d], sizeof(Float4)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_torque[d], sizeof(Float4)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_es_dot[d], sizeof(Float)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_ev_dot[d], sizeof(Float)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_es[d], sizeof(Float)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_ev[d], sizeof(Float)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_p[d], sizeof(Float)*domain_size[d]);
       +        cudaMalloc((void**)&hdev_walls_force_pp[d],
       +                   sizeof(Float)*domain_size[d]*walls.nw);
       +        cudaMalloc((void**)&hdev_contacts[d],
       +                   sizeof(unsigned)*domain_size[d]*NC);
       +        cudaMalloc((void**)&hdev_delta_t[d], sizeof(Float4)*domain_size[d]*NC);
       +
       +        checkForCudaErrors("During allocateGlobalDeviceMemoryOtherDevices");
       +    }
       +    cudaSetDevice(device); // select main GPU
       +}
       +
       +__host__ void DEM::freeHelperDeviceMemory()
       +{
       +    for (int d=0; d<ndevices; d++) {
       +
       +        // do not allocate memory on primary GPU
       +        if (d == device)
       +            continue;
       +
       +        cudaSetDevice(d);
       +
       +        cudaFree(hdev_gridParticleIndex[d]);
       +        cudaFree(hdev_gridCellStart[d]);
       +        cudaFree(hdev_gridCellEnd[d]);
       +        cudaFree(hdev_x[d]);
       +        cudaFree(hdev_vel[d]);
       +        cudaFree(hdev_vel_sorted[d]);
       +        cudaFree(hdev_angvel[d]);
       +        cudaFree(hdev_angvel_sorted[d]);
       +        cudaFree(hdev_walls_nx[d]);
       +        cudaFree(hdev_walls_mvfd[d]);
       +        cudaFree(hdev_distmod[d]);
       +
       +        cudaFree(hdev_force[d]);
       +        cudaFree(hdev_torque[d]);
       +        cudaFree(hdev_es_dot[d]);
       +        cudaFree(hdev_ev_dot[d]);
       +        cudaFree(hdev_es[d]);
       +        cudaFree(hdev_ev[d]);
       +        cudaFree(hdev_p[d]);
       +        cudaFree(hdev_walls_force_pp[d]);
       +        cudaFree(hdev_contacts[d]);
       +        cudaFree(hdev_delta_t[d]);
       +
       +        checkForCudaErrors("During helper device cudaFree calls");
       +    }
       +    cudaSetDevice(device); // select primary GPU
       +}
       +
        __host__ void DEM::freeGlobalDeviceMemory()
        {
            if (verbose == 1)
 (DIR) diff --git a/src/sphere.h b/src/sphere.h
       t@@ -53,8 +53,10 @@ class DEM {
                unsigned int width;
                unsigned int height;
        
       -        int ndevices;  // number of CUDA GPUs
       -        int device;    // primary GPU
       +        // Device management
       +        int ndevices;     // number of CUDA GPUs
       +        int device;       // primary GPU
       +        int* domain_size; // elements per GPU
        
        
                // DEVICE ARRAYS
       t@@ -137,6 +139,10 @@ class DEM {
                void allocateGlobalDeviceMemory();
                void rt_allocateGlobalDeviceMemory();
        
       +        // Allocate global memory on helper devices
       +        void allocateHelperDeviceMemory();
       +        void freeHelperDeviceMemory();
       +
                // Free dynamically allocated global device memory
                void freeGlobalDeviceMemory();
                void rt_freeGlobalDeviceMemory();
       t@@ -208,6 +214,33 @@ class DEM {
                Float*  dev_ns_div_tau_z;    // div(tau) on z-face
                Float3* dev_ns_f_pf;         // Interaction force on particles
        
       +        // Helper device arrays, input
       +        unsigned int** hdev_gridParticleIndex;
       +        unsigned int** hdev_gridCellStart;
       +        unsigned int** hdev_gridCellEnd;
       +        Float4** hdev_x;
       +        Float4** hdev_x_sorted;
       +        Float4** hdev_vel;
       +        Float4** hdev_vel_sorted;
       +        Float4** hdev_angvel;
       +        Float4** hdev_angvel_sorted;
       +        Float4** hdev_walls_nx;
       +        Float4** hdev_walls_mvfd;
       +        Float4** hdev_distmod;
       +
       +        // Helper device arrays, output
       +        Float4** hdev_force;
       +        Float4** hdev_torque;
       +        Float4** hdev_delta_t;
       +        Float** hdev_es_dot;
       +        Float** hdev_es;
       +        Float** hdev_ev_dot;
       +        Float** hdev_ev;
       +        Float** hdev_p;
       +        Float** hdev_walls_force_pp;
       +        unsigned int** hdev_contacts;
       +
       +
                //// Navier Stokes functions
        
                // Memory allocation