mx1.adamsgaard.dk

       tdevice.cu: estimate number of cudaCoresPerSM for CC version 8 - sphere - GPU-based 3D discrete element method algorithm with optional fluid coupling
 (HTM) git clone git://src.adamsgaard.dk/sphere
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) LICENSE
       ---
 (DIR) commit 04f7aa6ff1ec5eb78167e8b40fb399ed3f63a093
 (DIR) parent 401f418676345ca4b80e2725106c40fc37f28e31
 (HTM) Author: Anders Damsgaard <anders@adamsgaard.dk>
       Date:   Thu,  9 Feb 2023 12:43:46 +0100
       
       device.cu: estimate number of cudaCoresPerSM for CC version 8
       
       Diffstat:
         M src/device.cu                       |     104 ++++++++++++++++---------------
       
       1 file changed, 53 insertions(+), 51 deletions(-)
       ---
 (DIR) diff --git a/src/device.cu b/src/device.cu
       t@@ -54,6 +54,8 @@ int cudaCoresPerSM(int major, int minor)
                return 128;
            else if (major == 7)
                return 32;
       +    else if (major == 8)
       +        return 64;
            else
                printf("Error in cudaCoresPerSM Device compute capability value "
                        "(%d.%d) not recognized.", major, minor);
       t@@ -112,11 +114,11 @@ void DEM::initializeGPU(void)
        
                    if (verbose == 1) {
                        cout << "  CUDA device ID: " << d << "\n";
       -                cout << "  - Name: " <<  prop.name << ", compute capability: " 
       +                cout << "  - Name: " <<  prop.name << ", compute capability: "
                             << prop.major << "." << prop.minor << ".\n";
       -                cout << "  - CUDA Driver version: " << cudaDriverVersion/1000 
       -                     << "." <<  cudaDriverVersion%100 
       -                     << ", runtime version " << cudaRuntimeVersion/1000 << "." 
       +                cout << "  - CUDA Driver version: " << cudaDriverVersion/1000
       +                     << "." <<  cudaDriverVersion%100
       +                     << ", runtime version " << cudaRuntimeVersion/1000 << "."
                             << cudaRuntimeVersion%100 << std::endl;
                    }
                }
       t@@ -138,11 +140,11 @@ void DEM::initializeGPU(void)
        
                if (verbose == 1) {
                    cout << "  CUDA device ID: " << device << "\n";
       -            cout << "  - Name: " <<  prop.name << ", compute capability: " 
       +            cout << "  - Name: " <<  prop.name << ", compute capability: "
                         << prop.major << "." << prop.minor << ".\n";
       -            cout << "  - CUDA Driver version: " << cudaDriverVersion/1000 
       -                 << "." <<  cudaDriverVersion%100 
       -                 << ", runtime version " << cudaRuntimeVersion/1000 << "." 
       +            cout << "  - CUDA Driver version: " << cudaDriverVersion/1000
       +                 << "." <<  cudaDriverVersion%100
       +                 << ", runtime version " << cudaRuntimeVersion/1000 << "."
                         << cudaRuntimeVersion%100
                         << "\n  - " << ncudacores << " CUDA cores" << std::endl;
                }
       t@@ -284,7 +286,7 @@ __global__ void checkParticlePositions(
        
        
        // Copy the constant data components to device memory,
       -// and check whether the values correspond to the 
       +// and check whether the values correspond to the
        // values in constant memory.
        void DEM::checkConstantMemory()
        {
       t@@ -386,7 +388,7 @@ void DEM::updateGridSize()
            grid.L[2] = *Lz;
        
            // Write value to devC_grid.L[2]
       -    //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float)); 
       +    //cudaMemcpyToSymbol(devC_grid.L[2], &Lz, sizeof(Float));
            cudaMemcpyToSymbol(devC_grid, &grid, sizeof(Grid));
        
            checkForCudaErrors("DEM::updateGridSize(): write to devC_grid.L[2]");
       t@@ -857,15 +859,15 @@ void DEM::startTime()
            tic = clock();
        
            //// GPU workload configuration
       -    unsigned int threadsPerBlock = 256; 
       -    //unsigned int threadsPerBlock = 512; 
       +    unsigned int threadsPerBlock = 256;
       +    //unsigned int threadsPerBlock = 512;
        
            // Create enough blocks to accomodate the particles
       -    unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock); 
       +    unsigned int blocksPerGrid = iDivUp(np, threadsPerBlock);
            dim3 dimGrid(blocksPerGrid, 1, 1); // Blocks arranged in 1D grid
            dim3 dimBlock(threadsPerBlock, 1, 1); // Threads arranged in 1D block
        
       -    unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock); 
       +    unsigned int blocksPerGridBonds = iDivUp(params.nb0, threadsPerBlock);
            dim3 dimGridBonds(blocksPerGridBonds, 1, 1); // Blocks arranged in 1D grid
        
            // Use 3D block and grid layout for cell-centered fluid calculations
       t@@ -930,9 +932,9 @@ void DEM::startTime()
            //sprintf(file,"output/%s.status.dat", sid);
            outfile = "output/" + sid + ".status.dat";
            fp = fopen(outfile.c_str(), "w");
       -    fprintf(fp,"%2.4e %2.4e %d\n", 
       -            time.current, 
       -            100.0*time.current/time.total, 
       +    fprintf(fp,"%2.4e %2.4e %d\n",
       +            time.current,
       +            100.0*time.current/time.total,
                    time.step_count);
            fclose(fp);
        
       t@@ -1052,19 +1054,19 @@ void DEM::startTime()
                    checkForCudaErrorsIter("Post checkParticlePositions", iter);
        #endif
        
       -            // If the grid is adaptive, readjust the grid height to equal the 
       +            // If the grid is adaptive, readjust the grid height to equal the
                    // positions of the dynamic walls
                    if (grid.adaptive == 1 && walls.nw > 0) {
                        updateGridSize();
                    }
        
       -            // For each particle: 
       -            // Compute hash key (cell index) from position 
       +            // For each particle:
       +            // Compute hash key (cell index) from position
                    // in the fine, uniform and homogenous grid.
                    if (PROFILING == 1)
                        startTimer(&kernel_tic);
                    calcParticleCellID<<<dimGrid, dimBlock>>>(dev_gridParticleCellID,
       -                    dev_gridParticleIndex, 
       +                    dev_gridParticleIndex,
                            dev_x);
        
                    // Synchronization point
       t@@ -1093,7 +1095,7 @@ void DEM::startTime()
                    // Zero cell array values by setting cellStart to its highest
                    // possible value, specified with pointer value 0xffffffff, which
                    // for a 32 bit unsigned int is 4294967295.
       -            cudaMemset(dev_cellStart, 0xffffffff, 
       +            cudaMemset(dev_cellStart, 0xffffffff,
                            grid.num[0]*grid.num[1]*grid.num[2]*sizeof(unsigned int));
                    cudaThreadSynchronize();
                    checkForCudaErrorsIter("Post cudaMemset", iter);
       t@@ -1103,14 +1105,14 @@ void DEM::startTime()
                    // configurations in new arrays (*_sorted).
                    if (PROFILING == 1)
                        startTimer(&kernel_tic);
       -            reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart, 
       +            reorderArrays<<<dimGrid, dimBlock, smemSize>>>(dev_cellStart,
                            dev_cellEnd,
       -                    dev_gridParticleCellID, 
       +                    dev_gridParticleCellID,
                            dev_gridParticleIndex,
       -                    dev_x, dev_vel, 
       +                    dev_x, dev_vel,
                            dev_angvel,
       -                    dev_x_sorted, 
       -                    dev_vel_sorted, 
       +                    dev_x_sorted,
       +                    dev_vel_sorted,
                            dev_angvel_sorted);
        
                    // Synchronization point
       t@@ -1127,10 +1129,10 @@ void DEM::startTime()
                        // For each particle: Search contacts in neighbor cells
                        if (PROFILING == 1)
                            startTimer(&kernel_tic);
       -                topology<<<dimGrid, dimBlock>>>(dev_cellStart, 
       +                topology<<<dimGrid, dimBlock>>>(dev_cellStart,
                                dev_cellEnd,
                                dev_gridParticleIndex,
       -                        dev_x_sorted, 
       +                        dev_x_sorted,
                                dev_contacts,
                                dev_distmod);
        
       t@@ -1159,10 +1161,10 @@ void DEM::startTime()
                            dev_angvel_sorted,
                            dev_vel,
                            dev_angvel,
       -                    dev_force, 
       -                    dev_torque, 
       +                    dev_force,
       +                    dev_torque,
                            dev_es_dot,
       -                    dev_ev_dot, 
       +                    dev_ev_dot,
                            dev_es,
                            dev_ev,
                            dev_p,
       t@@ -1828,7 +1830,7 @@ void DEM::startTime()
                                    if (write_conv_log == 1)
                                        convlog << iter+1 << '\t' << nijac << std::endl;
        
       -                            std::cerr << "\nIteration " << iter << ", time " 
       +                            std::cerr << "\nIteration " << iter << ", time "
                                        << iter*time.dt << " s: "
                                        "Error, the epsilon solution in the fluid "
                                        "calculations did not converge. Try increasing "
       t@@ -1898,7 +1900,7 @@ void DEM::startTime()
                    } // end cfd_solver == 0
        
                    // Darcy solution
       -            else if (cfd_solver == 1) { 
       +            else if (cfd_solver == 1) {
        
        #if defined(REPORT_EPSILON) || defined(REPORT_FORCING_TERMS)
                        std::cout << "\n\n@@@@@@ TIME STEP " << iter << " @@@"
       t@@ -2027,7 +2029,7 @@ void DEM::startTime()
        
                            // copy porosities to the frictionless Y boundaries
                            if (grid.periodic == 2) {
       -                        copyDarcyPorositiesToEdges<<<dimGridFluid, 
       +                        copyDarcyPorositiesToEdges<<<dimGridFluid,
                                    dimBlockFluid>>>(
                                        dev_darcy_phi,
                                        dev_darcy_dphi,
       t@@ -2038,7 +2040,7 @@ void DEM::startTime()
        
                            // copy porosities to the frictionless lower Z boundary
                            if (grid.periodic == 2) {
       -                        copyDarcyPorositiesToBottom<<<dimGridFluid, 
       +                        copyDarcyPorositiesToBottom<<<dimGridFluid,
                                        dimBlockFluid>>>(
                                        dev_darcy_phi,
                                        dev_darcy_dphi,
       t@@ -2369,7 +2371,7 @@ void DEM::startTime()
                                    if (write_conv_log == 1)
                                        convlog << iter+1 << '\t' << nijac << std::endl;
        
       -                            std::cerr << "\nIteration " << iter << ", time " 
       +                            std::cerr << "\nIteration " << iter << ", time "
                                        << iter*time.dt << " s: "
                                        "Error, the pressure solution in the fluid "
                                        "calculations did not converge. Try increasing "
       t@@ -2471,14 +2473,14 @@ void DEM::startTime()
                    // Update particle kinematics
                    if (PROFILING == 1)
                        startTimer(&kernel_tic);
       -            integrate<<<dimGrid, dimBlock>>>(dev_x_sorted, 
       -                    dev_vel_sorted, 
       +            integrate<<<dimGrid, dimBlock>>>(dev_x_sorted,
       +                    dev_vel_sorted,
                            dev_angvel_sorted,
       -                    dev_x, 
       -                    dev_vel, 
       +                    dev_x,
       +                    dev_vel,
                            dev_angvel,
                            dev_force,
       -                    dev_torque, 
       +                    dev_torque,
                            dev_angpos,
                            dev_acc,
                            dev_angacc,
       t@@ -2571,7 +2573,7 @@ void DEM::startTime()
                }
        
        
       -        // Produce output binary if the time interval 
       +        // Produce output binary if the time interval
                // between output files has been reached
                if (filetimeclock >= time.file_dt) {
        
       t@@ -2646,19 +2648,19 @@ void DEM::startTime()
                            cout << "\n## Particle " << n << " ##\n";
        
                            cout  << "- contacts:\n";
       -                    for (int nc = 0; nc < NC; ++nc) 
       +                    for (int nc = 0; nc < NC; ++nc)
                                cout << "[" << nc << "]=" << k.contacts[nc+NC*n] <<
                                    '\n';
        
                            cout << "\n- delta_t:\n";
       -                    for (int nc = 0; nc < NC; ++nc) 
       +                    for (int nc = 0; nc < NC; ++nc)
                                cout << k.delta_t[nc+NC*n].x << '\t'
                                    << k.delta_t[nc+NC*n].y << '\t'
                                    << k.delta_t[nc+NC*n].z << '\t'
                                    << k.delta_t[nc+NC*n].w << '\n';
        
                            cout << "\n- distmod:\n";
       -                    for (int nc = 0; nc < NC; ++nc) 
       +                    for (int nc = 0; nc < NC; ++nc)
                                cout << k.distmod[nc+NC*n].x << '\t'
                                    << k.distmod[nc+NC*n].y << '\t'
                                    << k.distmod[nc+NC*n].z << '\t'
       t@@ -2667,11 +2669,11 @@ void DEM::startTime()
                        cout << '\n';
                    }
        
       -            // Update status.dat at the interval of filetime 
       +            // Update status.dat at the interval of filetime
                    outfile = "output/" + sid + ".status.dat";
                    fp = fopen(outfile.c_str(), "w");
       -            fprintf(fp,"%2.4e %2.4e %d\n", 
       -                    time.current, 
       +            fprintf(fp,"%2.4e %2.4e %d\n",
       +                    time.current,
                            100.0*time.current/time.total,
                            time.step_count);
                    fclose(fp);
       t@@ -2697,7 +2699,7 @@ void DEM::startTime()
        
            if (verbose == 1) {
                cout << "\nSimulation ended. Statistics:\n"
       -            << "  - Last output file number: " 
       +            << "  - Last output file number: "
                    << time.step_count << "\n"
                    << "  - GPU time spent: "
                    << dev_time_spent/1000.0f << " s\n"
       t@@ -2705,7 +2707,7 @@ void DEM::startTime()
                    << time_spent << " s\n"
                    << "  - Mean duration of iteration:\n"
                    << "      " << dev_time_spent/((double)iter*1000.0f) << " s"
       -            << std::endl; 
       +            << std::endl;
            }
        
            cudaEventDestroy(dev_tic);