CUDAã§ããã°ã©ãã³ã°ããåã«ãã£ã¦ãããããã¨
ãã¡ãã®è¨äºãã確èªä¸ããã
CUDA6.0用にCPUとGPUの速度比較コードを修正 - 株式会社CFlatの明後日スタイルのブログ
GPUã¨CPUã®é度æ¯è¼ãããã
以åCUDAã§ã®ããã°ã©ãã³ã°ãå®æããéã«ãCPUã¨ã®é度æ¯è¼ãè¡ãããã¨ããå½ããåã®è¦æãä¸ããã¾ããã ããã§GPUã¨CPUãåãæ¿ããã³ã¼ããä½æããã®ã§ãããã©ãããªãæåããä½ã£ã¦ããã¹ãã ã£ãã¨åçãã¾ããã ä»åã¯ãã®éã«ä½æããGPUã¨CPUã®é度æ¯è¼ã³ã¼ããå ¬éãã¾ãã
OpenMPã®è¨å®
GPUã¨CPUã®é度æ¯è¼ãããéã«CPUã®æ¹ã並ååããªãã¨ä¸å ¬å¹³ã§ããCPUã®ä¸¦ååã«ã¯è¨å®ãç°¡åãªOpenMPã使ãã¾ãã CUDAããã¸ã§ã¯ãã§OpenMPã使ç¨ã§ããããã«ããã«ã¯ãããã¸ã§ã¯ãã®ããããã£ãéããæ§æããããã£/CUDA C/C++/Command Lineã®è¿½å ãªãã·ã§ã³ã«-Xcompiler "/openmp"ã¨è¨è¼ãã¾ãã ãã¨ã¯ä¸¦ååãããforæã®åã«#pragma omp parallel forãä»ãå ããã ãã§ãã CUDAã§ã®ããã°ã©ãã³ã°ãåãã¦ã®æ¹ã¯ãã®è¨äºãåèã«ãã¦ä¸ããã
GPUã¨CPUã®åãæ¿ããè¡ãã©ããã³ã°
ä¸è¨ã«ããSwitchableCPUGPU.cuhã¨Timer.cuhå ã§å®£è¨ããããã¯ããå¼ã¶ã¨ãUSE_GPUã®åãæ¿ãã§GPUã¨CPUãåãæ¿ãããã¨ãã§ãã¾ãã
SwitchableCPUGPU.cuh
#pragma once #include "Timer.cuh" #define USE_GPU #ifdef USE_GPU #define SWITCHABLE_DEVICE __device__ #define SWITCHABLE_GLOBAL __global__ #define SWITCHABLE_TIMER CudaEventTimer #else #define SWITCHABLE_DEVICE #define SWITCHABLE_GLOBAL #define SWITCHABLE_TIMER Timer #endif template <typename T> void SwitchableCudaMalloc( T& val, int size ) { #ifdef USE_GPU cudaMalloc( (void**)&val, sizeof(T)*size ); #endif } template <typename T> void SwitchableCudaFree( T* val ) { #ifdef USE_GPU cudaFree( val ); #endif } template <typename T> void SwitchableCudaMemcpyHostToDevice( const T* const host, T* const device, int size ) { #ifdef USE_GPU cudaMemcpy( device, host, sizeof(T)*size, cudaMemcpyHostToDevice ); #endif } template <typename T> void SwitchableCudaMemcpyDeviceToHost( const T* const device, T* const host, int size ) { #ifdef USE_GPU cudaMemcpy( host, device, sizeof(T)*size, cudaMemcpyDeviceToHost); #endif }
Timer.cuh
#pragma once #include <string> #include <time.h> class CudaEventTimer { public : CudaEventTimer( const std::string& message ) : m_message( message ) { cudaEventCreate(&m_start); cudaEventCreate(&m_end); cudaEventRecord( m_start, 0 ); } ~CudaEventTimer() { cudaEventRecord( m_end, 0 ); cudaEventSynchronize( m_end ); float time; cudaEventElapsedTime( &time, m_start, m_end ); printf("%s = %f sec.\n",m_message.c_str(), time*0.001); cudaEventDestroy( m_start ); cudaEventDestroy( m_end ); } private: cudaEvent_t m_start; cudaEvent_t m_end; std::string m_message; }; class Timer { public : Timer( const std::string& message ) : m_message( message ) { m_start = clock(); } ~Timer() { m_end = clock(); printf("%s = %f sec.\n",m_message.c_str(), (double)(m_end - m_start)/CLOCKS_PER_SEC); } private: clock_t m_start; clock_t m_end; std::string m_message; };
è¡åè¨ç®ã§GPUã¨CPUãæ¯è¼
ä¸è¨ãè¡åã®æãç®ãè¡ããã¹ãã³ã¼ãã§ããCUDAé¢é£ã®å¦çãã©ããã³ã°ãããé¢æ°ããå¼ã¶ãã¨ã«ãã£ã¦ãGPUã¨CPUã®åãæ¿ãã容æã«ãã¦ãã¾ãã
#include <stdio.h> #include "SwitchableCPUGPU.cuh" SWITCHABLE_GLOBAL void Calculate( float* matrixA, float* matrixB, float* matrixC, int iLength, int col = 0, int row = 0 ) { #ifdef USE_GPU row = blockIdx.x * blockDim.x + threadIdx.x; col = blockIdx.y * blockDim.y + threadIdx.y; if ( row > iLength || col > iLength ) return; #endif float target = 0.0f; for ( int i = 0 ; i < iLength ; ++i ) { target += matrixA[row*iLength + i] * matrixB[i*iLength + col]; } matrixC[row*iLength + col] = target; } int main() { // è¡åã®ãµã¤ãºæ±ºå® const int iLength = 1024; const int iSize = iLength * iLength; // CPUå´ã®å¤æ°åæå float* matrixA = (float*)malloc(sizeof(float)*iSize); float* matrixB = (float*)malloc(sizeof(float)*iSize); float* matrixC = (float*)malloc(sizeof(float)*iSize); for ( int col = 0; col < iLength ; ++col ){ for ( int row = 0; row < iLength ; ++row ){ matrixA[col*iLength + row] = rand() % (1000); matrixB[col*iLength + row] = rand() % (1000); matrixC[col*iLength + row] = 0.0f; } } // ããããæéè¨æ¸¬ SWITCHABLE_TIMER t("time"); // GPUå´ã®å¤æ°åæå float* d_matrixA; float* d_matrixB; float* d_matrixC; SwitchableCudaMalloc( d_matrixA, iSize ); SwitchableCudaMalloc( d_matrixB, iSize ); SwitchableCudaMalloc( d_matrixC, iSize ); SwitchableCudaMemcpyHostToDevice( matrixA, d_matrixA, iSize ); SwitchableCudaMemcpyHostToDevice( matrixB, d_matrixB, iSize ); // è¡åè¨ç® #ifdef USE_GPU const int iThread = 16; dim3 thread( iThread, iThread ); const int iBlock = ( iLength + iThread - 1 )/iThread; dim3 block( iBlock, iBlock ); Calculate<<<block, thread>>>( d_matrixA, d_matrixB, d_matrixC, iLength ); cudaThreadSynchronize(); #else #pragma omp parallel for for ( int i = 0 ; i < iLength ; ++i ) { for ( int j = 0 ; j < iLength ; ++j ) { Calculate( matrixA, matrixB, matrixC, iLength, i, j ); } } #endif // å¾å¦ç SwitchableCudaMemcpyDeviceToHost( d_matrixC, matrixC, iSize ); free( matrixA ); free( matrixB ); free( matrixC ); SwitchableCudaFree( d_matrixA ); SwitchableCudaFree( d_matrixB ); SwitchableCudaFree( d_matrixC ); return 0; }
å®è¡çµæ
å®è¡çµæã¯æ¬¡ã®ããã«ãªãã¾ãããåèã¾ã§ã«ä¸¦ååããªãã£ãCPUã®çµæãè¼ãã¦ããã¾ãã GPUã«ã¯è¦æãªè¨ç®ãããã®ã§ãOpenMPã¨æ¯ã¹ã¦æããã«é ãå ´åã¯GPUã®ä½¿ç¨ãæ¤è¨ããæ¹ãè¯ãããããã¾ããã
GPU:0.152510s CPU OpenMP:1.220000s CPU 並ååç¡ã:7.254000s
GPUè¨ç®ã®é«éå
ä»åã®ããã°ã©ã ã§ã¯å ¨ãè¡ã£ã¦ãã¾ããããGPUè¨ç®ã«ã¯è²ã ãªé«éåææ³ãããã¾ãã èå³ã®ããæ¹ã¯ä¸è¨ã®æ¸ç±çãåèã«ãã¦ä¸ããã