SSE.æµ®åå°æ°ç¹æ¼ç®æåæé©åã¯æ¬å½ã«å¹æçãªã®ã
ã¡ãã£ã¨è©¦ããã®ã§ãçµæãã³ããã
æ¦è¦ã¨ãã¦ã¾ã
SSEã¯ãx86ã®SIMDå½ä»¤ã»ããã«å«ã¾ããæµ®åå°æ°ç¹æ¼ç®ã®è¤æ°åæå®è¡ãè¡ãå½ä»¤ã»ããã§ããå精度(float)ã§4ã¤ãå精度(double)ã§2ã¤ã®æ¼ç®ã1å½ä»¤ã§å®è¡ã§ããããããã¾ã使ãã¨ç¹°ãè¿ãåãæ¼ç®ã®ããã©ã¼ãã³ã¹ã¢ãããæå¾
ã§ãã¾ãã
Streaming SIMD Extensions - Wikipedia
ç»åå¦çãã確çã®è¨ç®ãªã©ã§ã¯æµ®åå°æ°ç¹æ°ã®è¨ç®ãæ°ç¾ä¸åããåæ°ã¨ãããã¬ãã«ã§ã¯ãªã3æ¥éãããã®è¦æ¨¡ã§è¡ããã¨ãããã®ã§ã
- å°ãã§ãéããªãã¨æéçã«ã¨ã¦ãããããï¼
- è¨ç®ãã©ãã§å¦¥åãããã¿ãããªé¨åãããããéãè¨ç®ãçµãããããã¨ã§ããããã¨ã®å¯è½æ§ãåºããï¼
ã¨ããæããããã¾ãã
CUDAã®ã»ããäºã
CUDAã¯ãNVIDIAã®GPUã対å¿ãã¦ããGPGPUã®ç°å¢ã§ããGPGPUã¨ããã®ã¯ãã°ã©ãã£ãã¯ãã¼ãã®ä¸¦åã³ã¢ãæ±ç¨ã³ã³ãã¥ã¼ãã£ã³ã°ã«ä½¿ãæè¡ã§ãç¹°ãè¿ãåãæ¼ç®ãè¡ããããªå¦çã§é©ãã®ããã©ã¼ãã³ã¹ã¢ãããæå¾
ã§ãã¾ãããã®å®å20ã100åã¨ããã¦ã¯ãµã§ãï¼
CUDA - Wikipedia
ããã¾ã夢ãåºããã¾ããã
- NVIDIAã®ã°ã©ãã£ãã¯ãã¼ããå¿ è¦ï¼ãããå¤æ®µã®é«ããã¤ããã£ã¨ããï¼
- CUDAãã©ã¤ãã®ã¤ã³ã¹ãã¼ã«ãå¿ è¦
ã¨å®è¡ç°å¢ãå¶éããããããããããªãã¨åããªãã½ããã¯æ·å± ãé«ããªããããã§ããã°ãªãã·ã§ã³è¦ç´ ã«ãããã¨ããæããããã¾ãããªã®ã§ãã¾ãSSEã
ã³ã³ãã¤ã©ã®æé©åã§äºã
ã³ã³ãã¤ã©ãèªåã§ãã£ã¦ãããã¨ããããã§ããããã£ã¦ããããä¸å®ãªã¨ãããããã®ã¨ãã¯ããããæã§æ¸ãæ°ãããã¨ã«ã¼ããå±éãããSSEãå¹æçã«ä½¿ãããããªæ§é ã§ããã°ã©ã ãæ¸ããã¨ãã§ãã¾ããã¨ããããã¯ãæã§æ¸ãã¦ã¿ãã®ãã®ã¨ãCè¨èªç(ã³ã³ãã¤ã©ã®æé©åã ãã)ãæ¯è¼ãã¾ãã
ããç®ã®çµæãã²ããã足ãã¦ã¿ã
float fsum_c(float *input, float *weight, int n) { int i; float sum = 0; for (i = 0; i < n; ++i) { sum += input[i] * weight[i]; } return sum; }
2ã¤ã®é
åã¨è¦ç´ æ°ãåãåã£ã¦ã両é
åã®åãæ·»ãåè¦ç´ ã®ä¹ç®çµæã®ç·åãè¿ãé¢æ°ã試ãã¾ããã
ãããSSEå½ä»¤ã»ããã使ã£ã¦æ¸ãã¨æ¬¡ã®ããã«ãªãã¾ããã¤ã³ã©ã¤ã³ã¢ã»ã³ãã©ã§ã¯ãªããã¤ã³ããªã³ã·ãã¯å½ä»¤ã使ã£ã¦ãã¾ããã¤ã³ããªã³ã·ãã¯ã¯ãgccã¨VC++ã§å
±éãªã®ã§ãã¤ã³ã©ã¤ã³ã¢ã»ã³ããªããäºææ§ã«åªãã¦ãã¾ãã詳ããã¯ã
Compiler Intrinsics (C++)
ã
float fsum_sse(float *input, float *weight, int n) { __m128 w, x, u; ALIGN16 float mm[4] = {0}; div_t lp = div(n, 4); int i, j, pk_lp = lp.quot, nm_lp = lp.rem; float sum = 0; u = _mm_load_ps(mm); for (i = 0, j = 0; i < pk_lp; ++i, j += 4) { w = _mm_load_ps(&input[j]); // 4ã¤åãã¼ã x = _mm_load_ps(&weight[j]); // 4ã¤åãã¼ã x = _mm_mul_ps(w, x); // 4ã¤åä¹ç® u = _mm_add_ps(u, x); // 4ã¤åuã«å ç® } _mm_store_ps(mm, u); // ã¡ã¢ãªã«æ¸ãæ»ã sum = mm[0] + mm[1] + mm[2] + mm[3]; // å°è¨ for (i = 0, j = pk_lp * 4; i < nm_lp; ++i, ++j) { sum += input[j] * weight[j]; // æ®ãå } return sum; }
å精度ã§ã¯ã4ã¤åæã«è¨ç®ã§ããããã4ã®åæ°åä½ã§ã«ã¼ããã¾ããã¦ã4ã¤ãã¤è¨ç®ãè¡ãã¾ããä½ãåã¯æå¾ã«è¨ç®ãã¦å ç®ãã¾ãã
çµæ
ä¸ã®å精度(float)ãã¼ã¸ã§ã³ã¨ãå精度(double)ãã¼ã¸ã§ã³ãé åè¦ç´ æ°10,003åã§ã100,000åè¡ãéã®æéãè¨æ¸¬ãã¦çµæã¨ãã¾ãããã¾ããOpenMPã«ãã並ååã¨ã®çµã¿åãããè¡ãã¾ãããå®è¡ç°å¢ã®CPUã¯ãIntel Core 2 Quad Q9550(2.83GHz x 4ã³ã¢)ã§ãã
// ã¢ã¼ã: çµéæé(ms), ãã¹ãç¨ã®å¤
// Cãã¼ã¸ã§ã³ã¨SSEãã¼ã¸ã§ã³æ¯è¼ - float C: 3594, 2.490360E+003 SSE: 562, 2.490360E+003 6.40 åé«éå! - double C: 1344, 2.473939E+003 SSE: 1078, 2.473939E+003 1.25 åé«éå!
// Cãã¼ã¸ã§ã³ã¨SSE+OpenMPãã¼ã¸ã§ã³æ¯è¼ - float C: 3578, 2.490360E+003 SSE: 156, 2.491235E+003 22.94 åé«éå! - double C: 1344, 2.473939E+003 SSE: 281, 2.473939E+003 4.78 åé«éå!
çµæã ãè¦ãã¨ãå精度ããããå精度ãã¾ãå¤ãããªãã¨ãã£ãå°è±¡ã§ãã
kousatsu
å精度ã§ã¯ãããªãã®å¹æãããããã«è¦ãã¾ãããVCã®å精度è¨ç®ã¯ããã£ã¹ããã¦ããã¨ãã§é ãã¨ãã話ãããã®ã§ãSSEãã¼ã¸ã§ã³ãéãã¨ããããã¯ãCãã¼ã¸ã§ã³ãé ãã¨ããçããããã¾ãï¼ã¤ã¾ãSSEéãï¼ï¼ãdoubleã®ã»ãã¯ãæé©åãªãã ã¨ããªãé ãã£ãã®ã§ã³ã³ãã¤ã©ã®æé©åã§SSEã使ããã¦ãããã§ããOpenMPãå«ãããã¼ã¸ã§ã³ã§ã¯ããªãå¹æãåºã¦ãã¾ãããå精度ã¯ã»ã¼OpenMPã«ããå¹æã ã¨ãããã¨ã¨ãå精度ã¯æ¼ç®æé ãå¤ããããã丸ã誤差ãå¢å¹ ããã¦Cãã¼ã¸ã§ã³ã¨ããªãéãå¤ã«ãªã£ã¦ãã¾ãã¾ãã(doubleãã¼ã¸ã§ã³ã¯ãºã¬ãªãã®ã§ã並ååããã°ã£ã¦ããããã§ã¯ãªãã¨æãï¼ã
ã³ã¡ã³ã
ãã£ã¹ã¢ã»ã³ãã«ããªãã¨æ£ç¢ºãªç¶æ³ãªã©åãããããªããã§ãï¼(ï¼¾oï¼¾)ï¼¼ã£ã¦æãã¯ãã¦ããã
丸ã誤差ã¯FPUã®ã¢ã¼ãã«ããã¨æãã®ã§ã調ã¹ãã
ã³ã³ãã¤ã©ãæé©åããªããããªããã£ã¨è¤éãªæ¼ç®ã®æåã§ã®å±éãããã¯ããã¤ãã§ã®æååå¦çãããããã»ããå¹æãããããã
ä»å試ããã³ã¼ã
// cl sum.c vcomp.lib /arch:SSE2 /O2 /openmp #include <math.h> #include <emmintrin.h> #include <xmmintrin.h> #include <windows.h> // GetTickCount #include <omp.h> #ifdef _linux // gcc #define ALIGN16 __attribute__((aligned(16))) #else // cl #define ALIGN16 _declspec(align(16)) #endif float fsum_c(float *input, float *weight, int n) { int i; float sum = 0; for (i = 0; i < n; ++i) { sum += input[i] * weight[i]; } return sum; } float fsum_sse(float *input, float *weight, int n) { __m128 w, x, u; ALIGN16 float mm[4] = {0}; div_t lp = div(n, 4); int i, j, pk_lp = lp.quot, nm_lp = lp.rem; float sum = 0; u = _mm_load_ps(mm); for (i = 0, j = 0; i < pk_lp; ++i, j += 4) { w = _mm_load_ps(&input[j]); x = _mm_load_ps(&weight[j]); x = _mm_mul_ps(w, x); u = _mm_add_ps(u, x); } _mm_store_ps(mm, u); sum = mm[0] + mm[1] + mm[2] + mm[3]; for (i = 0, j = pk_lp * 4; i < nm_lp; ++i, ++j) { sum += input[j] * weight[j]; } return sum; } double dsum_c(double *input, double *weight, int n) { int i; double sum = 0; for (i = 0; i < n; ++i) { sum += input[i] * weight[i]; } return sum; } double dsum_sse(double *input, double *weight, int n) { __m128d w, x, u; ALIGN16 double mm[2] = {0}; div_t lp = div(n, 2); int i, j, pk_lp = lp.quot, nm_lp = lp.rem; double sum = 0; u = _mm_load_pd(mm); for (i = 0, j = 0; i < pk_lp; ++i, j += 2) { w = _mm_load_pd(&input[j]); x = _mm_load_pd(&weight[j]); x = _mm_mul_pd(w, x); u = _mm_add_pd(u, x); } _mm_store_pd(mm, u); sum = mm[0] + mm[1]; for (i = 0, j = pk_lp * 2; i < nm_lp; ++i, ++j) { sum += input[j] * weight[j]; } return sum; } #define TC 100000 #define DC 10003 int main(void) { ALIGN16 float fdata1[DC], fdata2[DC]; ALIGN16 double ddata1[DC], ddata2[DC]; float fsum = 0; double dsum = 0; int i, j; long st, t1, t2; #ifdef _OPENMP omp_set_num_threads(omp_get_num_procs() * 8); #endif //srand(time()); for (j = 0; j < DC; ++j) { fdata1[j] = (float)(rand() % 1000) / 1000; fdata2[j] = (float)(rand() % 1000) / 1000; ddata1[j] = (double)(rand() % 1000) / 1000; ddata2[j] = (double)(rand() % 1000) / 1000; } /////// å精度 puts("- float"); // normal C fsum = 0.0; st = GetTickCount(); for (i = 0; i < TC; ++i) { fdata1[0] = 1.0; fsum += fsum_c(fdata1, fdata2, DC) / TC; } t1 = GetTickCount() - st; printf("%3s: %ld, %E\n", "C", t1, fsum); // SSE fsum = 0.0; st = GetTickCount(); #ifdef _OPENMP #pragma omp parallel for reduction(+:fsum) #endif for (i = 0; i < TC; ++i) { fdata1[0] = 1.0; fsum += fsum_sse(fdata1, fdata2, DC) / TC; } t2 = GetTickCount() - st; printf("%3s: %ld, %E\n", "SSE", t2, fsum); printf("%0.2f åé«éå!\n\n", (double)t1 / t2); /////// å精度 puts("- double"); // normal C dsum = 0.0; st = GetTickCount(); for (i = 0; i < TC; ++i) { ddata1[0] = 1.0; dsum += dsum_c(ddata1, ddata2, DC) / TC; } t1 = GetTickCount() - st; printf("%3s: %ld, %E\n", "C", t1, dsum); // SSE dsum = 0.0; st = GetTickCount(); #ifdef _OPENMP #pragma omp parallel for reduction(+:dsum) #endif for (i = 0; i < TC; ++i) { ddata1[0] = 1.0; dsum += dsum_sse(ddata1, ddata2, DC) / TC; } t2 = GetTickCount() - st; printf("%3s: %ld, %E\n", "SSE", t2, dsum); printf("%0.2f åé«éå!\n\n", (double)t1 / t2); return 0; }