ç¯å²(-0.005, -0.005, 0.005, 0.005)ã®ãã³ãã«ããéåãæç»ãµã¤ãº4,800x4,800、ç¹°ãè¿ãã®ä¸é10,000ã¨ãã¦è¨ç®ããã¦ã¿ã。C++ã§ã³ã¼ããæ¸ãã¦Xeon X5650 1ã³ã¢ã§èµ°ãããã¨ãã942.97ç§ããã£ã(SIMDæé©åã¯ãã¦ããªã)。TBBãå©ç¨ãã12è«çã³ã¢ã®ä¸¦åè¨ç®ã§ã¯88.68ç§ã ã£ã。次ãã§CUDAã使ã£ã¦Teslaã§èµ°ããã¦ã¿ãã7.45ç§ã¾ã§ç縮ããã。Teslaããªã。ããã¦è´
æ²¢ã«ã2æã®Teslaãã¹ã¬ããã§ä¸¦ååãã¦ä½¿ã£ã¦ã¿ãã4.26ç§ã§è¨ç®ã§ãã。å®ã«Xeon 1ã³ã¢ã®220åã®é度ãåºãããã 。ã«ãªã«ãªãã¥ã¼ã³ãããªãã¦ããã®ç¨åº¦ã®é«éåãã§ããã¨ãããã¨ãéè¦ã 。
å ã¿ã«ãã³ãã«ããéåã®ä¸è¨ã®ç¯å²ã¯æ°å¤ãçºæ£ããªãéåé¨åã§ãã、æç»ããã¦ã¿ã¦ãçã£é»ãªã®ã§é¢ç½ãã¯ãªã。ããã¯ä¸¦ååããéã«è¨ç®ãåããªãããã«ãããã£ãã®ã§ãã®ãããªç¯å²ãæå®ãã¦ãã。ããã°ã©ã ã§åºåããããã¡ã¤ã«ã¯è²ã®çãã¼ã¿ãªã®ã§ãã®ã¾ã¾ã§ã¯è¡¨ç¤ºã§ããªã。ä¸å¿、ç»å表示ããããã®Pythonã³ã¼ããä»ãå ãã¦ãã。
以ä¸ã«、GPU2æãå©ç¨ãã¦ãã³ãã«ããéåãè¨ç®ããCUDAã³ã¼ãã示ãã¦ãã。
mandelbrot_thread.cu
// Mandelbrot set using GPGPU by nox, 2011.02.12 // nvcc -lcutil_x86_64 -arch sm_13 -use_fast_math -prec-sqrt=false -keep -L ~/NVIDIA_GPU_Computing_SDK/C/lib -I ~/NVIDIA_GPU_Computing_SDK/C/common/inc -g mandelbrot_thread.cu -o mandelbrot_thread #include <iostream> #include <fstream> #include <cutil_inline.h> #include <multithreading.h> using namespace std; const int BLOCK_SIZE_X = 16; const int BLOCK_SIZE_Y = 16; struct MandelbrotParam { int dev; // GPU device id uchar4* h_color; // image buffer float t, l, w, h; // top, left, width, height int sw, sh; // screen width, screen height int max_iter; // max number of iterators float th; // threshold }; __device__ uchar4 coloring(int n, int b) { int d = n % b; d *= 256 / b; int m = static_cast<int>(d / 42.667f); switch (m) { case 0: return make_uchar4(0, 6 * d, 255, 0); case 1: return make_uchar4(0, 255, 255 - 6 * (d - 43), 0); case 2: return make_uchar4(6 * (d - 86), 255, 0, 0); case 3: return make_uchar4(255, 255 - 6 * (d - 129), 0, 0); case 4: return make_uchar4(255, 0, 6 * (d - 171), 0); case 5: return make_uchar4(255 - 6 * (d - 214), 0, 255, 0); default: return make_uchar4(0, 0, 0, 0); } } __global__ void mandelbrot(float t, float l, float w, float h, int sw, int sh, int max_iter, float th, uchar4* d_color) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= sw || iy >= sh) return; float ci = t + (static_cast<float>(iy) / sh) * h; float cr = l + (static_cast<float>(ix) / sw) * w; float zi = 0.0f; float zr = 0.0f; float zrzi, zr2, zi2; for (int i = 0; i < max_iter; i++) { zrzi = zr * zi; zr2 = zr * zr; zi2 = zi * zi; zr = zr2 - zi2 + cr; zi = zrzi + zrzi + ci; if (zi2 + zr2 >= th) { d_color[ix*sh+iy] = coloring(i, 256); return; } } d_color[ix*sh+iy] = make_uchar4(0, 0, 0, 0); } CUT_THREADPROC mandelbrot_thread(void* args) { MandelbrotParam* param = (MandelbrotParam*)args; cudaSetDevice(param->dev); uchar4* h_color = param->h_color; float t = param->t; float l = param->l; float w = param->w; float h = param->h; int sw = param->sw; int sh = param->sh; int max_iter = param->max_iter; float th = param->th; dim3 num_blocks((sw - 1) / BLOCK_SIZE_X + 1, (sh - 1) / BLOCK_SIZE_Y + 1, 1); dim3 num_threads(BLOCK_SIZE_X, BLOCK_SIZE_Y, 1); uchar4* d_color; cudaMalloc((void**)&d_color, sizeof(uchar4) * sw * sh); mandelbrot<<<num_blocks, num_threads>>>(t, l, w, h, sw, sh, max_iter, th, d_color); cudaThreadSynchronize(); cudaMemcpy(h_color, d_color, sizeof(uchar4) * sw * sh, cudaMemcpyDeviceToHost); cudaFree(d_color); CUT_THREADEND; } // outfile : output filename // t : top position // l : left position // w : width // h : height // sw : screen width // sh : screen height // max_iter : max number of iterations [256] // th : threshold [2.0] void write_mandelbrot(const string outfile, float t, float l, float w, float h, int sw, int sh, int max_iter=256, float th=2.0f) { MandelbrotParam param[2]; uchar4* h_color; cudaMallocHost((void**)&h_color, sizeof(uchar4) * sw * sh); // using 2 GPUs for (int i = 0; i < 2; i++) { param[i].dev = i; param[i].h_color = h_color; param[i].t = t; param[i].l = l; param[i].w = w / 2.0f; param[i].h = h; param[i].sw = sw / 2; param[i].sh = sh; param[i].max_iter = max_iter; param[i].th = th; } param[1].h_color = h_color + sw * sh / 2; param[1].l = l + w / 2.0f; unsigned int timer; cutCreateTimer(&timer); cutStartTimer(timer); CUTThread threads[2]; for (int i = 0; i < 2; i++) threads[i] = cutStartThread((CUT_THREADROUTINE)mandelbrot_thread, (void*)¶m[i]); cutWaitForThreads(threads, 2); cutStopTimer(timer); cout << "Time: " << cutGetTimerValue(timer) / 1000 << endl; fstream fs(outfile.c_str(), ios_base::out); fs << sw << endl; fs << sh << endl; for (int i = 0; i < sw * sh; i++) fs << h_color[i].x << h_color[i].y << h_color[i].z; fs.close(); cudaFreeHost(h_color); } int main(int argc, char* argv[]) { string outfile("mandelbrot.dat"); if (argc >= 2) outfile = argv[1]; // write_mandelbrot(output filename, // top position, // left position, // width, // height, // screen width, // screen height, // max number of iterations [256] // threshold [2.0]); //write_mandelbrot(outfile, -1.0f, -1.5f, 2.0f, 2.0f, 480, 480, 100); //write_mandelbrot(outfile, 0.57f, 0.34f, 0.036f, 0.027f, 6400, 4800, 1000); write_mandelbrot(outfile, -0.005f, -0.005f, 0.01f, 0.01f, 4800, 4800, 10000); return 0; }
表示ç¨ã®Pythonã³ã¼ã。
read_mandelbrot.py
#!/usr/bin/env python import sys, os, Image def main(args): if len(args) < 2: print >>sys.stderr, "%s datafile [imagefile]" % os.path.basename(args[0]) sys.exit() datafile = args[1] imagefile = args[2] if len(args) > 2 else "" f = file(datafile) sw = int(f.readline()) sh = int(f.readline()) data = map(ord, f.read()) im = Image.new("RGB", (sw, sh)) for x in range(sw): for y in range(sh): pos = (x * sh + y) * 3; im.putpixel((x, sh - y - 1), tuple(data[pos:pos+3])) if imagefile: im.save(imagefile) im.show() if __name__ == "__main__": main(sys.argv)
å ã¿ã«ãã³ãã«ããéåã®ä¸è¨ã®ç¯å²ã¯æ°å¤ãçºæ£ããªãéåé¨åã§ãã、æç»ããã¦ã¿ã¦ãçã£é»ãªã®ã§é¢ç½ãã¯ãªã。ããã¯ä¸¦ååããéã«è¨ç®ãåããªãããã«ãããã£ãã®ã§ãã®ãããªç¯å²ãæå®ãã¦ãã。ããã°ã©ã ã§åºåããããã¡ã¤ã«ã¯è²ã®çãã¼ã¿ãªã®ã§ãã®ã¾ã¾ã§ã¯è¡¨ç¤ºã§ããªã。ä¸å¿、ç»å表示ããããã®Pythonã³ã¼ããä»ãå ãã¦ãã。
以ä¸ã«、GPU2æãå©ç¨ãã¦ãã³ãã«ããéåãè¨ç®ããCUDAã³ã¼ãã示ãã¦ãã。
mandelbrot_thread.cu
// Mandelbrot set using GPGPU by nox, 2011.02.12 // nvcc -lcutil_x86_64 -arch sm_13 -use_fast_math -prec-sqrt=false -keep -L ~/NVIDIA_GPU_Computing_SDK/C/lib -I ~/NVIDIA_GPU_Computing_SDK/C/common/inc -g mandelbrot_thread.cu -o mandelbrot_thread #include <iostream> #include <fstream> #include <cutil_inline.h> #include <multithreading.h> using namespace std; const int BLOCK_SIZE_X = 16; const int BLOCK_SIZE_Y = 16; struct MandelbrotParam { int dev; // GPU device id uchar4* h_color; // image buffer float t, l, w, h; // top, left, width, height int sw, sh; // screen width, screen height int max_iter; // max number of iterators float th; // threshold }; __device__ uchar4 coloring(int n, int b) { int d = n % b; d *= 256 / b; int m = static_cast<int>(d / 42.667f); switch (m) { case 0: return make_uchar4(0, 6 * d, 255, 0); case 1: return make_uchar4(0, 255, 255 - 6 * (d - 43), 0); case 2: return make_uchar4(6 * (d - 86), 255, 0, 0); case 3: return make_uchar4(255, 255 - 6 * (d - 129), 0, 0); case 4: return make_uchar4(255, 0, 6 * (d - 171), 0); case 5: return make_uchar4(255 - 6 * (d - 214), 0, 255, 0); default: return make_uchar4(0, 0, 0, 0); } } __global__ void mandelbrot(float t, float l, float w, float h, int sw, int sh, int max_iter, float th, uchar4* d_color) { int ix = blockIdx.x * blockDim.x + threadIdx.x; int iy = blockIdx.y * blockDim.y + threadIdx.y; if (ix >= sw || iy >= sh) return; float ci = t + (static_cast<float>(iy) / sh) * h; float cr = l + (static_cast<float>(ix) / sw) * w; float zi = 0.0f; float zr = 0.0f; float zrzi, zr2, zi2; for (int i = 0; i < max_iter; i++) { zrzi = zr * zi; zr2 = zr * zr; zi2 = zi * zi; zr = zr2 - zi2 + cr; zi = zrzi + zrzi + ci; if (zi2 + zr2 >= th) { d_color[ix*sh+iy] = coloring(i, 256); return; } } d_color[ix*sh+iy] = make_uchar4(0, 0, 0, 0); } CUT_THREADPROC mandelbrot_thread(void* args) { MandelbrotParam* param = (MandelbrotParam*)args; cudaSetDevice(param->dev); uchar4* h_color = param->h_color; float t = param->t; float l = param->l; float w = param->w; float h = param->h; int sw = param->sw; int sh = param->sh; int max_iter = param->max_iter; float th = param->th; dim3 num_blocks((sw - 1) / BLOCK_SIZE_X + 1, (sh - 1) / BLOCK_SIZE_Y + 1, 1); dim3 num_threads(BLOCK_SIZE_X, BLOCK_SIZE_Y, 1); uchar4* d_color; cudaMalloc((void**)&d_color, sizeof(uchar4) * sw * sh); mandelbrot<<<num_blocks, num_threads>>>(t, l, w, h, sw, sh, max_iter, th, d_color); cudaThreadSynchronize(); cudaMemcpy(h_color, d_color, sizeof(uchar4) * sw * sh, cudaMemcpyDeviceToHost); cudaFree(d_color); CUT_THREADEND; } // outfile : output filename // t : top position // l : left position // w : width // h : height // sw : screen width // sh : screen height // max_iter : max number of iterations [256] // th : threshold [2.0] void write_mandelbrot(const string outfile, float t, float l, float w, float h, int sw, int sh, int max_iter=256, float th=2.0f) { MandelbrotParam param[2]; uchar4* h_color; cudaMallocHost((void**)&h_color, sizeof(uchar4) * sw * sh); // using 2 GPUs for (int i = 0; i < 2; i++) { param[i].dev = i; param[i].h_color = h_color; param[i].t = t; param[i].l = l; param[i].w = w / 2.0f; param[i].h = h; param[i].sw = sw / 2; param[i].sh = sh; param[i].max_iter = max_iter; param[i].th = th; } param[1].h_color = h_color + sw * sh / 2; param[1].l = l + w / 2.0f; unsigned int timer; cutCreateTimer(&timer); cutStartTimer(timer); CUTThread threads[2]; for (int i = 0; i < 2; i++) threads[i] = cutStartThread((CUT_THREADROUTINE)mandelbrot_thread, (void*)¶m[i]); cutWaitForThreads(threads, 2); cutStopTimer(timer); cout << "Time: " << cutGetTimerValue(timer) / 1000 << endl; fstream fs(outfile.c_str(), ios_base::out); fs << sw << endl; fs << sh << endl; for (int i = 0; i < sw * sh; i++) fs << h_color[i].x << h_color[i].y << h_color[i].z; fs.close(); cudaFreeHost(h_color); } int main(int argc, char* argv[]) { string outfile("mandelbrot.dat"); if (argc >= 2) outfile = argv[1]; // write_mandelbrot(output filename, // top position, // left position, // width, // height, // screen width, // screen height, // max number of iterations [256] // threshold [2.0]); //write_mandelbrot(outfile, -1.0f, -1.5f, 2.0f, 2.0f, 480, 480, 100); //write_mandelbrot(outfile, 0.57f, 0.34f, 0.036f, 0.027f, 6400, 4800, 1000); write_mandelbrot(outfile, -0.005f, -0.005f, 0.01f, 0.01f, 4800, 4800, 10000); return 0; }
表示ç¨ã®Pythonã³ã¼ã。
read_mandelbrot.py
#!/usr/bin/env python import sys, os, Image def main(args): if len(args) < 2: print >>sys.stderr, "%s datafile [imagefile]" % os.path.basename(args[0]) sys.exit() datafile = args[1] imagefile = args[2] if len(args) > 2 else "" f = file(datafile) sw = int(f.readline()) sh = int(f.readline()) data = map(ord, f.read()) im = Image.new("RGB", (sw, sh)) for x in range(sw): for y in range(sh): pos = (x * sh + y) * 3; im.putpixel((x, sh - y - 1), tuple(data[pos:pos+3])) if imagefile: im.save(imagefile) im.show() if __name__ == "__main__": main(sys.argv)
ã³ã¡ã³ã