__shfl_upã__shfl_downã§ã¯ï¼ç«¯ã®å¤ã循ç°ããããã¨ãåºæ¥ãªãï¼
ããã§ï¼__shflã使ãã°ï¼ç«¯ã®å¤ã循ç°ããããã¨ãã§ããï¼
#include <stdio.h> __global__ void shfl_wrap(int *out_dev, int *in_dev, int const offset){ int value = in_dev[threadIdx.x]; value = __shfl(value, threadIdx.x + offset); out_dev[threadIdx.x] = value; } int main(){ int *in, *out; int *in_dev, *out_dev; in = (int*)malloc(32*sizeof(int)); out = (int*)malloc(32*sizeof(int)); cudaMalloc(&in_dev, 32*sizeof(int)); cudaMalloc(&out_dev, 32*sizeof(int)); for(int i=0;i<32;i++){ in[i] = i; } cudaMemcpy(in_dev, in, 32*sizeof(int), cudaMemcpyHostToDevice); shfl_wrap<<<1, 32>>>(out_dev, in_dev, 2); cudaMemcpy(out, out_dev, 32*sizeof(int), cudaMemcpyDeviceToHost); for(int i=0; i<32; i++){ printf("%d\n", out[i]); } return 0; }