CUDAのGPUで平均を求めるプログラムを書いてみました。
#include <iostream>
#include <vector>
#include <stdio.h>
#include <numeric>
#include <windows.h>
#include <cuda.h>
__device__
void warpReduce(volatile float* sdata, unsigned int tid)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
__global__
void reduction_avg(float* d_val, float* d_out, int n)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
sdata[tid] = 0;
float f = 0.0f;
for (int i = tid; i < n; i += blockDim.x)
{
if ( n <= i )break;
f += d_val[i];
}
sdata[tid] = f;
__syncthreads();
if (tid < 512) { sdata[tid] += sdata[tid + 512]; }
__syncthreads();
if (tid < 256) { sdata[tid] += sdata[tid + 256]; }
__syncthreads();
if (tid < 128) { sdata[tid] += sdata[tid + 128]; }
__syncthreads();
if (tid < 64) { sdata[tid] += sdata[tid + 64]; }
__syncthreads();
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) d_out[tid] = sdata[0]/n;
}
int main()
{
std::vector<int> N = { 1024, 524288, 1048576 };
for (auto n : N)
{
std::cout << "---N : " << n << std::endl;
float* d_val;
float* d_out;
float* d_avg;
std::vector<float> h_val;
for (int i = 0; i < n; i++) h_val.push_back(float(i));
cudaMalloc(&d_val, sizeof(float) * n);
cudaMalloc(&d_out, sizeof(float));
cudaMemcpy(d_val, h_val.data(), sizeof(float) * n, cudaMemcpyHostToDevice);
d_avg = (float*)malloc(sizeof(float));
int block = 1024;
int gird = 1;
int sdata_size = 1024;
reduction_avg << < gird, block, sizeof(float)* sdata_size >> > (d_val, d_out, n);
cudaMemcpy(d_avg, d_out, sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "devicel Avg = " << d_avg[0] << std::endl;
std::cout << "host Avg = " << std::accumulate(h_val.begin(), h_val.end(), 0.0) / h_val.size() << std::endl;
}
出力結果
---N : 1024
devicel Avg = 511.5
host Avg = 511.5
---N : 524288
devicel Avg = 262144
host Avg = 262144
---N : 1048576
devicel Avg = 524288
host Avg = 524288
コメント