CUDAで最小値を求めるプログラムを作成しました。
#include <iostream>
#include <vector>
#include <stdio.h>
#include <numeric>
#include <windows.h>
#include <cuda.h>
#include <algorithm>
#pragma comment(lib, "winmm.lib")
__device__
float reduction_min(float val, int idx)
{
float min = 0;
float f = 0;
f = __shfl(val, idx + 16, warpSize);
min = (f < val) ? f : val;
f = __shfl(val, idx + 8, warpSize);
min = (f < val) ? f : val;
f = __shfl(val, idx + 4, warpSize);
min = (f < val) ? f : val;
f = __shfl(val, idx + 2, warpSize);
min = (f < val) ? f : val;
return min;
}
__global__
void kernel(float* d_val, float* d_out, int n)
{
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
int warpId = tid / warpSize;
int lanId = tid % warpSize;
if (warpId == 0) sdata[lanId] = 0;
float f = 0.0f;
float min = 0;
float tmp = 0;
int count = 0;
for (int i = tid; i < n; i += blockDim.x)
{
f = d_val[i];
if (count == 0)
{
min = reduction_min(f, lanId);
}
else
{
tmp = reduction_min(f, lanId);
min = (min < tmp) ? min : tmp;
}
count++;
}
if (lanId == 0)sdata[warpId] = min;
__syncthreads();
if (warpId == 0)min = sdata[lanId];
if (warpId == 0)min = reduction_min(min, lanId);
if (tid == 0)d_out[0] = min;
}
int main()
{
std::vector<int> N = { 1024, 524288, 1048576 };
for (auto n : N)
{
std::cout << "---N : " << n << std::endl;
float* d_val;
float* d_out;
float* d_avg;
std::vector<float> h_val;
for (int i = 0; i < n; i++) h_val.push_back(float(i));
cudaMalloc(&d_val, sizeof(float) * n);
cudaMalloc(&d_out, sizeof(float));
cudaMemcpy(d_val, h_val.data(), sizeof(float) * n, cudaMemcpyHostToDevice);
d_avg = (float*)malloc(sizeof(float));
int block = 1024;
int gird = 1;
int sdata_size = 32;
kernel << < gird, block, sizeof(float)* sdata_size >> > (d_val, d_out, n);
cudaMemcpy(d_avg, d_out, sizeof(float), cudaMemcpyDeviceToHost);
std::cout << "devicel min = " << d_avg[0] << std::endl;
std::cout << "host min = " << *min_element(h_val.begin(), h_val.end()) << std::endl;
}
}
出力結果
---N : 1024
devicel min = 0
host min = 0
---N : 524288
devicel min = 0
host min = 0
---N : 1048576
devicel min = 0
host min = 0
コメント