【初心者向け】GPUでデータ分析 ~最小値~【CUDA】

Uncategorized

CUDAで最小値を求めるプログラムを作成しました。


#include <iostream>
#include <vector>
#include <stdio.h>
#include <numeric>
#include <windows.h>
#include <cuda.h>
#include <algorithm>

#pragma comment(lib, "winmm.lib")


__device__
float reduction_min(float val, int idx)
{
    float min = 0;
    float f = 0;
    f = __shfl(val, idx + 16, warpSize);
    min = (f < val) ? f : val;
    f = __shfl(val, idx + 8, warpSize);
    min = (f < val) ? f : val;
    f = __shfl(val, idx + 4, warpSize);
    min = (f < val) ? f : val;
    f = __shfl(val, idx + 2, warpSize);
    min = (f < val) ? f : val;

    return min;

}


__global__
void kernel(float* d_val, float* d_out, int n)
{
    extern __shared__ float sdata[];

    unsigned int tid = threadIdx.x;

    int warpId = tid / warpSize;
    int lanId = tid % warpSize;
    if (warpId == 0) sdata[lanId] = 0;
    float f = 0.0f;

    float min = 0;
    float tmp = 0;
    int count = 0;
    for (int i = tid; i < n; i += blockDim.x)
    {

        f = d_val[i];

        if (count == 0)
        {
            min = reduction_min(f, lanId);
        }
        else
        {
            tmp = reduction_min(f, lanId);
            min = (min < tmp) ? min : tmp;
        }
        count++;

    }
    if (lanId == 0)sdata[warpId] = min;
    __syncthreads();

    if (warpId == 0)min = sdata[lanId];
    if (warpId == 0)min = reduction_min(min, lanId);

    if (tid == 0)d_out[0] = min;

}



int main()
{

    std::vector<int> N = { 1024, 524288, 1048576 };

    for (auto n : N)
    {
        std::cout << "---N : " << n << std::endl;
        float* d_val;
        float* d_out;
        float* d_avg;
        std::vector<float> h_val;



        for (int i = 0; i < n; i++) h_val.push_back(float(i));

        cudaMalloc(&d_val, sizeof(float) * n);
        cudaMalloc(&d_out, sizeof(float));
        cudaMemcpy(d_val, h_val.data(), sizeof(float) * n, cudaMemcpyHostToDevice);

        d_avg = (float*)malloc(sizeof(float));

        int block = 1024;
        int gird = 1;
        int sdata_size = 32;

        kernel << < gird, block, sizeof(float)* sdata_size >> > (d_val, d_out, n);

        cudaMemcpy(d_avg, d_out, sizeof(float), cudaMemcpyDeviceToHost);

        std::cout << "devicel min = " << d_avg[0] << std::endl;
        std::cout << "host    min = " << *min_element(h_val.begin(), h_val.end()) << std::endl;


    }


}

出力結果

---N : 1024
devicel min = 0
host    min = 0
---N : 524288
devicel min = 0
host    min = 0
---N : 1048576
devicel min = 0
host    min = 0

コメント

タイトルとURLをコピーしました