본문 바로가기
Computer Engineering/Tip

[cuda] udacity atomic add 관련 코드

by 임은천 2015. 6. 14.

#include <stdio.h>

#include "gputimer.h"


#define NUM_THREADS 10000000

#define ARRAY_SIZE  100


#define BLOCK_WIDTH 1000


void print_array(int *array, int size)

{

    printf("{ ");

    for (int i = 0; i < size; i++)  { printf("%d ", array[i]); }

    printf("}\n");

}


__global__ void increment_naive(int *g)

{

// which thread is this?

int i = blockIdx.x * blockDim.x + threadIdx.x; 


// each thread to increment consecutive elements, wrapping at ARRAY_SIZE

i = i % ARRAY_SIZE;  

g[i] = g[i] + 1;

}


__global__ void increment_atomic(int *g)

{

// which thread is this?

int i = blockIdx.x * blockDim.x + threadIdx.x; 


// each thread to increment consecutive elements, wrapping at ARRAY_SIZE

i = i % ARRAY_SIZE;  

atomicAdd(& g[i], 1);

}


int main(int argc,char **argv)

{   

    GpuTimer timer;

    printf("%d total threads in %d blocks writing into %d array elements\n",

           NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);


    // declare and allocate host memory

    int h_array[ARRAY_SIZE];

    const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

 

    // declare, allocate, and zero out GPU memory

    int * d_array;

    cudaMalloc((void **) &d_array, ARRAY_BYTES);

    cudaMemset((void *) d_array, 0, ARRAY_BYTES); 


    // launch the kernel - comment out one of these

    timer.Start();

    

    // Instructions: This program is needed for the next quiz

    // uncomment increment_naive to measure speed and accuracy 

    // of non-atomic increments or uncomment increment_atomic to

    // measure speed and accuracy of  atomic icrements

    // increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);

    increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);

    timer.Stop();

    

    // copy back the array of sums from GPU and print

    cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    print_array(h_array, ARRAY_SIZE);

    printf("Time elapsed = %g ms\n", timer.Elapsed());

 

    // free GPU memory allocation and exit

    cudaFree(d_array);

    return 0;

}


결과


10^6 -> 10^6: 0.117984 ms, Correct

10^6 -> atomic 10^6: Time elapsed = 0.16848 ms, Correct

10^6 -> 100: 0.31552 ms, Wrong

10^6 -> atomic 100: 0.368672 ms, Correct

10^7 -> atomic 100: 3.45574 ms, Correct

댓글