[cuda] udacity atomic add 관련 코드

#include <stdio.h>

#include "gputimer.h"

#define NUM_THREADS 10000000

#define ARRAY_SIZE 100

#define BLOCK_WIDTH 1000

void print_array(int *array, int size)

{

printf("{ ");

for (int i = 0; i < size; i++) { printf("%d ", array[i]); }

printf("}\n");

}

__global__ void increment_naive(int *g)

{

// which thread is this?

int i = blockIdx.x * blockDim.x + threadIdx.x;

// each thread to increment consecutive elements, wrapping at ARRAY_SIZE

i = i % ARRAY_SIZE;

g[i] = g[i] + 1;

}

__global__ void increment_atomic(int *g)

{

// which thread is this?

int i = blockIdx.x * blockDim.x + threadIdx.x;

// each thread to increment consecutive elements, wrapping at ARRAY_SIZE

i = i % ARRAY_SIZE;

atomicAdd(& g[i], 1);

}

int main(int argc,char **argv)

{

GpuTimer timer;

printf("%d total threads in %d blocks writing into %d array elements\n",

NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);

// declare and allocate host memory

int h_array[ARRAY_SIZE];

const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);

// declare, allocate, and zero out GPU memory

int * d_array;

cudaMalloc((void **) &d_array, ARRAY_BYTES);

cudaMemset((void *) d_array, 0, ARRAY_BYTES);

// launch the kernel - comment out one of these

timer.Start();

// Instructions: This program is needed for the next quiz

// uncomment increment_naive to measure speed and accuracy

// of non-atomic increments or uncomment increment_atomic to

// measure speed and accuracy of atomic icrements

// increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);

increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array);

timer.Stop();

// copy back the array of sums from GPU and print

cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);

print_array(h_array, ARRAY_SIZE);

printf("Time elapsed = %g ms\n", timer.Elapsed());

// free GPU memory allocation and exit

cudaFree(d_array);

return 0;

}

결과

10^6 -> 10^6: 0.117984 ms, Correct

10^6 -> atomic 10^6: Time elapsed = 0.16848 ms, Correct

10^6 -> 100: 0.31552 ms, Wrong

10^6 -> atomic 100: 0.368672 ms, Correct

10^7 -> atomic 100: 3.45574 ms, Correct

저작자표시

Blue Frog's Wisdom

[cuda] udacity atomic add 관련 코드

댓글

티스토리툴바

[cuda] udacity atomic add 관련 코드

관련글

댓글

티스토리툴바