#include <stdio.h>
__global__ void sumArraysOnGPU(float *A, float *B, float *C, int const N) {
   int i = threadIdx.x + blockDim.x * blockIdx.x;
   if(i < N) C[i] = A[i] + B[i];
}

int main(int argc, char **argv) {
   printf("%s Starting...\n", argv[0]);
   // set up device
   int dev = 0;
   cudaSetDevice(dev);
   // set up data size of vectors
   int nElem = 1 << 5;
   printf("Vector size %d\n", nElem);
   // malloc host memory
   size_t nBytes = nElem * sizeof(float);
   float *h_A, *h_B, *gpuRef;
   h_A = (float *) malloc(nBytes);
   h_B = (float *) malloc(nBytes);
   gpuRef = (float *) malloc(nBytes);
   // initialize data at host side
   for(int i = 0; i < nElem; i++) {
      h_A[i] = h_B[i] = i;
   }
   memset(gpuRef, 0, nBytes);
   // malloc device global memory
   float *d_A, *d_B, *d_C;
   cudaMalloc((float **) &d_A, nBytes);
   cudaMalloc((float **) &d_B, nBytes);
   cudaMalloc((float **) &d_C, nBytes);
   // transfer data from host to device
   cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
   cudaMemcpy(d_C, gpuRef, nBytes, cudaMemcpyHostToDevice);
   // invoke kernel at host side
   dim3 block(nElem);
   dim3 grid(1);
   sumArraysOnGPU<<<grid, block>>>(d_A, d_B, d_C, nElem);
   printf("Execution configure <<<%d, %d>>>\n", grid.x, block.x);
   // copy kernel result back to host side
   cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
   // free device global memory
   cudaFree(d_A);
   cudaFree(d_B);
   cudaFree(d_C);
   // free host memory
   free(h_A);
   free(h_B);
   free(gpuRef);
   cudaDeviceReset();
   
   return (0);
}