1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
| #include <cuda_runtime.h>
__global__ void addVectors(float *a, float *b, float *c, int n) { int i = threadIdx.x + blockIdx.x * blockDim.x; if (i < n) c[i] = a[i] + b[i]; }
int main() { int n = 1024; float *h_a, *h_b, *h_c; float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, n * sizeof(float)); cudaMalloc(&d_b, n * sizeof(float)); cudaMalloc(&d_c, n * sizeof(float));
cudaMemcpy(d_a, h_a, n * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, n * sizeof(float), cudaMemcpyHostToDevice);
addVectors<<<ceil(n/256.0), 256>>>(d_a, d_b, d_c, n);
cudaMemcpy(h_c, d_c, n * sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0; }
|