-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel.cu
More file actions
121 lines (99 loc) · 3.33 KB
/
kernel.cu
File metadata and controls
121 lines (99 loc) · 3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <iostream>
#include <nvtx3/nvToolsExt.h>
// Structure for the vector addition result.
typedef struct
{
double amount;
double time;
} VECTOR_ADD_RESULT, *PVECTOR_ADD_RESULT;
/**
* @brief CUDA kernel for adding two vectors element-wise.
*
* This kernel function performs element-wise addition of two vectors and stores the result in a third vector.
*
* @param a Pointer to the first input vector.
* @param b Pointer to the second input vector.
* @param c Pointer to the output vector.
* @param n Number of elements in the vectors.
*/
__global__ void VectorAddKernel(double *a, double *b, double *c, int n)
{
// Calculate our global thread id.
int id = blockIdx.x * blockDim.x + threadIdx.x;
// We might run with more threads than elements, so we need to make sure we don't do any work outside of our data.
if (id < n)
c[id] = a[id] + b[id];
}
/**
* Calculates the sum of two vectors using CUDA.
*
* @param seed The seed value for random number generation.
* @return A pointer to a VECTOR_ADD_RESULT struct containing the result of the vector addition.
*/
__declspec(dllexport) PVECTOR_ADD_RESULT __cdecl VectorAdd(unsigned int seed)
{
nvtxRangePushA("VectorAdd");
nvtxMark("InitializeReturnStruct");
PVECTOR_ADD_RESULT result = (PVECTOR_ADD_RESULT)malloc(sizeof(VECTOR_ADD_RESULT));
result->amount = 0;
result->time = 123.456;
nvtxMark("SetRandomSeed");
srand(seed);
// Number of elements in each vector.
int n = 500000;
// Host memory pointers for the input and output vectors.
double *h_a, *h_b, *h_c;
// Device memory pointers for the input and output vectors.
double *d_a, *d_b, *d_c;
// Number of bytes to allocate.
size_t bytes = n * sizeof(double);
// Allocate memory on the host.
nvtxRangePushA("HostMemAlloc");
h_a = (double *)malloc(bytes);
h_b = (double *)malloc(bytes);
h_c = (double *)malloc(bytes);
nvtxRangePop();
// Allocate memory on the device.
nvtxRangePushA("DeviceMemAlloc");
cudaMalloc(&d_a, bytes);
cudaMalloc(&d_b, bytes);
cudaMalloc(&d_c, bytes);
nvtxRangePop();
// Initialize vectors on host.
int i, x;
for (i = 0; i < n; i++)
{
x = rand();
h_a[i] = sin(i) * sin(x) + x;
h_b[i] = cos(i) * cos(i) - x;
}
nvtxRangePushA("CopyVectorsToDevice");
cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, bytes, cudaMemcpyHostToDevice);
nvtxRangePop();
// Calculate the number of thread blocks.
int blockSize, gridSize;
blockSize = 5000;
gridSize = (int)ceil((float)n / blockSize);
nvtxRangePushA("KernelExecution");
VectorAddKernel<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);
nvtxRangePop();
nvtxRangePushA("CopyResultBackToHost");
cudaMemcpy(h_c, d_c, bytes, cudaMemcpyDeviceToHost);
nvtxRangePop();
// Consolidate the result.
double sum = 0;
for (i = 0; i < n; i++)
sum += h_c[i] + 1;
// Set the result on the result struct.
result->amount = 2 + sum / n;
nvtxMark("FreeBothDeviceAndHostMemory");
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_a);
free(h_b);
free(h_c);
nvtxRangePop();
return result;
}