cuda编程笔记3（理解共享内存和线程同步使用）

it2023-11-04 82

#include <stdio.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" __global__ void gpu_shared_memory(float *d_a) { // Defining local variables which are private to each thread int i, index = threadIdx.x; float average, sum = 0.0f; //Define shared memory __shared__ float sh_arr[10]; sh_arr[index] = d_a[index]; __syncthreads(); // This ensures all the writes to shared memory have completed for (i = 0; i <= index; i++) { sum += sh_arr[i]; printf("i : %d , thread_Id: %d , sum: %2f\n", i, index,sum); } average = sum / (index + 1.0f); d_a[index] = average; sh_arr[index] = average; } int main(int argc, char **argv) { //Define Host Array float h_a[10]; //Define Device Pointer float *d_a; for (int i = 0; i < 10; i++) { h_a[i] = 2*i; } // allocate global memory on the device cudaMalloc((void **)&d_a, sizeof(float) * 10); // now copy data from host memory to device memory cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) * 10, cudaMemcpyHostToDevice); gpu_shared_memory << <1, 10 >> > (d_a); cudaDeviceSynchronize(); // copy the modified array back to the host memory cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) * 10, cudaMemcpyDeviceToHost); printf("Use of Shared Memory on GPU: \n"); //Printing result on console for (int i = 0; i < 10; i++) { printf("The running average after %d element is %f \n", i, h_a[i]); } return 0; }

结果如下：

如一幅图所示，在将GPU上全局内存数组中的值全赋给共享内存后，10个线程同时开始for循环，线程0的for循环一次，就执行for循环下面的求均值，线程1的for循环2次，就执行求均值，每个线程都先执行i=0。

作者： LEDyexu 博客: https://blog.csdn.net/LEDyexu 更新ing…

最新回复(0)

cuda编程笔记3（理解共享内存和线程同步使用）

目录

共享内存和线程同步计算