cuda编程笔记3(理解共享内存和线程同步使用)

it2023-11-04  75

目录

共享内存和线程同步计算

共享内存和线程同步计算

功能:对于长度为10的数组,用10个线程同步计算当前元素之前所有元素的平均值。

#include <stdio.h> #include "cuda_runtime.h" #include "device_launch_parameters.h" __global__ void gpu_shared_memory(float *d_a) { // Defining local variables which are private to each thread int i, index = threadIdx.x; float average, sum = 0.0f; //Define shared memory __shared__ float sh_arr[10]; sh_arr[index] = d_a[index]; __syncthreads(); // This ensures all the writes to shared memory have completed for (i = 0; i <= index; i++) { sum += sh_arr[i]; printf("i : %d , thread_Id: %d , sum: %2f\n", i, index,sum); } average = sum / (index + 1.0f); d_a[index] = average; sh_arr[index] = average; } int main(int argc, char **argv) { //Define Host Array float h_a[10]; //Define Device Pointer float *d_a; for (int i = 0; i < 10; i++) { h_a[i] = 2*i; } // allocate global memory on the device cudaMalloc((void **)&d_a, sizeof(float) * 10); // now copy data from host memory to device memory cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) * 10, cudaMemcpyHostToDevice); gpu_shared_memory << <1, 10 >> > (d_a); cudaDeviceSynchronize(); // copy the modified array back to the host memory cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) * 10, cudaMemcpyDeviceToHost); printf("Use of Shared Memory on GPU: \n"); //Printing result on console for (int i = 0; i < 10; i++) { printf("The running average after %d element is %f \n", i, h_a[i]); } return 0; }

结果如下:

如一幅图所示,在将GPU上全局内存数组中的值全赋给共享内存后,10个线程同时开始for循环,线程0的for循环一次,就执行for循环下面的求均值,线程1的for循环2次,就执行求均值,每个线程都先执行i=0。

作者: LEDyexu 博客: https://blog.csdn.net/LEDyexu 更新ing…
最新回复(0)