CUDA上的自旋锁(SpinLock)实现问题

最近在研究CUDA。其中有一个需求是要对一段代码加锁。CUDA里只提供了现成的对单个变量进行操作的原子操作库[1]，而没有提供对一段代码加锁的机制，于是我就想怎么改造一下这个原子库。

一开始自己写了一段代码，差不多是这个样子的[2]。

1
2
3
4
5
6

__device__ inline void lock(int* p_mutex ){
while(atomicCAS(p_mutex, 0, 1) != 0); // compare and swap
}
__device__ inline void unlock(int* p_mutex ){
atomicExch(p_mutex, 0);// exchange
}

调用了之后发现从加锁的循环里压根就跑不出来。自己想了半天死活也搞不懂。然后从StackOverFlow上查到资料。发现这个和CUDA的Warp有关[3]。每个Warp由32个线程组成，它们必须同时执行相同的指令。如果用上述代码实现自旋锁。假定Warp中有一个Thread获得了锁，需要向下执行才能够释放锁。但是其他Thread却没得到锁，因此整个Warp就不能向下执行，始终卡在while循环里形成死锁。这种死锁叫SIMD死锁(SIMD DeadLock)。

之后翻了一大堆文献和网站，里面成堆的人都是实现同上，没有一个能用的。最后翻到了这篇文章[4]，解决了问题。整理出的自旋锁的模板如下：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20

/*
!!! Strictly Keep to the Following Code Format Or Cause DeadLock !!!
*/
__global__ void SpinLockFrameWork() {
//SpinLock Template
int idx = threadIdx.x;
__shared__ int lock_var;
if (idx == 0)lock_var = 0;
__syncthreads();

bool loopFlag = false;
do {
if ((loopFlag = atomicCAS(&lock_var, 0, 1) == 0)) {
//Critical Section Code Here
}
__threadfence(); //Or __threadfence_block(), __threadfence_system() according to your Memory Fence demand
if (loopFlag)atomicExch(&lock_var, 0);
} while (!loopFlag);

}

这里面巧妙地用了一个Do-While循环解决了SIMD DeadLock问题。将加锁和解锁都放在循环里面，使得CUDA引擎能够绕开SIMD死锁。

还有一个问题是__threadfence()，这个东西是用来解决内存一致性问题的。对于多线程机器来说，一个线程修改了内存，对于其他线程不一定可见。threadfence类函数可以根据你的不同种类需要，确保其他线程对你刚修改的内存可见。代码中三个线程内存一致性同步函数的具体用法可以查询CUDA Manual。加上这个之后，在Critical Section里面做的就可以不是原子操作了。

下面是自旋锁的测试代码：

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

#include <HostMemory.h>
#include <DeviceMemory.cuh>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cuda_device_runtime_api.h>

/*
!!! Strictly Keep to the Following Code Format Or Cause DeadLock !!!
*/
__global__ void SpinLockFrameWork() {
//SpinLock Template
int idx = threadIdx.x;
__shared__ int lock_var;
if (idx == 0)lock_var = 0;
__syncthreads();

bool loopFlag = false;
do {
if ((loopFlag = atomicCAS(&lock_var, 0, 1) == 0)) {
//Critical Section Code Here
}
__threadfence(); //Or __threadfence_block(), __threadfence_system() According to your Memory Fence demand
if (loopFlag)atomicExch(&lock_var, 0);
} while (!loopFlag);

}

__global__ void testSpinLock(int* arr) {
//Utilize SpinLock to accumulate
int idx = threadIdx.x;
__shared__ int lock_var;
if (idx == 0)lock_var = 0;
__syncthreads();

//printf("Before Critical: %d\n", idx);

bool loopFlag = false;
do {
//A Do Loop well prevent from CUDA SIMD Deadlock
if ((loopFlag = atomicCAS(&lock_var, 0, 1) == 0)) {
//Start of Critical Section
//printf("Entered Critical: %d\n", idx);
arr[0] += 1;
//End of Critical Section
}
__threadfence_block();
if (loopFlag)atomicExch(&lock_var, 0);
} while (!loopFlag);

//printf("Leaved Critical: %d\n", idx);
}

__global__ void testNoSpinLock(int* arr) {git push
arr[0] += 1;
__threadfence_block();
}
int main() {
//Self-Defined Template Class DeviceMemory & HostMemory
DeviceMemory<int> dm(1);
HostMemory<int> hm(1);
hm[0] = 0;
//Overloaded cudaMemcpy
cudaMemcpy(dm, hm);
testSpinLock << <1, 512 >> > (dm());
cudaMemcpy(hm, dm);
std::cout << "SpinLock:" << hm[0] << std::endl;
hm[0] = 0;
cudaMemcpy(dm, hm);
testNoSpinLock << <1, 512 >> > (dm());
cudaMemcpy(hm, dm);
std::cout << "NoSpinLock:" << hm[0] << std::endl;
}

这个测试代码使用GPU的512个线程，使用自旋锁，向同一块内存累加1。最后的结果应该是512。请注意，在关键区结束之后，必须加上__threadfence_block()来保持同一block内thread内存一致性。否则其他线程就会取到修改前的值或者缓存，导致结果错误。我就迷惑了好半天，经Foobar群友提醒才知道还有Memory Fence这东西。非常感谢 Foobar 院计算机系统结构学部专家猴哥！

参考文献：
[1]https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
[2]Fine-grained synchronizations and dataflow programming on GPUs, Li A,Van Den Braak G,Corporaal H et al, Proceedings of the International Conference on Supercomputing (2015) 2015-June 109-118, DOI: 10.1145/2751205.2751232
[3]https://stackoverflow.com/questions/2021019/implementing-a-critical-section-in-cuda#comment74082158_2021173
[4]https://stackoverflow.com/questions/21341495/cuda-mutex-and-atomiccas

杰克部落

梦想本不会发光，发光的是追梦的你

CUDA上的自旋锁(SpinLock)实现问题

《CUDA上的自旋锁(SpinLock)实现问题》有4个想法

发表回复

2024 年 10 月
一	二	三	四	五	六	日
	1	2	3	4	5	6
7	8	9	10	11	12	13
14	15	16	17	18	19	20
21	22	23	24	25	26	27
28	29	30	31