East test_socket deadlock troubleshooting

在封装完Socket的常用函数后,需要测试其正确性,
然后测试中发现,线程在打印了服务器的回复之后就一直挂起没有响应了,此博客用来记录排查此问题的过程。

test_socket.cc

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/*
* @Author: Xudong0722
* @Date: 2025-05-11 18:26:13
* @Last Modified by: Xudong0722
* @Last Modified time: 2025-05-11 18:44:57
*/

#include "../East/include/Elog.h"
#include "../East/include/IOManager.h"
#include "../East/include/Socket.h"

static East::Logger::sptr g_logger = ELOG_NAME("root");
void test_socket() {
East::IPAddress::sptr addr =
East::Address::LookupAnyIPAddress("www.baidu.com");
if (nullptr == addr) {
ELOG_ERROR(g_logger) << "LookupAnyIPAddress failed";
return;
} else {
ELOG_INFO(g_logger) << "LookupAnyIPAddress: " << addr->toString();
}

East::Socket::sptr sock = East::Socket::CreateTCP(addr);
if (nullptr == sock)
return;

addr->setPort(80);
if (!sock->connect(addr)) {
ELOG_ERROR(g_logger) << "connect failed";
return;
} else {
ELOG_INFO(g_logger) << "connect " << addr->toString() << " success";
}

const char buf[] = "GET / HTTP/1.0\r\n\r\n";
int res = sock->send(buf, sizeof(buf));
if (res <= 0) {
ELOG_ERROR(g_logger) << "send failed";
return;
}

std::string rcv_buf;
rcv_buf.resize(4096);
res = sock->recv(&rcv_buf[0], rcv_buf.size());
if (res <= 0) {
ELOG_ERROR(g_logger) << "recv failed";
return;
}
rcv_buf.resize(res);
ELOG_INFO(g_logger) << "recv: " << rcv_buf;
}

int main() {
East::IOManager iom;
iom.schedule(test_socket);
iom.stop();
ELOG_INFO(g_logger) << "test_socket finished";
return 0;
}

程序在析构Socket时线程阻塞, 查看调用栈:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
__futex_abstimed_wait_common (cancel=false, private=0, abstime=0x0, clockid=0, expected=2, futex_word=0x7ffff7fba670 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+16>) at ./nptl/futex-internal.c:103
103 ./nptl/futex-internal.c: No such file or directory.
(gdb) bt
#0 __futex_abstimed_wait_common (cancel=false, private=0, abstime=0x0, clockid=0, expected=2, futex_word=0x7ffff7fba670 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+16>) at ./nptl/futex-internal.c:103
#1 __GI___futex_abstimed_wait64 (futex_word=futex_word@entry=0x7ffff7fba670 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+16>, expected=expected@entry=2, clockid=clockid@entry=0, abstime=abstime@entry=0x0, private=private@entry=0) at ./nptl/futex-internal.c:128
#2 0x00007ffff79ee315 in __pthread_rwlock_wrlock_full64 (abstime=0x0, clockid=0, rwlock=0x7ffff7fba668 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+8>) at ./nptl/pthread_rwlock_common.c:829
#3 ___pthread_rwlock_wrlock (rwlock=0x7ffff7fba668 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+8>) at ./nptl/pthread_rwlock_wrlock.c:26
#4 0x00007ffff7efbf8c in East::RWLock::wrlock (this=0x7ffff7fba668 <East::Singleton<East::FdManager, void, 0>::GetInst()::t+8>) at /home/elvis/East/East/include/Mutex.h:122
#5 0x00007ffff7efcd36 in East::WriteScopedLock<East::RWLock>::lock (this=0x7ffff7663bb0) at /home/elvis/East/East/include/Mutex.h:93
#6 0x00007ffff7efc449 in East::WriteScopedLock<East::RWLock>::WriteScopedLock (this=0x7ffff7663bb0, lock=...) at /home/elvis/East/East/include/Mutex.h:87
#7 0x00007ffff7efbe9a in East::FdManager::deleteFd (this=0x7ffff7fba660 <East::Singleton<East::FdManager, void, 0>::GetInst()::t>, fd=6) at /home/elvis/East/East/src/FdManager.cc:105
#8 0x00007ffff7f0e899 in close (fd=6) at /home/elvis/East/East/src/Hook.cc:420
#9 0x00007ffff7f5f628 in East::Socket::close (this=0x555555666cf0) at /home/elvis/East/East/src/Socket.cc:226
#10 0x00007ffff7f5d572 in East::Socket::~Socket (this=0x555555666cf0, __in_chrg=<optimized out>) at /home/elvis/East/East/src/Socket.cc:60
#11 0x00007ffff7f64cec in __gnu_cxx::new_allocator<East::Socket>::destroy<East::Socket> (this=0x555555666cf0, __p=0x555555666cf0) at /usr/include/c++/11/ext/new_allocator.h:168
#12 0x00007ffff7f64c7b in std::allocator_traits<std::allocator<East::Socket> >::destroy<East::Socket> (__a=..., __p=0x555555666cf0) at /usr/include/c++/11/bits/alloc_traits.h:535
#13 0x00007ffff7f64a81 in std::_Sp_counted_ptr_inplace<East::Socket, std::allocator<East::Socket>, (__gnu_cxx::_Lock_policy)2>::_M_dispose (this=0x555555666ce0) at /usr/include/c++/11/bits/shared_ptr_base.h:528
#14 0x00005555555d2133 in std::_Sp_counted_base<(__gnu_cxx::_Lock_policy)2>::_M_release (this=0x555555666ce0) at /usr/include/c++/11/bits/shared_ptr_base.h:168
#15 0x00005555555d19e7 in std::__shared_count<(__gnu_cxx::_Lock_policy)2>::~__shared_count (this=0x7ffff7663e08, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:705
#16 0x00005555555d18a6 in std::__shared_ptr<East::Socket, (__gnu_cxx::_Lock_policy)2>::~__shared_ptr (this=0x7ffff7663e00, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr_base.h:1154
#17 0x00005555555d18c6 in std::shared_ptr<East::Socket>::~shared_ptr (this=0x7ffff7663e00, __in_chrg=<optimized out>) at /usr/include/c++/11/bits/shared_ptr.h:122
#18 0x00005555555d0da2 in test_socket () at /home/elvis/East/tests/test_socket.cc:51
#19 0x00005555555d436d in std::__invoke_impl<void, void (*&)()> (__f=@0x555555666130: 0x5555555d0219 <test_socket()>) at /usr/include/c++/11/bits/invoke.h:61
#20 0x00005555555d3bf2 in std::__invoke_r<void, void (*&)()> (__fn=@0x555555666130: 0x5555555d0219 <test_socket()>) at /usr/include/c++/11/bits/invoke.h:111

发现是读写锁导致的问题, 我们看一下最后的几个函数分别是什么:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
FileDescriptor::sptr FdManager::getFd(int fd, bool create_when_notfound) {
if (fd < 0)
return nullptr;

{
RWMutexType::RLockGuard rlock(m_mutex);
if (fd >= (int)m_fds.size()) {
if (!create_when_notfound) {
return nullptr;
}
} else {
if (nullptr == m_fds.at(fd)) {
if (!create_when_notfound) {
return nullptr;
}
} else {
return m_fds.at(fd);
}
}
}
RWMutexType::WLockGuard wlock(m_mutex);
auto new_fd = std::make_shared<FileDescriptor>(fd);
//RWMutexType::WLockGuard wlock(m_mutex);

if((int)m_fds.size() <= fd) {
auto tmp = m_fds;
tmp.resize(fd * 1.5); //now, fd * 1.5 > m_fds.size()
copy(m_fds.begin(), m_fds.end(), tmp.begin());
m_fds.swap(tmp);
}

m_fds[fd] = std::move(new_fd);
return m_fds[fd];
}

void FdManager::deleteFd(int fd) {
if (fd < 0)
return;

RWMutexType::WLockGuard wlock(m_mutex);
if (fd >= (int)m_fds.size()) {
return;
}
m_fds[fd].reset();
}

getFd:
在getFd中会先获取读锁,看m_fds中是否有我们的fd,有就直接返回。
如果没有且需要创建的话我们尝试获取写锁,在构造一个fd放入队列中。

deleteFd:
获取写锁,然后删除对应的fd

一些关键信息:

  • 1.通过查看m_mutex被引用的地方,一共就是这三处。
  • 2.单线程

通过以上信息可以判断出大概率就在getFd函数中发生了锁的可重入问题。

代码中使用的是pthread_rwlock_t,读写锁,也称之为共享互斥锁。
多个线程可以同时持有读锁,但写锁同一时间只有一个线程可以获取。

  • 线程持有写锁,再尝试加写锁或读锁都会被阻塞
  • 线程持有读锁,再尝试加写锁会被阻塞

那我们看看getFd中加锁两处之间的代码,只有一个FileDescriptor的构造函数,看看实现,非常可疑:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
FileDescriptor::FileDescriptor(int fd)
: m_init(false),
m_isSocket(false),
m_sysNonBlock(false),
m_userNonBlock(false),
m_closed(false),
m_fd(fd),
m_recvTimeout(-1),
m_sendTimeout(-1) {
init();
}

FileDescriptor::~FileDescriptor() {}

bool FileDescriptor::init() {
if (m_init) {
return true;
}

struct stat status;
if (0 == fstat(m_fd, &status)) {
m_isSocket = S_ISSOCK(status.st_mode);
m_init = true;
} else {
m_isSocket = false;
m_init = false;
}

if (m_isSocket) {
int flags = fcntl( //会调用getFd
m_fd, F_GETFL,
0);
if (!(flags & O_NONBLOCK)) {
fcntl(m_fd, F_SETFL, //会调用getFd
flags | O_NONBLOCK);
}
m_sysNonBlock = true;
} else {
m_sysNonBlock = false;
}

m_userNonBlock = false;
m_closed = false;
return m_init;
}

我们来看一下调用关系:
某一次调用getFd,发现需要创建一个新的fd,我们尝试加写锁,然后构造一个FileDescriptor,
构造函数中调用了fcntl, fcntl又调用了getFd,这里面会尝试获取读锁,根据实际调试结果,
这里是在获取写锁之后再获取读锁,是不会阻塞的,程序正常执行。接着返回结果,”释放读锁?”。

最后执行到close函数中去,在deleteFd中死锁:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
int close(int fd) {
if (!East::is_hook_enable()) {
return close_f(fd);
}

auto fd_status = East::FdMgr::GetInst()->getFd(fd);
if (nullptr != fd_status) {
auto io_mgr = East::IOManager::GetThis();
if (nullptr != io_mgr) {
io_mgr->cancelAll(fd);
}
East::FdMgr::GetInst()->deleteFd(fd);
}
return close_f(fd);
}

在我们将getFd函数中构造fd和获取写锁的顺序调换一下,发现问题可以解决。
或者将fcntl中调用getfd的代码注释掉,死锁现象也消失了。

那么基本就可以确定是getFd中的锁没有正确释放导致的问题。
我尝试将deleteFd中的锁换成读锁是没有问题的,说明getFd中的读锁没有正确释放(如果是写锁的话,deleteFd中替换成读锁还是会有问题)

但是为什么我们使用RAII原则,理想状况下读锁应该会正确释放的?
TODO

  • Copyright: Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.
  • Copyrights © 2015-2025 Xudong0722
  • Visitors: | Views:

请我喝杯咖啡吧~

支付宝
微信