记录一次 C 语言调用 go 生成的动静库的踩坑过程
问题景象
因为某些非凡起因,须要在 C 语言中调用 go 语言生成的 so,原本挺顺利,所有都运行的很好。忽然某一天,不晓得怎么回事,再一个新程序中无奈失常运行了,看到的景象是程序无任何响应,相似于间接卡死了。应用 gdb 查看过程以后的信息,看到如下调用栈:
(gdb) source /root/go/src/runtime/runtime-gdb.py
Loading Go Runtime support.
(gdb) bt
#0 runtime.futex () at /usr/local/go/src/runtime/sys_linux_amd64.s:520
#1 0x00007fff23c550f6 in runtime.futexsleep (addr=0xfffffffffffffe00, val=0, ns=140733793710979) at /usr/local/go/src/runtime/os_linux.go:44
#2 0x00007fff23c35667 in runtime.notetsleep_internal (n=0x7fff23dc66e8 <runtime.work+232>, ns=-1) at /usr/local/go/src/runtime/lock_futex.go:183
#3 0x00007fff23c35785 in runtime.notetsleepg (n=0x7fff23dc66e8 <runtime.work+232>, ns=-1) at /usr/local/go/src/runtime/lock_futex.go:237
#4 0x00007fff23c41338 in runtime.gcBgMarkStartWorkers () at /usr/local/go/src/runtime/mgc.go:1126
#5 0x00007fff23c3fd92 in runtime.gcStart (trigger=...) at /usr/local/go/src/runtime/mgc.go:637
#6 0x00007fff23c36e3d in runtime.mallocgc (size=565248, typ=0x7fff23cfd160, needzero=true) at /usr/local/go/src/runtime/malloc.go:1174
#7 0x00007fff23c6dfaa in runtime.growslice (et=, old=..., cap=) at /usr/local/go/src/runtime/slice.go:267
#8 0x00007fff23c918ef in regexp/syntax.(*compiler).inst (c=0xc000044bf8, op=) at /usr/local/go/src/regexp/syntax/compile.go:164
#9 regexp/syntax.(*compiler).rune (c=0xc000044bf8, r=[]int32 = {...}, flags=212) at /usr/local/go/src/regexp/syntax/compile.go:273
#10 0x00007fff23c90f1e in regexp/syntax.(*compiler).compile (c=0xc000044bf8, re=0xc000190000) at /usr/local/go/src/regexp/syntax/compile.go:101
#11 0x00007fff23c8f805 in regexp/syntax.Compile (re=0xc000190000) at /usr/local/go/src/regexp/syntax/compile.go:74
#12 0x00007fff23ca4a9a in regexp.compile (expr=,
mode=, longest=false) at /usr/local/go/src/regexp/regexp.go:178
#13 0x00007fff23ca5591 in regexp.Compile (expr="") at /usr/local/go/src/regexp/regexp.go:133
#14 regexp.MustCompile (str="") at /usr/local/go/src/regexp/regexp.go:309
#15 0x00007fff23ca8e0a in main.CheckPostAddress (buf=, bufLen=) at /root/codes/di_rechk/rechk.go:540
#16 0x00007fff23ca9a88 in _cgoexp_67df0785bef2_CheckPostAddress (a=0x7fffffffc740) at _cgo_gotypes.go:162
#17 0x00007fff23c2e21a in runtime.cgocallbackg1 (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at /usr/local/go/src/runtime/cgocall.go:306
#18 0x00007fff23c2dee9 in runtime.cgocallbackg (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at /usr/local/go/src/runtime/cgocall.go:232
#19 0x00007fff23c83791 in runtime.cgocallbackg (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at :1
#20 0x00007fff23c813f3 in runtime.cgocallback () at /usr/local/go/src/runtime/asm_amd64.s:915
#21 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#22 0x0000000000000000 in ?? ()
(gdb) info goroutines
17 syscall runtime.notetsleepg
2 waiting runtime.gopark
3 waiting runtime.gopark
4 waiting runtime.gopark
19 runnable runtime.gcBgMarkWorker
(gdb) goroutine 17 bt
#0 runtime.notetsleepg (n=0x7fff23dc66e8 <runtime.work+232>, ns=-1) at /usr/local/go/src/runtime/lock_futex.go:237
#1 0x00007fff23c41338 in runtime.gcBgMarkStartWorkers () at /usr/local/go/src/runtime/mgc.go:1126
#2 0x00007fff23c3fd92 in runtime.gcStart (trigger=...) at /usr/local/go/src/runtime/mgc.go:637
#3 0x00007fff23c36e3d in runtime.mallocgc (size=565248, typ=0x7fff23cfd160, needzero=true) at /usr/local/go/src/runtime/malloc.go:1174
#4 0x00007fff23c6dfaa in runtime.growslice (et=, old=..., cap=) at /usr/local/go/src/runtime/slice.go:267
#5 0x00007fff23c918ef in regexp/syntax.(*compiler).inst (c=0xc000044bf8, op=) at /usr/local/go/src/regexp/syntax/compile.go:164
#6 regexp/syntax.(*compiler).rune (c=0xc000044bf8, r=[]int32 = {...}, flags=212) at /usr/local/go/src/regexp/syntax/compile.go:273
#7 0x00007fff23c90f1e in regexp/syntax.(*compiler).compile (c=0xc000044bf8, re=0xc00022a000) at /usr/local/go/src/regexp/syntax/compile.go:101
#8 0x00007fff23c8f805 in regexp/syntax.Compile (re=0xc00022a000) at /usr/local/go/src/regexp/syntax/compile.go:74
#9 0x00007fff23ca4a9a in regexp.compile (expr=,
mode=, longest=false) at /usr/local/go/src/regexp/regexp.go:178
#10 0x00007fff23ca5591 in regexp.Compile (expr=""") at /usr/local/go/src/regexp/regexp.go:133
#11 regexp.MustCompile (str=""") at /usr/local/go/src/regexp/regexp.go:309
#12 0x00007fff23ca8e0a in main.CheckPostAddress (buf=, bufLen=) at /root/codes/di_rechk/rechk.go:540
#13 0x00007fff23ca9a88 in _cgoexp_67df0785bef2_CheckPostAddress (a=0x7fffffffc740) at _cgo_gotypes.go:162
#14 0x00007fff23c2e21a in runtime.cgocallbackg1 (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at /usr/local/go/src/runtime/cgocall.go:306
#15 0x00007fff23c2dee9 in runtime.cgocallbackg (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at /usr/local/go/src/runtime/cgocall.go:232
#16 0x00007fff23c83791 in runtime.cgocallbackg (fn=0x7fff23ca9a60 <_cgoexp_67df0785bef2_CheckPostAddress>, frame=0x7fffffffc740, ctxt=0)
at :1
#17 0x00007fff23c813f3 in runtime.cgocallback () at /usr/local/go/src/runtime/asm_amd64.s:915
#18 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#19 0x0000000000000000 in ?? ()
(gdb) goroutine 2 bt
#0 runtime.gopark (unlockf=, lock=, reason=, traceEv=, traceskip=)
at /usr/local/go/src/runtime/proc.go:367
#1 0x00007fff23c5b86d in runtime.goparkunlock (reason=, traceEv=, traceskip=, lock=)
at /usr/local/go/src/runtime/proc.go:372
#2 runtime.forcegchelper () at /usr/local/go/src/runtime/proc.go:306
#3 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#4 0x0000000000000000 in ?? ()
(gdb) goroutine 3 bt
#0 runtime.gopark (unlockf=, lock=, reason=, traceEv=, traceskip=)
at /usr/local/go/src/runtime/proc.go:367
#1 0x00007fff23c48ee8 in runtime.goparkunlock (reason=, traceEv=, traceskip=, lock=)
at /usr/local/go/src/runtime/proc.go:372
#2 runtime.bgsweep () at /usr/local/go/src/runtime/mgcsweep.go:163
#3 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#4 0x0000000000000000 in ?? ()
(gdb) goroutine 4 bt
#0 runtime.gopark (unlockf=, lock=, reason=, traceEv=, traceskip=)
at /usr/local/go/src/runtime/proc.go:367
#1 0x00007fff23c46fed in runtime.goparkunlock (reason=, traceEv=, traceskip=, lock=)
at /usr/local/go/src/runtime/proc.go:372
#2 runtime.bgscavenge () at /usr/local/go/src/runtime/mgcscavenge.go:265
#3 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#4 0x0000000000000000 in ?? ()
(gdb) goroutine 19 bt
#0 runtime.gcBgMarkWorker () at /usr/local/go/src/runtime/mgc.go:1166
#1 0x00007fff23c81601 in runtime.goexit () at /usr/local/go/src/runtime/asm_amd64.s:1581
#2 0x0000000000000000 in ?? ()
(gdb) info threads
Id Target Id Frame
1 Thread 0x7ffff7966b80 (LWP 2024707) "nginx" runtime.futex () at /usr/local/go/src/runtime/sys_linux_amd64.s:520
2 Thread 0x7ffef1ffb700 (LWP 2024717) "nginx" 0x00007ffff7a483bf in __GI___clock_nanosleep (clock_id=clock_id@entry=0, flags=flags@entry=0,
req=req@entry=0x7ffef1ffae80, rem=rem@entry=0x0) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:78
3 Thread 0x7ffef27fc700 (LWP 2024718) "ZMQbg/Reaper" 0x00007ffff7a8a5ce in epoll_wait (epfd=42, events=events@entry=0x7ffef27fb200,
maxevents=maxevents@entry=256, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
4 Thread 0x7ffef2ffd700 (LWP 2024719) "ZMQbg/IO/0" 0x00007ffff7a8a5ce in epoll_wait (epfd=44, events=events@entry=0x7ffef2ffc200,
maxevents=maxevents@entry=256, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
问题解决过程
因为对 go 不太熟悉,所以不晓得出了什么问题。go 自身的代码也比较简单,没有太简单的操作。而且之前的程序调用这些 go 代码生成的 so,也能整成运行。从网上查资料也是一头雾水,不晓得该怎么查。通过一段时间的搜寻,终于在 stackoverflow 中看到了一个相似问题。依照这个起因进行批改,果然批改好了。
问题起因总结
依照 stackoverflow 中的总结。起因是在程序启动的时候,如果调用了 go 生成的 so,那么 go 会调用一些函数建设 go 运行时环境,会创立一些线程。而我的新程序中,具体调用 go 函数的中央,是在子过程中调用的,而父过程 fork 子过程的时候,不会把 go 建设的那几个运行时线程复制到子过程中,即使在子过程中从新 dlopen go 生成的 so 也不会从新创立那几个运行时线程,所以导致 go 中那些波及到一些须要运行时程序干涉的代码(如 gc 等)无奈失常运行。而我的 go 代码中刚好触发了 GC,导致 go 无奈运行,始终处在死循环之中。
解决办法是:在 fork 之后运行 dlopen 关上 go 生成的 so,这样就能够了。
批改之后,程序运行的信息如下:
(gdb) info threads
Id Target Id Frame
* 1 Thread 0x7ffff7966b80 (LWP 2366880) "nginx" 0x00007ffff7a8a5ce in epoll_wait (epfd=31, events=0x555555d0af90, maxevents=512,
timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
2 Thread 0x7fff481fd700 (LWP 2366893) "nginx" 0x00007ffff7a483bf in __GI___clock_nanosleep (clock_id=clock_id@entry=0, flags=flags@entry=0,
req=req@entry=0x7fff481fce80, rem=rem@entry=0x0) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:78
3 Thread 0x7fff489fe700 (LWP 2366894) "ZMQbg/Reaper" 0x00007ffff7a8a5ce in epoll_wait (epfd=42, events=events@entry=0x7fff489fd200,
maxevents=maxevents@entry=256, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
4 Thread 0x7fff491ff700 (LWP 2366895) "ZMQbg/IO/0" 0x00007ffff7a8a5ce in epoll_wait (epfd=44, events=events@entry=0x7fff491fe200,
maxevents=maxevents@entry=256, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30
5 Thread 0x7fff245cb700 (LWP 2368456) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
6 Thread 0x7fff20c34700 (LWP 2368457) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
7 Thread 0x7fff091a1700 (LWP 2368458) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
8 Thread 0x7fff089a0700 (LWP 2368459) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
9 Thread 0x7ffed7fff700 (LWP 2368460) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
10 Thread 0x7ffed77fe700 (LWP 2368461) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
11 Thread 0x7ffed6ffd700 (LWP 2368462) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
12 Thread 0x7ffed67fc700 (LWP 2368463) "nginx" runtime.futex () at /root/go/src/runtime/sys_linux_amd64.s:560
和有问题时的 info threads 信息比照,发现多了好几个 runtime.futex 的线程,这些线程即是运行 go 程序须要的线程。之前的程序之所以没出问题,是因为之前的程序没有调用 fork,所以不会呈现此问题。
额定问题
在有问题的时候,为什么触发 GC 的时候导致 go 程序无奈运行,始终处在死循环中?这个须要理解 go 的垃圾回收机制能力晓得起因,我自身对 go 不太熟悉,所以只是简略理解了一点,还无奈具体解释分明。