关于golang:手撸golang-GO与微服务-ChatServer之3-压测与诊断

49次阅读

共计 6647 个字符,预计需要花费 17 分钟才能阅读完成。

缘起

最近浏览 <<Go 微服务实战 >> (刘金亮, 2021.1)
本系列笔记拟采纳 golang 练习之

案例需要 (聊天服务器)

  • 用户能够连贯到服务器。
  • 用户能够设定本人的用户名。
  • 用户能够向服务器发送音讯,同时服务器也会向其余用户播送该音讯。

指标 (Day 3)

  • 在 Day 2 中, 已根本实现功能性编码, 测试也通过了, 明天尝试压测, 并诊断是否存在内存和 routine 透露

过程

  • 批改单元测试代码, 并发客户端减少到 50 个, 每个客户端随机 1 - 3 秒发送一条音讯
  • 增加 ILoggingService, 以更不便的采集全局日志
  • 应用 pprof 包的 api, 采集 heap 和 goroutine 的诊断日志
  • 应用 go tool pprof < 诊断日志 >, 察看是否存在内存和 routine 透露
  • 发现单元测试总是 failed, 总有若干 tChatClient, 服务端未打印敞开日志
  • 查看 routine 诊断日志, 发现存在 routine 透露
  • 在 tChatClient 中增加更具体的函数调用日志, 发现是 closeChan 通道写入阻塞导致
  • 修复 routine 透露的 bug, 加大并发客户端到 500 个, 并进行多轮测试, 察看是否存在内存和 routine 透露

单元测试

  • ChatServer_test.go
  • 增大客户端并发数, 并增加 pprof 诊断
  • 诊断 ILoggingService.AllLogs(), 判断所有客户端是否实现连贯 / 断开的闭环
package chat_server

import (
    "fmt"
    cs "learning/gooop/chat_server"
    "math/rand"
    "os"
    "runtime"
    "runtime/pprof"
    "strings"
    "sync"
    "testing"
    "time"
)

func Test_ChatServer(t *testing.T) {fnAssertTrue := func(b bool, msg string) {
        if !b {t.Fatal(msg)
        }
    }

    // create dump file
    memProfile, err := os.Create("/home/ioly/chat_server_mem.profile")
    if err != nil {t.Fatal(err)
    }
    cpuProfile, err := os.Create("/home/ioly/chat_server_cpu.profile")
    if err != nil {t.Fatal(err)
    }
    rtnProfile, err := os.Create("/home/ioly/chat_server_routine.profile")
    if err != nil {t.Fatal(err)
    }
    err = pprof.StartCPUProfile(cpuProfile)
    if err != nil {t.Fatal(err)
    }
    defer pprof.StopCPUProfile()

    port := 3333
    server := cs.NewChatServer()
    err = server.Open(port)
    if err != nil {t.Fatal(err)
    }

    clientCount := 500
    wg := &sync.WaitGroup{}

    rnd := rand.New(rand.NewSource(time.Now().UnixNano()))
    address := fmt.Sprintf("localhost:%v", port)
    for i := 0;i < clientCount;i++ {err, client := cs.DialChatClient(address)
        if err != nil {t.Fatal(err)
        }

        id := fmt.Sprintf("c%02d", i)
        //client.RecvHandler(func(client cs.IChatClient, msg cs.IMsg) {//    t.Logf("%v recv: %v\n", id, msg)
        //})

        wg.Add(1)
        go func() {client.SetName(id)
            client.Send(&cs.NameMsg{id})

            n := 0
            duration := rnd.Intn(3) + 1
            for range time.Tick(time.Duration(duration) * time.Second) {client.Send(&cs.ChatMsg{id, fmt.Sprintf("msg %02d from %v", n, id) })

                n++
                if n > 10 {break}
            }

            client.Close()
            wg.Done()}()}

    // wait and set status
    done := []bool{false}
    go func() {wg.Wait()
        done[0] = true
    }()

    // print passed seconds
    passedSeconds := 0
    for range time.Tick(time.Second) {
        passedSeconds++
        t.Logf("%v seconds passed", passedSeconds)

        if done[0] {break}
    }

    time.Sleep(5*time.Second)
    server.Close()

    // dump heap file
    runtime.GC()
    err = pprof.Lookup("heap").WriteTo(memProfile, 0)
    if err != nil {t.Fatal(err)
    }
    err = pprof.Lookup("goroutine").WriteTo(rtnProfile, 0)
    if err != nil {t.Fatal(err)
    }

    // check server logs
    logs := cs.Logging.AllLogs()
    fnHasLog := func(log string) bool {
        for _,it := range logs {if strings.Contains(it, log) {return true}
        }
        return false
    }
    for i := 0;i < clientCount;i++ {msg := fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=false", i)
        fnAssertTrue(fnHasLog(msg), "expecting log:" + msg)

        msg = fmt.Sprintf("tChatServer.handleIncomingConn, clientCount=%v", i + 1)
        fnAssertTrue(fnHasLog(msg), "expecting log:" + msg)

        msg = fmt.Sprintf("tChatClient.postConnClosed, c%02d, serverFlag=true", i)
        fnAssertTrue(fnHasLog(msg), "expecting log:" + msg)

        msg = fmt.Sprintf("tChatServer.handleClientClosed, c%02d", i)
        fnAssertTrue(fnHasLog(msg), "expecting log:" + msg)
    }
}

测试输入

并发 500, 仅开端局部

tChatClient.beginWrite, write error, c496, serverFlag=true
tChatClient.beginWrite, <- closeChan, c496, serverFlag=true
tChatClient.postConnClosed, c496, serverFlag=true
tChatServer.handleClientClosed, c496
tChatServer.handleClientClosed, c496, clientCount=1
tChatClient.closeConn, c498, serverFlag=false
tChatClient.beginWrite, <- closeChan, c498, serverFlag=false
tChatClient.beginRead, read error, c498, serverFlag=false
tChatClient.postConnClosed, c498, serverFlag=false
tChatClient.beginRead, read error, c498, serverFlag=true
tChatClient.closeConn, c498, serverFlag=true
tChatClient.beginWrite, <- closeChan, c498, serverFlag=true
tChatClient.postConnClosed, c498, serverFlag=true
tChatServer.handleClientClosed, c498
tChatServer.handleClientClosed, c498, clientCount=0
    ChatServer_test.go:97: 34 seconds passed
--- PASS: Test_ChatServer (39.50s)
PASS
ok      command-line-arguments  39.535s

内存透露诊断

无泄漏

$ go tool pprof ~/chat_server_mem.profile 
File: chat_server.test
Type: inuse_space
Time: Mar 9, 2021 at 8:56am (CST)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top
Showing nodes accounting for 8461.35kB, 100% of 8461.35kB total
Showing top 10 nodes out of 15
      flat  flat%   sum%        cum   cum%
 5720.44kB 67.61% 67.61%  5720.44kB 67.61%  time.startTimer
 1184.27kB 14.00% 81.60%  1184.27kB 14.00%  runtime/pprof.StartCPUProfile
 1024.38kB 12.11% 93.71%  1024.38kB 12.11%  runtime.malg
  532.26kB  6.29%   100%   532.26kB  6.29%  runtime/pprof.(*profMap).lookup
         0     0%   100%  1184.27kB 14.00%  command-line-arguments.Test_ChatServer
         0     0%   100%  5720.44kB 67.61%  learning/gooop/chat_server.(*tChatClient).beginWrite
         0     0%   100%  1024.38kB 12.11%  runtime.mstart
         0     0%   100%  1024.38kB 12.11%  runtime.newproc.func1
         0     0%   100%  1024.38kB 12.11%  runtime.newproc1
         0     0%   100%  1024.38kB 12.11%  runtime.systemstack
(pprof) quit

routine 透露诊断 (修复前)

有透露时的诊断日志 (并发 50)

$ go tool pprof ~/chat_server_routine.profile 
File: chat_server.test
Type: goroutine
Time: Mar 9, 2021 at 9:12am (CST)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top
Showing nodes accounting for 4, 100% of 4 total
Showing top 10 nodes out of 21
      flat  flat%   sum%        cum   cum%
         3 75.00% 75.00%          3 75.00%  runtime.gopark
         1 25.00%   100%          1 25.00%  runtime/pprof.runtime_goroutineProfileWithLabels
         0     0%   100%          1 25.00%  command-line-arguments.Test_ChatServer
         0     0%   100%          1 25.00%  learning/gooop/chat_server.(*tChatClient).beginWrite
         0     0%   100%          1 25.00%  learning/gooop/chat_server.(*tChatClient).closeConn
         0     0%   100%          1 25.00%  main.main
         0     0%   100%          1 25.00%  runtime.chanrecv
         0     0%   100%          1 25.00%  runtime.chanrecv1
         0     0%   100%          1 25.00%  runtime.chansend
         0     0%   100%          1 25.00%  runtime.chansend1
(pprof) 

能够看到 closeConn 函数阻塞

func (me *tChatClient) closeConn() {if !atomic.CompareAndSwapInt32(&me.closeFlag, 0, 1) {return}

    Logging.Logf("tChatClient.closeConn, %v, serverFlag=%v", me.name, me.serverFlag)
    me.closeChan <- true
}

从代码看, closeConn 函数阻塞只有一种可能, 就是 closeChan 阻塞.
这个起因有点诡异, 因为曾经加了 atomic.CompareAndSwapInt32 爱护,
依然导致阻塞, 惟一起因是并发环境下, closeChan 尚未开始读取, 是无奈写入的.

修复 routine 透露

就是给导致写阻塞的 closeChan 加上缓冲, 大小等于 closeConn 函数的可能调用入口数量

func openChatClient(conn net.Conn, serverFlag bool) IChatClient {
    it := &tChatClient{
        conn: conn,
        openFlag: 0,
        closeFlag: 0,
        serverFlag: serverFlag,

        closeChan: make(chan bool, 3),    // 给 closeChan 增加缓冲区!!
        sendChan: make(chan IMsg, gMaxPendingSend),

        name: "anonymous",
        sendLogs: []IMsg{},
        dropLogs: []IMsg{},
        recvLogs: []IMsg{},
    }
    it.open()
    return it
}

routine 透露诊断 (修复后)

修复后, 加大并发到 500, 未发现显著 routine 透露 (不存在业务代码导致挂起的 routine)

$ go tool pprof ~/chat_server_routine.profile 
File: chat_server.test
Type: goroutine
Time: Mar 9, 2021 at 9:28am (CST)
Entering interactive mode (type "help" for commands, "o" for options)
(pprof) top
Showing nodes accounting for 3, 100% of 3 total
Showing top 10 nodes out of 17
      flat  flat%   sum%        cum   cum%
         2 66.67% 66.67%          2 66.67%  runtime.gopark
         1 33.33%   100%          1 33.33%  runtime/pprof.runtime_goroutineProfileWithLabels
         0     0%   100%          1 33.33%  command-line-arguments.Test_ChatServer
         0     0%   100%          1 33.33%  main.main
         0     0%   100%          1 33.33%  runtime.chanrecv
         0     0%   100%          1 33.33%  runtime.chanrecv1
         0     0%   100%          1 33.33%  runtime.main
         0     0%   100%          1 33.33%  runtime/pprof.(*Profile).WriteTo
         0     0%   100%          1 33.33%  runtime/pprof.profileWriter
         0     0%   100%          1 33.33%  runtime/pprof.writeGoroutine
(pprof) 

(end)

正文完
 0