Nsq原理分析二

27次阅读

共计 11848 个字符,预计需要花费 30 分钟才能阅读完成。

在上一篇文章中对 nsq 进行了简单的介绍,从 nsq 的 golang 的客户端代码分析了一下 nsq 的使用,这篇文章会分析 nsqd 的代码

Nsqd 代码分析

nsqd 做了什么

  • nsqd 接收对 topic、channel 的创建以及对消息的存储和分发
  • nsqd 向 nsqlookup 注册自己的服务信息,ip 和端口,向 nsqlookup 注册自己的元数据信息 (topic、channel),nsqd 也会向 nsqdlook 查询 topic、和 channel 信息


nsqadmin 是一个简单的管理界面,通过它可以查询 topic、channel、消费者等等一些基本信息,nsqadmin 是从 nsqlookup 中获取信息的,通过 nsqadmin 也可以创建 topic、channel,创建到了 nsqlookup 中,在 nsqlookup 中的内存中维护者,nsqd 会在某一个合适的时刻将这些信息拉回本地然后创建
nsqd 启动

func (n *NSQD) Main() error {ctx := &context{n}

    exitCh := make(chan error)
    var once sync.Once
    exitFunc := func(err error) {once.Do(func() {
            if err != nil {n.logf(LOG_FATAL, "%s", err)
            }
            exitCh <- err
        })
    }

    n.tcpServer.ctx = ctx
    // 启动 tcp 监听
    n.waitGroup.Wrap(func() {exitFunc(protocol.TCPServer(n.tcpListener, n.tcpServer, n.logf))
    })
    // 启动 http 监听
    httpServer := newHTTPServer(ctx, false, n.getOpts().TLSRequired == TLSRequired)
    n.waitGroup.Wrap(func() {exitFunc(http_api.Serve(n.httpListener, httpServer, "HTTP", n.logf))
    })

    if n.tlsConfig != nil && n.getOpts().HTTPSAddress != "" {httpsServer := newHTTPServer(ctx, true, true)
        n.waitGroup.Wrap(func() {exitFunc(http_api.Serve(n.httpsListener, httpsServer, "HTTPS", n.logf))
        })
    }
   // 队列扫描,处理超时、延迟等信息
    n.waitGroup.Wrap(n.queueScanLoop)
    // 向 nsqlookup 注册自己的元数据信息
    n.waitGroup.Wrap(n.lookupLoop)
    if n.getOpts().StatsdAddress != "" {n.waitGroup.Wrap(n.statsdLoop)
    }

    err := <-exitCh
    return err
}
func TCPServer(listener net.Listener, handler TCPHandler, logf lg.AppLogFunc) error {logf(lg.INFO, "TCP: listening on %s", listener.Addr())

    var wg sync.WaitGroup

    for {
       // 等待请求的到来
        clientConn, err := listener.Accept()
        if err != nil {if nerr, ok := err.(net.Error); ok && nerr.Temporary() {logf(lg.WARN, "temporary Accept() failure - %s", err)
                runtime.Gosched()
                continue
            }
            // theres no direct way to detect this error because it is not exposed
            if !strings.Contains(err.Error(), "use of closed network connection") {return fmt.Errorf("listener.Accept() error - %s", err)
            }
            break
        }

        wg.Add(1)
        // 每当到来一个请求都启动一个 goroutine 进行处理
        go func() {handler.Handle(clientConn)
            wg.Done()}()}

    // wait to return until all handler goroutines complete
    wg.Wait()

    logf(lg.INFO, "TCP: closing %s", listener.Addr())

    return nil
}
unc (p *tcpServer) Handle(clientConn net.Conn) {p.ctx.nsqd.logf(LOG_INFO, "TCP: new client(%s)", clientConn.RemoteAddr())

    // The client should initialize itself by sending a 4 byte sequence indicating
    // the version of the protocol that it intends to communicate, this will allow us
    // to gracefully upgrade the protocol away from text/line oriented to whatever...
    buf := make([]byte, 4)
    _, err := io.ReadFull(clientConn, buf)
    if err != nil {p.ctx.nsqd.logf(LOG_ERROR, "failed to read protocol version - %s", err)
        clientConn.Close()
        return
    }
    // 协商协议版本
    protocolMagic := string(buf)

    p.ctx.nsqd.logf(LOG_INFO, "CLIENT(%s): desired protocol magic'%s'",
        clientConn.RemoteAddr(), protocolMagic)

    var prot protocol.Protocol
    switch protocolMagic {
    case "V2":
        prot = &protocolV2{ctx: p.ctx}
    default:
        protocol.SendFramedResponse(clientConn, frameTypeError, []byte("E_BAD_PROTOCOL"))
        clientConn.Close()
        p.ctx.nsqd.logf(LOG_ERROR, "client(%s) bad protocol magic'%s'",
            clientConn.RemoteAddr(), protocolMagic)
        return
    }

    p.conns.Store(clientConn.RemoteAddr(), clientConn)
    // 开始一个死循环
    err = prot.IOLoop(clientConn)
    if err != nil {p.ctx.nsqd.logf(LOG_ERROR, "client(%s) - %s", clientConn.RemoteAddr(), err)
    }

    p.conns.Delete(clientConn.RemoteAddr())
}
func (p *protocolV2) IOLoop(conn net.Conn) error {
    var err error
    var line []byte
    var zeroTime time.Time

    clientID := atomic.AddInt64(&p.ctx.nsqd.clientIDSequence, 1)
    client := newClientV2(clientID, conn, p.ctx)
    p.ctx.nsqd.AddClient(client.ID, client)

    // synchronize the startup of messagePump in order
    // to guarantee that it gets a chance to initialize
    // goroutine local state derived from client attributes
    // and avoid a potential race with IDENTIFY (where a client
    // could have changed or disabled said attributes)
    messagePumpStartedChan := make(chan bool)
    go p.messagePump(client, messagePumpStartedChan)
    // 消息分发,向消费者发送消息
    <-messagePumpStartedChan

    for {
    // 设置 socket 读取超时,如果 consumer 未在指定的时间内发送过来,那么会断开连接,导致 consumer 退出
        if client.HeartbeatInterval > 0 {client.SetReadDeadline(time.Now().Add(client.HeartbeatInterval * 2))
        } else {client.SetReadDeadline(zeroTime)
        }

        // ReadSlice does not allocate new space for the data each request
        // ie. the returned slice is only valid until the next call to it
        // 读取生产者或者消费者发送过来的请求
        line, err = client.Reader.ReadSlice('\n')
        if err != nil {
            if err == io.EOF {err = nil} else {err = fmt.Errorf("failed to read command - %s", err)
            }
            break
        }

        // trim the '\n'
        line = line[:len(line)-1]
        // optionally trim the '\r'
        if len(line) > 0 && line[len(line)-1] == '\r' {line = line[:len(line)-1]
        }
        params := bytes.Split(line, separatorBytes)

        p.ctx.nsqd.logf(LOG_DEBUG, "PROTOCOL(V2): [%s] %s", client, params)

        var response []byte
        // 根据不同的命令执行不同的动作
        response, err = p.Exec(client, params)
        if err != nil {
            ctx := ""
            if parentErr := err.(protocol.ChildErr).Parent(); parentErr != nil {ctx = "-" + parentErr.Error()
            }
            p.ctx.nsqd.logf(LOG_ERROR, "[%s] - %s%s", client, err, ctx)

            sendErr := p.Send(client, frameTypeError, []byte(err.Error()))
            if sendErr != nil {p.ctx.nsqd.logf(LOG_ERROR, "[%s] - %s%s", client, sendErr, ctx)
                break
            }

            // errors of type FatalClientErr should forceably close the connection
            if _, ok := err.(*protocol.FatalClientErr); ok {break}
            continue
        }

        if response != nil {err = p.Send(client, frameTypeResponse, response)
            if err != nil {err = fmt.Errorf("failed to send response - %s", err)
                break
            }
        }
    }

    p.ctx.nsqd.logf(LOG_INFO, "PROTOCOL(V2): [%s] exiting ioloop", client)
    conn.Close()
    close(client.ExitChan)
    if client.Channel != nil {client.Channel.RemoveClient(client.ID)
    }

    p.ctx.nsqd.RemoveClient(client.ID)
    return err
}

在继续向下看前,看一下生产者的 PUB 请求在 nsqd 中做了什么

func (p *protocolV2) PUB(client *clientV2, params [][]byte) ([]byte, error) {
    var err error

    if len(params) < 2 {return nil, protocol.NewFatalClientErr(nil, "E_INVALID", "PUB insufficient number of parameters")
    }

    topicName := string(params[1])
    if !protocol.IsValidTopicName(topicName) {
        return nil, protocol.NewFatalClientErr(nil, "E_BAD_TOPIC",
            fmt.Sprintf("PUB topic name %q is not valid", topicName))
    }

    bodyLen, err := readLen(client.Reader, client.lenSlice)
    if err != nil {return nil, protocol.NewFatalClientErr(err, "E_BAD_MESSAGE", "PUB failed to read message body size")
    }

    if bodyLen <= 0 {
        return nil, protocol.NewFatalClientErr(nil, "E_BAD_MESSAGE",
            fmt.Sprintf("PUB invalid message body size %d", bodyLen))
    }

    if int64(bodyLen) > p.ctx.nsqd.getOpts().MaxMsgSize {
        return nil, protocol.NewFatalClientErr(nil, "E_BAD_MESSAGE",
            fmt.Sprintf("PUB message too big %d > %d", bodyLen, p.ctx.nsqd.getOpts().MaxMsgSize))
    }

    messageBody := make([]byte, bodyLen)
    _, err = io.ReadFull(client.Reader, messageBody)
    if err != nil {return nil, protocol.NewFatalClientErr(err, "E_BAD_MESSAGE", "PUB failed to read message body")
    }

    if err := p.CheckAuth(client, "PUB", topicName, ""); err != nil {return nil, err}
    // topic 在 nsqd 中的创建的 lazy create,只有当某个生产者向该 topic 中发送消息时才会创建 topic,topic := p.ctx.nsqd.GetTopic(topicName)
    msg := NewMessage(topic.GenerateID(), messageBody)
    err = topic.PutMessage(msg)
    if err != nil {return nil, protocol.NewFatalClientErr(err, "E_PUB_FAILED", "PUB failed"+err.Error())
    }

    client.PublishedMessage(topicName, 1)

    return okBytes, nil
}
/ GetTopic performs a thread safe operation
// to return a pointer to a Topic object (potentially new)
func (n *NSQD) GetTopic(topicName string) *Topic {
    // most likely, we already have this topic, so try read lock first.
    n.RLock()
    // 当 topic 在 nsqd 中创建过时就直接返回该 topic
    t, ok := n.topicMap[topicName]
    n.RUnlock()
    if ok {return t}

    n.Lock()

    t, ok = n.topicMap[topicName]
    if ok {n.Unlock()
        return t
    }
    deleteCallback := func(t *Topic) {n.DeleteExistingTopic(t.name)
    }
    // 稍后看一下这个函数
    t = NewTopic(topicName, &context{n}, deleteCallback)
    n.topicMap[topicName] = t

    n.Unlock()

    n.logf(LOG_INFO, "TOPIC(%s): created", t.name)
    // topic is created but messagePump not yet started

    // if loading metadata at startup, no lookupd connections yet, topic started after load
    if atomic.LoadInt32(&n.isLoading) == 1 {return t}

    // if using lookupd, make a blocking call to get the topics, and immediately create them.
    // this makes sure that any message received is buffered to the right channels
    // 如果使用了 nsqlookup,那么从 nsqlookup 中查询该 topic 的 channel 信息,如果没有在 nsqd 中创建就创建出来
    lookupdHTTPAddrs := n.lookupdHTTPAddrs()
    if len(lookupdHTTPAddrs) > 0 {channelNames, err := n.ci.GetLookupdTopicChannels(t.name, lookupdHTTPAddrs)
        if err != nil {n.logf(LOG_WARN, "failed to query nsqlookupd for channels to pre-create for topic %s - %s", t.name, err)
        }
        for _, channelName := range channelNames {if strings.HasSuffix(channelName, "#ephemeral") {continue // do not create ephemeral channel with no consumer client}
            t.GetChannel(channelName)
        }
    } else if len(n.getOpts().NSQLookupdTCPAddresses) > 0 {n.logf(LOG_ERROR, "no available nsqlookupd to query for channels to pre-create for topic %s", t.name)
    }

    // now that all channels are added, start topic messagePump
    t.Start()
    return t
}
// Topic constructor
func NewTopic(topicName string, ctx *context, deleteCallback func(*Topic)) *Topic {
    t := &Topic{
        name:              topicName,
        channelMap:        make(map[string]*Channel),
        memoryMsgChan:     nil,
        startChan:         make(chan int, 1),
        exitChan:          make(chan int),
        channelUpdateChan: make(chan int),
        ctx:               ctx,
        paused:            0,
        pauseChan:         make(chan int),
        deleteCallback:    deleteCallback,
        idFactory:         NewGUIDFactory(ctx.nsqd.getOpts().ID),
    }
    // create mem-queue only if size > 0 (do not use unbuffered chan)
    if ctx.nsqd.getOpts().MemQueueSize > 0 {t.memoryMsgChan = make(chan *Message, ctx.nsqd.getOpts().MemQueueSize)
    }
    if strings.HasSuffix(topicName, "#ephemeral") {
        t.ephemeral = true
        t.backend = newDummyBackendQueue()} else {dqLogf := func(level diskqueue.LogLevel, f string, args ...interface{}) {opts := ctx.nsqd.getOpts()
            lg.Logf(opts.Logger, opts.LogLevel, lg.LogLevel(level), f, args...)
        }
        // 持久化的结构
        t.backend = diskqueue.New(
            topicName,
            ctx.nsqd.getOpts().DataPath,
            ctx.nsqd.getOpts().MaxBytesPerFile,
            int32(minValidMsgLength),
            int32(ctx.nsqd.getOpts().MaxMsgSize)+minValidMsgLength,
            ctx.nsqd.getOpts().SyncEvery,
            ctx.nsqd.getOpts().SyncTimeout,
            dqLogf,
        )
    }
   // topic 中也启动了一个 messagePump,在 protocolv2 中也启动了一个同名函数,前一个是为了向 consumer 推送消息,这个是向 topic 下的一个或者多个队列中发送消息
    t.waitGroup.Wrap(t.messagePump)
    // 通知持久化
    t.ctx.nsqd.Notify(t)

    return t
}

func (t *Topic) Start() {
    select {
    case t.startChan <- 1:
    default:
    }
}

看一下 nsqd 是如何向 nsqlookup 注册自己的元数据信息的,在 nsqd 启动时起了一个 goroutine lookuploop

func (n *NSQD) lookupLoop() {var lookupPeers []*lookupPeer
    var lookupAddrs []string
    connect := true

    hostname, err := os.Hostname()
    if err != nil {n.logf(LOG_FATAL, "failed to get hostname - %s", err)
        os.Exit(1)
    }

    // for announcements, lookupd determines the host automatically
    ticker := time.Tick(15 * time.Second)
    for {
        if connect {for _, host := range n.getOpts().NSQLookupdTCPAddresses {if in(host, lookupAddrs) {continue}
                n.logf(LOG_INFO, "LOOKUP(%s): adding peer", host)
                lookupPeer := newLookupPeer(host, n.getOpts().MaxBodySize, n.logf,
                    connectCallback(n, hostname))
                lookupPeer.Command(nil) // start the connection
                lookupPeers = append(lookupPeers, lookupPeer)
                lookupAddrs = append(lookupAddrs, host)
            }
            n.lookupPeers.Store(lookupPeers)
            connect = false
        }

        select {
        case <-ticker:
        // 向 nsqlookup 发送心跳信息
            // send a heartbeat and read a response (read detects closed conns)
            for _, lookupPeer := range lookupPeers {n.logf(LOG_DEBUG, "LOOKUPD(%s): sending heartbeat", lookupPeer)
                cmd := nsq.Ping()
                _, err := lookupPeer.Command(cmd)
                if err != nil {n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lookupPeer, cmd, err)
                }
            }
        case val := <-n.notifyChan:
            var cmd *nsq.Command
            var branch string

            switch val.(type) {
            // 注册 channel
            case *Channel:
                // notify all nsqlookupds that a new channel exists, or that it's removed
                branch = "channel"
                channel := val.(*Channel)
                if channel.Exiting() == true {cmd = nsq.UnRegister(channel.topicName, channel.name)
                } else {cmd = nsq.Register(channel.topicName, channel.name)
                }
            // 注册 topic
            case *Topic:
           
                // notify all nsqlookupds that a new topic exists, or that it's removed
                branch = "topic"
                topic := val.(*Topic)
                if topic.Exiting() == true {cmd = nsq.UnRegister(topic.name, "")
                } else {cmd = nsq.Register(topic.name, "")
                }
            }

            for _, lookupPeer := range lookupPeers {n.logf(LOG_INFO, "LOOKUPD(%s): %s %s", lookupPeer, branch, cmd)
                _, err := lookupPeer.Command(cmd)
                if err != nil {n.logf(LOG_ERROR, "LOOKUPD(%s): %s - %s", lookupPeer, cmd, err)
                }
            }
        case <-n.optsNotificationChan:
            var tmpPeers []*lookupPeer
            var tmpAddrs []string
            for _, lp := range lookupPeers {if in(lp.addr, n.getOpts().NSQLookupdTCPAddresses) {tmpPeers = append(tmpPeers, lp)
                    tmpAddrs = append(tmpAddrs, lp.addr)
                    continue
                }
                n.logf(LOG_INFO, "LOOKUP(%s): removing peer", lp)
                lp.Close()}
            lookupPeers = tmpPeers
            lookupAddrs = tmpAddrs
            connect = true
        case <-n.exitChan:
            goto exit
        }
    }

exit:
    n.logf(LOG_INFO, "LOOKUP: closing")
}

在 nsqd 启动 lookuploop 这个 goroutine 时还启动了另一 queueScanLoop goroutine,主要用来监控超时消息的处理。
总结一下

  • nsqd 启动时分别监听 tcp、http 端口
  • 启动 loopuploop goroutine 向 nsqlookup 注册自己的相关信息
  • 启动 queueScanLoop goroutine 对超时消息进行处理
  • 启动 statsdLoop goroutine 进行性能和 topic、channel 等一些参数进行统计
  • 当有 producer client 通过 PUB 命令接入进来时,nsqd 会情动一个单独的 goroutine 进行处理,此时会创建 topic、channel,topic 会启动一个 messagepump 的 goroutine,将消息发送给下面的各个 channel
  • 当有 consumer client 接入进来时,启动单独 goroutine 进行处理,会启动一个 messagepump goroutine 将消息发送给各个 consumer

注意,consumer 消费消息是有超时配置的,消费者的每一条消息要在超时范围内,要不然会导致一些问题。

正文完
 0