关于c:IPVS分析

68次阅读

共计 12057 个字符,预计需要花费 31 分钟才能阅读完成。

1.Netfilter & CONNTRACK & IPVS 结构图

2.IPVS

ipvs 只有 DNAT 和 de-DNAT 性能 , 它独立与 iptables 和 conntrack, 实现了本人的一套连贯跟踪表和 NAT 机制

2.1 ipvs 与 conntrack 的分割:

ipvs 仅仅在做 DNAT 后对 conntrack 连贯进行更新, 避免回包因为没有记录而被抛弃
ipvs 在 TUNNEL 模式下, 会调用 nf_conntrack_confirm 函数对连贯进行确认

2.2 ipvs 注册的钩子函数

static const struct nf_hook_ops ip_vs_ops[] = {
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4, /* ip_vs_out */
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 2,
    },
    /* After packet filtering, forward packet through VS/DR, VS/TUN,
     * or VS/NAT(change destination), so that filtering rules can be
     * applied to IPVS. */
    {
        .hook       = ip_vs_remote_request4,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_IN,
        .priority   = NF_IP_PRI_NAT_SRC - 1,
    },
    /* Before ip_vs_in, change source only for VS/NAT */
    {
        .hook       = ip_vs_local_reply4,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 1,
    },
    /* After mangle, schedule and forward local requests */
    {
        .hook       = ip_vs_local_request4,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_LOCAL_OUT,
        .priority   = NF_IP_PRI_NAT_DST + 2,
    },
    /* After packet filtering (but before ip_vs_out_icmp), catch icmp
     * destined for 0.0.0.0/0, which is for incoming IPVS connections */
    {
        .hook       = ip_vs_forward_icmp,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 99,
    },
    /* After packet filtering, change source only for VS/NAT */
    {
        .hook       = ip_vs_reply4,
        .pf     = NFPROTO_IPV4,
        .hooknum    = NF_INET_FORWARD,
        .priority   = 100,
    },
};

2.3 IPVS 中 tcp 协定的状态转换表

/*
 *  Timeout table[state]
 */
static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {[IP_VS_TCP_S_NONE]      =   2*HZ,
    [IP_VS_TCP_S_ESTABLISHED]   =   15*60*HZ,
    [IP_VS_TCP_S_SYN_SENT]      =   2*60*HZ,
    [IP_VS_TCP_S_SYN_RECV]      =   1*60*HZ,
    [IP_VS_TCP_S_FIN_WAIT]      =   2*60*HZ,
    [IP_VS_TCP_S_TIME_WAIT]     =   2*60*HZ,
    [IP_VS_TCP_S_CLOSE]     =   10*HZ,
    [IP_VS_TCP_S_CLOSE_WAIT]    =   60*HZ,
    [IP_VS_TCP_S_LAST_ACK]      =   30*HZ,
    [IP_VS_TCP_S_LISTEN]        =   2*60*HZ,
    [IP_VS_TCP_S_SYNACK]        =   120*HZ,
    [IP_VS_TCP_S_LAST]      =   2*HZ,
};

#define sNO IP_VS_TCP_S_NONE
#define sES IP_VS_TCP_S_ESTABLISHED
#define sSS IP_VS_TCP_S_SYN_SENT
#define sSR IP_VS_TCP_S_SYN_RECV
#define sFW IP_VS_TCP_S_FIN_WAIT
#define sTW IP_VS_TCP_S_TIME_WAIT
#define sCL IP_VS_TCP_S_CLOSE
#define sCW IP_VS_TCP_S_CLOSE_WAIT
#define sLA IP_VS_TCP_S_LAST_ACK
#define sLI IP_VS_TCP_S_LISTEN
#define sSA IP_VS_TCP_S_SYNACK

static struct tcp_states_t tcp_states[] = {
/*  INPUT ip_vs_in 调用 */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA    初始状态   */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR}},
/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW}},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES}},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR}},


/*  OUTPUT ip_vs_out 调用 */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA    初始状态   */
/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR}},
/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW}},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES}},
/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL}},


/*  INPUT-ONLY ip_vs_in 在没有收到回包时调用 */
/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR}},
/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW}},
/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES}},
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL}},
};

2.4 ip_vs_in

为目的地为虚构服务器的数据包确认连贯, 并为连贯调配后端, 而后转发数据包

  1. 数据包四元组匹配到了连贯记录

    1. 连贯不复用

      • 开释连贯
    2. 连贯复用

      • 复用连贯
  2. 数据包四元组没有匹配到连贯记录, 或者连贯被开释

    1. 目的地是虚构服务器

      • 调配后端, 新建连贯
    2. 目的地不是虚构服务器

      • 返回 ACCEPT
  3. 统计计数, 更新四层协定连贯状态
  4. 执行 DNAT, 转发数据包到 LOCAL_OUT
  5. 更新连贯放弃工夫

源码剖析:

/*
 *  Check if it's for virtual services, look it up,
 *  and send it on its way...
 */
static unsigned int
ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;
    int ret, pkts;
    int conn_reuse_mode;
    struct sock *sk;

    /* 曾经被 ipvs 解决过则不解决 */
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;


    /*
     *  Big tappo:
     *  - remote client: only PACKET_HOST
     *  - route: used for struct net when skb->dev is unset
     */
    if (unlikely((skb->pkt_type != PACKET_HOST &&
              hooknum != NF_INET_LOCAL_OUT) ||
             !skb_dst(skb))) {ip_vs_fill_iph_skb(af, skb, false, &iph);
        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"
                  "ignored in hook %u\n",
                  skb->pkt_type, iph.protocol,
                  IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);
        return NF_ACCEPT;
    }
    /* ipvs enabled in this netns ? */
    if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
        return NF_ACCEPT;

    /* 获取 ip 头 */
    ip_vs_fill_iph_skb(af, skb, false, &iph);

    /* 获取数据包所属 sock */
    /* Bad... Do not break raw sockets */
    sk = skb_to_full_sk(skb);
    if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
             af == AF_INET)) {if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
            return NF_ACCEPT;
    }


#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6) {if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
            int related;
            int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,
                               hooknum, &iph);


            if (related)
                return verdict;
        }
    } else
#endif
        if (unlikely(iph.protocol == IPPROTO_ICMP)) {
            int related;
            int verdict = ip_vs_in_icmp(ipvs, skb, &related,
                            hooknum);


            if (related)
                return verdict;
        }


    /* Protocol supported? */
    /* 判断是否为 ipvs 反对的协定 */
    pd = ip_vs_proto_data_get(ipvs, iph.protocol);
    if (unlikely(!pd)) {
        /* The only way we'll see this packet again is if it's
         * encapsulated, so mark it with ipvs_property=1 so we
         * skip it if we're ignoring tunneled packets
         */
        if (sysctl_ignore_tunneled(ipvs))
            skb->ipvs_property = 1;


        return NF_ACCEPT;
    }
    pp = pd->pp;
    /*
     * Check if the packet belongs to an existing connection entry
     */
    /* 在 ipvs 连贯跟踪表里查找数据包所属连贯 */
    cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
                 ipvs, af, skb, &iph);

    /* conn_reuse_mode 是 ipvs 连贯复用参数
     * frag 是分片偏移量
     * is_new_conn()是判断 tcp 头的 syn 标记位
     */
    conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
    if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
        /* 找到了所属连贯并且是 SYN, 非分片,reuse_mode==1, 时会走到这里 */
        bool uses_ct = false, resched = false;

        /* 判断 expire_nodest_conn 和连贯的目的地的 weight */
        if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
            unlikely(!atomic_read(&cp->dest->weight))) {
            /* expire_nodest_conn 示意开释不可用后端的连贯
             * 后端不可用会走到这里
             */
            resched = true;
            /* 是否应用了 nf_conntrack */
            uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
        /* 判断之前的连贯是否能够开释 */
        } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
            /* 是否应用了 nf_conntrack */
            uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
            if (!atomic_read(&cp->n_control)) {resched = true;} else {
                /* Do not reschedule controlling connection
                 * that uses conntrack while it is still
                 * referenced by controlled connection(s).
                 */
                resched = !uses_ct;
            }
        }


        if (resched) {
            /* 提前开释之前的连贯 */
            if (!atomic_read(&cp->n_control))
                ip_vs_conn_expire_now(cp);
            __ip_vs_conn_put(cp);
            /* 这里有一个 bug, 如果应用了 conntrack, 间接丢包, 客户端必须重传
             * 重传导致产生 1s 提早 */
            if (uses_ct)
                return NF_DROP;
            cp = NULL;
        }
    }

    if (unlikely(!cp)) {
        /* 没有连贯记录和不复用连贯记录会走到这里 */
        int v;
        /* 进行连贯记录的创立和目的地的确认 */
        if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))
            /* 没有匹配到 service 的不属于 ipvs 的数据包返回 ACCEPT */
            return v;
    }

    /* 属于 IPVS 的 service 的数据包会走到这里 */

    IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");


    /* Check the server status */
    if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
        /* the destination server is not available */


        __u32 flags = cp->flags;


        /* when timer already started, silently drop the packet.*/
        if (timer_pending(&cp->timer))
            __ip_vs_conn_put(cp);
        else
            ip_vs_conn_put(cp);


        if (sysctl_expire_nodest_conn(ipvs) &&
            !(flags & IP_VS_CONN_F_ONE_PACKET)) {
            /* try to expire the connection immediately */
            ip_vs_conn_expire_now(cp);
        }


        return NF_DROP;
    }

    /* 统计计数 */
    ip_vs_in_stats(cp, skb);
    /* 更新四层协定连贯状态 */
    ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
    if (cp->packet_xmit)
        /* DNAT 之后, 发送数据包到 local_out
         * 发送胜利 ret = NF_STOLEN
         */
        ret = cp->packet_xmit(skb, cp, pp, &iph);
        /* do not touch skb anymore */
    else {IP_VS_DBG_RL("warning: packet_xmit is null");
        ret = NF_ACCEPT;
    }


    /* Increase its packet counter and check if it is needed
     * to be synchronized
     *
     * Sync connection if it is about to close to
     * encorage the standby servers to update the connections timeout
     *
     * For ONE_PKT let ip_vs_sync_conn() do the filter work.
     */


    if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
        pkts = sysctl_sync_threshold(ipvs);
    else
        pkts = atomic_add_return(1, &cp->in_pkts);


    if (ipvs->sync_state & IP_VS_STATE_MASTER)
        ip_vs_sync_conn(ipvs, cp, pkts);
    else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
        /* increment is done inside ip_vs_sync_conn too */
        atomic_inc(&cp->control->in_pkts);

    /* 更新连贯记录放弃工夫 */
    ip_vs_conn_put(cp);
    return ret;
}

2.5 ip_vs_out

为回包确认所属连贯, 并将回包做还原解决
源码剖析:

/*
 *  Check if outgoing packet belongs to the established ip_vs_conn.
 */
static unsigned int
ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af)
{
    struct ip_vs_iphdr iph;
    struct ip_vs_protocol *pp;
    struct ip_vs_proto_data *pd;
    struct ip_vs_conn *cp;
    struct sock *sk;


    EnterFunction(11);


    /* 曾经被 ipvs 解决过 */
    /* Already marked as IPVS request or reply? */
    if (skb->ipvs_property)
        return NF_ACCEPT;


    /* 获取所属连贯?? */
    sk = skb_to_full_sk(skb);
    /* Bad... Do not break raw sockets */
    if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
             af == AF_INET)) {if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
            return NF_ACCEPT;
    }


    if (unlikely(!skb_dst(skb)))
        return NF_ACCEPT;


    if (!ipvs->enable)
        return NF_ACCEPT;


    /* 获取 ip 协定头 */
    ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6) {if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
            int related;
            int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,
                            hooknum, &iph);


            if (related)
                return verdict;
        }
    } else
#endif
        if (unlikely(iph.protocol == IPPROTO_ICMP)) {
            int related;
            int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);


            if (related)
                return verdict;
        }


    /* 判断是否是 ipvs 反对的四层协定类型 */
    pd = ip_vs_proto_data_get(ipvs, iph.protocol);
    if (unlikely(!pd))
        return NF_ACCEPT;
    pp = pd->pp;


    /* reassemble IP fragments */
#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET)
#endif
        if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {
            if (ip_vs_gather_frags(ipvs, skb,
                           ip_vs_defrag_user(hooknum)))
                return NF_STOLEN;


            ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);
        }


    /*
     * Check if the packet belongs to an existing entry
     */
    /* 是否属于现有连贯 */
    cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,
                 ipvs, af, skb, &iph);


    if (likely(cp)) {if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
            goto ignore_cp;
        /* de-DNAT 和连贯状态更新 */
        return handle_response(af, skb, pd, cp, &iph, hooknum);
    }

    /* UDP 协定 */
    /* Check for real-server-started requests */
    if (atomic_read(&ipvs->conn_out_counter)) {
        /* Currently only for UDP:
         * connection oriented protocols typically use
         * ephemeral ports for outgoing connections, so
         * related incoming responses would not match any VS
         */
        if (pp->protocol == IPPROTO_UDP) {cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
            if (likely(cp))
                return handle_response(af, skb, pd, cp, &iph,
                               hooknum);
        }
    }

    /* icmp 协定 */
    if (sysctl_nat_icmp_send(ipvs) &&
        (pp->protocol == IPPROTO_TCP ||
         pp->protocol == IPPROTO_UDP ||
         pp->protocol == IPPROTO_SCTP)) {__be16 _ports[2], *pptr;


        pptr = frag_safe_skb_hp(skb, iph.len,
                     sizeof(_ports), _ports);
        if (pptr == NULL)
            return NF_ACCEPT;   /* Not for me */
        if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,
                       pptr[0])) {
            /*
             * Notify the real server: there is no
             * existing entry if it is not RST
             * packet or not TCP packet.
             */
            if ((iph.protocol != IPPROTO_TCP &&
                 iph.protocol != IPPROTO_SCTP)
                 || ((iph.protocol == IPPROTO_TCP
                  && !is_tcp_reset(skb, iph.len))
                 || (iph.protocol == IPPROTO_SCTP
                    && !is_sctp_abort(skb,
                        iph.len)))) {
#ifdef CONFIG_IP_VS_IPV6
                if (af == AF_INET6) {if (!skb->dev)
                        skb->dev = ipvs->net->loopback_dev;
                    icmpv6_send(skb,
                            ICMPV6_DEST_UNREACH,
                            ICMPV6_PORT_UNREACH,
                            0);
                } else
#endif
                    icmp_send(skb,
                          ICMP_DEST_UNREACH,
                          ICMP_PORT_UNREACH, 0);
                return NF_DROP;
            }
        }
    }


out:
    IP_VS_DBG_PKT(12, af, pp, skb, iph.off,
              "ip_vs_out: packet continues traversal as normal");
    return NF_ACCEPT;


ignore_cp:
    __ip_vs_conn_put(cp);
    goto out;
}

/* Handle response packets: rewrite addresses and send away...
 */
static unsigned int
handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
        struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,
        unsigned int hooknum)
{
    struct ip_vs_protocol *pp = pd->pp;


    IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");


    if (skb_ensure_writable(skb, iph->len))
        goto drop;


    /* mangle the packet */
    /* 四层 de-DNAT */
    if (pp->snat_handler &&
        !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))
        goto drop;


#ifdef CONFIG_IP_VS_IPV6
    if (af == AF_INET6)
        ipv6_hdr(skb)->saddr = cp->vaddr.in6;
    else
#endif
    {
        /* 三层 de-DNAT */
        ip_hdr(skb)->saddr = cp->vaddr.ip;
        ip_send_check(ip_hdr(skb));
    }


    /*
     * nf_iterate does not expect change in the skb->dst->dev.
     * It looks like it is not fatal to enable this code for hooks
     * where our handlers are at the end of the chain list and
     * when all next handlers use skb->dst->dev and not outdev.
     * It will definitely route properly the inout NAT traffic
     * when multiple paths are used.
     */


    /* For policy routing, packets originating from this
     * machine itself may be routed differently to packets
     * passing through.  We want this packet to be routed as
     * if it came from this machine itself.  So re-compute
     * the routing information.
     */
    /* 从新路由, 由 snat_reroute 参数决定 */
    if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))
        goto drop;


    IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");

    /* 统计计数 */
    ip_vs_out_stats(cp, skb);
    /* 更新四层协定状态 */
    ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
    skb->ipvs_property = 1;
    if (!(cp->flags & IP_VS_CONN_F_NFCT))
        ip_vs_notrack(skb);
    else
        ip_vs_update_conntrack(skb, cp, 0);
    /* 更新连贯记录放弃工夫 */
    ip_vs_conn_put(cp);

    LeaveFunction(11);
    return NF_ACCEPT;


drop:
    ip_vs_conn_put(cp);
    kfree_skb(skb);
    LeaveFunction(11);
    return NF_STOLEN;
}

正文完
 0