1.Netfilter & CONNTRACK & IPVS结构图

2.IPVS

ipvs只有DNAT和de-DNAT性能 ,它独立与iptables和conntrack,实现了本人的一套连贯跟踪表和NAT机制

2.1 ipvs与conntrack的分割:

ipvs仅仅在做DNAT后对conntrack连贯进行更新,避免回包因为没有记录而被抛弃
ipvs在TUNNEL模式下,会调用nf_conntrack_confirm函数对连贯进行确认

2.2 ipvs注册的钩子函数

static const struct nf_hook_ops ip_vs_ops[] = {    /* After packet filtering, change source only for VS/NAT */    {        .hook       = ip_vs_reply4, /* ip_vs_out */        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_LOCAL_IN,        .priority   = NF_IP_PRI_NAT_SRC - 2,    },    /* After packet filtering, forward packet through VS/DR, VS/TUN,     * or VS/NAT(change destination), so that filtering rules can be     * applied to IPVS. */    {        .hook       = ip_vs_remote_request4,        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_LOCAL_IN,        .priority   = NF_IP_PRI_NAT_SRC - 1,    },    /* Before ip_vs_in, change source only for VS/NAT */    {        .hook       = ip_vs_local_reply4,        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_LOCAL_OUT,        .priority   = NF_IP_PRI_NAT_DST + 1,    },    /* After mangle, schedule and forward local requests */    {        .hook       = ip_vs_local_request4,        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_LOCAL_OUT,        .priority   = NF_IP_PRI_NAT_DST + 2,    },    /* After packet filtering (but before ip_vs_out_icmp), catch icmp     * destined for 0.0.0.0/0, which is for incoming IPVS connections */    {        .hook       = ip_vs_forward_icmp,        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_FORWARD,        .priority   = 99,    },    /* After packet filtering, change source only for VS/NAT */    {        .hook       = ip_vs_reply4,        .pf     = NFPROTO_IPV4,        .hooknum    = NF_INET_FORWARD,        .priority   = 100,    },};

2.3 IPVS中tcp协定的状态转换表

/* *  Timeout table[state] */static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {    [IP_VS_TCP_S_NONE]      =   2*HZ,    [IP_VS_TCP_S_ESTABLISHED]   =   15*60*HZ,    [IP_VS_TCP_S_SYN_SENT]      =   2*60*HZ,    [IP_VS_TCP_S_SYN_RECV]      =   1*60*HZ,    [IP_VS_TCP_S_FIN_WAIT]      =   2*60*HZ,    [IP_VS_TCP_S_TIME_WAIT]     =   2*60*HZ,    [IP_VS_TCP_S_CLOSE]     =   10*HZ,    [IP_VS_TCP_S_CLOSE_WAIT]    =   60*HZ,    [IP_VS_TCP_S_LAST_ACK]      =   30*HZ,    [IP_VS_TCP_S_LISTEN]        =   2*60*HZ,    [IP_VS_TCP_S_SYNACK]        =   120*HZ,    [IP_VS_TCP_S_LAST]      =   2*HZ,};#define sNO IP_VS_TCP_S_NONE#define sES IP_VS_TCP_S_ESTABLISHED#define sSS IP_VS_TCP_S_SYN_SENT#define sSR IP_VS_TCP_S_SYN_RECV#define sFW IP_VS_TCP_S_FIN_WAIT#define sTW IP_VS_TCP_S_TIME_WAIT#define sCL IP_VS_TCP_S_CLOSE#define sCW IP_VS_TCP_S_CLOSE_WAIT#define sLA IP_VS_TCP_S_LAST_ACK#define sLI IP_VS_TCP_S_LISTEN#define sSA IP_VS_TCP_S_SYNACKstatic struct tcp_states_t tcp_states[] = {/*  INPUT ip_vs_in调用 *//*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA    初始状态   *//*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},/*  OUTPUT ip_vs_out调用 *//*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA    初始状态   *//*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},/*  INPUT-ONLY ip_vs_in在没有收到回包时调用*//*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA *//*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},};

2.4 ip_vs_in

为目的地为虚构服务器的数据包确认连贯,并为连贯调配后端,而后转发数据包

数据包四元组匹配到了连贯记录
1. 连贯不复用
  - 开释连贯
2. 连贯复用
  - 复用连贯
数据包四元组没有匹配到连贯记录,或者连贯被开释
1. 目的地是虚构服务器
  - 调配后端,新建连贯
2. 目的地不是虚构服务器
  - 返回ACCEPT
统计计数,更新四层协定连贯状态
执行DNAT,转发数据包到LOCAL_OUT
更新连贯放弃工夫

源码剖析:

/* *  Check if it's for virtual services, look it up, *  and send it on its way... */static unsigned intip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af){    struct ip_vs_iphdr iph;    struct ip_vs_protocol *pp;    struct ip_vs_proto_data *pd;    struct ip_vs_conn *cp;    int ret, pkts;    int conn_reuse_mode;    struct sock *sk;    /* 曾经被ipvs解决过则不解决 */    /* Already marked as IPVS request or reply? */    if (skb->ipvs_property)        return NF_ACCEPT;    /*     *  Big tappo:     *  - remote client: only PACKET_HOST     *  - route: used for struct net when skb->dev is unset     */    if (unlikely((skb->pkt_type != PACKET_HOST &&              hooknum != NF_INET_LOCAL_OUT) ||             !skb_dst(skb))) {        ip_vs_fill_iph_skb(af, skb, false, &iph);        IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s"                  " ignored in hook %u\n",                  skb->pkt_type, iph.protocol,                  IP_VS_DBG_ADDR(af, &iph.daddr), hooknum);        return NF_ACCEPT;    }    /* ipvs enabled in this netns ? */    if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))        return NF_ACCEPT;    /* 获取ip头 */    ip_vs_fill_iph_skb(af, skb, false, &iph);    /* 获取数据包所属sock */    /* Bad... Do not break raw sockets */    sk = skb_to_full_sk(skb);    if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&             af == AF_INET)) {        if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)            return NF_ACCEPT;    }#ifdef CONFIG_IP_VS_IPV6    if (af == AF_INET6) {        if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {            int related;            int verdict = ip_vs_in_icmp_v6(ipvs, skb, &related,                               hooknum, &iph);            if (related)                return verdict;        }    } else#endif        if (unlikely(iph.protocol == IPPROTO_ICMP)) {            int related;            int verdict = ip_vs_in_icmp(ipvs, skb, &related,                            hooknum);            if (related)                return verdict;        }    /* Protocol supported? */    /* 判断是否为ipvs反对的协定 */    pd = ip_vs_proto_data_get(ipvs, iph.protocol);    if (unlikely(!pd)) {        /* The only way we'll see this packet again is if it's         * encapsulated, so mark it with ipvs_property=1 so we         * skip it if we're ignoring tunneled packets         */        if (sysctl_ignore_tunneled(ipvs))            skb->ipvs_property = 1;        return NF_ACCEPT;    }    pp = pd->pp;    /*     * Check if the packet belongs to an existing connection entry     */    /* 在ipvs连贯跟踪表里查找数据包所属连贯 */    cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,                 ipvs, af, skb, &iph);    /* conn_reuse_mode是ipvs连贯复用参数     * frag是分片偏移量     * is_new_conn()是判断tcp头的syn标记位     */    conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);    if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {        /* 找到了所属连贯并且是SYN,非分片,reuse_mode==1,时会走到这里 */        bool uses_ct = false, resched = false;        /* 判断expire_nodest_conn和连贯的目的地的weight */        if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&            unlikely(!atomic_read(&cp->dest->weight))) {            /* expire_nodest_conn示意开释不可用后端的连贯             * 后端不可用会走到这里             */            resched = true;            /* 是否应用了nf_conntrack */            uses_ct = ip_vs_conn_uses_conntrack(cp, skb);        /* 判断之前的连贯是否能够开释 */        } else if (is_new_conn_expected(cp, conn_reuse_mode)) {            /* 是否应用了nf_conntrack */            uses_ct = ip_vs_conn_uses_conntrack(cp, skb);            if (!atomic_read(&cp->n_control)) {                resched = true;            } else {                /* Do not reschedule controlling connection                 * that uses conntrack while it is still                 * referenced by controlled connection(s).                 */                resched = !uses_ct;            }        }        if (resched) {            /* 提前开释之前的连贯 */            if (!atomic_read(&cp->n_control))                ip_vs_conn_expire_now(cp);            __ip_vs_conn_put(cp);            /* 这里有一个bug,如果应用了conntrack,间接丢包,客户端必须重传             * 重传导致产生1s提早 */            if (uses_ct)                return NF_DROP;            cp = NULL;        }    }    if (unlikely(!cp)) {        /* 没有连贯记录和不复用连贯记录会走到这里 */        int v;        /* 进行连贯记录的创立和目的地的确认 */        if (!ip_vs_try_to_schedule(ipvs, af, skb, pd, &v, &cp, &iph))            /* 没有匹配到service的不属于ipvs的数据包返回ACCEPT */            return v;    }    /* 属于IPVS的service的数据包会走到这里 */    IP_VS_DBG_PKT(11, af, pp, skb, iph.off, "Incoming packet");    /* Check the server status */    if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {        /* the destination server is not available */        __u32 flags = cp->flags;        /* when timer already started, silently drop the packet.*/        if (timer_pending(&cp->timer))            __ip_vs_conn_put(cp);        else            ip_vs_conn_put(cp);        if (sysctl_expire_nodest_conn(ipvs) &&            !(flags & IP_VS_CONN_F_ONE_PACKET)) {            /* try to expire the connection immediately */            ip_vs_conn_expire_now(cp);        }        return NF_DROP;    }    /* 统计计数 */    ip_vs_in_stats(cp, skb);    /* 更新四层协定连贯状态 */    ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);    if (cp->packet_xmit)        /* DNAT之后,发送数据包到local_out         * 发送胜利ret = NF_STOLEN         */        ret = cp->packet_xmit(skb, cp, pp, &iph);        /* do not touch skb anymore */    else {        IP_VS_DBG_RL("warning: packet_xmit is null");        ret = NF_ACCEPT;    }    /* Increase its packet counter and check if it is needed     * to be synchronized     *     * Sync connection if it is about to close to     * encorage the standby servers to update the connections timeout     *     * For ONE_PKT let ip_vs_sync_conn() do the filter work.     */    if (cp->flags & IP_VS_CONN_F_ONE_PACKET)        pkts = sysctl_sync_threshold(ipvs);    else        pkts = atomic_add_return(1, &cp->in_pkts);    if (ipvs->sync_state & IP_VS_STATE_MASTER)        ip_vs_sync_conn(ipvs, cp, pkts);    else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)        /* increment is done inside ip_vs_sync_conn too */        atomic_inc(&cp->control->in_pkts);    /* 更新连贯记录放弃工夫 */    ip_vs_conn_put(cp);    return ret;}

2.5 ip_vs_out

为回包确认所属连贯,并将回包做还原解决
源码剖析:

/* *  Check if outgoing packet belongs to the established ip_vs_conn. */static unsigned intip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af){    struct ip_vs_iphdr iph;    struct ip_vs_protocol *pp;    struct ip_vs_proto_data *pd;    struct ip_vs_conn *cp;    struct sock *sk;    EnterFunction(11);    /* 曾经被ipvs解决过 */    /* Already marked as IPVS request or reply? */    if (skb->ipvs_property)        return NF_ACCEPT;    /* 获取所属连贯?? */    sk = skb_to_full_sk(skb);    /* Bad... Do not break raw sockets */    if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&             af == AF_INET)) {        if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)            return NF_ACCEPT;    }    if (unlikely(!skb_dst(skb)))        return NF_ACCEPT;    if (!ipvs->enable)        return NF_ACCEPT;    /* 获取ip协定头 */    ip_vs_fill_iph_skb(af, skb, false, &iph);#ifdef CONFIG_IP_VS_IPV6    if (af == AF_INET6) {        if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {            int related;            int verdict = ip_vs_out_icmp_v6(ipvs, skb, &related,                            hooknum, &iph);            if (related)                return verdict;        }    } else#endif        if (unlikely(iph.protocol == IPPROTO_ICMP)) {            int related;            int verdict = ip_vs_out_icmp(ipvs, skb, &related, hooknum);            if (related)                return verdict;        }    /* 判断是否是ipvs反对的四层协定类型 */    pd = ip_vs_proto_data_get(ipvs, iph.protocol);    if (unlikely(!pd))        return NF_ACCEPT;    pp = pd->pp;    /* reassemble IP fragments */#ifdef CONFIG_IP_VS_IPV6    if (af == AF_INET)#endif        if (unlikely(ip_is_fragment(ip_hdr(skb)) && !pp->dont_defrag)) {            if (ip_vs_gather_frags(ipvs, skb,                           ip_vs_defrag_user(hooknum)))                return NF_STOLEN;            ip_vs_fill_iph_skb(AF_INET, skb, false, &iph);        }    /*     * Check if the packet belongs to an existing entry     */    /* 是否属于现有连贯 */    cp = INDIRECT_CALL_1(pp->conn_out_get, ip_vs_conn_out_get_proto,                 ipvs, af, skb, &iph);    if (likely(cp)) {        if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)            goto ignore_cp;        /* de-DNAT和连贯状态更新 */        return handle_response(af, skb, pd, cp, &iph, hooknum);    }    /* UDP协定 */    /* Check for real-server-started requests */    if (atomic_read(&ipvs->conn_out_counter)) {        /* Currently only for UDP:         * connection oriented protocols typically use         * ephemeral ports for outgoing connections, so         * related incoming responses would not match any VS         */        if (pp->protocol == IPPROTO_UDP) {            cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);            if (likely(cp))                return handle_response(af, skb, pd, cp, &iph,                               hooknum);        }    }    /* icmp协定 */    if (sysctl_nat_icmp_send(ipvs) &&        (pp->protocol == IPPROTO_TCP ||         pp->protocol == IPPROTO_UDP ||         pp->protocol == IPPROTO_SCTP)) {        __be16 _ports[2], *pptr;        pptr = frag_safe_skb_hp(skb, iph.len,                     sizeof(_ports), _ports);        if (pptr == NULL)            return NF_ACCEPT;   /* Not for me */        if (ip_vs_has_real_service(ipvs, af, iph.protocol, &iph.saddr,                       pptr[0])) {            /*             * Notify the real server: there is no             * existing entry if it is not RST             * packet or not TCP packet.             */            if ((iph.protocol != IPPROTO_TCP &&                 iph.protocol != IPPROTO_SCTP)                 || ((iph.protocol == IPPROTO_TCP                  && !is_tcp_reset(skb, iph.len))                 || (iph.protocol == IPPROTO_SCTP                    && !is_sctp_abort(skb,                        iph.len)))) {#ifdef CONFIG_IP_VS_IPV6                if (af == AF_INET6) {                    if (!skb->dev)                        skb->dev = ipvs->net->loopback_dev;                    icmpv6_send(skb,                            ICMPV6_DEST_UNREACH,                            ICMPV6_PORT_UNREACH,                            0);                } else#endif                    icmp_send(skb,                          ICMP_DEST_UNREACH,                          ICMP_PORT_UNREACH, 0);                return NF_DROP;            }        }    }out:    IP_VS_DBG_PKT(12, af, pp, skb, iph.off,              "ip_vs_out: packet continues traversal as normal");    return NF_ACCEPT;ignore_cp:    __ip_vs_conn_put(cp);    goto out;}/* Handle response packets: rewrite addresses and send away... */static unsigned inthandle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,        struct ip_vs_conn *cp, struct ip_vs_iphdr *iph,        unsigned int hooknum){    struct ip_vs_protocol *pp = pd->pp;    IP_VS_DBG_PKT(11, af, pp, skb, iph->off, "Outgoing packet");    if (skb_ensure_writable(skb, iph->len))        goto drop;    /* mangle the packet */    /* 四层de-DNAT */    if (pp->snat_handler &&        !SNAT_CALL(pp->snat_handler, skb, pp, cp, iph))        goto drop;#ifdef CONFIG_IP_VS_IPV6    if (af == AF_INET6)        ipv6_hdr(skb)->saddr = cp->vaddr.in6;    else#endif    {        /* 三层de-DNAT */        ip_hdr(skb)->saddr = cp->vaddr.ip;        ip_send_check(ip_hdr(skb));    }    /*     * nf_iterate does not expect change in the skb->dst->dev.     * It looks like it is not fatal to enable this code for hooks     * where our handlers are at the end of the chain list and     * when all next handlers use skb->dst->dev and not outdev.     * It will definitely route properly the inout NAT traffic     * when multiple paths are used.     */    /* For policy routing, packets originating from this     * machine itself may be routed differently to packets     * passing through.  We want this packet to be routed as     * if it came from this machine itself.  So re-compute     * the routing information.     */    /* 从新路由,由snat_reroute参数决定 */    if (ip_vs_route_me_harder(cp->ipvs, af, skb, hooknum))        goto drop;    IP_VS_DBG_PKT(10, af, pp, skb, iph->off, "After SNAT");    /* 统计计数 */    ip_vs_out_stats(cp, skb);    /* 更新四层协定状态 */    ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);    skb->ipvs_property = 1;    if (!(cp->flags & IP_VS_CONN_F_NFCT))        ip_vs_notrack(skb);    else        ip_vs_update_conntrack(skb, cp, 0);    /* 更新连贯记录放弃工夫 */    ip_vs_conn_put(cp);    LeaveFunction(11);    return NF_ACCEPT;drop:    ip_vs_conn_put(cp);    kfree_skb(skb);    LeaveFunction(11);    return NF_STOLEN;}