乐趣区

关于c:SNAT的MASQUERADE地址选择与端口选择

环境:

  1. 版本:kernel-5.4.54 amd64 双核 ubuntu18.04
  2. k8s 集群网络组件:flannel,kube-proxy: ipvs
  3. 代码工具:vs code

1. 概述

  • SNAT(源地址转换)是 IPTABLES 的 NAT 表的外围性能,广泛应用与路由器,云服务器,K8S 集群等内网环境中,是内核网络子系统中不可或缺的性能
  • IPTABLES 的 NAT 齐全依赖于 netfilter 的 conntrack,对于没有进行 conntrack 的数据包无奈进行 NAT
  • 在 K8S 集群中 DNAT 用于负载平衡,SNAT 用来保障节点转发的数据包能回到节点去实现 de-DNAT 还原,而不是间接发给客户端。

    • 客户端拜访的是负载平衡 IP,后端 IP 间接回包给客户端的话,客户端无奈辨认;
    • 后端 IP 回包先转给负载均衡器,将后端 IP 还原成负载平衡 IP 之后再发给客户端
  • IPTABLES 和 IPVS 都能够实现 DNAT 负载平衡的性能,然而 SNAT 只能由 IPTABLES 实现
  • 查看集群中 IPTABLES 的 SNAT 规定
root@cluster1-worker1:~# iptables -t nat -nL
Chain PREROUTING (policy ACCEPT)
target     prot opt source               destination         
KUBE-SERVICES  all  --  0.0.0.0/0            0.0.0.0/0            /* kubernetes service portals */
DOCKER     all  --  0.0.0.0/0            0.0.0.0/0            ADDRTYPE match dst-type LOCAL

Chain INPUT (policy ACCEPT)
target     prot opt source               destination         

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination         
KUBE-SERVICES  all  --  0.0.0.0/0            0.0.0.0/0            /* kubernetes service portals */
DOCKER     all  --  0.0.0.0/0           !127.0.0.0/8          ADDRTYPE match dst-type LOCAL

Chain POSTROUTING (policy ACCEPT)
target     prot opt source               destination         
KUBE-POSTROUTING  all  --  0.0.0.0/0            0.0.0.0/0            /* kubernetes postrouting rules */
MASQUERADE  all  --  172.17.0.0/16        0.0.0.0/0           
RETURN     all  --  10.244.0.0/16        10.244.0.0/16       
MASQUERADE  all  --  10.244.0.0/16       !224.0.0.0/4         
RETURN     all  -- !10.244.0.0/16        10.244.2.0/24       
MASQUERADE  all  -- !10.244.0.0/16        10.244.0.0/16 
...
Chain KUBE-POSTROUTING (1 references)
target     prot opt source        destination   
/* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */      
MASQUERADE  all  --  0.0.0.0/0    0.0.0.0/0     match-set KUBE-LOOP-BACK dst,dst,src

RETURN     all  --  0.0.0.0/0     0.0.0.0/0     mark match ! 0x4000/0x4000
MARK       all  --  0.0.0.0/0     0.0.0.0/0     MARK xor 0x4000

/* kubernetes service traffic requiring SNAT */
MASQUERADE  all  --  0.0.0.0/0    0.0.0.0/0     
...

剖析 MASQUERADE 是如何 SNAT 的对于咱们理解集群间网络通信很有帮忙

2. 概念

2.1 de-SNAT

为什么要做 de-SNAT?
假如本机将 POD1 收回的包进行了 SNAT,源 IP 从 POD1-IP 变成了 HOST-IP;这样服务端回包目的地是 HOST-IP,然而须要收包的是 POD1,如果不 de-SNAT 把回包的目的地改为 POD1-IP,POD1 就无奈收到数据包

2.2 netfilter 中的与 SNAT 无关的钩子点


K8S 集群的 SNAT 规定是在 POST_ROUTING 做 SNAT,在 PRE_ROUTING 做 de-SNAT

3. 代码剖析

3.1 MASQUERADE 在 NAT 表中注册的钩子函数

static struct xt_target masquerade_tg_reg[] __read_mostly = {
    {#if IS_ENABLED(CONFIG_IPV6)
        .name       = "MASQUERADE",
        .family     = NFPROTO_IPV6,
        .target     = masquerade_tg6,
        .targetsize = sizeof(struct nf_nat_range),
        .table      = "nat",
        .hooks      = 1 << NF_INET_POST_ROUTING,
        .checkentry = masquerade_tg6_checkentry,
        .destroy    = masquerade_tg_destroy,
        .me     = THIS_MODULE,
    }, {
#endif
        .name       = "MASQUERADE",
        .family     = NFPROTO_IPV4,
        .target     = masquerade_tg,
        .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
        .table      = "nat",
        .hooks      = 1 << NF_INET_POST_ROUTING,
        .checkentry = masquerade_tg_check,
        .destroy    = masquerade_tg_destroy,
        .me     = THIS_MODULE,
    }
};

3.2 masquerade_tg 剖析

static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
    struct nf_nat_range2 range;
    const struct nf_nat_ipv4_multi_range_compat *mr;

    /* 获取规定的配置和 SNAT 的可用端口范畴 */
    mr = par->targinfo;
    range.flags = mr->range[0].flags;
    range.min_proto = mr->range[0].min;
    range.max_proto = mr->range[0].max;

    /* 外围函数 */
    return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
                      xt_out(par));
}

3.2.1 nf_nat_masquerade_ipv4 剖析

unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
               const struct nf_nat_range2 *range,
               const struct net_device *out)
{
    struct nf_conn *ct;
    struct nf_conn_nat *nat;
    enum ip_conntrack_info ctinfo;
    struct nf_nat_range2 newrange;
    const struct rtable *rt;
    __be32 newsrc, nh;


    WARN_ON(hooknum != NF_INET_POST_ROUTING);

    /* 获取 conntrack 连贯信息 */
    ct = nf_ct_get(skb, &ctinfo);


    WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
             ctinfo == IP_CT_RELATED_REPLY)));


    /* Source address is 0.0.0.0 - locally generated packet that is
     * probably not supposed to be masqueraded.
     */
    if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
        return NF_ACCEPT;

    /* 获取路由表 */
    rt = skb_rtable(skb);
    /* 下一跳的地址 */
    nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
    /* 抉择最合适的 SNAT 源地址 */
    newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
    if (!newsrc) {pr_info("%s ate my IP address\n", out->name);
        return NF_DROP;
    }


    nat = nf_ct_nat_ext_add(ct);
    if (nat)
        nat->masq_index = out->ifindex;


    /* Transfer from original range. */
    /* 设置可用的源地址和源端口范畴 */
    memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
    memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
    newrange.flags       = range->flags | NF_NAT_RANGE_MAP_IPS;
    newrange.min_addr.ip = newsrc;
    newrange.max_addr.ip = newsrc;
    newrange.min_proto   = range->min_proto;
    newrange.max_proto   = range->max_proto;


    /* Hand modified range to generic setup. */
    /* 依据可用范畴确定 SNAT 源地址,并批改连贯记录 */
    return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}

3.2.2 nf_nat_setup_info 剖析

unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range2 *range,
          enum nf_nat_manip_type maniptype)
{struct net *net = nf_ct_net(ct);
    struct nf_conntrack_tuple curr_tuple, new_tuple;


    /* Can't setup nat info for confirmed ct. */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;


    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);


    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;


    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     */
    nf_ct_invert_tuple(&curr_tuple,
               &ct->tuplehash[IP_CT_DIR_REPLY].tuple);

    /* 从可用范畴中获取惟一的五元组 */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);


    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;


        /* Alter conntrack table so will recognize replies. */
        /* 批改 conntrack 中的回包的五元组 */
        nf_ct_invert_tuple(&reply, &new_tuple);
        nf_conntrack_alter_reply(ct, &reply);


        /* Non-atomic: we own this at the moment. */
        /* 标识须要做的 nat 类型 */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;


        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }

    /* 将连贯记录增加到 bysource 表中 */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;


        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }


    /* It's done. */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;


    return NF_ACCEPT;
}

3.3.3 get_unique_tuple 剖析

/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range2 *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
{
    const struct nf_conntrack_zone *zone;
    struct net *net = nf_ct_net(ct);


    zone = nf_ct_zone(ct);


    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    /* 先尝试判断不做 SNAT 是否满足可用范畴,或者在最近 SNAT 的连贯记录中获取 SNAT 源地址 */
    if (maniptype == NF_NAT_MANIP_SRC &&
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* SNAT 和非随机端口会走到这里 */
        /* try the original tuple first */
        /* 不做 SNAT 判断是否满足可用范畴 */
        if (in_range(orig_tuple, range)) {
            /* 判断五元组是否惟一 */
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                return;
            }
        /* 依据源地址 hash,在最近 SNAT 的连贯记录中获取 SNAT 源地址 */
        } else if (find_appropriate_src(net, zone,
                        orig_tuple, tuple, range)) {pr_debug("get_unique_tuple: Found current src map\n");
            /* 判断五元组是否惟一 */
            if (!nf_nat_used_tuple(tuple, ct))
                return;
        }
    }

    /* 随机端口或者没有找到合乎下面判断的五元组时会走到这里 */
    /* 2) Select the least-used IP/proto combination in the given range */
    *tuple = *orig_tuple;
    /* 从源地址范畴中获取最合适的源地址 */
    find_best_ips_proto(zone, tuple, range, ct, maniptype);


    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */


    /* Only bother mapping if it's not already in range and unique */
    /* 先不批改端口判断五元组是否满足范畴 */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
                l4proto_in_range(tuple, maniptype,
                      &range->min_proto,
                      &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                /* 非随机端口 && 设置了端口范畴 && 端口满足范畴 && 五元组惟一
                 * 会走到这里 间接返回确认的五元组 */
                return;
        } else if (!nf_nat_used_tuple(tuple, ct)) {
            /* 非随机端口 && 没有设置了端口范畴 && 五元组惟一
             * 会走到这里 间接返回确认的五元组 */
            return;
        }
    }

    /* Last chance: get protocol to try to obtain unique tuple. */
    /* 在可用范畴中抉择一个适合的端口(五元组惟一,端口在范畴内)*/
    nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}

先不做对数据包的批改,这里只批改 conntrack 连贯记录,后续依据连贯记录对数据包批改
对数据包的批改和 de-SNAT 在 NAT 剖析文档中:IPTABLES 的连贯跟踪与 NAT 剖析

3.3 SNAT 与 MASQ 区别

3.3.1 SNAT 钩子函数

static struct xt_target xt_nat_target_reg[] __read_mostly = {
    {
        .name       = "SNAT",
        .revision   = 0,
        .checkentry = xt_nat_checkentry_v0,
        .destroy    = xt_nat_destroy,
        .target     = xt_snat_target_v0,
        .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
        .family     = NFPROTO_IPV4,
        .table      = "nat",
        .hooks      = (1 << NF_INET_POST_ROUTING) |
                  (1 << NF_INET_LOCAL_IN),
        .me     = THIS_MODULE,
    },
...

3.3.2 xt_snat_target_v0 剖析

static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
    const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
    struct nf_nat_range2 range;
    enum ip_conntrack_info ctinfo;
    struct nf_conn *ct;


    ct = nf_ct_get(skb, &ctinfo);
    WARN_ON(!(ct != NULL &&
         (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
          ctinfo == IP_CT_RELATED_REPLY)));

    /* 获取范畴 */
    xt_nat_convert_range(&range, &mr->range[0]);
    /* 依据可用范畴确定 SNAT 源地址,并批改连贯记录 */
    return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}

能够看到 SNAT 和 MASQ 最初都调用了 nf_nat_setup_info,区别是 MASQ 在后面有一个抉择最合适源 IP 的步骤。

退出移动版