环境:
- 版本:kernel-5.4.54 amd64 双核 ubuntu18.04
- k8s集群网络组件:flannel,kube-proxy: ipvs
- 代码工具:vs code
1.概述
- SNAT(源地址转换)是IPTABLES的NAT表的外围性能,广泛应用与路由器,云服务器,K8S集群等内网环境中,是内核网络子系统中不可或缺的性能
- IPTABLES的NAT齐全依赖于netfilter的conntrack,对于没有进行conntrack的数据包无奈进行NAT
在K8S集群中DNAT用于负载平衡,SNAT用来保障节点转发的数据包能回到节点去实现de-DNAT还原,而不是间接发给客户端。
- 客户端拜访的是负载平衡IP,后端IP间接回包给客户端的话,客户端无奈辨认;
- 后端IP回包先转给负载均衡器,将后端IP还原成负载平衡IP之后再发给客户端
- IPTABLES和IPVS都能够实现DNAT负载平衡的性能,然而SNAT只能由IPTABLES实现
- 查看集群中IPTABLES的SNAT规定
root@cluster1-worker1:~# iptables -t nat -nLChain PREROUTING (policy ACCEPT)target prot opt source destination KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */DOCKER all -- 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCALChain INPUT (policy ACCEPT)target prot opt source destination Chain OUTPUT (policy ACCEPT)target prot opt source destination KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */DOCKER all -- 0.0.0.0/0 !127.0.0.0/8 ADDRTYPE match dst-type LOCALChain POSTROUTING (policy ACCEPT)target prot opt source destination KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */MASQUERADE all -- 172.17.0.0/16 0.0.0.0/0 RETURN all -- 10.244.0.0/16 10.244.0.0/16 MASQUERADE all -- 10.244.0.0/16 !224.0.0.0/4 RETURN all -- !10.244.0.0/16 10.244.2.0/24 MASQUERADE all -- !10.244.0.0/16 10.244.0.0/16 ...Chain KUBE-POSTROUTING (1 references)target prot opt source destination /* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */ MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-LOOP-BACK dst,dst,srcRETURN all -- 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000/* kubernetes service traffic requiring SNAT */MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 ...
剖析MASQUERADE是如何SNAT的对于咱们理解集群间网络通信很有帮忙
2.概念
2.1 de-SNAT
为什么要做de-SNAT?
假如本机将POD1收回的包进行了SNAT,源IP从POD1-IP变成了HOST-IP;这样服务端回包目的地是HOST-IP,然而须要收包的是POD1,如果不de-SNAT把回包的目的地改为POD1-IP,POD1就无奈收到数据包
2.2 netfilter中的与SNAT无关的钩子点
K8S集群的SNAT规定是在POST_ROUTING做SNAT,在PRE_ROUTING做de-SNAT
3.代码剖析
3.1 MASQUERADE在NAT表中注册的钩子函数
static struct xt_target masquerade_tg_reg[] __read_mostly = { {#if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg6_checkentry, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }, {#endif .name = "MASQUERADE", .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }};
3.2 masquerade_tg剖析
static unsigned intmasquerade_tg(struct sk_buff *skb, const struct xt_action_param *par){ struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; /* 获取规定的配置和SNAT的可用端口范畴 */ mr = par->targinfo; range.flags = mr->range[0].flags; range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; /* 外围函数 */ return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, xt_out(par));}
3.2.1 nf_nat_masquerade_ipv4剖析
unsigned intnf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, const struct nf_nat_range2 *range, const struct net_device *out){ struct nf_conn *ct; struct nf_conn_nat *nat; enum ip_conntrack_info ctinfo; struct nf_nat_range2 newrange; const struct rtable *rt; __be32 newsrc, nh; WARN_ON(hooknum != NF_INET_POST_ROUTING); /* 获取conntrack连贯信息 */ ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. */ if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0) return NF_ACCEPT; /* 获取路由表 */ rt = skb_rtable(skb); /* 下一跳的地址 */ nh = rt_nexthop(rt, ip_hdr(skb)->daddr); /* 抉择最合适的SNAT源地址 */ newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE); if (!newsrc) { pr_info("%s ate my IP address\n", out->name); return NF_DROP; } nat = nf_ct_nat_ext_add(ct); if (nat) nat->masq_index = out->ifindex; /* Transfer from original range. */ /* 设置可用的源地址和源端口范畴 */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.ip = newsrc; newrange.max_addr.ip = newsrc; newrange.min_proto = range->min_proto; newrange.max_proto = range->max_proto; /* Hand modified range to generic setup. */ /* 依据可用范畴确定SNAT源地址,并批改连贯记录 */ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);}
3.2.2 nf_nat_setup_info剖析
unsigned intnf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype){ struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /* 从可用范畴中获取惟一的五元组 */ get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ /* 批改conntrack中的回包的五元组 */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ /* 标识须要做的nat类型 */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } /* 将连贯记录增加到bysource表中 */ if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT;}
3.3.3 get_unique_tuple剖析
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */static voidget_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype){ const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ /* 先尝试判断不做SNAT是否满足可用范畴,或者在最近SNAT的连贯记录中获取SNAT源地址 */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* SNAT和非随机端口会走到这里 */ /* try the original tuple first */ /* 不做SNAT判断是否满足可用范畴 */ if (in_range(orig_tuple, range)) { /* 判断五元组是否惟一 */ if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } /* 依据源地址hash,在最近SNAT的连贯记录中获取SNAT源地址 */ } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); /* 判断五元组是否惟一 */ if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 随机端口或者没有找到合乎下面判断的五元组时会走到这里 */ /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; /* 从源地址范畴中获取最合适的源地址 */ find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ /* 先不批改端口判断五元组是否满足范畴 */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) /* 非随机端口 && 设置了端口范畴 && 端口满足范畴 && 五元组惟一 * 会走到这里 间接返回确认的五元组*/ return; } else if (!nf_nat_used_tuple(tuple, ct)) { /* 非随机端口 && 没有设置了端口范畴 && 五元组惟一 * 会走到这里 间接返回确认的五元组*/ return; } } /* Last chance: get protocol to try to obtain unique tuple. */ /* 在可用范畴中抉择一个适合的端口(五元组惟一,端口在范畴内) */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);}
先不做对数据包的批改,这里只批改conntrack连贯记录,后续依据连贯记录对数据包批改
对数据包的批改和de-SNAT在NAT剖析文档中:IPTABLES的连贯跟踪与NAT剖析
3.3 SNAT与MASQ区别
3.3.1 SNAT钩子函数
static struct xt_target xt_nat_target_reg[] __read_mostly = { { .name = "SNAT", .revision = 0, .checkentry = xt_nat_checkentry_v0, .destroy = xt_nat_destroy, .target = xt_snat_target_v0, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .family = NFPROTO_IPV4, .table = "nat", .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, },...
3.3.2 xt_snat_target_v0剖析
static unsigned intxt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par){ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; struct nf_nat_range2 range; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); WARN_ON(!(ct != NULL && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY))); /* 获取范畴 */ xt_nat_convert_range(&range, &mr->range[0]); /* 依据可用范畴确定SNAT源地址,并批改连贯记录 */ return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);}
能够看到SNAT和MASQ最初都调用了nf_nat_setup_info,区别是MASQ在后面有一个抉择最合适源IP的步骤。