共计 9016 个字符,预计需要花费 23 分钟才能阅读完成。
环境:
- 版本:kernel-5.4.54 amd64 双核 ubuntu18.04
- k8s 集群网络组件:flannel,kube-proxy: ipvs
- 代码工具:vs code
1. 概述
- SNAT(源地址转换)是 IPTABLES 的 NAT 表的外围性能,广泛应用与路由器,云服务器,K8S 集群等内网环境中,是内核网络子系统中不可或缺的性能
- IPTABLES 的 NAT 齐全依赖于 netfilter 的 conntrack,对于没有进行 conntrack 的数据包无奈进行 NAT
-
在 K8S 集群中 DNAT 用于负载平衡,SNAT 用来保障节点转发的数据包能回到节点去实现 de-DNAT 还原,而不是间接发给客户端。
- 客户端拜访的是负载平衡 IP,后端 IP 间接回包给客户端的话,客户端无奈辨认;
- 后端 IP 回包先转给负载均衡器,将后端 IP 还原成负载平衡 IP 之后再发给客户端
- IPTABLES 和 IPVS 都能够实现 DNAT 负载平衡的性能,然而 SNAT 只能由 IPTABLES 实现
- 查看集群中 IPTABLES 的 SNAT 规定
root@cluster1-worker1:~# iptables -t nat -nL
Chain PREROUTING (policy ACCEPT)
target prot opt source destination
KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
DOCKER all -- 0.0.0.0/0 0.0.0.0/0 ADDRTYPE match dst-type LOCAL
Chain INPUT (policy ACCEPT)
target prot opt source destination
Chain OUTPUT (policy ACCEPT)
target prot opt source destination
KUBE-SERVICES all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes service portals */
DOCKER all -- 0.0.0.0/0 !127.0.0.0/8 ADDRTYPE match dst-type LOCAL
Chain POSTROUTING (policy ACCEPT)
target prot opt source destination
KUBE-POSTROUTING all -- 0.0.0.0/0 0.0.0.0/0 /* kubernetes postrouting rules */
MASQUERADE all -- 172.17.0.0/16 0.0.0.0/0
RETURN all -- 10.244.0.0/16 10.244.0.0/16
MASQUERADE all -- 10.244.0.0/16 !224.0.0.0/4
RETURN all -- !10.244.0.0/16 10.244.2.0/24
MASQUERADE all -- !10.244.0.0/16 10.244.0.0/16
...
Chain KUBE-POSTROUTING (1 references)
target prot opt source destination
/* Kubernetes endpoints dst ip:port, source ip for solving hairpin purpose */
MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0 match-set KUBE-LOOP-BACK dst,dst,src
RETURN all -- 0.0.0.0/0 0.0.0.0/0 mark match ! 0x4000/0x4000
MARK all -- 0.0.0.0/0 0.0.0.0/0 MARK xor 0x4000
/* kubernetes service traffic requiring SNAT */
MASQUERADE all -- 0.0.0.0/0 0.0.0.0/0
...
剖析 MASQUERADE 是如何 SNAT 的对于咱们理解集群间网络通信很有帮忙
2. 概念
2.1 de-SNAT
为什么要做 de-SNAT?
假如本机将 POD1 收回的包进行了 SNAT,源 IP 从 POD1-IP 变成了 HOST-IP;这样服务端回包目的地是 HOST-IP,然而须要收包的是 POD1,如果不 de-SNAT 把回包的目的地改为 POD1-IP,POD1 就无奈收到数据包
2.2 netfilter 中的与 SNAT 无关的钩子点
K8S 集群的 SNAT 规定是在 POST_ROUTING 做 SNAT,在 PRE_ROUTING 做 de-SNAT
3. 代码剖析
3.1 MASQUERADE 在 NAT 表中注册的钩子函数
static struct xt_target masquerade_tg_reg[] __read_mostly = {
{#if IS_ENABLED(CONFIG_IPV6)
.name = "MASQUERADE",
.family = NFPROTO_IPV6,
.target = masquerade_tg6,
.targetsize = sizeof(struct nf_nat_range),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg6_checkentry,
.destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
}, {
#endif
.name = "MASQUERADE",
.family = NFPROTO_IPV4,
.target = masquerade_tg,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.table = "nat",
.hooks = 1 << NF_INET_POST_ROUTING,
.checkentry = masquerade_tg_check,
.destroy = masquerade_tg_destroy,
.me = THIS_MODULE,
}
};
3.2 masquerade_tg 剖析
static unsigned int
masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
struct nf_nat_range2 range;
const struct nf_nat_ipv4_multi_range_compat *mr;
/* 获取规定的配置和 SNAT 的可用端口范畴 */
mr = par->targinfo;
range.flags = mr->range[0].flags;
range.min_proto = mr->range[0].min;
range.max_proto = mr->range[0].max;
/* 外围函数 */
return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
xt_out(par));
}
3.2.1 nf_nat_masquerade_ipv4 剖析
unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
const struct nf_nat_range2 *range,
const struct net_device *out)
{
struct nf_conn *ct;
struct nf_conn_nat *nat;
enum ip_conntrack_info ctinfo;
struct nf_nat_range2 newrange;
const struct rtable *rt;
__be32 newsrc, nh;
WARN_ON(hooknum != NF_INET_POST_ROUTING);
/* 获取 conntrack 连贯信息 */
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
*/
if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
return NF_ACCEPT;
/* 获取路由表 */
rt = skb_rtable(skb);
/* 下一跳的地址 */
nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
/* 抉择最合适的 SNAT 源地址 */
newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
if (!newsrc) {pr_info("%s ate my IP address\n", out->name);
return NF_DROP;
}
nat = nf_ct_nat_ext_add(ct);
if (nat)
nat->masq_index = out->ifindex;
/* Transfer from original range. */
/* 设置可用的源地址和源端口范畴 */
memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
newrange.min_addr.ip = newsrc;
newrange.max_addr.ip = newsrc;
newrange.min_proto = range->min_proto;
newrange.max_proto = range->max_proto;
/* Hand modified range to generic setup. */
/* 依据可用范畴确定 SNAT 源地址,并批改连贯记录 */
return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
3.2.2 nf_nat_setup_info 剖析
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range2 *range,
enum nf_nat_manip_type maniptype)
{struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
/* Can't setup nat info for confirmed ct. */
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
maniptype != NF_NAT_MANIP_DST);
if (WARN_ON(nf_nat_initialized(ct, maniptype)))
return NF_DROP;
/* What we've got will look like inverse of reply. Normally
* this is what is in the conntrack, except for prior
* manipulations (future optimization: if num_manips == 0,
* orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
*/
nf_ct_invert_tuple(&curr_tuple,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
/* 从可用范畴中获取惟一的五元组 */
get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
struct nf_conntrack_tuple reply;
/* Alter conntrack table so will recognize replies. */
/* 批改 conntrack 中的回包的五元组 */
nf_ct_invert_tuple(&reply, &new_tuple);
nf_conntrack_alter_reply(ct, &reply);
/* Non-atomic: we own this at the moment. */
/* 标识须要做的 nat 类型 */
if (maniptype == NF_NAT_MANIP_SRC)
ct->status |= IPS_SRC_NAT;
else
ct->status |= IPS_DST_NAT;
if (nfct_help(ct) && !nfct_seqadj(ct))
if (!nfct_seqadj_ext_add(ct))
return NF_DROP;
}
/* 将连贯记录增加到 bysource 表中 */
if (maniptype == NF_NAT_MANIP_SRC) {
unsigned int srchash;
spinlock_t *lock;
srchash = hash_by_src(net,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
spin_lock_bh(lock);
hlist_add_head_rcu(&ct->nat_bysource,
&nf_nat_bysource[srchash]);
spin_unlock_bh(lock);
}
/* It's done. */
if (maniptype == NF_NAT_MANIP_DST)
ct->status |= IPS_DST_NAT_DONE;
else
ct->status |= IPS_SRC_NAT_DONE;
return NF_ACCEPT;
}
3.3.3 get_unique_tuple 剖析
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
* we change the source to map into the range. For NF_INET_PRE_ROUTING
* and NF_INET_LOCAL_OUT, we change the destination to map into the
* range. It might not be possible to get a unique tuple, but we try.
* At worst (or if we race), we will end up with a final duplicate in
* __nf_conntrack_confirm and drop the packet. */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig_tuple,
const struct nf_nat_range2 *range,
struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
{
const struct nf_conntrack_zone *zone;
struct net *net = nf_ct_net(ct);
zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
* and that same mapping gives a unique tuple within the given
* range, use that.
*
* This is only required for source (ie. NAT/masq) mappings.
* So far, we don't do local source mappings, so multiple
* manips not an issue.
*/
/* 先尝试判断不做 SNAT 是否满足可用范畴,或者在最近 SNAT 的连贯记录中获取 SNAT 源地址 */
if (maniptype == NF_NAT_MANIP_SRC &&
!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
/* SNAT 和非随机端口会走到这里 */
/* try the original tuple first */
/* 不做 SNAT 判断是否满足可用范畴 */
if (in_range(orig_tuple, range)) {
/* 判断五元组是否惟一 */
if (!nf_nat_used_tuple(orig_tuple, ct)) {
*tuple = *orig_tuple;
return;
}
/* 依据源地址 hash,在最近 SNAT 的连贯记录中获取 SNAT 源地址 */
} else if (find_appropriate_src(net, zone,
orig_tuple, tuple, range)) {pr_debug("get_unique_tuple: Found current src map\n");
/* 判断五元组是否惟一 */
if (!nf_nat_used_tuple(tuple, ct))
return;
}
}
/* 随机端口或者没有找到合乎下面判断的五元组时会走到这里 */
/* 2) Select the least-used IP/proto combination in the given range */
*tuple = *orig_tuple;
/* 从源地址范畴中获取最合适的源地址 */
find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
* the range to make a unique tuple.
*/
/* Only bother mapping if it's not already in range and unique */
/* 先不批改端口判断五元组是否满足范畴 */
if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) &&
l4proto_in_range(tuple, maniptype,
&range->min_proto,
&range->max_proto) &&
(range->min_proto.all == range->max_proto.all ||
!nf_nat_used_tuple(tuple, ct)))
/* 非随机端口 && 设置了端口范畴 && 端口满足范畴 && 五元组惟一
* 会走到这里 间接返回确认的五元组 */
return;
} else if (!nf_nat_used_tuple(tuple, ct)) {
/* 非随机端口 && 没有设置了端口范畴 && 五元组惟一
* 会走到这里 间接返回确认的五元组 */
return;
}
}
/* Last chance: get protocol to try to obtain unique tuple. */
/* 在可用范畴中抉择一个适合的端口(五元组惟一,端口在范畴内)*/
nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct);
}
先不做对数据包的批改,这里只批改 conntrack 连贯记录,后续依据连贯记录对数据包批改
对数据包的批改和 de-SNAT 在 NAT 剖析文档中:IPTABLES 的连贯跟踪与 NAT 剖析
3.3 SNAT 与 MASQ 区别
3.3.1 SNAT 钩子函数
static struct xt_target xt_nat_target_reg[] __read_mostly = {
{
.name = "SNAT",
.revision = 0,
.checkentry = xt_nat_checkentry_v0,
.destroy = xt_nat_destroy,
.target = xt_snat_target_v0,
.targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
.family = NFPROTO_IPV4,
.table = "nat",
.hooks = (1 << NF_INET_POST_ROUTING) |
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
},
...
3.3.2 xt_snat_target_v0 剖析
static unsigned int
xt_snat_target_v0(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
struct nf_nat_range2 range;
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
WARN_ON(!(ct != NULL &&
(ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
ctinfo == IP_CT_RELATED_REPLY)));
/* 获取范畴 */
xt_nat_convert_range(&range, &mr->range[0]);
/* 依据可用范畴确定 SNAT 源地址,并批改连贯记录 */
return nf_nat_setup_info(ct, &range, NF_NAT_MANIP_SRC);
}
能够看到 SNAT 和 MASQ 最初都调用了 nf_nat_setup_info,区别是 MASQ 在后面有一个抉择最合适源 IP 的步骤。
正文完