1.Netfilter 结构图
Netfilter 框架的外围是五个钩子点, 能够通过在钩子点注册函数, 实现过滤批改数据包的性能
IPTABLES 和 IPVS 就是通过注册钩子函数的形式来实现它们的次要性能的
ip_rcv 是三层协定栈的入口函数
dev_queue_xmit 最初会调用网络设备驱动发送数据包包
2.Netfilter & CONNTRACK & IPTABLES NAT 结构图
2.1 Netfilter 的每个钩子点的钩子函数都有不同的优先级
/* hook 函数默认优先级设置, 数值越小优先级越高 */
enum nf_ip_hook_priorities {
NF_IP_PRI_FIRST = INT_MIN, /* 最高优先级 */
NF_IP_PRI_RAW_BEFORE_DEFRAG = -450, /* 波及 IP 分片重组的 RAW */
NF_IP_PRI_CONNTRACK_DEFRAG = -400, /* 波及 IP 分片重组的连贯跟踪 */
NF_IP_PRI_RAW = -300, /* RAW 表, 用于勾销连贯跟踪 */
NF_IP_PRI_SELINUX_FIRST = -225,
NF_IP_PRI_CONNTRACK = -200, /* 连贯跟踪开始 */
NF_IP_PRI_MANGLE = -150,
NF_IP_PRI_NAT_DST = -100, /* NAT 的扭转目标地址, DNAT or de-SNAT */
NF_IP_PRI_FILTER = 0, /* IPTABLES 的数据包过滤 */
NF_IP_PRI_SECURITY = 50,
NF_IP_PRI_NAT_SRC = 100, /* NAT 的扭转源地址, SNAT or de-DNAT */
NF_IP_PRI_SELINUX_LAST = 225,
NF_IP_PRI_CONNTRACK_HELPER = 300,
NF_IP_PRI_CONNTRACK_CONFIRM = INT_MAX, /* 连贯确认 */
NF_IP_PRI_LAST = INT_MAX, /* 最低优先级 */
};
优先级 CONNTRACK > DNAT > FILTER > SNAT > CONNTRACK_CONFIRM
3.CONNTRACK
3.1 conntrack 注册的钩子
static const struct nf_hook_ops ipv4_conntrack_ops[] = {
{
.hook = ipv4_conntrack_in, /* return nf_conntrack_in */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_conntrack_local, /* return nf_conntrack_in */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_CONNTRACK,
},
{
.hook = ipv4_confirm, /* 调用 nf_conntrack_confirm */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
{
.hook = ipv4_confirm, /* 调用 nf_conntrack_confirm */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_CONNTRACK_CONFIRM,
},
};
3.2 nf_conntrack_in
nf_conntrack_in 是 conntrack 的外围函数, 次要作用是:
- 获取数据包所对应的连贯, 如果没有则新建连贯记录
- 获取连贯或者新建连贯后, 更新连贯状态, 设置 skb->_nfct 字段保留数据包的所属连贯指针和连贯的状态
所有没有标注 UNCONNTRACK 的数据包在 nf_conntrack_in 中会获取所属连贯, 为后续做 NAT 提供根底
3.2.1 nf_conntrack_in 源码剖析:
unsigned int
nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct, *tmpl;
u_int8_t protonum;
int dataoff, ret;
/* 先尝试获取从 skb->_nfct 字段获取连贯指针和连贯状态
* skb->_nfct 是 unsigned long 类型, 后 3 位保留连贯状态, 其余位保留连贯记录的指针.
* 内核常常用这种操作节俭内存 */
tmpl = nf_ct_get(skb, &ctinfo);
/* 如果胜利获取到了连贯的指针和状态, 或者数据包标注勾销连贯跟踪 */
if (tmpl || ctinfo == IP_CT_UNTRACKED) {/* Previously seen (loopback or untracked)? Ignore. */
/* 三种包会到这里
* 1. 曾经获取了连贯的 skb
* 2. 不进行连贯跟踪的 skb
* 3. 设置了模板连贯的 skb */
if ((tmpl && !nf_ct_is_template(tmpl)) ||
ctinfo == IP_CT_UNTRACKED) {
/* 曾经获取连贯和不进行连贯跟踪的 skb 在减少命名空间 ignore 计数后返回 ACCEPT */
NF_CT_STAT_INC_ATOMIC(state->net, ignore);
return NF_ACCEPT;
}
/* 模板连贯的 skb 会走到这里,skb 的_nfct 字段会被重置
* 然而 tmpl 曾经获取到了模板连贯和连贯状态信息 */
skb->_nfct = 0;
}
/* 没有连贯的 skb 和设置了模板连贯的 skb 会持续走 */
/* rcu_read_lock()ed by nf_hook_thresh */
/* 获取 skb 四层协定头偏移 */
dataoff = get_l4proto(skb, skb_network_offset(skb), state->pf, &protonum);
if (dataoff <= 0) {pr_debug("not prepared to track yet or error occurred\n");
NF_CT_STAT_INC_ATOMIC(state->net, error);
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
}
/* ICMP 协定相干, 临时不看 */
if (protonum == IPPROTO_ICMP || protonum == IPPROTO_ICMPV6) {
ret = nf_conntrack_handle_icmp(tmpl, skb, dataoff,
protonum, state);
if (ret <= 0) {
ret = -ret;
goto out;
}
/* ICMP[v6] protocol trackers may assign one conntrack. */
if (skb->_nfct)
goto out;
}
repeat:
/* nf_conntrack_in 的外围函数, 作用如下
* 1. 依据 skb 的五元组在全局哈希表中匹配连贯
* 2. 没有匹配到连贯的话会新建连贯
* 3. 匹配或建设连贯后, 更新连贯状态
* 4. 将连贯指针和连贯状态保留到 skb->_nfct 字段 */
ret = resolve_normal_ct(tmpl, skb, dataoff,
protonum, state);
if (ret < 0) {
/* Too stressed to deal. */
NF_CT_STAT_INC_ATOMIC(state->net, drop);
ret = NF_DROP;
goto out;
}
/* 到这里 skb 的连贯曾经被确认了, 从新获取连贯指针和连贯状态 */
ct = nf_ct_get(skb, &ctinfo);
if (!ct) {
/* Not valid part of a connection */
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
ret = NF_ACCEPT;
goto out;
}
/* 四层协定连贯跟踪, 例如 tcp 连贯状态的扭转 */
ret = nf_conntrack_handle_packet(ct, skb, dataoff, ctinfo, state);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
pr_debug("nf_conntrack_in: Can't track with proto module\n");
nf_conntrack_put(&ct->ct_general);
skb->_nfct = 0;
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
if (ret == -NF_DROP)
NF_CT_STAT_INC_ATOMIC(state->net, drop);
/* Special case: TCP tracker reports an attempt to reopen a
* closed/aborted connection. We have to go back and create a
* fresh conntrack.
*/
if (ret == -NF_REPEAT)
goto repeat;
ret = -ret;
goto out;
}
if (ctinfo == IP_CT_ESTABLISHED_REPLY &&
!test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
nf_conntrack_event_cache(IPCT_REPLY, ct);
out:
if (tmpl)
nf_ct_put(tmpl);
return ret;
}
3.2.2 init_conntrack 是 conntrack 新建连贯的函数, 源码剖析:
/* Allocate a new conntrack: we return -ENOMEM if classification
failed due to stress. Otherwise it really is unclassifiable. */
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
{
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
struct nf_conntrack_ecache *ecache;
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
/* 翻转数据包的五元组获取回包的五元组 */
if (!nf_ct_invert_tuple(&repl_tuple, tuple)) {pr_debug("Can't invert tuple.\n");
return NULL;
}
/* 模板连贯设置的 zone */
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* 依据命名空间,zone, 原始五元组和回包五元组新建连贯 ct */
ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
hash);
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
/* synproxy 相干 */
if (!nf_ct_add_synproxy(ct, tmpl)) {nf_conntrack_free(ct);
return ERR_PTR(-ENOMEM);
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
if (timeout_ext)
nf_ct_timeout_ext_add(ct, rcu_dereference(timeout_ext->timeout),
GFP_ATOMIC);
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
ecache ? ecache->expmask : 0,
GFP_ATOMIC);
/* 冀望子连贯, 很少的协定会有(例如 ftp 协定) */
local_bh_disable();
if (net->ct.expect_count) {spin_lock(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
pr_debug("expectation arrives ct=%p exp=%p\n",
ct, exp);
/* Welcome, Mr. Bond. We've been expecting you... */
__set_bit(IPS_EXPECTED_BIT, &ct->status);
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
ct->master = exp->master;
if (exp->helper) {help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help)
rcu_assign_pointer(help->helper, exp->helper);
}
#ifdef CONFIG_NF_CONNTRACK_MARK
ct->mark = exp->master->mark;
#endif
#ifdef CONFIG_NF_CONNTRACK_SECMARK
ct->secmark = exp->master->secmark;
#endif
NF_CT_STAT_INC(net, expect_new);
}
spin_unlock(&nf_conntrack_expect_lock);
}
if (!exp)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
/* Now it is inserted into the unconfirmed list, bump refcount */
/* 统计计数, 而后将连贯的原始五元组插入 cpu 的未确认链表中 */
nf_conntrack_get(&ct->ct_general);
nf_ct_add_to_unconfirmed_list(ct);
local_bh_enable();
if (exp) {if (exp->expectfn)
exp->expectfn(ct, exp);
nf_ct_expect_put(exp);
}
return &ct->tuplehash[IP_CT_DIR_ORIGINAL];
}
3.2.3 CONNTRACK 的连贯记录有两个五元组
- 第一个是初始方向的五元组
- 第二个是冀望回包的五元组
这两个五元组在 nf_conntrack_confirm 中会被插入到同一个全局哈希表中,nf_conntrack_in 中通过查找全局哈希表来确认数据包所属的连贯
nf_conntrack_in 新建的连贯的两个五元组不会立刻增加到全局哈希表中, 而是先将初始方向五元组插入未确认链表.
nf_conntrack_in 新建的连贯通过 nf_conntrack_confirm 之后它的两个五元组才会被插入全局哈希表中
这种先建设后确认机制的起因是: 数据包可能在 Netfilter 途中就被内核抛弃 (比方 filter 表).
连贯跟踪在三层协定栈入口地位 PRE_ROUTING 和 LOCAL_OUT 注册了调用 nf_conntrack_in 钩子函数, 确保所有数据包的连贯可能被记录
连贯跟踪在三层协定栈进口地位 POST_ROUTING 和 LOCAL_IN 注册了调用 nf_conntrack_confirm 钩子函数, 确保新建的连贯可能被确认
3.3 nf_conntrack_confirm
3.3.1 nf_conntrack_confirm 源码剖析:
/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int nf_conntrack_confirm(struct sk_buff *skb)
{
/* 从 skb 中获取_nfct 字段失去数据包所属连贯的指针 */
struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb);
int ret = NF_ACCEPT;
/* 获取到了数据包的所属连贯 */
if (ct) {
/* 为没被确认的连贯进行确认 */
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
if (likely(ret == NF_ACCEPT))
nf_ct_deliver_cached_events(ct);
}
/* 没有所属连贯的 skb 包间接返回 ACCEPT */
return ret;
}
/* Confirm a connection given skb; places it in hash table */
int
__nf_conntrack_confirm(struct sk_buff *skb)
{
const struct nf_conntrack_zone *zone;
unsigned int hash, reply_hash;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conn_tstamp *tstamp;
struct hlist_nulls_node *n;
enum ip_conntrack_info ctinfo;
struct net *net;
unsigned int sequence;
int ret = NF_DROP;
/* 从 skb 中获取连贯指针和连贯状态 */
ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct);
/* ipt_REJECT uses nf_conntrack_attach to attach related
ICMP/TCP RST packets in other direction. Actual packet
which created connection will be IP_CT_NEW or for an
expected connection, IP_CT_RELATED. */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return NF_ACCEPT;
/* 获取数据包 zone */
zone = nf_ct_zone(ct);
local_bh_disable();
/* 获取原始五元组和回包五元组的 hash */
do {sequence = read_seqcount_begin(&nf_conntrack_generation);
/* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = scale_hash(hash);
reply_hash = hash_conntrack(net,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related
* connections for unconfirmed conns. But packet copies and
* REJECT will give spurious warnings here.
*/
/* Another skb with the same unconfirmed conntrack may
* win the race. This may happen for bridge(br_flood)
* or broadcast/multicast packets do skb_clone with
* unconfirmed conntrack.
*/
if (unlikely(nf_ct_is_confirmed(ct))) {WARN_ON_ONCE(1);
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return NF_DROP;
}
pr_debug("Confirming conntrack %p\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
nf_ct_del_from_dying_or_unconfirmed_list(ct);
if (unlikely(nf_ct_is_dying(ct))) {nf_ct_add_to_dying_list(ct);
goto dying;
}
/* See if there's one in the list already, including reverse:
NAT could have grabbed it without realizing, since we're
not in the hash. If there is, we lost race. */
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
zone, net))
goto out;
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode)
if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
zone, net))
goto out;
/* Timer relative to confirmation time, not original
setting time, otherwise we'd get timer wrap in
weird delay cases. */
ct->timeout += nfct_time_stamp;
atomic_inc(&ct->ct_general.use);
/* 标识连贯已确定 */
ct->status |= IPS_CONFIRMED;
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
if (tstamp)
tstamp->start = ktime_get_real_ns();
/* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above
* stores are visible.
*/
/* 将连贯的原始五元组和回包五元组插入全局哈希表中 */
__nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
nf_conntrack_event_cache(master_ct(ct) ?
IPCT_RELATED : IPCT_NEW, ct);
return NF_ACCEPT;
out:
nf_ct_add_to_dying_list(ct);
ret = nf_ct_resolve_clash(net, skb, ctinfo, h);
dying:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed);
local_bh_enable();
return ret;
}
4.IPTABLES NAT
IPTABLES 的 NAT 依赖于连贯跟踪, 对于没有连贯跟踪的数据包不做 NAT 解决
4.1 NAT 注册的钩子
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {/* 三层协定栈入口地位, 在包过滤之前, 批改目标地址(DNAT or de-SNAT) */
{
.hook = nf_nat_ipv4_in, /* 首先调用 nf_nat_ipv4_fn */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
/* 三层协定栈进口地位, 包过滤之后, 批改源地址(SNAT or de-DNAT) */
{
.hook = nf_nat_ipv4_out, /* 首先调用 nf_nat_ipv4_fn */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
/* 三层协定栈入口地位, 包过滤之前, 批改目标地址(DNAT or de-SNAT) */
{
.hook = nf_nat_ipv4_local_fn, /* 首先调用 nf_nat_ipv4_fn */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
/* 三层协定栈进口地位, 包过滤之后, 批改源地址(SNAT or de-DNAT) */
{
.hook = nf_nat_ipv4_fn, /* nf_nat_ipv4_fn */
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
},
};
4.2 nf_nat_ipv4_fn
NAT 注册的钩子函数都会先调用 nf_nat_ipv4_fn
函数 nf_nat_ipv4_fn 中会先获取数据包的 conntrack 连贯指针和连贯状态, 没有 conntrack 的连贯, 就不会进行 NAT
4.2.1 nf_nat_ipv4_fn 源码剖析:
static unsigned int
nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
/* 先从 skb 的_nfct 字段获取连贯指针和连贯状态, 如果没有则间接返回, 不做 NAT 解决 */
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
return NF_ACCEPT;
/* ICMP 协定相干 */
if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
state->hook))
return NF_DROP;
else
return NF_ACCEPT;
}
}
/* 调用外围函数 nf_nat_inet_fn */
return nf_nat_inet_fn(priv, skb, state);
}
unsigned int
nf_nat_inet_fn(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook);
/* 再获取一遍 skb 包的连贯指针和连贯状态 */
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
* have dropped it. Hence it's the user's responsibilty to
* packet filter it out, or implement conntrack/NAT for that
* protocol. 8) --RR
*/
if (!ct)
return NF_ACCEPT;
/* 获取 Natwork Namespace */
nat = nfct_nat(ct);
/* 依据连贯状态做不同解决 */
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED_REPLY:
/* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
* or local packets.
*/
if (!nf_nat_initialized(ct, maniptype)) {
struct nf_nat_lookup_hook_priv *lpriv = priv;
/* 获取 NAT 表本人保留的钩子函数入口 */
struct nf_hook_entries *e = rcu_dereference(lpriv->entries);
unsigned int ret;
int i;
if (!e)
goto null_bind;
/* 执行入口保留的所有钩子函数,nat 表的 hook 函数会程序遍历规定 */
for (i = 0; i < e->num_hook_entries; i++) {ret = e->hooks[i].hook(e->hooks[i].priv, skb,
state);
if (ret != NF_ACCEPT)
return ret;
if (nf_nat_initialized(ct, maniptype))
goto do_nat;
}
null_bind:
ret = nf_nat_alloc_null_binding(ct, state->hook);
if (ret != NF_ACCEPT)
return ret;
} else {pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n",
maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST",
ct, ct->status);
if (nf_nat_oif_changed(state->hook, ctinfo, nat,
state->out))
goto oif_changed;
}
break;
default:
/* ESTABLISHED */
WARN_ON(ctinfo != IP_CT_ESTABLISHED &&
ctinfo != IP_CT_ESTABLISHED_REPLY);
if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out))
goto oif_changed;
}
do_nat:
/* 依据连贯记录对数据包进行 nat 解决 */
return nf_nat_packet(ct, ctinfo, state->hook, skb);
oif_changed:
nf_ct_kill_acct(ct, ctinfo, skb);
return NF_DROP;
}
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned int verdict = NF_ACCEPT;
unsigned long statusbit;
if (mtype == NF_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;1
else
statusbit = IPS_DST_NAT;10
/* 回包异或取反 */
/* Invert if this is reply dir. */
if (dir == IP_CT_DIR_REPLY)
statusbit ^= IPS_NAT_MASK;11
/* Non-atomic: these bits don't change. */
if (ct->status & statusbit)
/* NAT 批改数据包 */
verdict = nf_nat_manip_pkt(skb, ct, mtype, dir);
return verdict;
}
unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct,
enum nf_nat_manip_type mtype,
enum ip_conntrack_dir dir)
{
struct nf_conntrack_tuple target;
/* We are aiming to look like inverse of other direction. */
/* 原始包依据回复五元组 NAT,回包依据原始五元组 de-NAT */
nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
switch (target.src.l3num) {
case NFPROTO_IPV6:
if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype))
return NF_ACCEPT;
break;
case NFPROTO_IPV4:
if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype))
return NF_ACCEPT;
break;
default:
WARN_ON_ONCE(1);
break;
}
return NF_DROP;
}
static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
unsigned int iphdroff,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
struct iphdr *iph;
unsigned int hdroff;
/* skb 可写 */
if (skb_ensure_writable(skb, iphdroff + sizeof(*iph)))
return false;
/* IP 头 */
iph = (void *)skb->data + iphdroff;
hdroff = iphdroff + iph->ihl * 4;
/* 四层端口批改 */
if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype))
return false;
iph = (void *)skb->data + iphdroff;
/* NAT */
if (maniptype == NF_NAT_MANIP_SRC) {csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
iph->saddr = target->src.u3.ip;
} else {csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
iph->daddr = target->dst.u3.ip;
}
return true;
}