连接跟踪之ZONE

39次阅读

共计 9657 个字符,预计需要花费 25 分钟才能阅读完成。

简介

目前一个连接跟踪的五元组为源目的 IP,传输层协议,源目的端口。多租户环境下,租户的私有地址网络可能存在重叠,如果只用这五个元素来区分一个 CT 的话,无法满足多租户的需求。所以引入 zone 的概念,zone 是一个 16bit 的整型数,不同用户使用不同的 id,从而保证租户之间的隔离。

实现

连接跟踪控制块中的 zone 成员:

struct nf_conn {
    /* Usage count in here is 1 for hash table, 1 per skb,
     * plus 1 for any connection(s) we are `master' for
     *
     * Hint, SKB address this struct and refcnt via skb->_nfct and
     * helpers nf_conntrack_get() and nf_conntrack_put().
     * Helper nf_ct_put() equals nf_conntrack_put() by dec refcnt,
     * beware nf_ct_get() is different and don't inc refcnt.
     */
    struct nf_conntrack ct_general;

    spinlock_t    lock;
    u16        cpu;
    // 连接跟踪 zone 成员。#ifdef CONFIG_NF_CONNTRACK_ZONES
    struct nf_conntrack_zone zone;
#endif
    ...
};

zone 定义

struct nf_conntrack_zone {
    u16    id;//id
    u8    flags;// 标志,目前只有一个标志 NF_CT_FLAG_MARK,表示使用 skb->mark 作为 zone-id,否则使用 id 成员作为 zone-id。u8    dir;// 方向,默认是双向的,即从某一个网口接收的报文不管是应答还是请求方向都用同一个 zoneid,最常见。// 看见宏 NF_CT_DEFAULT_ZONE_DIR。};
#define NF_CT_DEFAULT_ZONE_DIR    (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)
// 详见函数:static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
        struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    if (!tmpl)
        return &nf_ct_zone_dflt;
    // 设置 zone
    if (tmpl->zone.flags & NF_CT_FLAG_MARK)
        return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
    return nf_ct_zone(tmpl);
}
static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    return &ct->zone;
#else
    return &nf_ct_zone_dflt;
#endif
}
// 初始化连接跟踪的 zone。static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
    zone->id = id;
    zone->flags = flags;
    zone->dir = dir;

    return zone;
}

默认的连接跟踪 zone 定义

/* Built-in default zone used e.g. by modules. */
const struct nf_conntrack_zone nf_ct_zone_dflt = {
    .id    = NF_CT_DEFAULT_ZONE_ID,
    .dir    = NF_CT_DEFAULT_ZONE_DIR,
};
EXPORT_SYMBOL_GPL(nf_ct_zone_dflt);
#define NF_CT_DEFAULT_ZONE_ID    0

#define NF_CT_ZONE_DIR_ORIG    (1 << IP_CT_DIR_ORIGINAL)
#define NF_CT_ZONE_DIR_REPL    (1 << IP_CT_DIR_REPLY)

#define NF_CT_DEFAULT_ZONE_DIR    (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL)

zone 的常见操作函数

static inline const struct nf_conntrack_zone *
nf_ct_zone(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    return &ct->zone;
#else
    return &nf_ct_zone_dflt;
#endif
}
// 设置连接跟踪的 zone。static inline const struct nf_conntrack_zone *
nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags)
{
    zone->id = id;
    zone->flags = flags;
    zone->dir = dir;

    return zone;
}

static inline const struct nf_conntrack_zone *
nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb,
        struct nf_conntrack_zone *tmp)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    if (!tmpl)
        return &nf_ct_zone_dflt;
    // 设置 zone
    if (tmpl->zone.flags & NF_CT_FLAG_MARK)
        return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0);
#endif
    return nf_ct_zone(tmpl);
}
// 设置 ct 的 zone
static inline void nf_ct_zone_add(struct nf_conn *ct,
                  const struct nf_conntrack_zone *zone)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    ct->zone = *zone;
#endif
}

static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone,
                      enum ip_conntrack_dir dir)
{return zone->dir & (1 << dir);
}
// 或者 ct 的某一个方向的 zone id
static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone,
                enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    return nf_ct_zone_matches_dir(zone, dir) ?
           zone->id : NF_CT_DEFAULT_ZONE_ID;
#else
    return NF_CT_DEFAULT_ZONE_ID;
#endif
}
// 判断两个 ct 在同一个方向上的 zone id 是否相等
static inline bool nf_ct_zone_equal(const struct nf_conn *a,
                    const struct nf_conntrack_zone *b,
                    enum ip_conntrack_dir dir)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    return nf_ct_zone_id(nf_ct_zone(a), dir) ==
           nf_ct_zone_id(b, dir);
#else
    return true;
#endif
}
// 比较连接跟踪 a 和 b 任意方向的 zone 是否相等
static inline bool nf_ct_zone_equal_any(const struct nf_conn *a,
                    const struct nf_conntrack_zone *b)
{
#ifdef CONFIG_NF_CONNTRACK_ZONES
    return nf_ct_zone(a)->id == b->id;
#else
    return true;
#endif
}

ZONE 的使用

通过将设备映射到不同的 zone,实现租户流量到 zone 的映射,也可以使用 iptables 的 mark 功能设置流量的 zone。linux 使用 CT target 来设置流量的 zone。CT 命令会在内核创建一个连接跟踪模板,命中该规则的 flow 将会设置模板 CT,首包在创建 CT 的时候会以模板为参考进行初始化,从而将我们在 CT target 设置的参数传递给连接跟踪。

   CT
       The  CT  target  sets parameters for a packet or its associated connection. The target attaches a "template" connection tracking entry to the packet, which is then used by the conntrack core
       when initializing a new ct entry. This target is thus only valid in the "raw" table.

       --notrack
              Disables connection tracking for this packet.

       --helper name
              Use the helper identified by name for the connection. This is more flexible than loading the conntrack helper modules with preset ports.

       --ctevents event[,...]
              Only generate the specified conntrack events for this connection. Possible event types are: new, related, destroy, reply, assured, protoinfo, helper, mark (this refers to the  ctmark,
              not nfmark), natseqinfo, secmark (ctsecmark).

       --expevents event[,...]
              Only generate the specified expectation events for this connection.  Possible event types are: new.

       --zone-orig {id|mark}
              For  traffic  coming from ORIGINAL direction, assign this packet to zone id and only have lookups done in that zone. If mark is used instead of id, the zone is derived from the packet
              nfmark.

       --zone-reply {id|mark}
              For traffic coming from REPLY direction, assign this packet to zone id and only have lookups done in that zone. If mark is used instead of id, the zone  is  derived  from  the  packet
              nfmark.

       --zone {id|mark}
              Assign  this  packet  to  zone id and only have lookups done in that zone.  If mark is used instead of id, the zone is derived from the packet nfmark. By default, packets have zone 0.
              This option applies to both directions.

       --timeout name
              Use the timeout policy identified by name for the connection. This is provides more flexible timeout policy definition than global timeout values  available  at  /proc/sys/net/netfil‐
              ter/nf_conntrack_*_timeout_*.

案例

sudo iptables -t raw -A PREROUTING -i ens39 -j CT --zone 2
#该命令将 ens39 网口收到的报文映射到 zone 2 中,实现不同接口收到的流量连接跟踪隔离。

CT target 实现分析

struct xt_ct_target_info_v1 {
    __u16 flags;// 标志,见下面枚举
    __u16 zone;//zone id
    __u32 ct_events;
    __u32 exp_events;
    char helper[16];
    char timeout[32];// 用户自定义超时时间

    /* Used internally by the kernel */
    /* 连接跟踪模板 */
    struct nf_conn    *ct __attribute__((aligned(8)));
};

enum {
    XT_CT_NOTRACK        = 1 << 0,// 设置了 --notrack 参数
    XT_CT_NOTRACK_ALIAS    = 1 << 1,// 设置了 --notrack 参数
    XT_CT_ZONE_DIR_ORIG    = 1 << 2,//zone 设置的是请求方向
    XT_CT_ZONE_DIR_REPL    = 1 << 3,//zone 设置的应答方向,也可以两个标志都有
    XT_CT_ZONE_MARK        = 1 << 4,//zone 来自于 nfmark

    XT_CT_MASK        = XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS |
                  XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL |
                  XT_CT_ZONE_MARK,
};

构建模板

static int xt_ct_tg_check_v2(const struct xt_tgchk_param *par)
{
    struct xt_ct_target_info_v1 *info = par->targinfo;

    if (info->flags & ~XT_CT_MASK)// 一个选项也没设置,直接退出。return -EINVAL;

    return xt_ct_tg_check(par, par->targinfo);
}

static int xt_ct_tg_check(const struct xt_tgchk_param *par,
              struct xt_ct_target_info_v1 *info)
{
    struct nf_conntrack_zone zone;
    struct nf_conn_help *help;
    struct nf_conn *ct;
    int ret = -EOPNOTSUPP;

    if (info->flags & XT_CT_NOTRACK) {
        ct = NULL;
        goto out;
    }

#ifndef CONFIG_NF_CONNTRACK_ZONES
    if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG |
                     XT_CT_ZONE_DIR_REPL |
                     XT_CT_ZONE_MARK))
        goto err1;
#endif

    ret = nf_ct_netns_get(par->net, par->family);
    if (ret < 0)
        goto err1;

    memset(&zone, 0, sizeof(zone));
    zone.id = info->zone;
    zone.dir = xt_ct_flags_to_dir(info);
    if (info->flags & XT_CT_ZONE_MARK)
        zone.flags |= NF_CT_FLAG_MARK;
    // 分配 ct 模板
    ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL);
    if (!ct) {
        ret = -ENOMEM;
        goto err2;
    }
    ...
        
    return ret;
}

执行 target

static unsigned int xt_ct_target_v1(struct sk_buff *skb,
                    const struct xt_action_param *par)
{
    // 获取规则信息
    const struct xt_ct_target_info_v1 *info = par->targinfo;
    struct nf_conn *ct = info->ct;// 获取规则的 CT 模板

    return xt_ct_target(skb, ct);
}
static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct)
{/* Previously seen (loopback)? Ignore. */
    if (skb->_nfct != 0)
        return XT_CONTINUE;

    if (ct) {// 设置报文的 CT 模板。atomic_inc(&ct->ct_general.use);
        nf_ct_set(skb, ct, IP_CT_NEW);
    } else {nf_ct_set(skb, ct, IP_CT_UNTRACKED);
    }

    return XT_CONTINUE;
}

连接跟踪对模板的处理

unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
        struct sk_buff *skb)
{
    const struct nf_conntrack_l3proto *l3proto;
    const struct nf_conntrack_l4proto *l4proto;
    struct nf_conn *ct, *tmpl;
    enum ip_conntrack_info ctinfo;
    unsigned int *timeouts;
    unsigned int dataoff;
    u_int8_t protonum;
    int ret;

    tmpl = nf_ct_get(skb, &ctinfo);// 获取模板
    if (tmpl || ctinfo == IP_CT_UNTRACKED) {/* Previously seen (loopback or untracked)?  Ignore. */
        /* 自己 ping 自己的报文会经过 lo 从 prerouting 进入协议栈,由于报文已经
        ** 在 out 进行了连接跟踪。所以这里直接接受。** 只有环回接口 (ping 自己的任何一个地址) 的报文会携带 CT,并且其不是模板。** 使用 CT 动作,设置报文为 IP_CT_UNTRACKED 也会直接返回。*/ 
        if ((tmpl && !nf_ct_is_template(tmpl)) ||// 设置了 zone 的话,会有 tmpl,并且 nf_ct_is_template 为真。ctinfo == IP_CT_UNTRACKED) {NF_CT_STAT_INC_ATOMIC(net, ignore);
            return NF_ACCEPT;
        }
        skb->_nfct = 0;
    }
    ...

    return ret;
}

/* On success, returns 0, sets skb->_nfct | ctinfo */
static int
resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
          struct sk_buff *skb,
          unsigned int dataoff,
          u_int16_t l3num,
          u_int8_t protonum,
          const struct nf_conntrack_l3proto *l3proto,
          const struct nf_conntrack_l4proto *l4proto)
{
    const struct nf_conntrack_zone *zone;
    struct nf_conntrack_tuple tuple;
    struct nf_conntrack_tuple_hash *h;
    enum ip_conntrack_info ctinfo;
    struct nf_conntrack_zone tmp;
    struct nf_conn *ct;
    u32 hash;

    if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
                 dataoff, l3num, protonum, net, &tuple, l3proto,
                 l4proto)) {pr_debug("Can't get tuple\n");
        return 0;
    }

    /* look for tuple match 查找 CT 的时候,使用模板的 zone */
    zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
    hash = hash_conntrack_raw(&tuple, net);
    h = __nf_conntrack_find_get(net, zone, &tuple, hash);
    if (!h) {
        // 没有找到,那么查找期望连接
        h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
                   skb, dataoff, hash);
        if (!h)
            return 0;
        if (IS_ERR(h))
            return PTR_ERR(h);
    }
    ...
    return 0;
}



/* Allocate a new conntrack: we return -ENOMEM if classification
   failed due to stress.  Otherwise it really is unclassifiable. */
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
           const struct nf_conntrack_tuple *tuple,
           const struct nf_conntrack_l3proto *l3proto,
           const struct nf_conntrack_l4proto *l4proto,
           struct sk_buff *skb,
           unsigned int dataoff, u32 hash)
{
    struct nf_conn *ct;
    struct nf_conn_help *help;
    struct nf_conntrack_tuple repl_tuple;
    struct nf_conntrack_ecache *ecache;
    struct nf_conntrack_expect *exp = NULL;
    const struct nf_conntrack_zone *zone;
    struct nf_conn_timeout *timeout_ext;
    struct nf_conntrack_zone tmp;
    unsigned int *timeouts;

    if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {pr_debug("Can't invert tuple.\n");
        return NULL;
    }
    // 初始化 ct 也会只用模板的 zone。zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
    // 分配连接跟踪
    ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC,
                  hash);
    ...
}

正文完
 0