OVS分类器三Trie

25次阅读

共计 10796 个字符,预计需要花费 27 分钟才能阅读完成。

简介

​ ovs 分类器中的前缀树是一个过滤器,将规则的网络层的源目的 IP 构建前缀树。在查找时,先使用 Trie 进行匹配,如果不能命中的话,说明整个子表都不会命中,跳过该子表,加速匹配过程。其数据结构就是一个二叉树。

​ Trie 属于分类器 classifier,一个分类器 classifier 可以有多个 Trie,典型的是两个:IPV4 源目的 IP。

数据结构

typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr;

/* A longest-prefix match tree. */
/* 一个最长匹配的前缀树,前缀树节点      */
struct trie_node {
    uint32_t prefix;           /* Prefix bits for this node, MSB first. 前缀,msb 优先 */
    uint8_t  n_bits;           /* Never zero, except for the root node. 前缀长度,用于限制上面的 prefix */
    unsigned int n_rules;      /* Number of rules that have this prefix. 拥有这个前缀的规则数 */
    rcu_trie_ptr edges[2];     /* Both NULL if leaf. 叶子的话,两边的都为空,树的两个节点,左右节点,因为一个 bit 只有两个值,要么 0,要么 1,所以是二叉树 */
};
/* Prefix trie for a 'field' */
/* 为 field 域构建前缀树 */
struct cls_trie {
    const struct mf_field *field; /* Trie field, or NULL. 构成该树的报文域 */
    rcu_trie_ptr root;            /* NULL if none. 树根 */
};

函数

trie_init

该函数用于初始化一个 Trie

/* 进行分类器的前缀树的初始化:1 计算前缀长度,2 将已经存在的规则加入该前缀树 */
static void
trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field)
{struct cls_trie *trie = &cls->tries[trie_idx];/* 从前缀树数组中获取指定的树描述控制块 */
    struct cls_subtable *subtable;/* 子表 */

    /* 先将以前的树进行删除 */
    if (trie_idx < cls->n_tries) {trie_destroy(&trie->root);
    } else {ovsrcu_set_hidden(&trie->root, NULL);
    }
    // 设置该前缀树对应的域
    trie->field = field;

    /* Add existing rules to the new trie. */
    /* 添加已经存在的规则到新树中 */
    CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) {/* 遍历每一个子表 */
        unsigned int plen;/* 前缀长度局部变量 */
        // 计算前缀树的前缀长度
        plen = field ? minimask_get_prefix_len(&subtable->mask, field) : 0;/* 获取匹配域的掩码的长度 */
        if (plen) {/* 存在前缀长度,那么将该子表下的规则插入该前缀树 */
            struct cls_match *head;

            CMAP_FOR_EACH (head, cmap_node, &subtable->rules) {/* 遍历每一个规则 */
                trie_insert(trie, head->cls_rule, plen);/* 将规则插入前缀树 */
            }
        }
        /* Initialize subtable's prefix length on this field.  This will
         * allow readers to use the trie. */
        atomic_thread_fence(memory_order_release);
        subtable->trie_plen[trie_idx] = plen; /* 设置子表的前缀树的前缀长度,子表的前缀长度是确定的 */
    }
}

trie_insert

插入一个规则到前缀树

/* 插入的是前缀掩码 */
/* edge 为当前节点 */
static void
trie_insert_prefix(rcu_trie_ptr *edge, const ovs_be32 *prefix, int mlen)/* 前缀与前缀长度 */
{
    struct trie_node *node;
    int ofs = 0;

    /* Walk the tree. */
    /* 遍历整棵树 */
    for (; (node = ovsrcu_get_protected(struct trie_node *, edge));/* 从根节点开始 */
         edge = trie_next_edge(node, prefix, ofs)) {/* 根据前缀 bit 的值为 1 还是 0 获取下一个节点 */
        unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);/*  */
        ofs += eqbits;/* 得到需要插入的前缀与本节点相同的 bit 个数 */
        if (eqbits < node->n_bits) {/* bit 相等的个数比该节点的前缀 bit 个数要小,说明两者不相等 */
            /* Mismatch, new node needs to be inserted above. 所以需要新建一个节点 */
            int old_branch = get_bit_at(node->prefix, eqbits);/* 从相等的位置开始一个新的分支 */
            struct trie_node *new_parent;
            /* 创建一个新的分支 */
            new_parent = trie_branch_create(prefix, ofs - eqbits, eqbits,
                                            ofs == mlen ? 1 : 0);
            /* Copy the node to modify it. */
            /* 修改一个节点 */
            node = trie_node_rcu_realloc(node);
            /* Adjust the new node for its new position in the tree. */
            node->prefix <<= eqbits; /* 去掉相同的几个 bit,因为这几个 bit 分给新的父节点 */
            node->n_bits -= eqbits;  /* 前缀个数减小,因为新的节点已经占用了一部分 */
            ovsrcu_set_hidden(&new_parent->edges[old_branch], node);/* 原来的节点作为这个新的节点的子节点 */

            /* Check if need a new branch for the new rule. */
            if (ofs < mlen) {ovsrcu_set_hidden(&new_parent->edges[!old_branch],
                                  trie_branch_create(prefix, ofs, mlen - ofs,
                                                     1));
            }
            ovsrcu_set(edge, new_parent); /* Publish changes. */
            return;
        }
        /* Full match so far. */

        if (ofs == mlen) {
            /* Full match at the current node, rule needs to be added here. */
            node->n_rules++;
            return;
        }
    }
    /* Must insert a new tree branch for the new rule. */
    ovsrcu_set(edge, trie_branch_create(prefix, ofs, mlen - ofs, 1));
}
/*
 * This is called only when mask prefix is known to be CIDR and non-zero.
 * Relies on the fact that the flow and mask have the same map, and since
 * the mask is CIDR, the storage for the flow field exists even if it
 * happened to be zeros.
 * 获取指定的前缀。*/
static const ovs_be32 *
minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf)
{
    size_t u64_ofs = mf->flow_be32ofs / 2;/* 4 字节单位偏移转换成 8 个字节单位偏移 */

    return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs)
        + (mf->flow_be32ofs & 1);
}
/* Insert rule in to the prefix tree.
 * 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
 * in 'rule'. 
 * 插入一个规则到前缀树,前缀掩码必须不为 0
 */
static void
trie_insert(struct cls_trie *trie/* 需要插入的树 */, const struct cls_rule *rule/* 需要插入的规则 */, int mlen/* 掩码长度 */)
{
    trie_insert_prefix(&trie->root,/* 根节点 */
                       minimatch_get_prefix(&rule->match, trie->field)/* 获取匹配域的掩码起始地址 */, mlen/* 掩码长度 */);
}

trie_remove

从前缀树中删除一个规则

/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
 * in 'rule'. 移除规则 cls_rule 对应的前缀 */
static void
trie_remove(struct cls_trie *trie,/* 规则的前缀树 */ 
                const struct cls_rule *rule, /* 要移除的规则描述 */
                int mlen)/* 掩码长度 */
{
    trie_remove_prefix(&trie->root,
                       minimatch_get_prefix(&rule->match, trie->field), mlen);
}

/* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask
 * in 'rule'. */
static void
trie_remove_prefix(rcu_trie_ptr *root, const ovs_be32 *prefix, int mlen)
{
    struct trie_node *node;
    rcu_trie_ptr *edges[sizeof(union trie_prefix) * CHAR_BIT];
    int depth = 0, ofs = 0;

    /* Walk the tree. */
    for (edges[0] = root;
         (node = ovsrcu_get_protected(struct trie_node *, edges[depth]));
         edges[++depth] = trie_next_edge(node, prefix, ofs)) {/* 根据前缀获取下一个节点 */
        unsigned int eqbits = trie_prefix_equal_bits(node, prefix, ofs, mlen);/* 判断两个节点前缀相等的位置 */

        if (eqbits < node->n_bits) {/* 不相等,说明不能命中 */
            /* Mismatch, nothing to be removed.  This should never happen, as
             * only rules in the classifier are ever removed. */
            break; /* Log a warning. */
        }
        /* Full match so far. 全匹配,到目前位置 */
        ofs += eqbits;

        if (ofs == mlen) {/* 全匹配了 */
            /* Full prefix match at the current node, remove rule here. */
            if (!node->n_rules) {/* 没有规则,直接退出,一般是不可能的 */
                break; /* Log a warning. */
            }
            node->n_rules--;/* 规则数减掉 1 */

            /* Check if can prune the tree. 检查是否能阶段这个树,一般来说,最后一个规则被删除的话,是可以截断这棵树的 */
            while (!node->n_rules) {
                struct trie_node *next,/* 获取当前节点的左右分支 */
                    *edge0 = ovsrcu_get_protected(struct trie_node *,
                                                  &node->edges[0]),
                    *edge1 = ovsrcu_get_protected(struct trie_node *,
                                                  &node->edges[1]);

                if (edge0 && edge1) {/* 左右分支都存在不能截断 */
                    break; /* A branching point, cannot prune. */
                }

                /* Else have at most one child node, remove this node. 只有一个分支,截断他,这个分支就是最后一个规则的分支 */
                next = edge0 ? edge0 : edge1;

                if (next) {if (node->n_bits + next->n_bits > TRIE_PREFIX_BITS) {break;   /* Cannot combine. */}
                    next = trie_node_rcu_realloc(next); /* Modify. */

                    /* Combine node with next. */
                    next->prefix = node->prefix | next->prefix >> node->n_bits;
                    next->n_bits += node->n_bits;
                }
                /* Update the parent's edge. */
                ovsrcu_set(edges[depth], next); /* Publish changes. 更新变化 */
                trie_node_destroy(node);/* 销毁当前节点 */

                if (next || !depth) {
                    /* Branch not pruned or at root, nothing more to do. */
                    break;
                }
                node = ovsrcu_get_protected(struct trie_node *,
                                            edges[--depth]);
            }
            return;
        }
    }
    /* Cannot go deeper. This should never happen, since only rules
     * that actually exist in the classifier are ever removed. */
}

trie_lookup

在前缀树中查找是否命中规则

/* Returns the number of bits in the prefix mask necessary to determine a
 * mismatch, in case there are longer prefixes in the tree below the one that
 * matched.
 * '*plens' will have a bit set for each prefix length that may have matching
 * rules.  The caller is responsible for clearing the '*plens' prior to
 * calling this.
 * 前缀树查找函数
 * trie: 树根
 * value: 需要匹配的内容,一般是 ip 地址 (ipv4 或者 ipv6)
 * plens: 命中结果存储的位置,它跟 value 是等长的,用于记录存在规则的 bit,即多长的前缀
 * 存在规则的命中。* n_bits: 本域的匹配长度 (bit 数),要么是 32 要么是 128。* 返回值为匹配过的 bit 数,如果最后一个 node miss-match,返回值为 match_len + 1
 */
static unsigned int
trie_lookup_value(const rcu_trie_ptr *trie, const ovs_be32 value[],
                  ovs_be32 plens[]/* 返回存在命中规则的 bit,用于记录命中 */, unsigned int n_bits/* 本区域最长的 bit 数 */)
{
    const struct trie_node *prev = NULL;
    const struct trie_node *node = ovsrcu_get(struct trie_node *, trie);
    unsigned int match_len = 0; /* Number of matching bits. */

    /* 从树根开始遍历节点 */
    for (; node; prev = node, node = trie_next_node(node, value, match_len)) {
        unsigned int eqbits;
        /* Check if this edge can be followed. 判断下一个节点的前缀与需要查找的前缀相等的个数 */
        eqbits = prefix_equal_bits(node->prefix, node->n_bits, value,
                                   match_len);
        match_len += eqbits;/* 得到相等的个数 */
        if (eqbits < node->n_bits) { /* Mismatch, nothing more to be found. */
            /* Bit at offset 'match_len' differed. 如果相等的 bit 个数不一致的话,说明这两个节点代表的掩码不一样 */
            return match_len + 1; /* Includes the first mismatching bit. 返回的长度还包含了第一个不匹配的 bit */
        }
        /* Full match, check if rules exist at this prefix length.
         * 如果两者的前缀掩码完全一样,判断该节点是否存在规则 */
        /* 如果存在的话,说明已经命中了规则,设置当前位置命中了规则
         */
        if (node->n_rules > 0) {be_set_bit_at(plens, match_len - 1);
        }
        if (match_len >= n_bits) {/* 全匹配了 */
            return n_bits; /* Full prefix. */
        }
    }
    /* node == NULL.  Full match so far, but we tried to follow an
     * non-existing branch.  Need to exclude the other branch if it exists
     * (it does not if we were called on an empty trie or 'prev' is a leaf
     * node). 
     * 能走到这里说明整个前缀树都匹配完毕了,即 node == NULL 跳出了循环。* 没有全前缀的命中。* prev 为 NULL,说明整棵树为空。* prev 为 leaf,说明整棵树匹配完毕。* 其它情况应该不会走到这里,所以不会存在返回 match_len + 1 的情况。* match_len + 1 表示包含了第一个不匹配的 bit。*/
    return !prev || trie_is_leaf(prev) ? match_len : match_len + 1;
}

/* 使用分类器的前缀树进行查找 flow */
/* trie: 分类器前缀树
** flow: 需要匹配的报文的 flow
** plens: 返回的匹配的前缀
 */
static unsigned int
trie_lookup(const struct cls_trie *trie, const struct flow *flow,
            union trie_prefix *plens)
{
    const struct mf_field *mf = trie->field;/* 树的匹配元数据 */

    /* Check that current flow matches the prerequisites for the trie
     * field.  Some match fields are used for multiple purposes, so we
     * must check that the trie is relevant for this flow.
     * 检查当前流的匹配先决条件,一些匹配域可能用于多个目的。比如我们需要查找的
     * ipv4 的话,那么在 Ethernet 头部的 ethertype 字段必须要等于 ipv4
     */
    if (mf_are_prereqs_ok(mf, flow, NULL)) {/* 进行查找 */
        return trie_lookup_value(&trie->root,/* 树根节点 */
                                 &((ovs_be32 *)flow)[mf->flow_be32ofs],/* 报文在该域的值 */
                                 &plens->be32, mf->n_bits);// 该域的长度
    }
    // 不满足条件,说明本报文不需要参与前缀树过滤,所以对任何 subtable 都是符合条件的。// 不需要跳过,继续查找 subtable。memset(plens, 0xff, sizeof *plens); /* All prefixes, no skipping. */
    return 0; /* Value not used in this case. 这种情况下,该报文不需要参与 Trie 过滤 */
}

过滤器 check_tries

该函数通过 Trie 进行过滤,报文不可能匹配的话,返回 True,否则返回 False。

/* Return 'true' if can skip rest of the subtable based on the prefix trie
 * lookup results. 
 * 基于前缀树查找结果,前缀树一般用于匹配源目的 ip 字段。用于快速过滤。*/
static inline bool
check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries,
            const unsigned int field_plen[CLS_MAX_TRIES],
            const struct flowmap range_map, const struct flow *flow,// 报文内容
            struct flow_wildcards *wc)// 报文域 bit 位
{
    int j;

    /* Check if we could avoid fully unwildcarding the next level of
     * fields using the prefix tries.  The trie checks are done only as
     * needed to avoid folding in additional bits to the wildcards mask. 
     * */
    for (j = 0; j < n_tries; j++) {/* 遍历每一棵树 */
        /* Is the trie field relevant for this subtable, and
           is the trie field within the current range of fields? */
        if (field_plen[j] &&/* 前缀长度存在并且在当前的 stage 中,则需要进行匹配 */
            flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) {struct trie_ctx *ctx = &trie_ctx[j];/* 树匹配上下文 */

            /* On-demand trie lookup. */
            /* 对需要查找的树进行查找,整个报文每个 trie 只需要查找一次,后续子表直接拿前面的结果来用即可 */
            if (!ctx->lookup_done) {/* 树还没查找过,进行查找 */
                memset(&ctx->match_plens, 0, sizeof ctx->match_plens);/* 清空匹配了的长度为 0 */
                /* 返回的是参与了匹配的 bit 数,命中 */
                ctx->maskbits = trie_lookup(ctx->trie, flow, &ctx->match_plens);
                ctx->lookup_done = true;/* 设置本树已经查找完了 */
            }
            /* Possible to skip the rest of the subtable if subtable's
             * prefix on the field is not included in the lookup result. 
             * 查看本 subtable 的前缀长度是否有规则命中,没有的话,说明该子表不存在
             * 命中的可能,直接跳过。没看明白里面的两个 if 的作用。*/
            if (!be_get_bit_at(&ctx->match_plens.be32, field_plen[j] - 1)) {
                /* We want the trie lookup to never result in unwildcarding
                 * any bits that would not be unwildcarded otherwise.
                 * Since the trie is shared by the whole classifier, it is
                 * possible that the 'maskbits' contain bits that are
                 * irrelevant for the partition relevant for the current
                 * packet.  Hence the checks below. 
                 * Trie 是整个分类器共享的。它是可能的 maskbits 可能包含一些与这个报文不相关的部分。* 因此需要进行下面的检查。*/

                /* Check that the trie result will not unwildcard more bits
                 * than this subtable would otherwise. 
                 * 这棵树最长 */
                if (ctx->maskbits <= field_plen[j]) {// 没有全匹配
                    /* Unwildcard the bits and skip the rest. */
                    /* 设置非通配 bit,用于跳过其子表其他 */
                    mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits);
                    /* Note: Prerequisite already unwildcarded, as the only
                     * prerequisite of the supported trie lookup fields is
                     * the ethertype, which is always unwildcarded. */
                    return true;
                }
                /* Can skip if the field is already unwildcarded. */
                if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) {return true;}
            }
        }
    }
    return false;
}

正文完
 0