共计 10526 个字符,预计需要花费 27 分钟才能阅读完成。
vpp 支持两套 qos 实现,一套是基于 policer 实现的 qos,另外一套是基于 dpdk 的 qos 套件实现的 hqos。
基本流程图
如上图所示,worker 线程从 nic 中读取报文进行处理,调用 dpdk 设备发送函数时,如果配置了 hqos,那么设置 hqos 相关参数,将其送入 swq 队列 (swq 队列与 worker 线程是 1:1 的关系),worker 线程处理结束,hqos 线程(根据配置决定个数) 轮询从 swq 中读取报文进行 qos 处理,qos 使用的是 dpdk qos 套件。
HQOS 配置解析
dpdk 配置
dpdk {
socket-mem 16384,16384
dev 0000:02:00.0 {
num-rx-queues 2
hqos #使能网卡 hqos
}
dev 0000:06:00.0 {
num-rx-queues 2
hqos #使能网卡 hqos
}
num-mbufs 1000000
}
cpu 配置
cpu {
main-core 0
corelist-workers 1, 2, 3, 4
corelist-hqos-threads 5, 6 #启动两个 hqos 线程,分别使用 cpu5 和 6
}
通过上面两步配置即可开启 hqos 配置,会使用默认的 hqos 配置。
dpdk qos 相关参数配置
- port configuration
port {
rate 1250000000 /* Assuming 10GbE port */
frame_overhead 24 /* Overhead fields per Ethernet frame:
* 7B (Preamble) +
* 1B (Start of Frame Delimiter (SFD)) +
* 4B (Frame Check Sequence (FCS)) +
* 12B (Inter Frame Gap (IFG))
*/
mtu 1522 /* Assuming Ethernet/IPv4 pkt (FCS not included) */
n_subports_per_port 1 /* Number of subports per output interface */
n_pipes_per_subport 4096 /* Number of pipes (users/subscribers) */
queue_sizes 64 64 64 64 /* Packet queue size for each traffic class.
* All queues within the same pipe traffic class
* have the same size. Queues from different
* pipes serving the same traffic class have
* the same size. */
}
- subport configuration
subport 0 {tb_rate 1250000000 /* Subport level token bucket rate (bytes per second) */
tb_size 1000000 /* Subport level token bucket size (bytes) */
tc0_rate 1250000000 /* Subport level token bucket rate for traffic class 0 (bytes per second) */
tc1_rate 1250000000 /* Subport level token bucket rate for traffic class 1 (bytes per second) */
tc2_rate 1250000000 /* Subport level token bucket rate for traffic class 2 (bytes per second) */
tc3_rate 1250000000 /* Subport level token bucket rate for traffic class 3 (bytes per second) */
tc_period 10 /* Time interval for refilling the token bucket associated with traffic class (Milliseconds) */
pipe 0 4095 profile 0 /* pipe 0 到 4095 使用 profile0 pipes (users/subscribers) configured with pipe profile 0 */
}
- pipe configuration
pipe_profile 0 {tb_rate 305175 /* Pipe level token bucket rate (bytes per second) */
tb_size 1000000 /* Pipe level token bucket size (bytes) */
tc0_rate 305175 /* Pipe level token bucket rate for traffic class 0 (bytes per second) */
tc1_rate 305175 /* Pipe level token bucket rate for traffic class 1 (bytes per second) */
tc2_rate 305175 /* Pipe level token bucket rate for traffic class 2 (bytes per second) */
tc3_rate 305175 /* Pipe level token bucket rate for traffic class 3 (bytes per second) */
tc_period 40 /* Time interval for refilling the token bucket associated with traffic class at pipe level (Milliseconds) */
tc3_oversubscription_weight 1 /* Weight traffic class 3 oversubscription */
tc0_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 0 */
tc1_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 1 */
tc2_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 2 */
tc3_wrr_weights 1 1 1 1 /* Pipe queues WRR weights for traffic class 3 */
}
- red 拥塞控制
red {tc0_wred_min 48 40 32 /* Minimum threshold for traffic class 0 queue (min_th) in number of packets */
tc0_wred_max 64 64 64 /* Maximum threshold for traffic class 0 queue (max_th) in number of packets */
tc0_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 0 queue (maxp = 1 / maxp_inv) */
tc0_wred_weight 9 9 9 /* Traffic Class 0 queue weight */
tc1_wred_min 48 40 32 /* Minimum threshold for traffic class 1 queue (min_th) in number of packets */
tc1_wred_max 64 64 64 /* Maximum threshold for traffic class 1 queue (max_th) in number of packets */
tc1_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 1 queue (maxp = 1 / maxp_inv) */
tc1_wred_weight 9 9 9 /* Traffic Class 1 queue weight */
tc2_wred_min 48 40 32 /* Minimum threshold for traffic class 2 queue (min_th) in number of packets */
tc2_wred_max 64 64 64 /* Maximum threshold for traffic class 2 queue (max_th) in number of packets */
tc2_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 2 queue (maxp = 1 / maxp_inv) */
tc2_wred_weight 9 9 9 /* Traffic Class 2 queue weight */
tc3_wred_min 48 40 32 /* Minimum threshold for traffic class 3 queue (min_th) in number of packets */
tc3_wred_max 64 64 64 /* Maximum threshold for traffic class 3 queue (max_th) in number of packets */
tc3_wred_inv_prob 10 10 10 /* Inverse of packet marking probability for traffic class 3 queue (maxp = 1 / maxp_inv) */
tc3_wred_weight 9 9 9 /* Traffic Class 3 queue weight */
}
port,subport,pipe,tc,queue 这些配置某些参数也可以使用命令行或者 api 进行配置。
CLI 配置 qos
- 可以使用如下命令配置 subport 参数
set dpdk interface hqos subport <if-name> subport <n> [rate <n>] [bktsize <n>] [tc0 <n>] [tc1 <n>] [tc2 <n>] [tc3 <n>] [period <n>]
- 配置 pipe 参数
set dpdk interface hqos pipe <if-name> subport <n> pipe <n> profile <n>
- 指定接口的 hqos 处理线程
set dpdk interface hqos placement <if-name> thread <n>
- 设置报文的具体域用于报文分类,其中 id 为 hqos_field 编号
set dpdk interface hqos pktfield <if-name> id <n> offset <n> mask <n>
- 设置 tc 和 tcq 映射表,根据 dscp 映射到具体的 tc 和 tcq
set dpdk interface hqos tctbl <if-name> entry <n> tc <n> queue <n>
- 查看 hqos 配置命令
vpp# show dpdk interface hqos TenGigabitEthernet2/0/0
Thread:
Input SWQ size = 4096 packets
Enqueue burst size = 256 packets
Dequeue burst size = 220 packets
Packet field 0: slab position = 0, slab bitmask = 0x0000000000000000
Packet field 1: slab position = 40, slab bitmask = 0x0000000fff000000
Packet field 2: slab position = 8, slab bitmask = 0x00000000000000fc
Packet field 2 translation table:
[0 .. 15]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
[16 .. 31]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
[32 .. 47]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
[48 .. 63]: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
Port:
Rate = 1250000000 bytes/second
MTU = 1514 bytes
Frame overhead = 24 bytes
Number of subports = 1
Number of pipes per subport = 4096
Packet queue size: TC0 = 64, TC1 = 64, TC2 = 64, TC3 = 64 packets
Number of pipe profiles = 1
Pipe profile 0:
Rate = 305175 bytes/second
Token bucket size = 1000000 bytes
Traffic class rate: TC0 = 305175, TC1 = 305175, TC2 = 305175, TC3 = 305175 bytes/second
TC period = 40 milliseconds
TC0 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
TC1 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
TC2 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
TC3 WRR weights: Q0 = 1, Q1 = 1, Q2 = 1, Q3 = 1
- 查看设备所属的 hqos 线程
vpp# show dpdk interface hqos placement
Thread 5 (vpp_hqos-threads_0 at lcore 5):
TenGigabitEthernet2/0/0 queue 0
Thread 6 (vpp_hqos-threads_1 at lcore 6):
TenGigabitEthernet4/0/1 queue 0
DPDK QOS 套件
该部分内容请参考:
http://doc.dpdk.org/guides/pr…
HQOS 源码分析
数据结构
dpdk_device_t
HQOS 是基于每个 dpdk 设备的,所以 dpdk 设备描述控制块存在与 hqos 相关的成员。
typedef struct
{
......
/* HQoS related 工作线程与 hqos 线程之间有一个映射关系,因为两者线程不一样多 */
dpdk_device_hqos_per_worker_thread_t *hqos_wt;// 工作线程队列,工作线程往 hqos_wt.swq 中写报文,swq 指向 hqos_ht.swq
dpdk_device_hqos_per_hqos_thread_t *hqos_ht;//hqos 线程队列,hqos 线程从 hqos_ht.swq 中读报文
......
} dpdk_device_t;
dpdk_device_hqos_per_worker_thread_t
typedef struct
{
/* Required for vec_validate_aligned */
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
struct rte_ring *swq;// 指向该设备的该线程将报文写入的目标环形队列。// 这些参数用来对报文进行分类,设置 dpdk qos 的 subport,pipe,tc,queue 等参数
u64 hqos_field0_slabmask;
u32 hqos_field0_slabpos;
u32 hqos_field0_slabshr;
u64 hqos_field1_slabmask;
u32 hqos_field1_slabpos;
u32 hqos_field1_slabshr;
u64 hqos_field2_slabmask;
u32 hqos_field2_slabpos;
u32 hqos_field2_slabshr;
u32 hqos_tc_table[64];
} dpdk_device_hqos_per_worker_thread_t;
上面是每工作线程与 hqos 相关的私有数据,dpdk 设备维护一个这样的数组,每一个工作线程一个成员。该结构中的成员主要用来对报文进行分类,从上面可以看出,报文分类太简单,无法满足较细的分类需求。可以结合分类器做进一步改进。
dpdk_device_hqos_per_hqos_thread_t
typedef struct
{
/* Required for vec_validate_aligned */
CLIB_CACHE_LINE_ALIGN_MARK (cacheline0);
struct rte_ring **swq;//worker thread 和 hqos thread 线程报文中转环形队列数组,其大小等于 worker 线程的个数
struct rte_mbuf **pkts_enq;// 入队,用于从 swq 中提取报文放入到 pkts_enq 中,然后将 pkts_enq 中的报文压入 qos
struct rte_mbuf **pkts_deq;// 出队,从 qos 中提取报文,然后从网口发送出去。struct rte_sched_port *hqos;// 设备的 dpdk qos 配置参数
u32 hqos_burst_enq;// 设备一次入队报文的个数限制
u32 hqos_burst_deq;// 设备一次出队报文的个数限制
u32 pkts_enq_len;// 当前入队报文中的个数
u32 swq_pos;//hqos 迭代器当前处理的软件队列的位置
u32 flush_count;// 当前报文不足批量写入个数空转次数,达到一定次数后,立即写入,不再等待。} dpdk_device_hqos_per_hqos_thread_t;
上面是每 hqos 线程与 hqos 相关的私有数据,一个 dpdk 设备只能属于一个 hqos 线程。该结构主要维护与 worker 线程的桥接队列以及该 port 的 hqos 参数。
dpdk_main_t
typedef struct
{
......
//dpdk 设备 hqos cpu 索引数组, 根据 hqos 索引获取其管理的 dpdk 设备数组,构建了 hqos 线程与 dpdk 设备的关系,是一对多的关系
dpdk_device_and_queue_t **devices_by_hqos_cpu;
......
} dpdk_main_t;
hqos 类线程注册
/* *INDENT-OFF* 注册 dpdk 类线程 */
VLIB_REGISTER_THREAD (hqos_thread_reg, static) =
{
.name = "hqos-threads",
.short_name = "hqos-threads",
.function = dpdk_hqos_thread_fn,// 执行函数
};
前面提到的配置corelist-hqos-threads 5,6。VPP 在解析到该配置后,会使用 hqos-threads 去线程类型链表中去查找是否存在这样的线程,找到后会启动对应个数的该类线程,线程的主函数为dpdk_hqos_thread_fn。
函数实现
dpdk_hqos_thread_fn
//dpdk 线程执行函数
void
dpdk_hqos_thread_fn (void *arg)
{vlib_worker_thread_t *w = (vlib_worker_thread_t *) arg;
vlib_worker_thread_init (w);//hqos 线程初始化
dpdk_hqos_thread (w);
}
void
dpdk_hqos_thread (vlib_worker_thread_t * w)
{
vlib_main_t *vm;
vlib_thread_main_t *tm = vlib_get_thread_main ();
dpdk_main_t *dm = &dpdk_main;
vm = vlib_get_main ();
ASSERT (vm->thread_index == vlib_get_thread_index ());
clib_time_init (&vm->clib_time);
clib_mem_set_heap (w->thread_mheap);
/* Wait until the dpdk init sequence is complete */
while (tm->worker_thread_release == 0)
vlib_worker_thread_barrier_check ();
// 根据 cpu 索引
if (vec_len (dm->devices_by_hqos_cpu[vm->thread_index]) == 0)
return
clib_error
("current I/O TX thread does not have any devices assigned to it");
if (DPDK_HQOS_DBG_BYPASS)
dpdk_hqos_thread_internal_hqos_dbg_bypass (vm);// 调试函数,用于调试
else
dpdk_hqos_thread_internal (vm);// 核心处理函数
}
dpdk_hqos_thread_internal
static_always_inline void
dpdk_hqos_thread_internal (vlib_main_t * vm)
{
dpdk_main_t *dm = &dpdk_main;
u32 thread_index = vm->thread_index;
u32 dev_pos;
dev_pos = 0;// 起始设备编号
while (1)// 循环遍历每一个设备
{vlib_worker_thread_barrier_check ();// 查看主线程是否通知了 sync
// 根据 cpu id 获取该 cpu 的设备数组
u32 n_devs = vec_len (dm->devices_by_hqos_cpu[thread_index]);
if (PREDICT_FALSE (n_devs == 0))
{
dev_pos = 0;
continue;
}
// 一圈遍历完成,开始新的一圈。if (dev_pos >= n_devs)
dev_pos = 0;
// 获取当前需要遍历的设备的上下文
dpdk_device_and_queue_t *dq =
vec_elt_at_index (dm->devices_by_hqos_cpu[thread_index], dev_pos);
// 获取 dpdk 设备描述控制块
dpdk_device_t *xd = vec_elt_at_index (dm->devices, dq->device);
// 获取该设备的线程数据
dpdk_device_hqos_per_hqos_thread_t *hqos = xd->hqos_ht;
u32 device_index = xd->port_id;
u16 queue_id = dq->queue_id;
// 如队列
struct rte_mbuf **pkts_enq = hqos->pkts_enq;
// 出队列
struct rte_mbuf **pkts_deq = hqos->pkts_deq;
// 入队中已经存在的报文个数
u32 pkts_enq_len = hqos->pkts_enq_len;
u32 swq_pos = hqos->swq_pos;// 获取当前队里的队列位置
u32 n_swq = vec_len (hqos->swq), i;
u32 flush_count = hqos->flush_count;// 统计报文不足次数
/*
* SWQ dequeue and HQoS enqueue for current device
* 从 swq 队列中出报文,然后将其压入 Hqos 队列
*/
for (i = 0; i < n_swq; i++)
{
/* Get current SWQ for this device */
struct rte_ring *swq = hqos->swq[swq_pos];
/* Read SWQ burst to packet buffer of this device */
/* 从 swq 中出报文 */
pkts_enq_len += rte_ring_sc_dequeue_burst (swq,
(void **)
&pkts_enq[pkts_enq_len],
hqos->hqos_burst_enq, 0);
/* Get next SWQ for this device */
swq_pos++;
if (swq_pos >= n_swq)
swq_pos = 0;
hqos->swq_pos = swq_pos;
/* HQoS enqueue when burst available */
// 将报文压入 hqos 队列,一次压入 hqos_burst_enq 报文,然后退出
if (pkts_enq_len >= hqos->hqos_burst_enq)
{rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
pkts_enq_len = 0;
flush_count = 0;
break;
}
}
if (pkts_enq_len)// 需要压入 pkts_enq_len 个报文
{
flush_count++;// 报文不足次数
if (PREDICT_FALSE (flush_count == HQOS_FLUSH_COUNT_THRESHOLD))
{rte_sched_port_enqueue (hqos->hqos, pkts_enq, pkts_enq_len);
pkts_enq_len = 0;
flush_count = 0;
}
}
hqos->pkts_enq_len = pkts_enq_len;
hqos->flush_count = flush_count;
/*
* HQoS dequeue and HWQ TX enqueue for current device
* 开始从 hqos 队列中出队, 将报文发送出去
*/
{
u32 pkts_deq_len, n_pkts;
pkts_deq_len = rte_sched_port_dequeue (hqos->hqos,
pkts_deq,
hqos->hqos_burst_deq);
for (n_pkts = 0; n_pkts < pkts_deq_len;)
// 将报文发送出去
n_pkts += rte_eth_tx_burst (device_index,
(uint16_t) queue_id,
&pkts_deq[n_pkts],
(uint16_t) (pkts_deq_len - n_pkts));
}
/* Advance to next device */
dev_pos++;
}
}