2 * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36 struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38 struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44 struct mlx5e_sq_param *param)
46 void *sqc = param->sqc;
47 void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
50 MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 MLX5_SET(wq, wq, pd, rl->priv->pdn);
54 param->wq.buf_numa_node = 0;
55 param->wq.db_numa_node = 0;
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61 struct mlx5e_cq_param *param)
63 void *cqc = param->cqc;
64 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
66 MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
70 switch (rl->param.tx_coalesce_mode) {
72 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
75 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
78 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85 struct mlx5e_rl_channel_param *cparam)
87 memset(cparam, 0, sizeof(*cparam));
89 mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 mlx5e_rl_build_cq_param(rl, &cparam->cq);
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95 struct mlx5e_sq_param *param, int ix)
97 struct mlx5_core_dev *mdev = priv->mdev;
98 void *sqc = param->sqc;
99 void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
102 /* Create DMA descriptor TAG */
103 if ((err = -bus_dma_tag_create(
104 bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 1, /* any alignment */
107 BUS_SPACE_MAXADDR, /* lowaddr */
108 BUS_SPACE_MAXADDR, /* highaddr */
109 NULL, NULL, /* filter, filterarg */
110 MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */
111 MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */
112 MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */
114 NULL, NULL, /* lockfunc, lockfuncarg */
119 sq->uar = priv->rl.sq_uar;
121 err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq,
124 goto err_free_dma_tag;
126 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
128 * The sq->bf_buf_size variable is intentionally left zero so
129 * that the doorbell writes will occur at the same memory
133 err = mlx5e_alloc_sq_db(sq);
135 goto err_sq_wq_destroy;
137 sq->mkey_be = cpu_to_be32(priv->mr.key);
141 mlx5e_update_sq_inline(sq);
146 mlx5_wq_destroy(&sq->wq_ctrl);
148 bus_dma_tag_destroy(sq->dma_tag);
154 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
157 mlx5e_free_sq_db(sq);
158 mlx5_wq_destroy(&sq->wq_ctrl);
159 bus_dma_tag_destroy(sq->dma_tag);
163 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
164 struct mlx5e_sq_param *param, int ix)
168 err = mlx5e_rl_create_sq(priv, sq, param, ix);
172 err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
176 err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
180 WRITE_ONCE(sq->running, 1);
185 mlx5e_disable_sq(sq);
187 mlx5e_rl_destroy_sq(sq);
193 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
195 mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
196 mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
198 callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
200 sq->cev_factor = priv->rl.param.tx_completion_fact;
202 /* ensure the TX completion event factor is not zero */
203 if (sq->cev_factor == 0)
208 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
209 struct mlx5e_rl_channel_param *cparam,
210 struct mlx5e_sq *volatile *ppsq)
212 struct mlx5e_priv *priv = rlw->priv;
216 sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
219 mlx5e_rl_chan_mtx_init(priv, sq);
221 /* open TX completion queue */
222 err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
223 &mlx5e_tx_cq_comp, eq_ix);
227 err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
229 goto err_close_tx_cq;
231 /* store TX channel pointer */
234 /* poll TX queue initially */
235 sq->cq.mcq.comp(&sq->cq.mcq);
240 mlx5e_close_cq(&sq->cq);
243 /* destroy mutexes */
244 mtx_destroy(&sq->lock);
245 mtx_destroy(&sq->comp_lock);
247 atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
252 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
254 struct mlx5e_sq *sq = *ppsq;
256 /* check if channel is already closed */
259 /* ensure channel pointer is no longer used */
262 /* teardown and destroy SQ */
264 mlx5e_disable_sq(sq);
265 mlx5e_rl_destroy_sq(sq);
268 mlx5e_close_cq(&sq->cq);
270 /* destroy mutexes */
271 mtx_destroy(&sq->lock);
272 mtx_destroy(&sq->comp_lock);
278 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
281 * Limit the maximum distance between completion events to
282 * half of the currently set TX queue size.
284 * The maximum number of queue entries a single IP packet can
285 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
287 * The worst case max value is then given as below:
289 uint64_t max = rl->param.tx_queue_size /
290 (2 * MLX5_SEND_WQE_MAX_WQEBBS);
293 * Update the maximum completion factor value in case the
294 * tx_queue_size field changed. Ensure we don't overflow
299 else if (max > 65535)
301 rl->param.tx_completion_fact_max = max;
304 * Verify that the current TX completion factor is within the
307 if (rl->param.tx_completion_fact < 1)
308 rl->param.tx_completion_fact = 1;
309 else if (rl->param.tx_completion_fact > max)
310 rl->param.tx_completion_fact = max;
314 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
316 struct mlx5e_priv *priv = sq->priv;
317 struct mlx5_core_dev *mdev = priv->mdev;
324 inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
325 in = mlx5_vzalloc(inlen);
329 sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
331 MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
332 MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
333 MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
334 MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
335 MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
337 err = mlx5_core_modify_sq(mdev, in, inlen);
345 * This function will search the configured rate limit table for the
346 * best match to avoid that a single socket based application can
347 * allocate all the available hardware rates. If the user selected
348 * rate deviates too much from the closes rate available in the rate
349 * limit table, unlimited rate will be selected.
352 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
354 uint64_t distance = -1ULL;
356 uint64_t retval = 0; /* unlimited */
359 /* search for closest rate */
360 for (x = 0; x != rl->param.tx_rates_def; x++) {
361 uint64_t rate = rl->rate_limit_table[x];
365 if (rate > user_rate)
366 diff = rate - user_rate;
368 diff = user_rate - rate;
370 /* check if distance is smaller than previous rate */
371 if (diff < distance) {
377 /* range check for multiplication below */
378 if (user_rate > rl->param.tx_limit_max)
379 user_rate = rl->param.tx_limit_max;
381 /* fallback to unlimited, if rate deviates too much */
382 if (distance > howmany(user_rate *
383 rl->param.tx_allowed_deviation, 1000ULL))
390 * This function sets the requested rate for a rate limit channel, in
391 * bits per second. The requested rate will be filtered through the
392 * find best rate function above.
395 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
396 struct mlx5e_rl_channel *channel, uint64_t rate)
398 struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
406 MLX5E_RL_WORKER_UNLOCK(rlw);
410 /* get current burst size in bytes */
411 temp = rl->param.tx_burst_size *
412 MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
414 /* limit burst size to 64K currently */
420 rate = mlx5e_rl_find_best_rate_locked(rl, rate);
422 MLX5E_RL_RUNLOCK(rl);
425 /* rate doesn't exist, fallback to unlimited */
428 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
430 /* get a reference on the new rate */
431 error = -mlx5_rl_add_rate(rlw->priv->mdev,
432 howmany(rate, 1000), burst, &index);
435 /* adding rate failed, fallback to unlimited */
438 atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
441 MLX5E_RL_WORKER_LOCK(rlw);
444 burst = 0; /* default */
447 /* atomically swap rates */
448 temp = channel->last_rate;
449 channel->last_rate = rate;
452 /* atomically swap burst size */
453 temp = channel->last_burst;
454 channel->last_burst = burst;
457 MLX5E_RL_WORKER_UNLOCK(rlw);
458 /* put reference on the old rate, if any */
460 mlx5_rl_remove_rate(rlw->priv->mdev,
461 howmany(rate, 1000), burst);
464 /* set new rate, if SQ is running */
466 if (sq != NULL && READ_ONCE(sq->running) != 0) {
467 error = mlx5e_rl_modify_sq(sq, index);
469 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
472 MLX5E_RL_WORKER_LOCK(rlw);
478 mlx5e_rl_worker(void *arg)
481 struct mlx5e_rl_worker *rlw = arg;
482 struct mlx5e_rl_channel *channel;
483 struct mlx5e_priv *priv;
488 /* set thread priority */
492 sched_prio(td, PI_SWI(SWI_NET));
497 /* compute completion vector */
498 ix = (rlw - priv->rl.workers) %
499 priv->mdev->priv.eq_table.num_comp_vectors;
501 /* TODO bind to CPU */
503 /* open all the SQs */
504 MLX5E_RL_WORKER_LOCK(rlw);
505 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
506 struct mlx5e_rl_channel *channel = rlw->channels + x;
508 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
509 if (channel->state == MLX5E_RL_ST_FREE)
512 MLX5E_RL_WORKER_UNLOCK(rlw);
514 MLX5E_RL_RLOCK(&priv->rl);
515 error = mlx5e_rl_open_channel(rlw, ix,
516 &priv->rl.chan_param, &channel->sq);
517 MLX5E_RL_RUNLOCK(&priv->rl);
519 MLX5E_RL_WORKER_LOCK(rlw);
521 mlx5_en_err(priv->ifp,
522 "mlx5e_rl_open_channel failed: %d\n", error);
525 mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
528 if (STAILQ_FIRST(&rlw->process_head) == NULL) {
529 /* check if we are tearing down */
530 if (rlw->worker_done != 0)
532 cv_wait(&rlw->cv, &rlw->mtx);
534 /* check if we are tearing down */
535 if (rlw->worker_done != 0)
537 channel = STAILQ_FIRST(&rlw->process_head);
538 if (channel != NULL) {
539 STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
541 switch (channel->state) {
542 case MLX5E_RL_ST_MODIFY:
543 channel->state = MLX5E_RL_ST_USED;
544 MLX5E_RL_WORKER_UNLOCK(rlw);
546 /* create channel by demand */
547 if (channel->sq == NULL) {
548 MLX5E_RL_RLOCK(&priv->rl);
549 error = mlx5e_rl_open_channel(rlw, ix,
550 &priv->rl.chan_param, &channel->sq);
551 MLX5E_RL_RUNLOCK(&priv->rl);
554 mlx5_en_err(priv->ifp,
555 "mlx5e_rl_open_channel failed: %d\n", error);
557 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
560 mlx5e_resume_sq(channel->sq);
563 MLX5E_RL_WORKER_LOCK(rlw);
564 /* convert from bytes/s to bits/s and set new rate */
565 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
566 channel->new_rate * 8ULL);
568 mlx5_en_err(priv->ifp,
569 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
574 case MLX5E_RL_ST_DESTROY:
575 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
577 mlx5_en_err(priv->ifp,
578 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
581 if (channel->sq != NULL) {
583 * Make sure all packets are
584 * transmitted before SQ is
585 * returned to free list:
587 MLX5E_RL_WORKER_UNLOCK(rlw);
588 mlx5e_drain_sq(channel->sq);
589 MLX5E_RL_WORKER_LOCK(rlw);
591 /* put the channel back into the free list */
592 STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
593 channel->state = MLX5E_RL_ST_FREE;
594 atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
603 /* close all the SQs */
604 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
605 struct mlx5e_rl_channel *channel = rlw->channels + x;
607 /* update the initial rate */
608 channel->init_rate = channel->last_rate;
610 /* make sure we free up the rate resource */
611 mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
613 if (channel->sq != NULL) {
614 MLX5E_RL_WORKER_UNLOCK(rlw);
615 mlx5e_rl_close_channel(&channel->sq);
616 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
617 MLX5E_RL_WORKER_LOCK(rlw);
621 rlw->worker_done = 0;
622 cv_broadcast(&rlw->cv);
623 MLX5E_RL_WORKER_UNLOCK(rlw);
629 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
631 struct mlx5_core_dev *mdev = priv->mdev;
632 u32 in[MLX5_ST_SZ_DW(create_tis_in)];
633 void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
635 memset(in, 0, sizeof(in));
637 MLX5_SET(tisc, tisc, prio, 0);
638 MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
640 return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
644 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
646 mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
650 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
651 struct mlx5_core_dev *mdev)
653 /* ratelimit workers */
654 param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
655 param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
658 if (param->tx_worker_threads_def == 0 ||
659 param->tx_worker_threads_def > param->tx_worker_threads_max)
660 param->tx_worker_threads_def = param->tx_worker_threads_max;
662 /* ratelimit channels */
663 param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
664 param->tx_worker_threads_def;
665 param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
668 if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
669 param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
671 /* set default burst size */
672 param->tx_burst_size = 4; /* MTUs */
675 * Set maximum burst size
677 * The burst size is multiplied by the MTU and clamped to the
678 * range 0 ... 65535 bytes inclusivly before fed into the
681 * NOTE: If the burst size or MTU is changed only ratelimit
682 * connections made after the change will use the new burst
685 param->tx_burst_size_max = 255;
687 /* get firmware rate limits in 1000bit/s and convert them to bit/s */
688 param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
689 param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
691 /* ratelimit table size */
692 param->tx_rates_max = mdev->priv.rl_table.max_size;
695 if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
696 param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
698 /* set default number of rates */
699 param->tx_rates_def = param->tx_rates_max;
701 /* set maximum allowed rate deviation */
702 if (param->tx_limit_max != 0) {
704 * Make sure the deviation multiplication doesn't
705 * overflow unsigned 64-bit:
707 param->tx_allowed_deviation_max = -1ULL /
710 /* set default rate deviation */
711 param->tx_allowed_deviation = 50; /* 5.0% */
713 /* channel parameters */
714 param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
715 param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
716 param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
717 param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
718 param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
721 static const char *mlx5e_rl_params_desc[] = {
722 MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
725 static const char *mlx5e_rl_table_params_desc[] = {
726 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
729 static const char *mlx5e_rl_stats_desc[] = {
730 MLX5E_RL_STATS(MLX5E_STATS_DESC)
734 mlx5e_rl_init(struct mlx5e_priv *priv)
736 struct mlx5e_rl_priv_data *rl = &priv->rl;
737 struct sysctl_oid *node;
738 struct sysctl_oid *stats;
744 /* check if there is support for packet pacing */
745 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
750 sysctl_ctx_init(&rl->ctx);
752 sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
754 /* allocate shared UAR for SQs */
755 error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
759 /* open own TIS domain for ratelimit SQs */
760 error = mlx5e_rl_open_tis(priv);
764 /* setup default value for parameters */
765 mlx5e_rl_set_default_params(&rl->param, priv->mdev);
767 /* update the completion factor */
768 mlx5e_rl_sync_tx_completion_fact(rl);
770 /* create root node */
771 node = SYSCTL_ADD_NODE(&rl->ctx,
772 SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
773 "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
777 for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
778 mlx5e_rl_sysctl_add_u64_oid(rl,
779 MLX5E_RL_PARAMS_INDEX(arg[i]),
780 node, mlx5e_rl_params_desc[2 * i],
781 mlx5e_rl_params_desc[2 * i + 1]);
784 stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
785 OID_AUTO, "stats", CTLFLAG_RD, NULL,
786 "Rate limiting statistics");
789 for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
790 mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
791 stats, mlx5e_rl_stats_desc[2 * i],
792 mlx5e_rl_stats_desc[2 * i + 1]);
797 /* allocate workers array */
798 rl->workers = malloc(sizeof(rl->workers[0]) *
799 rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
801 /* allocate rate limit array */
802 rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
803 rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
806 /* create more SYSCTls */
807 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
808 "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
809 CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
810 "A", "Show table of all configured TX rates");
812 /* try to fetch rate table from kernel environment */
813 for (i = 0; i != rl->param.tx_rates_def; i++) {
814 /* compute path for tunable */
815 snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
816 device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
817 if (TUNABLE_QUAD_FETCH(buf, &j))
818 mlx5e_rl_tx_limit_add(rl, j);
821 /* setup rate table sysctls */
822 for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
823 mlx5e_rl_sysctl_add_u64_oid(rl,
824 MLX5E_RL_PARAMS_INDEX(table_arg[i]),
825 node, mlx5e_rl_table_params_desc[2 * i],
826 mlx5e_rl_table_params_desc[2 * i + 1]);
830 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
831 struct mlx5e_rl_worker *rlw = rl->workers + j;
835 cv_init(&rlw->cv, "mlx5-worker-cv");
836 mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
837 STAILQ_INIT(&rlw->index_list_head);
838 STAILQ_INIT(&rlw->process_head);
840 rlw->channels = malloc(sizeof(rlw->channels[0]) *
841 rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
843 MLX5E_RL_WORKER_LOCK(rlw);
844 for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
845 struct mlx5e_rl_channel *channel = rlw->channels + i;
846 channel->worker = rlw;
847 channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
848 STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
850 MLX5E_RL_WORKER_UNLOCK(rlw);
854 error = mlx5e_rl_open_workers(priv);
858 mlx5_en_err(priv->ifp,
859 "mlx5e_rl_open_workers failed: %d\n", error);
865 mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
867 sysctl_ctx_free(&rl->ctx);
868 sx_destroy(&rl->rl_sxlock);
873 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
875 struct mlx5e_rl_priv_data *rl = &priv->rl;
876 struct thread *rl_thread = NULL;
877 struct proc *rl_proc = NULL;
881 if (priv->gone || rl->opened)
885 /* compute channel parameters once */
886 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
887 MLX5E_RL_WUNLOCK(rl);
889 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
890 struct mlx5e_rl_worker *rlw = rl->workers + j;
892 /* start worker thread */
893 error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
894 RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
896 mlx5_en_err(rl->priv->ifp,
897 "kproc_kthread_add failed: %d\n", error);
898 rlw->worker_done = 1;
908 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
910 struct mlx5e_rl_priv_data *rl = &priv->rl;
916 /* tear down worker threads simultaneously */
917 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
918 struct mlx5e_rl_worker *rlw = rl->workers + y;
920 /* tear down worker before freeing SQs */
921 MLX5E_RL_WORKER_LOCK(rlw);
922 if (rlw->worker_done == 0) {
923 rlw->worker_done = 1;
924 cv_broadcast(&rlw->cv);
926 /* XXX thread not started */
927 rlw->worker_done = 0;
929 MLX5E_RL_WORKER_UNLOCK(rlw);
932 /* wait for worker threads to exit */
933 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
934 struct mlx5e_rl_worker *rlw = rl->workers + y;
936 /* tear down worker before freeing SQs */
937 MLX5E_RL_WORKER_LOCK(rlw);
938 while (rlw->worker_done != 0)
939 cv_wait(&rlw->cv, &rlw->mtx);
940 MLX5E_RL_WORKER_UNLOCK(rlw);
947 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
952 for (x = 0; x != rl->param.tx_rates_def; x++)
953 rl->rate_limit_table[x] = 0;
954 MLX5E_RL_WUNLOCK(rl);
958 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
960 struct mlx5e_rl_priv_data *rl = &priv->rl;
963 /* check if there is support for packet pacing */
964 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
967 /* TODO check if there is support for packet pacing */
969 sysctl_ctx_free(&rl->ctx);
972 mlx5e_rl_close_workers(priv);
975 mlx5e_rl_reset_rates(rl);
977 /* free shared UAR for SQs */
978 mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
980 /* close TIS domain */
981 mlx5e_rl_close_tis(priv);
983 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
984 struct mlx5e_rl_worker *rlw = rl->workers + y;
986 cv_destroy(&rlw->cv);
987 mtx_destroy(&rlw->mtx);
988 free(rlw->channels, M_MLX5EN);
990 free(rl->rate_limit_table, M_MLX5EN);
991 free(rl->workers, M_MLX5EN);
992 sx_destroy(&rl->rl_sxlock);
996 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
997 struct mlx5e_rl_channel *channel)
999 STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1000 cv_broadcast(&rlw->cv);
1004 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1006 if (channel == NULL)
1009 MLX5E_RL_WORKER_LOCK(rlw);
1010 switch (channel->state) {
1011 case MLX5E_RL_ST_MODIFY:
1012 channel->state = MLX5E_RL_ST_DESTROY;
1014 case MLX5E_RL_ST_USED:
1015 channel->state = MLX5E_RL_ST_DESTROY;
1016 mlx5e_rlw_queue_channel_locked(rlw, channel);
1021 MLX5E_RL_WORKER_UNLOCK(rlw);
1025 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1028 MLX5E_RL_WORKER_LOCK(rlw);
1029 channel->new_rate = rate;
1030 switch (channel->state) {
1031 case MLX5E_RL_ST_USED:
1032 channel->state = MLX5E_RL_ST_MODIFY;
1033 mlx5e_rlw_queue_channel_locked(rlw, channel);
1038 MLX5E_RL_WORKER_UNLOCK(rlw);
1044 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1045 union if_snd_tag_query_params *params)
1049 MLX5E_RL_WORKER_LOCK(rlw);
1050 switch (channel->state) {
1051 case MLX5E_RL_ST_USED:
1052 params->rate_limit.max_rate = channel->last_rate;
1053 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1056 case MLX5E_RL_ST_MODIFY:
1057 params->rate_limit.max_rate = channel->last_rate;
1058 params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1065 MLX5E_RL_WORKER_UNLOCK(rlw);
1071 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1072 struct mlx5e_rl_channel **pchannel)
1074 struct mlx5e_rl_channel *channel;
1075 int retval = ENOMEM;
1077 MLX5E_RL_WORKER_LOCK(rlw);
1078 /* Check for available channel in free list */
1079 if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1081 /* Remove head index from available list */
1082 STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1083 channel->state = MLX5E_RL_ST_USED;
1084 atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1086 atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1088 MLX5E_RL_WORKER_UNLOCK(rlw);
1090 *pchannel = channel;
1091 #ifdef RATELIMIT_DEBUG
1092 mlx5_en_info(rlw->priv->ifp,
1093 "Channel pointer for rate limit connection is %p\n", channel);
1099 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1100 union if_snd_tag_alloc_params *params,
1101 struct m_snd_tag **ppmt)
1103 struct mlx5e_rl_channel *channel;
1104 struct mlx5e_rl_worker *rlw;
1105 struct mlx5e_priv *priv;
1108 priv = ifp->if_softc;
1110 /* check if there is support for packet pacing or if device is going away */
1111 if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1112 !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1113 params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1114 return (EOPNOTSUPP);
1116 /* compute worker thread this TCP connection belongs to */
1117 rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1118 priv->rl.param.tx_worker_threads_def);
1120 error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1124 error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1126 mlx5e_rl_free(rlw, channel);
1130 /* store pointer to mbuf tag */
1131 MPASS(channel->tag.m_snd_tag.refcount == 0);
1132 m_snd_tag_init(&channel->tag.m_snd_tag, ifp);
1133 *ppmt = &channel->tag.m_snd_tag;
1140 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1142 struct mlx5e_rl_channel *channel =
1143 container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1145 return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1149 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1151 struct mlx5e_rl_channel *channel =
1152 container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1154 return (mlx5e_rl_query(channel->worker, channel, params));
1158 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1160 struct mlx5e_rl_channel *channel =
1161 container_of(pmt, struct mlx5e_rl_channel, tag.m_snd_tag);
1163 mlx5e_rl_free(channel->worker, channel);
1167 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1169 struct mlx5e_rl_priv_data *rl = arg1;
1170 struct mlx5e_priv *priv = rl->priv;
1175 error = sysctl_wire_old_buffer(req, 0);
1181 sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1184 "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1185 "\t" "--------------------------------------------\n");
1188 for (x = 0; x != rl->param.tx_rates_def; x++) {
1189 if (rl->rate_limit_table[x] == 0)
1192 sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1193 x, (unsigned)rl->param.tx_burst_size,
1194 (long long)rl->rate_limit_table[x]);
1196 MLX5E_RL_RUNLOCK(rl);
1198 error = sbuf_finish(&sbuf);
1207 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1213 /* compute channel parameters once */
1214 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1215 MLX5E_RL_WUNLOCK(rl);
1217 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1218 struct mlx5e_rl_worker *rlw = rl->workers + y;
1220 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1221 struct mlx5e_rl_channel *channel;
1222 struct mlx5e_sq *sq;
1224 channel = rlw->channels + x;
1230 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1231 mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1232 rl->param.tx_coalesce_usecs,
1233 rl->param.tx_coalesce_pkts,
1234 rl->param.tx_coalesce_mode);
1236 mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1237 rl->param.tx_coalesce_usecs,
1238 rl->param.tx_coalesce_pkts);
1246 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1251 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1252 struct mlx5e_rl_worker *rlw = rl->workers + y;
1254 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1255 struct mlx5e_rl_channel *channel;
1256 struct mlx5e_sq *sq;
1258 channel = rlw->channels + x;
1264 mtx_lock(&sq->lock);
1265 mlx5e_update_sq_inline(sq);
1266 mtx_unlock(&sq->lock);
1272 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1278 mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1284 /* check if rate already exists */
1285 for (x = 0; x != rl->param.tx_rates_def; x++) {
1286 if (rl->rate_limit_table[x] != value)
1292 /* check if there is a free rate entry */
1293 if (x == rl->param.tx_rates_def) {
1294 for (x = 0; x != rl->param.tx_rates_def; x++) {
1295 if (rl->rate_limit_table[x] != 0)
1297 rl->rate_limit_table[x] = value;
1302 MLX5E_RL_WUNLOCK(rl);
1308 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1318 /* check if rate already exists */
1319 for (x = 0; x != rl->param.tx_rates_def; x++) {
1320 if (rl->rate_limit_table[x] != value)
1323 rl->rate_limit_table[x] = 0;
1327 /* check if there is a free rate entry */
1328 if (x == rl->param.tx_rates_def)
1332 MLX5E_RL_WUNLOCK(rl);
1338 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1340 struct mlx5e_rl_priv_data *rl = arg1;
1341 struct mlx5e_priv *priv = rl->priv;
1342 unsigned mode_modify;
1343 unsigned was_opened;
1351 value = rl->param.arg[arg2];
1352 MLX5E_RL_RUNLOCK(rl);
1356 error = sysctl_handle_64(oidp, &value, 0, req);
1357 if (error || req->newptr == NULL ||
1358 value == rl->param.arg[arg2])
1365 /* check if device is gone */
1370 was_opened = rl->opened;
1371 mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1373 switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1374 case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1375 if (value > rl->param.tx_worker_threads_max)
1376 value = rl->param.tx_worker_threads_max;
1380 /* store new value */
1381 rl->param.arg[arg2] = value;
1384 case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1385 if (value > rl->param.tx_channels_per_worker_max)
1386 value = rl->param.tx_channels_per_worker_max;
1390 /* store new value */
1391 rl->param.arg[arg2] = value;
1394 case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1395 if (value > rl->param.tx_rates_max)
1396 value = rl->param.tx_rates_max;
1400 /* store new value */
1401 rl->param.arg[arg2] = value;
1404 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1408 else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1409 value = MLX5E_FLD_MAX(cqc, cq_period);
1411 /* store new value */
1412 rl->param.arg[arg2] = value;
1414 /* check to avoid down and up the network interface */
1416 error = mlx5e_rl_refresh_channel_params(rl);
1419 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1420 /* import TX coal pkts */
1423 else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1424 value = MLX5E_FLD_MAX(cqc, cq_max_count);
1426 /* store new value */
1427 rl->param.arg[arg2] = value;
1429 /* check to avoid down and up the network interface */
1431 error = mlx5e_rl_refresh_channel_params(rl);
1434 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1435 /* network interface must be down */
1436 if (was_opened != 0 && mode_modify == 0)
1437 mlx5e_rl_close_workers(priv);
1439 /* import TX coalesce mode */
1443 /* store new value */
1444 rl->param.arg[arg2] = value;
1446 /* restart network interface, if any */
1447 if (was_opened != 0) {
1448 if (mode_modify == 0)
1449 mlx5e_rl_open_workers(priv);
1451 error = mlx5e_rl_refresh_channel_params(rl);
1455 case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1456 /* network interface must be down */
1458 mlx5e_rl_close_workers(priv);
1460 /* import TX queue size */
1461 if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1462 value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1463 else if (value > priv->params_ethtool.tx_queue_size_max)
1464 value = priv->params_ethtool.tx_queue_size_max;
1466 /* store actual TX queue size */
1467 value = 1ULL << order_base_2(value);
1469 /* store new value */
1470 rl->param.arg[arg2] = value;
1472 /* verify TX completion factor */
1473 mlx5e_rl_sync_tx_completion_fact(rl);
1475 /* restart network interface, if any */
1477 mlx5e_rl_open_workers(priv);
1480 case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1481 /* network interface must be down */
1483 mlx5e_rl_close_workers(priv);
1485 /* store new value */
1486 rl->param.arg[arg2] = value;
1488 /* verify parameter */
1489 mlx5e_rl_sync_tx_completion_fact(rl);
1491 /* restart network interface, if any */
1493 mlx5e_rl_open_workers(priv);
1496 case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1497 error = mlx5e_rl_tx_limit_add(rl, value);
1500 case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1501 error = mlx5e_rl_tx_limit_clr(rl, value);
1504 case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1506 if (value > rl->param.tx_allowed_deviation_max)
1507 value = rl->param.tx_allowed_deviation_max;
1508 else if (value < rl->param.tx_allowed_deviation_min)
1509 value = rl->param.tx_allowed_deviation_min;
1512 rl->param.arg[arg2] = value;
1513 MLX5E_RL_WUNLOCK(rl);
1516 case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1518 if (value > rl->param.tx_burst_size_max)
1519 value = rl->param.tx_burst_size_max;
1520 else if (value < rl->param.tx_burst_size_min)
1521 value = rl->param.tx_burst_size_min;
1524 rl->param.arg[arg2] = value;
1525 MLX5E_RL_WUNLOCK(rl);
1537 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1538 struct sysctl_oid *node, const char *name, const char *desc)
1541 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1542 * take care of loading default sysctl value from the kernel
1543 * environment, if any:
1545 if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1546 /* read-only SYSCTLs */
1547 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1548 name, CTLTYPE_U64 | CTLFLAG_RD |
1549 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1551 if (strstr(name, "_def") != 0) {
1552 #ifdef RATELIMIT_DEBUG
1553 /* tunable read-only advanced SYSCTLs */
1554 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1555 name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1556 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1559 /* read-write SYSCTLs */
1560 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1561 name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1562 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1568 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1569 struct sysctl_oid *node, const char *name, const char *desc)
1571 /* read-only SYSCTLs */
1572 SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1573 CTLFLAG_RD, &rl->stats.arg[x], 0, desc);