2 * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36 struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38 struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44 struct mlx5e_sq_param *param)
46 void *sqc = param->sqc;
47 void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
50 MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 MLX5_SET(wq, wq, pd, rl->priv->pdn);
54 param->wq.buf_numa_node = 0;
55 param->wq.db_numa_node = 0;
60 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
61 struct mlx5e_cq_param *param)
63 void *cqc = param->cqc;
64 uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
66 MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
67 MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
68 MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
70 switch (rl->param.tx_coalesce_mode) {
72 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
75 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
76 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
78 MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
84 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
85 struct mlx5e_rl_channel_param *cparam)
87 memset(cparam, 0, sizeof(*cparam));
89 mlx5e_rl_build_sq_param(rl, &cparam->sq);
90 mlx5e_rl_build_cq_param(rl, &cparam->cq);
94 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
95 struct mlx5e_sq_param *param, int ix)
97 struct mlx5_core_dev *mdev = priv->mdev;
98 void *sqc = param->sqc;
99 void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
102 /* Create DMA descriptor TAG */
103 if ((err = -bus_dma_tag_create(
104 bus_get_dma_tag(mdev->pdev->dev.bsddev),
105 1, /* any alignment */
107 BUS_SPACE_MAXADDR, /* lowaddr */
108 BUS_SPACE_MAXADDR, /* highaddr */
109 NULL, NULL, /* filter, filterarg */
110 MLX5E_MAX_TX_PAYLOAD_SIZE, /* maxsize */
111 MLX5E_MAX_TX_MBUF_FRAGS, /* nsegments */
112 MLX5E_MAX_TX_MBUF_SIZE, /* maxsegsize */
114 NULL, NULL, /* lockfunc, lockfuncarg */
119 sq->uar = priv->rl.sq_uar;
121 err = mlx5_wq_cyc_create(mdev, ¶m->wq, sqc_wq, &sq->wq,
124 goto err_free_dma_tag;
126 sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
128 * The sq->bf_buf_size variable is intentionally left zero so
129 * that the doorbell writes will occur at the same memory
133 err = mlx5e_alloc_sq_db(sq);
135 goto err_sq_wq_destroy;
137 sq->mkey_be = cpu_to_be32(priv->mr.key);
141 mlx5e_update_sq_inline(sq);
146 mlx5_wq_destroy(&sq->wq_ctrl);
148 bus_dma_tag_destroy(sq->dma_tag);
154 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
157 mlx5e_free_sq_db(sq);
158 mlx5_wq_destroy(&sq->wq_ctrl);
162 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
163 struct mlx5e_sq_param *param, int ix)
167 err = mlx5e_rl_create_sq(priv, sq, param, ix);
171 err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
175 err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
182 mlx5e_disable_sq(sq);
184 mlx5e_rl_destroy_sq(sq);
190 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
192 mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
193 mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
195 callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
197 sq->cev_factor = priv->rl.param.tx_completion_fact;
199 /* ensure the TX completion event factor is not zero */
200 if (sq->cev_factor == 0)
205 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
206 struct mlx5e_rl_channel_param *cparam,
207 struct mlx5e_sq *volatile *ppsq)
209 struct mlx5e_priv *priv = rlw->priv;
213 sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
216 mlx5e_rl_chan_mtx_init(priv, sq);
218 /* open TX completion queue */
219 err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
220 &mlx5e_tx_cq_comp, eq_ix);
224 err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
226 goto err_close_tx_cq;
228 /* store TX channel pointer */
231 /* poll TX queue initially */
232 sq->cq.mcq.comp(&sq->cq.mcq);
237 mlx5e_close_cq(&sq->cq);
240 /* destroy mutexes */
241 mtx_destroy(&sq->lock);
242 mtx_destroy(&sq->comp_lock);
244 atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
249 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
251 struct mlx5e_sq *sq = *ppsq;
253 /* check if channel is already closed */
256 /* ensure channel pointer is no longer used */
259 /* teardown and destroy SQ */
261 mlx5e_disable_sq(sq);
262 mlx5e_rl_destroy_sq(sq);
265 mlx5e_close_cq(&sq->cq);
267 /* destroy mutexes */
268 mtx_destroy(&sq->lock);
269 mtx_destroy(&sq->comp_lock);
275 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
278 * Limit the maximum distance between completion events to
279 * half of the currently set TX queue size.
281 * The maximum number of queue entries a single IP packet can
282 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
284 * The worst case max value is then given as below:
286 uint64_t max = rl->param.tx_queue_size /
287 (2 * MLX5_SEND_WQE_MAX_WQEBBS);
290 * Update the maximum completion factor value in case the
291 * tx_queue_size field changed. Ensure we don't overflow
296 else if (max > 65535)
298 rl->param.tx_completion_fact_max = max;
301 * Verify that the current TX completion factor is within the
304 if (rl->param.tx_completion_fact < 1)
305 rl->param.tx_completion_fact = 1;
306 else if (rl->param.tx_completion_fact > max)
307 rl->param.tx_completion_fact = max;
311 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
313 struct mlx5e_priv *priv = sq->priv;
314 struct mlx5_core_dev *mdev = priv->mdev;
321 inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
322 in = mlx5_vzalloc(inlen);
326 sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
328 MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
329 MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
330 MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
331 MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
332 MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
334 err = mlx5_core_modify_sq(mdev, in, inlen);
342 * This function will search the configured rate limit table for the
343 * best match to avoid that a single socket based application can
344 * allocate all the available hardware rates. If the user selected
345 * rate deviates too much from the closes rate available in the rate
346 * limit table, unlimited rate will be selected.
349 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
351 uint64_t distance = -1ULL;
353 uint64_t retval = 0; /* unlimited */
356 /* search for closest rate */
357 for (x = 0; x != rl->param.tx_rates_def; x++) {
358 uint64_t rate = rl->rate_limit_table[x];
362 if (rate > user_rate)
363 diff = rate - user_rate;
365 diff = user_rate - rate;
367 /* check if distance is smaller than previous rate */
368 if (diff < distance) {
374 /* range check for multiplication below */
375 if (user_rate > rl->param.tx_limit_max)
376 user_rate = rl->param.tx_limit_max;
378 /* fallback to unlimited, if rate deviates too much */
379 if (distance > howmany(user_rate *
380 rl->param.tx_allowed_deviation, 1000ULL))
387 * This function sets the requested rate for a rate limit channel, in
388 * bits per second. The requested rate will be filtered through the
389 * find best rate function above.
392 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
393 struct mlx5e_rl_channel *channel, uint64_t rate)
395 struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
403 MLX5E_RL_WORKER_UNLOCK(rlw);
407 /* get current burst size in bytes */
408 temp = rl->param.tx_burst_size *
409 MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
411 /* limit burst size to 64K currently */
417 rate = mlx5e_rl_find_best_rate_locked(rl, rate);
419 MLX5E_RL_RUNLOCK(rl);
422 /* rate doesn't exist, fallback to unlimited */
425 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
427 /* get a reference on the new rate */
428 error = -mlx5_rl_add_rate(rlw->priv->mdev,
429 howmany(rate, 1000), burst, &index);
432 /* adding rate failed, fallback to unlimited */
435 atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
438 MLX5E_RL_WORKER_LOCK(rlw);
441 burst = 0; /* default */
444 /* atomically swap rates */
445 temp = channel->last_rate;
446 channel->last_rate = rate;
449 /* atomically swap burst size */
450 temp = channel->last_burst;
451 channel->last_burst = burst;
454 MLX5E_RL_WORKER_UNLOCK(rlw);
455 /* put reference on the old rate, if any */
457 mlx5_rl_remove_rate(rlw->priv->mdev,
458 howmany(rate, 1000), burst);
461 /* set new rate, if SQ is running */
463 if (sq != NULL && READ_ONCE(sq->running) != 0) {
464 error = mlx5e_rl_modify_sq(sq, index);
466 atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
469 MLX5E_RL_WORKER_LOCK(rlw);
475 mlx5e_rl_worker(void *arg)
478 struct mlx5e_rl_worker *rlw = arg;
479 struct mlx5e_rl_channel *channel;
480 struct mlx5e_priv *priv;
485 /* set thread priority */
489 sched_prio(td, PI_SWI(SWI_NET));
494 /* compute completion vector */
495 ix = (rlw - priv->rl.workers) %
496 priv->mdev->priv.eq_table.num_comp_vectors;
498 /* TODO bind to CPU */
500 /* open all the SQs */
501 MLX5E_RL_WORKER_LOCK(rlw);
502 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
503 struct mlx5e_rl_channel *channel = rlw->channels + x;
505 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
506 if (channel->state == MLX5E_RL_ST_FREE)
509 MLX5E_RL_WORKER_UNLOCK(rlw);
511 MLX5E_RL_RLOCK(&priv->rl);
512 error = mlx5e_rl_open_channel(rlw, ix,
513 &priv->rl.chan_param, &channel->sq);
514 MLX5E_RL_RUNLOCK(&priv->rl);
516 MLX5E_RL_WORKER_LOCK(rlw);
519 "mlx5e_rl_open_channel failed: %d\n", error);
522 mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
525 if (STAILQ_FIRST(&rlw->process_head) == NULL) {
526 /* check if we are tearing down */
527 if (rlw->worker_done != 0)
529 cv_wait(&rlw->cv, &rlw->mtx);
531 /* check if we are tearing down */
532 if (rlw->worker_done != 0)
534 channel = STAILQ_FIRST(&rlw->process_head);
535 if (channel != NULL) {
536 STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
538 switch (channel->state) {
539 case MLX5E_RL_ST_MODIFY:
540 channel->state = MLX5E_RL_ST_USED;
541 MLX5E_RL_WORKER_UNLOCK(rlw);
543 /* create channel by demand */
544 if (channel->sq == NULL) {
545 MLX5E_RL_RLOCK(&priv->rl);
546 error = mlx5e_rl_open_channel(rlw, ix,
547 &priv->rl.chan_param, &channel->sq);
548 MLX5E_RL_RUNLOCK(&priv->rl);
552 "mlx5e_rl_open_channel failed: %d\n", error);
554 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
557 mlx5e_resume_sq(channel->sq);
560 MLX5E_RL_WORKER_LOCK(rlw);
561 /* convert from bytes/s to bits/s and set new rate */
562 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
563 channel->new_rate * 8ULL);
566 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
571 case MLX5E_RL_ST_DESTROY:
572 error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
575 "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
578 if (channel->sq != NULL) {
580 * Make sure all packets are
581 * transmitted before SQ is
582 * returned to free list:
584 MLX5E_RL_WORKER_UNLOCK(rlw);
585 mlx5e_drain_sq(channel->sq);
586 MLX5E_RL_WORKER_LOCK(rlw);
588 /* put the channel back into the free list */
589 STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
590 channel->state = MLX5E_RL_ST_FREE;
591 atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
600 /* close all the SQs */
601 for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
602 struct mlx5e_rl_channel *channel = rlw->channels + x;
604 /* update the initial rate */
605 channel->init_rate = channel->last_rate;
607 /* make sure we free up the rate resource */
608 mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
610 if (channel->sq != NULL) {
611 MLX5E_RL_WORKER_UNLOCK(rlw);
612 mlx5e_rl_close_channel(&channel->sq);
613 atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
614 MLX5E_RL_WORKER_LOCK(rlw);
618 rlw->worker_done = 0;
619 cv_broadcast(&rlw->cv);
620 MLX5E_RL_WORKER_UNLOCK(rlw);
626 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
628 struct mlx5_core_dev *mdev = priv->mdev;
629 u32 in[MLX5_ST_SZ_DW(create_tis_in)];
630 void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
632 memset(in, 0, sizeof(in));
634 MLX5_SET(tisc, tisc, prio, 0);
635 MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
637 return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
641 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
643 mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
647 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
648 struct mlx5_core_dev *mdev)
650 /* ratelimit workers */
651 param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
652 param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
655 if (param->tx_worker_threads_def == 0 ||
656 param->tx_worker_threads_def > param->tx_worker_threads_max)
657 param->tx_worker_threads_def = param->tx_worker_threads_max;
659 /* ratelimit channels */
660 param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
661 param->tx_worker_threads_def;
662 param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
665 if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
666 param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
668 /* set default burst size */
669 param->tx_burst_size = 4; /* MTUs */
672 * Set maximum burst size
674 * The burst size is multiplied by the MTU and clamped to the
675 * range 0 ... 65535 bytes inclusivly before fed into the
678 * NOTE: If the burst size or MTU is changed only ratelimit
679 * connections made after the change will use the new burst
682 param->tx_burst_size_max = 255;
684 /* get firmware rate limits in 1000bit/s and convert them to bit/s */
685 param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
686 param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
688 /* ratelimit table size */
689 param->tx_rates_max = mdev->priv.rl_table.max_size;
692 if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
693 param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
695 /* set default number of rates */
696 param->tx_rates_def = param->tx_rates_max;
698 /* set maximum allowed rate deviation */
699 if (param->tx_limit_max != 0) {
701 * Make sure the deviation multiplication doesn't
702 * overflow unsigned 64-bit:
704 param->tx_allowed_deviation_max = -1ULL /
707 /* set default rate deviation */
708 param->tx_allowed_deviation = 50; /* 5.0% */
710 /* channel parameters */
711 param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
712 param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
713 param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
714 param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
715 param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
718 static const char *mlx5e_rl_params_desc[] = {
719 MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
722 static const char *mlx5e_rl_table_params_desc[] = {
723 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
726 static const char *mlx5e_rl_stats_desc[] = {
727 MLX5E_RL_STATS(MLX5E_STATS_DESC)
731 mlx5e_rl_init(struct mlx5e_priv *priv)
733 struct mlx5e_rl_priv_data *rl = &priv->rl;
734 struct sysctl_oid *node;
735 struct sysctl_oid *stats;
741 /* check if there is support for packet pacing */
742 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
747 sysctl_ctx_init(&rl->ctx);
749 sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
751 /* allocate shared UAR for SQs */
752 error = mlx5_alloc_map_uar(priv->mdev, &rl->sq_uar);
756 /* open own TIS domain for ratelimit SQs */
757 error = mlx5e_rl_open_tis(priv);
761 /* setup default value for parameters */
762 mlx5e_rl_set_default_params(&rl->param, priv->mdev);
764 /* update the completion factor */
765 mlx5e_rl_sync_tx_completion_fact(rl);
767 /* create root node */
768 node = SYSCTL_ADD_NODE(&rl->ctx,
769 SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
770 "rate_limit", CTLFLAG_RW, NULL, "Rate limiting support");
774 for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
775 mlx5e_rl_sysctl_add_u64_oid(rl,
776 MLX5E_RL_PARAMS_INDEX(arg[i]),
777 node, mlx5e_rl_params_desc[2 * i],
778 mlx5e_rl_params_desc[2 * i + 1]);
781 stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
782 OID_AUTO, "stats", CTLFLAG_RD, NULL,
783 "Rate limiting statistics");
786 for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
787 mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
788 stats, mlx5e_rl_stats_desc[2 * i],
789 mlx5e_rl_stats_desc[2 * i + 1]);
794 /* allocate workers array */
795 rl->workers = malloc(sizeof(rl->workers[0]) *
796 rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
798 /* allocate rate limit array */
799 rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
800 rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
803 /* create more SYSCTls */
804 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
805 "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
806 CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
807 "A", "Show table of all configured TX rates");
809 /* try to fetch rate table from kernel environment */
810 for (i = 0; i != rl->param.tx_rates_def; i++) {
811 /* compute path for tunable */
812 snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
813 device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
814 if (TUNABLE_QUAD_FETCH(buf, &j))
815 mlx5e_rl_tx_limit_add(rl, j);
818 /* setup rate table sysctls */
819 for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
820 mlx5e_rl_sysctl_add_u64_oid(rl,
821 MLX5E_RL_PARAMS_INDEX(table_arg[i]),
822 node, mlx5e_rl_table_params_desc[2 * i],
823 mlx5e_rl_table_params_desc[2 * i + 1]);
827 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
828 struct mlx5e_rl_worker *rlw = rl->workers + j;
832 cv_init(&rlw->cv, "mlx5-worker-cv");
833 mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
834 STAILQ_INIT(&rlw->index_list_head);
835 STAILQ_INIT(&rlw->process_head);
837 rlw->channels = malloc(sizeof(rlw->channels[0]) *
838 rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
840 MLX5E_RL_WORKER_LOCK(rlw);
841 for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
842 struct mlx5e_rl_channel *channel = rlw->channels + i;
843 channel->worker = rlw;
844 channel->m_snd_tag.ifp = priv->ifp;
845 STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
847 MLX5E_RL_WORKER_UNLOCK(rlw);
851 error = mlx5e_rl_open_workers(priv);
856 "mlx5e_rl_open_workers failed: %d\n", error);
862 mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
864 sysctl_ctx_free(&rl->ctx);
865 sx_destroy(&rl->rl_sxlock);
870 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
872 struct mlx5e_rl_priv_data *rl = &priv->rl;
873 struct thread *rl_thread = NULL;
874 struct proc *rl_proc = NULL;
878 if (priv->gone || rl->opened)
882 /* compute channel parameters once */
883 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
884 MLX5E_RL_WUNLOCK(rl);
886 for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
887 struct mlx5e_rl_worker *rlw = rl->workers + j;
889 /* start worker thread */
890 error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
891 RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
893 if_printf(rl->priv->ifp,
894 "kproc_kthread_add failed: %d\n", error);
895 rlw->worker_done = 1;
905 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
907 struct mlx5e_rl_priv_data *rl = &priv->rl;
913 /* tear down worker threads simultaneously */
914 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
915 struct mlx5e_rl_worker *rlw = rl->workers + y;
917 /* tear down worker before freeing SQs */
918 MLX5E_RL_WORKER_LOCK(rlw);
919 if (rlw->worker_done == 0) {
920 rlw->worker_done = 1;
921 cv_broadcast(&rlw->cv);
923 /* XXX thread not started */
924 rlw->worker_done = 0;
926 MLX5E_RL_WORKER_UNLOCK(rlw);
929 /* wait for worker threads to exit */
930 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
931 struct mlx5e_rl_worker *rlw = rl->workers + y;
933 /* tear down worker before freeing SQs */
934 MLX5E_RL_WORKER_LOCK(rlw);
935 while (rlw->worker_done != 0)
936 cv_wait(&rlw->cv, &rlw->mtx);
937 MLX5E_RL_WORKER_UNLOCK(rlw);
944 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
949 for (x = 0; x != rl->param.tx_rates_def; x++)
950 rl->rate_limit_table[x] = 0;
951 MLX5E_RL_WUNLOCK(rl);
955 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
957 struct mlx5e_rl_priv_data *rl = &priv->rl;
960 /* check if there is support for packet pacing */
961 if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
964 /* TODO check if there is support for packet pacing */
966 sysctl_ctx_free(&rl->ctx);
969 mlx5e_rl_close_workers(priv);
972 mlx5e_rl_reset_rates(rl);
974 /* free shared UAR for SQs */
975 mlx5_unmap_free_uar(priv->mdev, &rl->sq_uar);
977 /* close TIS domain */
978 mlx5e_rl_close_tis(priv);
980 for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
981 struct mlx5e_rl_worker *rlw = rl->workers + y;
983 cv_destroy(&rlw->cv);
984 mtx_destroy(&rlw->mtx);
985 free(rlw->channels, M_MLX5EN);
987 free(rl->rate_limit_table, M_MLX5EN);
988 free(rl->workers, M_MLX5EN);
989 sx_destroy(&rl->rl_sxlock);
993 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
994 struct mlx5e_rl_channel *channel)
996 STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
997 cv_broadcast(&rlw->cv);
1001 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1003 if (channel == NULL)
1006 MLX5E_RL_WORKER_LOCK(rlw);
1007 switch (channel->state) {
1008 case MLX5E_RL_ST_MODIFY:
1009 channel->state = MLX5E_RL_ST_DESTROY;
1011 case MLX5E_RL_ST_USED:
1012 channel->state = MLX5E_RL_ST_DESTROY;
1013 mlx5e_rlw_queue_channel_locked(rlw, channel);
1018 MLX5E_RL_WORKER_UNLOCK(rlw);
1022 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1025 MLX5E_RL_WORKER_LOCK(rlw);
1026 channel->new_rate = rate;
1027 switch (channel->state) {
1028 case MLX5E_RL_ST_USED:
1029 channel->state = MLX5E_RL_ST_MODIFY;
1030 mlx5e_rlw_queue_channel_locked(rlw, channel);
1035 MLX5E_RL_WORKER_UNLOCK(rlw);
1041 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t *prate)
1045 MLX5E_RL_WORKER_LOCK(rlw);
1046 switch (channel->state) {
1047 case MLX5E_RL_ST_USED:
1048 *prate = channel->last_rate;
1051 case MLX5E_RL_ST_MODIFY:
1058 MLX5E_RL_WORKER_UNLOCK(rlw);
1064 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1065 struct mlx5e_rl_channel **pchannel)
1067 struct mlx5e_rl_channel *channel;
1068 int retval = ENOMEM;
1070 MLX5E_RL_WORKER_LOCK(rlw);
1071 /* Check for available channel in free list */
1072 if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1074 /* Remove head index from available list */
1075 STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1076 channel->state = MLX5E_RL_ST_USED;
1077 atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1079 atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1081 MLX5E_RL_WORKER_UNLOCK(rlw);
1083 *pchannel = channel;
1084 #ifdef RATELIMIT_DEBUG
1085 if_printf(rlw->priv->ifp, "Channel pointer for rate limit connection is %p\n", channel);
1091 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1092 union if_snd_tag_alloc_params *params,
1093 struct m_snd_tag **ppmt)
1095 struct mlx5e_rl_channel *channel;
1096 struct mlx5e_rl_worker *rlw;
1097 struct mlx5e_priv *priv;
1100 priv = ifp->if_softc;
1102 /* check if there is support for packet pacing or if device is going away */
1103 if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1104 !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1105 params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1106 return (EOPNOTSUPP);
1108 /* compute worker thread this TCP connection belongs to */
1109 rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1110 priv->rl.param.tx_worker_threads_def);
1112 error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1116 error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1118 mlx5e_rl_free(rlw, channel);
1122 /* store pointer to mbuf tag */
1123 *ppmt = &channel->m_snd_tag;
1130 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1132 struct mlx5e_rl_channel *channel =
1133 container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1135 return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1139 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1141 struct mlx5e_rl_channel *channel =
1142 container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1144 return (mlx5e_rl_query(channel->worker, channel, ¶ms->rate_limit.max_rate));
1148 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1150 struct mlx5e_rl_channel *channel =
1151 container_of(pmt, struct mlx5e_rl_channel, m_snd_tag);
1153 mlx5e_rl_free(channel->worker, channel);
1157 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1159 struct mlx5e_rl_priv_data *rl = arg1;
1160 struct mlx5e_priv *priv = rl->priv;
1165 error = sysctl_wire_old_buffer(req, 0);
1171 sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1174 "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1175 "\t" "--------------------------------------------\n");
1178 for (x = 0; x != rl->param.tx_rates_def; x++) {
1179 if (rl->rate_limit_table[x] == 0)
1182 sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1183 x, (unsigned)rl->param.tx_burst_size,
1184 (long long)rl->rate_limit_table[x]);
1186 MLX5E_RL_RUNLOCK(rl);
1188 error = sbuf_finish(&sbuf);
1197 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1203 /* compute channel parameters once */
1204 mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1205 MLX5E_RL_WUNLOCK(rl);
1207 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1208 struct mlx5e_rl_worker *rlw = rl->workers + y;
1210 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1211 struct mlx5e_rl_channel *channel;
1212 struct mlx5e_sq *sq;
1214 channel = rlw->channels + x;
1220 if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1221 mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1222 rl->param.tx_coalesce_usecs,
1223 rl->param.tx_coalesce_pkts,
1224 rl->param.tx_coalesce_mode);
1226 mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1227 rl->param.tx_coalesce_usecs,
1228 rl->param.tx_coalesce_pkts);
1236 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1241 for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1242 struct mlx5e_rl_worker *rlw = rl->workers + y;
1244 for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1245 struct mlx5e_rl_channel *channel;
1246 struct mlx5e_sq *sq;
1248 channel = rlw->channels + x;
1254 mtx_lock(&sq->lock);
1255 mlx5e_update_sq_inline(sq);
1256 mtx_unlock(&sq->lock);
1262 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1268 mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1274 /* check if rate already exists */
1275 for (x = 0; x != rl->param.tx_rates_def; x++) {
1276 if (rl->rate_limit_table[x] != value)
1282 /* check if there is a free rate entry */
1283 if (x == rl->param.tx_rates_def) {
1284 for (x = 0; x != rl->param.tx_rates_def; x++) {
1285 if (rl->rate_limit_table[x] != 0)
1287 rl->rate_limit_table[x] = value;
1292 MLX5E_RL_WUNLOCK(rl);
1298 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1308 /* check if rate already exists */
1309 for (x = 0; x != rl->param.tx_rates_def; x++) {
1310 if (rl->rate_limit_table[x] != value)
1313 rl->rate_limit_table[x] = 0;
1317 /* check if there is a free rate entry */
1318 if (x == rl->param.tx_rates_def)
1322 MLX5E_RL_WUNLOCK(rl);
1328 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1330 struct mlx5e_rl_priv_data *rl = arg1;
1331 struct mlx5e_priv *priv = rl->priv;
1332 unsigned mode_modify;
1333 unsigned was_opened;
1341 value = rl->param.arg[arg2];
1342 MLX5E_RL_RUNLOCK(rl);
1346 error = sysctl_handle_64(oidp, &value, 0, req);
1347 if (error || req->newptr == NULL ||
1348 value == rl->param.arg[arg2])
1355 /* check if device is gone */
1360 was_opened = rl->opened;
1361 mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1363 switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1364 case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1365 if (value > rl->param.tx_worker_threads_max)
1366 value = rl->param.tx_worker_threads_max;
1370 /* store new value */
1371 rl->param.arg[arg2] = value;
1374 case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1375 if (value > rl->param.tx_channels_per_worker_max)
1376 value = rl->param.tx_channels_per_worker_max;
1380 /* store new value */
1381 rl->param.arg[arg2] = value;
1384 case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1385 if (value > rl->param.tx_rates_max)
1386 value = rl->param.tx_rates_max;
1390 /* store new value */
1391 rl->param.arg[arg2] = value;
1394 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1398 else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1399 value = MLX5E_FLD_MAX(cqc, cq_period);
1401 /* store new value */
1402 rl->param.arg[arg2] = value;
1404 /* check to avoid down and up the network interface */
1406 error = mlx5e_rl_refresh_channel_params(rl);
1409 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1410 /* import TX coal pkts */
1413 else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1414 value = MLX5E_FLD_MAX(cqc, cq_max_count);
1416 /* store new value */
1417 rl->param.arg[arg2] = value;
1419 /* check to avoid down and up the network interface */
1421 error = mlx5e_rl_refresh_channel_params(rl);
1424 case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1425 /* network interface must be down */
1426 if (was_opened != 0 && mode_modify == 0)
1427 mlx5e_rl_close_workers(priv);
1429 /* import TX coalesce mode */
1433 /* store new value */
1434 rl->param.arg[arg2] = value;
1436 /* restart network interface, if any */
1437 if (was_opened != 0) {
1438 if (mode_modify == 0)
1439 mlx5e_rl_open_workers(priv);
1441 error = mlx5e_rl_refresh_channel_params(rl);
1445 case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1446 /* network interface must be down */
1448 mlx5e_rl_close_workers(priv);
1450 /* import TX queue size */
1451 if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1452 value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1453 else if (value > priv->params_ethtool.tx_queue_size_max)
1454 value = priv->params_ethtool.tx_queue_size_max;
1456 /* store actual TX queue size */
1457 value = 1ULL << order_base_2(value);
1459 /* store new value */
1460 rl->param.arg[arg2] = value;
1462 /* verify TX completion factor */
1463 mlx5e_rl_sync_tx_completion_fact(rl);
1465 /* restart network interface, if any */
1467 mlx5e_rl_open_workers(priv);
1470 case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1471 /* network interface must be down */
1473 mlx5e_rl_close_workers(priv);
1475 /* store new value */
1476 rl->param.arg[arg2] = value;
1478 /* verify parameter */
1479 mlx5e_rl_sync_tx_completion_fact(rl);
1481 /* restart network interface, if any */
1483 mlx5e_rl_open_workers(priv);
1486 case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1487 error = mlx5e_rl_tx_limit_add(rl, value);
1490 case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1491 error = mlx5e_rl_tx_limit_clr(rl, value);
1494 case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1496 if (value > rl->param.tx_allowed_deviation_max)
1497 value = rl->param.tx_allowed_deviation_max;
1498 else if (value < rl->param.tx_allowed_deviation_min)
1499 value = rl->param.tx_allowed_deviation_min;
1502 rl->param.arg[arg2] = value;
1503 MLX5E_RL_WUNLOCK(rl);
1506 case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1508 if (value > rl->param.tx_burst_size_max)
1509 value = rl->param.tx_burst_size_max;
1510 else if (value < rl->param.tx_burst_size_min)
1511 value = rl->param.tx_burst_size_min;
1514 rl->param.arg[arg2] = value;
1515 MLX5E_RL_WUNLOCK(rl);
1527 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1528 struct sysctl_oid *node, const char *name, const char *desc)
1531 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1532 * take care of loading default sysctl value from the kernel
1533 * environment, if any:
1535 if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1536 /* read-only SYSCTLs */
1537 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1538 name, CTLTYPE_U64 | CTLFLAG_RD |
1539 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1541 if (strstr(name, "_def") != 0) {
1542 #ifdef RATELIMIT_DEBUG
1543 /* tunable read-only advanced SYSCTLs */
1544 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1545 name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1546 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1549 /* read-write SYSCTLs */
1550 SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1551 name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1552 CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1558 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1559 struct sysctl_oid *node, const char *name, const char *desc)
1561 /* read-only SYSCTLs */
1562 SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1563 CTLFLAG_RD, &rl->stats.arg[x], 0, desc);