]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - sys/netinet/cc/cc_cdg.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / sys / netinet / cc / cc_cdg.c
1 /*-
2  * Copyright (c) 2009-2013
3  *      Swinburne University of Technology, Melbourne, Australia
4  * All rights reserved.
5  *
6  * This software was developed at the Centre for Advanced Internet
7  * Architectures, Swinburne University of Technology, by David Hayes, made
8  * possible in part by a gift from The Cisco University Research Program Fund,
9  * a corporate advised fund of Silicon Valley Community Foundation. Development
10  * and testing were further assisted by a grant from the FreeBSD Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33
34 /*
35  * CAIA Delay-Gradient (CDG) congestion control algorithm
36  *
37  * An implemention of the delay-gradient congestion control algorithm proposed
38  * in the following paper:
39  *
40  * D. A. Hayes and G. Armitage, "Revisiting TCP Congestion Control using Delay
41  * Gradients", in IFIP Networking, Valencia, Spain, 9-13 May 2011.
42  *
43  * Developed as part of the NewTCP research project at Swinburne University of
44  * Technology's Centre for Advanced Internet Architectures, Melbourne,
45  * Australia. More details are available at:
46  *   http://caia.swin.edu.au/urp/newtcp/
47  */
48
49 #include <sys/cdefs.h>
50 __FBSDID("$FreeBSD$");
51
52 #include <sys/param.h>
53 #include <sys/hhook.h>
54 #include <sys/kernel.h>
55 #include <sys/khelp.h>
56 #include <sys/limits.h>
57 #include <sys/lock.h>
58 #include <sys/malloc.h>
59 #include <sys/module.h>
60 #include <sys/queue.h>
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sysctl.h>
64 #include <sys/systm.h>
65
66 #include <net/if.h>
67 #include <net/vnet.h>
68
69 #include <netinet/cc.h>
70 #include <netinet/tcp_seq.h>
71 #include <netinet/tcp_timer.h>
72 #include <netinet/tcp_var.h>
73
74 #include <netinet/cc/cc_module.h>
75
76 #include <netinet/khelp/h_ertt.h>
77
78 #include <vm/uma.h>
79
80 #define CDG_VERSION "0.1"
81
82 #define CAST_PTR_INT(X) (*((int*)(X)))
83
84 #ifndef VIMAGE
85 #define vnet_sysctl_handle_uint(oidp, arg1, arg2, req) \
86     sysctl_handle_int(oidp, arg1, arg2, req)
87 #endif
88
89 /* Private delay-gradient induced congestion control signal. */
90 #define CC_CDG_DELAY 0x01000000
91
92 /* NewReno window deflation factor on loss (as a percentage). */
93 #define RENO_BETA 50
94
95 /* Queue states. */
96 #define CDG_Q_EMPTY     1
97 #define CDG_Q_RISING    2
98 #define CDG_Q_FALLING   3
99 #define CDG_Q_FULL      4
100 #define CDG_Q_UNKNOWN   9999
101
102 /* Number of bit shifts used in probexp lookup table. */
103 #define EXP_PREC 15
104
105 /* Largest gradient represented in probexp lookup table. */
106 #define MAXGRAD 5
107
108 /*
109  * Delay Precision Enhance - number of bit shifts used for qtrend related
110  * integer arithmetic precision.
111  */
112 #define D_P_E 7
113
114 struct qdiff_sample {
115         long qdiff;
116         STAILQ_ENTRY(qdiff_sample) qdiff_lnk;
117 };
118
119 struct cdg {
120         long max_qtrend;
121         long min_qtrend;
122         STAILQ_HEAD(minrtts_head, qdiff_sample) qdiffmin_q;
123         STAILQ_HEAD(maxrtts_head, qdiff_sample) qdiffmax_q;
124         long window_incr;
125         /* rttcount for window increase when in congestion avoidance */
126         long rtt_count;
127         /* maximum measured rtt within an rtt period */
128         int maxrtt_in_rtt;
129         /* maximum measured rtt within prev rtt period */
130         int maxrtt_in_prevrtt;
131         /* minimum measured rtt within an rtt period */
132         int minrtt_in_rtt;
133         /* minimum measured rtt within prev rtt period */
134         int minrtt_in_prevrtt;
135         /* consecutive congestion episode counter */
136         uint32_t consec_cong_cnt;
137         /* when tracking a new reno type loss window */
138         uint32_t shadow_w;
139         /* maximum number of samples in the moving average queue */
140         int sample_q_size;
141         /* number of samples in the moving average queue */
142         int num_samples;
143         /* estimate of the queue state of the path */
144         int queue_state;
145 };
146
147 /*
148  * Lookup table for:
149  *   (1 - exp(-x)) << EXP_PREC, where x = [0,MAXGRAD] in 2^-7 increments
150  *
151  * Note: probexp[0] is set to 10 (not 0) as a safety for very low increase
152  * gradients.
153  */
154 static const int probexp[641] = {
155    10,255,508,759,1008,1255,1501,1744,1985,2225,2463,2698,2932,3165,3395,3624,
156    3850,4075,4299,4520,4740,4958,5175,5389,5602,5814,6024,6232,6438,6643,6846,
157    7048,7248,7447,7644,7839,8033,8226,8417,8606,8794,8981,9166,9350,9532,9713,
158    9892,10070,10247,10422,10596,10769,10940,11110,11278,11445,11611,11776,11939,
159    12101,12262,12422,12580,12737,12893,13048,13201,13354,13505,13655,13803,13951,
160    14097,14243,14387,14530,14672,14813,14952,15091,15229,15365,15500,15635,15768,
161    15900,16032,16162,16291,16419,16547,16673,16798,16922,17046,17168,17289,17410,
162    17529,17648,17766,17882,17998,18113,18227,18340,18453,18564,18675,18784,18893,
163    19001,19108,19215,19320,19425,19529,19632,19734,19835,19936,20036,20135,20233,
164    20331,20427,20523,20619,20713,20807,20900,20993,21084,21175,21265,21355,21444,
165    21532,21619,21706,21792,21878,21962,22046,22130,22213,22295,22376,22457,22537,
166    22617,22696,22774,22852,22929,23006,23082,23157,23232,23306,23380,23453,23525,
167    23597,23669,23739,23810,23879,23949,24017,24085,24153,24220,24286,24352,24418,
168    24483,24547,24611,24675,24738,24800,24862,24924,24985,25045,25106,25165,25224,
169    25283,25341,25399,25456,25513,25570,25626,25681,25737,25791,25846,25899,25953,
170    26006,26059,26111,26163,26214,26265,26316,26366,26416,26465,26514,26563,26611,
171    26659,26707,26754,26801,26847,26893,26939,26984,27029,27074,27118,27162,27206,
172    27249,27292,27335,27377,27419,27460,27502,27543,27583,27624,27664,27703,27743,
173    27782,27821,27859,27897,27935,27973,28010,28047,28084,28121,28157,28193,28228,
174    28263,28299,28333,28368,28402,28436,28470,28503,28536,28569,28602,28634,28667,
175    28699,28730,28762,28793,28824,28854,28885,28915,28945,28975,29004,29034,29063,
176    29092,29120,29149,29177,29205,29232,29260,29287,29314,29341,29368,29394,29421,
177    29447,29472,29498,29524,29549,29574,29599,29623,29648,29672,29696,29720,29744,
178    29767,29791,29814,29837,29860,29882,29905,29927,29949,29971,29993,30014,30036,
179    30057,30078,30099,30120,30141,30161,30181,30201,30221,30241,30261,30280,30300,
180    30319,30338,30357,30376,30394,30413,30431,30449,30467,30485,30503,30521,30538,
181    30555,30573,30590,30607,30624,30640,30657,30673,30690,30706,30722,30738,30753,
182    30769,30785,30800,30815,30831,30846,30861,30876,30890,30905,30919,30934,30948,
183    30962,30976,30990,31004,31018,31031,31045,31058,31072,31085,31098,31111,31124,
184    31137,31149,31162,31174,31187,31199,31211,31223,31235,31247,31259,31271,31283,
185    31294,31306,31317,31328,31339,31351,31362,31373,31383,31394,31405,31416,31426,
186    31436,31447,31457,31467,31477,31487,31497,31507,31517,31527,31537,31546,31556,
187    31565,31574,31584,31593,31602,31611,31620,31629,31638,31647,31655,31664,31673,
188    31681,31690,31698,31706,31715,31723,31731,31739,31747,31755,31763,31771,31778,
189    31786,31794,31801,31809,31816,31824,31831,31838,31846,31853,31860,31867,31874,
190    31881,31888,31895,31902,31908,31915,31922,31928,31935,31941,31948,31954,31960,
191    31967,31973,31979,31985,31991,31997,32003,32009,32015,32021,32027,32033,32038,
192    32044,32050,32055,32061,32066,32072,32077,32083,32088,32093,32098,32104,32109,
193    32114,32119,32124,32129,32134,32139,32144,32149,32154,32158,32163,32168,32173,
194    32177,32182,32186,32191,32195,32200,32204,32209,32213,32217,32222,32226,32230,
195    32234,32238,32242,32247,32251,32255,32259,32263,32267,32270,32274,32278,32282,
196    32286,32290,32293,32297,32301,32304,32308,32311,32315,32318,32322,32325,32329,
197    32332,32336,32339,32342,32346,32349,32352,32356,32359,32362,32365,32368,32371,
198    32374,32377,32381,32384,32387,32389,32392,32395,32398,32401,32404,32407,32410,
199    32412,32415,32418,32421,32423,32426,32429,32431,32434,32437,32439,32442,32444,
200    32447,32449,32452,32454,32457,32459,32461,32464,32466,32469,32471,32473,32476,
201    32478,32480,32482,32485,32487,32489,32491,32493,32495,32497,32500,32502,32504,
202    32506,32508,32510,32512,32514,32516,32518,32520,32522,32524,32526,32527,32529,
203    32531,32533,32535,32537,32538,32540,32542,32544,32545,32547};
204
205 static uma_zone_t qdiffsample_zone;
206
207 static MALLOC_DEFINE(M_CDG, "cdg data",
208   "Per connection data required for the CDG congestion control algorithm");
209
210 static int ertt_id;
211
212 static VNET_DEFINE(uint32_t, cdg_alpha_inc);
213 static VNET_DEFINE(uint32_t, cdg_beta_delay);
214 static VNET_DEFINE(uint32_t, cdg_beta_loss);
215 static VNET_DEFINE(uint32_t, cdg_smoothing_factor);
216 static VNET_DEFINE(uint32_t, cdg_exp_backoff_scale);
217 static VNET_DEFINE(uint32_t, cdg_consec_cong);
218 static VNET_DEFINE(uint32_t, cdg_hold_backoff);
219 #define V_cdg_alpha_inc         VNET(cdg_alpha_inc)
220 #define V_cdg_beta_delay        VNET(cdg_beta_delay)
221 #define V_cdg_beta_loss         VNET(cdg_beta_loss)
222 #define V_cdg_smoothing_factor  VNET(cdg_smoothing_factor)
223 #define V_cdg_exp_backoff_scale VNET(cdg_exp_backoff_scale)
224 #define V_cdg_consec_cong       VNET(cdg_consec_cong)
225 #define V_cdg_hold_backoff      VNET(cdg_hold_backoff)
226
227 /* Function prototypes. */
228 static int cdg_mod_init(void);
229 static void cdg_conn_init(struct cc_var *ccv);
230 static int cdg_cb_init(struct cc_var *ccv);
231 static void cdg_cb_destroy(struct cc_var *ccv);
232 static void cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type);
233 static void cdg_ack_received(struct cc_var *ccv, uint16_t ack_type);
234
235 struct cc_algo cdg_cc_algo = {
236         .name = "cdg",
237         .mod_init = cdg_mod_init,
238         .ack_received = cdg_ack_received,
239         .cb_destroy = cdg_cb_destroy,
240         .cb_init = cdg_cb_init,
241         .conn_init = cdg_conn_init,
242         .cong_signal = cdg_cong_signal
243 };
244
245 /* Vnet created and being initialised. */
246 static void
247 cdg_init_vnet(const void *unused __unused)
248 {
249
250         V_cdg_alpha_inc = 0;
251         V_cdg_beta_delay = 70;
252         V_cdg_beta_loss = 50;
253         V_cdg_smoothing_factor = 8;
254         V_cdg_exp_backoff_scale = 3;
255         V_cdg_consec_cong = 5;
256         V_cdg_hold_backoff = 5;
257 }
258
259 static int
260 cdg_mod_init(void)
261 {
262         VNET_ITERATOR_DECL(v);
263
264         ertt_id = khelp_get_id("ertt");
265         if (ertt_id <= 0)
266                 return (EINVAL);
267
268         qdiffsample_zone = uma_zcreate("cdg_qdiffsample",
269             sizeof(struct qdiff_sample), NULL, NULL, NULL, NULL, 0, 0);
270
271         VNET_LIST_RLOCK();
272         VNET_FOREACH(v) {
273                 CURVNET_SET(v);
274                 cdg_init_vnet(NULL);
275                 CURVNET_RESTORE();
276         }
277         VNET_LIST_RUNLOCK();
278
279         cdg_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
280         cdg_cc_algo.after_idle = newreno_cc_algo.after_idle;
281
282         return (0);
283 }
284
285 static int
286 cdg_cb_init(struct cc_var *ccv)
287 {
288         struct cdg *cdg_data;
289
290         cdg_data = malloc(sizeof(struct cdg), M_CDG, M_NOWAIT);
291         if (cdg_data == NULL)
292                 return (ENOMEM);
293
294         cdg_data->shadow_w = 0;
295         cdg_data->max_qtrend = 0;
296         cdg_data->min_qtrend = 0;
297         cdg_data->queue_state = CDG_Q_UNKNOWN;
298         cdg_data->maxrtt_in_rtt = 0;
299         cdg_data->maxrtt_in_prevrtt = 0;
300         cdg_data->minrtt_in_rtt = INT_MAX;
301         cdg_data->minrtt_in_prevrtt = 0;
302         cdg_data->window_incr = 0;
303         cdg_data->rtt_count = 0;
304         cdg_data->consec_cong_cnt = 0;
305         cdg_data->sample_q_size = V_cdg_smoothing_factor;
306         cdg_data->num_samples = 0;
307         STAILQ_INIT(&cdg_data->qdiffmin_q);
308         STAILQ_INIT(&cdg_data->qdiffmax_q);
309
310         ccv->cc_data = cdg_data;
311
312         return (0);
313 }
314
315 static void
316 cdg_conn_init(struct cc_var *ccv)
317 {
318         struct cdg *cdg_data = ccv->cc_data;
319
320         /*
321          * Initialise the shadow_cwnd in case we are competing with loss based
322          * flows from the start
323          */
324         cdg_data->shadow_w = CCV(ccv, snd_cwnd);
325 }
326
327 static void
328 cdg_cb_destroy(struct cc_var *ccv)
329 {
330         struct cdg *cdg_data;
331         struct qdiff_sample *qds, *qds_n;
332
333         cdg_data = ccv->cc_data;
334
335         qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
336         while (qds != NULL) {
337                 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
338                 uma_zfree(qdiffsample_zone,qds);
339                 qds = qds_n;
340         }
341
342         qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
343         while (qds != NULL) {
344                 qds_n = STAILQ_NEXT(qds, qdiff_lnk);
345                 uma_zfree(qdiffsample_zone,qds);
346                 qds = qds_n;
347         }
348
349         free(ccv->cc_data, M_CDG);
350 }
351
352 static int
353 cdg_beta_handler(SYSCTL_HANDLER_ARGS)
354 {
355
356         if (req->newptr != NULL &&
357             (CAST_PTR_INT(req->newptr) == 0 || CAST_PTR_INT(req->newptr) > 100))
358                 return (EINVAL);
359
360         return (vnet_sysctl_handle_uint(oidp, arg1, arg2, req));
361 }
362
363 static int
364 cdg_exp_backoff_scale_handler(SYSCTL_HANDLER_ARGS)
365 {
366
367         if (req->newptr != NULL && CAST_PTR_INT(req->newptr) < 1)
368                 return (EINVAL);
369
370         return (vnet_sysctl_handle_uint(oidp, arg1, arg2, req));
371 }
372
373 static inline unsigned long
374 cdg_window_decrease(struct cc_var *ccv, unsigned long owin, unsigned int beta)
375 {
376
377         return ((ulmin(CCV(ccv, snd_wnd), owin) * beta) / 100);
378 }
379
380 /*
381  * Window increase function
382  * This window increase function is independent of the initial window size
383  * to ensure small window flows are not discriminated against (i.e. fairness).
384  * It increases at 1pkt/rtt like Reno for alpha_inc rtts, and then 2pkts/rtt for
385  * the next alpha_inc rtts, etc.
386  */
387 static void
388 cdg_window_increase(struct cc_var *ccv, int new_measurement)
389 {
390         struct cdg *cdg_data;
391         int incr, s_w_incr;
392
393         cdg_data = ccv->cc_data;
394         incr = s_w_incr = 0;
395
396         if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
397                 /* Slow start. */
398                 incr = CCV(ccv, t_maxseg);
399                 s_w_incr = incr;
400                 cdg_data->window_incr = cdg_data->rtt_count = 0;
401         } else {
402                 /* Congestion avoidance. */
403                 if (new_measurement) {
404                         s_w_incr = CCV(ccv, t_maxseg);
405                         if (V_cdg_alpha_inc == 0) {
406                                 incr = CCV(ccv, t_maxseg);
407                         } else {
408                                 if (++cdg_data->rtt_count >= V_cdg_alpha_inc) {
409                                         cdg_data->window_incr++;
410                                         cdg_data->rtt_count = 0;
411                                 }
412                                 incr = CCV(ccv, t_maxseg) *
413                                     cdg_data->window_incr;
414                         }
415                 }
416         }
417
418         if (cdg_data->shadow_w > 0)
419                 cdg_data->shadow_w = ulmin(cdg_data->shadow_w + s_w_incr,
420                     TCP_MAXWIN << CCV(ccv, snd_scale));
421
422         CCV(ccv, snd_cwnd) = ulmin(CCV(ccv, snd_cwnd) + incr,
423             TCP_MAXWIN << CCV(ccv, snd_scale));
424 }
425
426 static void
427 cdg_cong_signal(struct cc_var *ccv, uint32_t signal_type)
428 {
429         struct cdg *cdg_data = ccv->cc_data;
430
431         switch(signal_type) {
432         case CC_CDG_DELAY:
433                 CCV(ccv, snd_ssthresh) = cdg_window_decrease(ccv,
434                     CCV(ccv, snd_cwnd), V_cdg_beta_delay);
435                 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
436                 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
437                 cdg_data->window_incr = cdg_data->rtt_count = 0;
438                 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
439                 break;
440         case CC_NDUPACK:
441                 /*
442                  * If already responding to congestion OR we have guessed no
443                  * queue in the path is full.
444                  */
445                 if (IN_CONGRECOVERY(CCV(ccv, t_flags)) ||
446                     cdg_data->queue_state < CDG_Q_FULL) {
447                         CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
448                         CCV(ccv, snd_recover) = CCV(ccv, snd_max);
449                 } else {
450                         /*
451                          * Loss is likely to be congestion related. We have
452                          * inferred a queue full state, so have shadow window
453                          * react to loss as NewReno would.
454                          */
455                         if (cdg_data->shadow_w > 0)
456                                 cdg_data->shadow_w = cdg_window_decrease(ccv,
457                                     cdg_data->shadow_w, RENO_BETA);
458
459                         CCV(ccv, snd_ssthresh) = ulmax(cdg_data->shadow_w,
460                             cdg_window_decrease(ccv, CCV(ccv, snd_cwnd),
461                             V_cdg_beta_loss));
462
463                         cdg_data->window_incr = cdg_data->rtt_count = 0;
464                 }
465                 ENTER_RECOVERY(CCV(ccv, t_flags));
466                 break;
467         default:
468                 newreno_cc_algo.cong_signal(ccv, signal_type);
469                 break;
470         }
471 }
472
473 /*
474  * Using a negative exponential probabilistic backoff so that sources with
475  * varying RTTs which share the same link will, on average, have the same
476  * probability of backoff over time.
477  *
478  * Prob_backoff = 1 - exp(-qtrend / V_cdg_exp_backoff_scale), where
479  * V_cdg_exp_backoff_scale is the average qtrend for the exponential backoff.
480  */
481 static inline int
482 prob_backoff(long qtrend)
483 {
484         int backoff, idx, p;
485
486         backoff = (qtrend > ((MAXGRAD * V_cdg_exp_backoff_scale) << D_P_E));
487
488         if (!backoff) {
489                 if (V_cdg_exp_backoff_scale > 1)
490                         idx = (qtrend + V_cdg_exp_backoff_scale / 2) /
491                             V_cdg_exp_backoff_scale;
492                 else
493                         idx = qtrend;
494
495                 /* Backoff probability proportional to rate of queue growth. */
496                 p = (INT_MAX / (1 << EXP_PREC)) * probexp[idx];
497                 backoff = (random() < p);
498         }
499
500         return (backoff);
501 }
502
503 static inline void
504 calc_moving_average(struct cdg *cdg_data, long qdiff_max, long qdiff_min)
505 {
506         struct qdiff_sample *qds;
507
508         ++cdg_data->num_samples;
509         if (cdg_data->num_samples > cdg_data->sample_q_size) {
510                 /* Minimum RTT. */
511                 qds = STAILQ_FIRST(&cdg_data->qdiffmin_q);
512                 cdg_data->min_qtrend =  cdg_data->min_qtrend +
513                     (qdiff_min - qds->qdiff) / cdg_data->sample_q_size;
514                 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmin_q, qdiff_lnk);
515                 qds->qdiff = qdiff_min;
516                 STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds, qdiff_lnk);
517
518                 /* Maximum RTT. */
519                 qds = STAILQ_FIRST(&cdg_data->qdiffmax_q);
520                 cdg_data->max_qtrend =  cdg_data->max_qtrend +
521                     (qdiff_max - qds->qdiff) / cdg_data->sample_q_size;
522                 STAILQ_REMOVE_HEAD(&cdg_data->qdiffmax_q, qdiff_lnk);
523                 qds->qdiff = qdiff_max;
524                 STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds, qdiff_lnk);
525                 --cdg_data->num_samples;
526         } else {
527                 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
528                 if (qds != NULL) {
529                         cdg_data->min_qtrend = cdg_data->min_qtrend +
530                             qdiff_min / cdg_data->sample_q_size;
531                         qds->qdiff = qdiff_min;
532                         STAILQ_INSERT_TAIL(&cdg_data->qdiffmin_q, qds,
533                             qdiff_lnk);
534                 }
535
536                 qds = uma_zalloc(qdiffsample_zone, M_NOWAIT);
537                 if (qds) {
538                         cdg_data->max_qtrend = cdg_data->max_qtrend +
539                             qdiff_max / cdg_data->sample_q_size;
540                         qds->qdiff = qdiff_max;
541                         STAILQ_INSERT_TAIL(&cdg_data->qdiffmax_q, qds,
542                             qdiff_lnk);
543                 }
544         }
545 }
546
547 static void
548 cdg_ack_received(struct cc_var *ccv, uint16_t ack_type)
549 {
550         struct cdg *cdg_data;
551         struct ertt *e_t;
552         long qdiff_max, qdiff_min;
553         int congestion, new_measurement, slowstart;
554
555         cdg_data = ccv->cc_data;
556         e_t = (struct ertt *)khelp_get_osd(CCV(ccv, osd), ertt_id);
557         new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
558         congestion = 0;
559         cdg_data->maxrtt_in_rtt = imax(e_t->rtt, cdg_data->maxrtt_in_rtt);
560         cdg_data->minrtt_in_rtt = imin(e_t->rtt, cdg_data->minrtt_in_rtt);
561
562         if (new_measurement) {
563                 slowstart = (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh));
564                 /*
565                  * Update smoothed gradient measurements. Since we are only
566                  * using one measurement per RTT, use max or min rtt_in_rtt.
567                  * This is also less noisy than a sample RTT measurement. Max
568                  * RTT measurements can have trouble due to OS issues.
569                  */
570                 if (cdg_data->maxrtt_in_prevrtt) {
571                         qdiff_max = ((long)(cdg_data->maxrtt_in_rtt -
572                             cdg_data->maxrtt_in_prevrtt) << D_P_E );
573                         qdiff_min = ((long)(cdg_data->minrtt_in_rtt -
574                             cdg_data->minrtt_in_prevrtt) << D_P_E );
575
576                         calc_moving_average(cdg_data, qdiff_max, qdiff_min);
577
578                         /* Probabilistic backoff with respect to gradient. */
579                         if (slowstart && qdiff_min > 0)
580                                 congestion = prob_backoff(qdiff_min);
581                         else if (cdg_data->min_qtrend > 0)
582                                 congestion = prob_backoff(cdg_data->min_qtrend);
583                         else if (slowstart && qdiff_max > 0)
584                                 congestion = prob_backoff(qdiff_max);
585                         else if (cdg_data->max_qtrend > 0)
586                                 congestion = prob_backoff(cdg_data->max_qtrend);
587                         
588                         /* Update estimate of queue state. */
589                         if (cdg_data->min_qtrend > 0 &&
590                             cdg_data->max_qtrend <= 0) {
591                                 cdg_data->queue_state = CDG_Q_FULL;
592                         } else if (cdg_data->min_qtrend >= 0 &&
593                             cdg_data->max_qtrend < 0) {
594                                 cdg_data->queue_state = CDG_Q_EMPTY;
595                                 cdg_data->shadow_w = 0;
596                         } else if (cdg_data->min_qtrend > 0 &&
597                             cdg_data->max_qtrend > 0) {
598                                 cdg_data->queue_state = CDG_Q_RISING;
599                         } else if (cdg_data->min_qtrend < 0 &&
600                             cdg_data->max_qtrend < 0) {
601                                 cdg_data->queue_state = CDG_Q_FALLING;
602                         }
603
604                         if (cdg_data->min_qtrend < 0 ||
605                             cdg_data->max_qtrend < 0)
606                                 cdg_data->consec_cong_cnt = 0;
607                 }
608
609                 cdg_data->minrtt_in_prevrtt = cdg_data->minrtt_in_rtt;
610                 cdg_data->minrtt_in_rtt = INT_MAX;
611                 cdg_data->maxrtt_in_prevrtt = cdg_data->maxrtt_in_rtt;
612                 cdg_data->maxrtt_in_rtt = 0;
613                 e_t->flags &= ~ERTT_NEW_MEASUREMENT;
614         }
615
616         if (congestion) {
617                 cdg_data->consec_cong_cnt++;
618                 if (!IN_RECOVERY(CCV(ccv, t_flags))) {
619                         if (cdg_data->consec_cong_cnt <= V_cdg_consec_cong)
620                                 cdg_cong_signal(ccv, CC_CDG_DELAY);
621                         else
622                                 /*
623                                  * We have been backing off but the queue is not
624                                  * falling. Assume we are competing with
625                                  * loss-based flows and don't back off for the
626                                  * next V_cdg_hold_backoff RTT periods.
627                                  */
628                                 if (cdg_data->consec_cong_cnt >=
629                                     V_cdg_consec_cong + V_cdg_hold_backoff)
630                                         cdg_data->consec_cong_cnt = 0;
631
632                         /* Won't see effect until 2nd RTT. */
633                         cdg_data->maxrtt_in_prevrtt = 0;
634                         /*
635                          * Resync shadow window in case we are competing with a
636                          * loss based flow
637                          */
638                         cdg_data->shadow_w = ulmax(CCV(ccv, snd_cwnd),
639                             cdg_data->shadow_w);
640                 }
641         } else if (ack_type == CC_ACK)
642                 cdg_window_increase(ccv, new_measurement);
643 }
644
645 /* When a vnet is created and being initialised, init the per-stack CDG vars. */
646 VNET_SYSINIT(cdg_init_vnet, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
647     cdg_init_vnet, NULL);
648
649 SYSCTL_DECL(_net_inet_tcp_cc_cdg);
650 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, cdg, CTLFLAG_RW, NULL,
651     "CAIA delay-gradient congestion control related settings");
652
653 SYSCTL_STRING(_net_inet_tcp_cc_cdg, OID_AUTO, version,
654     CTLFLAG_RD, CDG_VERSION, sizeof(CDG_VERSION) - 1,
655     "Current algorithm/implementation version number");
656
657 SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, alpha_inc,
658     CTLFLAG_RW, &VNET_NAME(cdg_alpha_inc), 0,
659     "Increment the window increase factor alpha by 1 MSS segment every "
660     "alpha_inc RTTs during congestion avoidance mode.");
661
662 SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_delay,
663     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_delay), 70,
664     &cdg_beta_handler, "IU",
665     "Delay-based window decrease factor as a percentage "
666     "(on delay-based backoff, w = w * beta_delay / 100)");
667
668 SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, beta_loss,
669     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_beta_loss), 50,
670     &cdg_beta_handler, "IU",
671     "Loss-based window decrease factor as a percentage "
672     "(on loss-based backoff, w = w * beta_loss / 100)");
673
674 SYSCTL_VNET_PROC(_net_inet_tcp_cc_cdg, OID_AUTO, exp_backoff_scale,
675     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(cdg_exp_backoff_scale), 2,
676     &cdg_exp_backoff_scale_handler, "IU",
677     "Scaling parameter for the probabilistic exponential backoff");
678
679 SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg,  OID_AUTO, smoothing_factor,
680     CTLFLAG_RW, &VNET_NAME(cdg_smoothing_factor), 8,
681     "Number of samples used for moving average smoothing (0 = no smoothing)");
682
683 SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_consec_cong,
684     CTLFLAG_RW, &VNET_NAME(cdg_consec_cong), 5,
685     "Number of consecutive delay-gradient based congestion episodes which will "
686     "trigger loss based CC compatibility");
687
688 SYSCTL_VNET_UINT(_net_inet_tcp_cc_cdg, OID_AUTO, loss_compete_hold_backoff,
689     CTLFLAG_RW, &VNET_NAME(cdg_hold_backoff), 5,
690     "Number of consecutive delay-gradient based congestion episodes to hold "
691     "the window backoff for loss based CC compatibility");
692
693 DECLARE_CC_MODULE(cdg, &cdg_cc_algo);
694
695 MODULE_DEPEND(cdg, ertt, 1, 1, 1);