sys/netpfil/ipfw/dn_sched_qfq.c

   1 /*
   2  * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
   3  * All rights reserved
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  *
  14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  24  * SUCH DAMAGE.
  25  */
  26
  27 /*
  28  * $FreeBSD$
  29  */
  30
  31 #ifdef _KERNEL
  32 #include <sys/malloc.h>
  33 #include <sys/socket.h>
  34 #include <sys/socketvar.h>
  35 #include <sys/kernel.h>
  36 #include <sys/lock.h>
  37 #include <sys/mbuf.h>
  38 #include <sys/module.h>
  39 #include <sys/rwlock.h>
  40 #include <net/if.h>     /* IFNAMSIZ */
  41 #include <netinet/in.h>
  42 #include <netinet/ip_var.h>             /* ipfw_rule_ref */
  43 #include <netinet/ip_fw.h>      /* flow_id */
  44 #include <netinet/ip_dummynet.h>
  45 #include <netpfil/ipfw/ip_fw_private.h>
  46 #include <netpfil/ipfw/dn_heap.h>
  47 #include <netpfil/ipfw/ip_dn_private.h>
  48 #ifdef NEW_AQM
  49 #include <netpfil/ipfw/dn_aqm.h>
  50 #endif
  51 #include <netpfil/ipfw/dn_sched.h>
  52 #else
  53 #include <dn_test.h>
  54 #endif
  55
  56 #ifdef QFQ_DEBUG
  57 #define _P64    unsigned long long      /* cast for printing uint64_t */
  58 struct qfq_sched;
  59 static void dump_sched(struct qfq_sched *q, const char *msg);
  60 #define NO(x)   x
  61 #else
  62 #define NO(x)
  63 #endif
  64 #define DN_SCHED_QFQ    4 // XXX Where?
  65 typedef unsigned long   bitmap;
  66
  67 /*
  68  * bitmaps ops are critical. Some linux versions have __fls
  69  * and the bitmap ops. Some machines have ffs
  70  * NOTE: fls() returns 1 for the least significant bit,
  71  *       __fls() returns 0 for the same case.
  72  * We use the base-0 version __fls() to match the description in
  73  * the ToN QFQ paper
  74  */
  75 #if defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
  76 int fls(unsigned int n)
  77 {
  78         int i = 0;
  79         for (i = 0; n > 0; n >>= 1, i++)
  80                 ;
  81         return i;
  82 }
  83 #endif
  84
  85 #if !defined(_KERNEL) || defined( __FreeBSD__ ) || defined(_WIN32) || (defined(__MIPSEL__) && defined(LINUX_24))
  86 static inline unsigned long __fls(unsigned long word)
  87 {
  88         return fls(word) - 1;
  89 }
  90 #endif
  91
  92 #if !defined(_KERNEL) || !defined(__linux__)
  93 #ifdef QFQ_DEBUG
  94 static int test_bit(int ix, bitmap *p)
  95 {
  96         if (ix < 0 || ix > 31)
  97                 D("bad index %d", ix);
  98         return *p & (1<<ix);
  99 }
 100 static void __set_bit(int ix, bitmap *p)
 101 {
 102         if (ix < 0 || ix > 31)
 103                 D("bad index %d", ix);
 104         *p |= (1<<ix);
 105 }
 106 static void __clear_bit(int ix, bitmap *p)
 107 {
 108         if (ix < 0 || ix > 31)
 109                 D("bad index %d", ix);
 110         *p &= ~(1<<ix);
 111 }
 112 #else /* !QFQ_DEBUG */
 113 /* XXX do we have fast version, or leave it to the compiler ? */
 114 #define test_bit(ix, pData)     ((*pData) & (1<<(ix)))
 115 #define __set_bit(ix, pData)    (*pData) |= (1<<(ix))
 116 #define __clear_bit(ix, pData)  (*pData) &= ~(1<<(ix))
 117 #endif /* !QFQ_DEBUG */
 118 #endif /* !__linux__ */
 119
 120 #ifdef __MIPSEL__
 121 #define __clear_bit(ix, pData)  (*pData) &= ~(1<<(ix))
 122 #endif
 123
 124 /*-------------------------------------------*/
 125 /*
 126
 127 Virtual time computations.
 128
 129 S, F and V are all computed in fixed point arithmetic with
 130 FRAC_BITS decimal bits.
 131
 132    QFQ_MAX_INDEX is the maximum index allowed for a group. We need
 133         one bit per index.
 134    QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
 135    The layout of the bits is as below:
 136
 137                    [ MTU_SHIFT ][      FRAC_BITS    ]
 138                    [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
 139                                  ^.__grp->index = 0
 140                                  *.__grp->slot_shift
 141
 142    where MIN_SLOT_SHIFT is derived by difference from the others.
 143
 144 The max group index corresponds to Lmax/w_min, where
 145 Lmax=1<<MTU_SHIFT, w_min = 1 .
 146 From this, and knowing how many groups (MAX_INDEX) we want,
 147 we can derive the shift corresponding to each group.
 148
 149 Because we often need to compute
 150         F = S + len/w_i  and V = V + len/wsum
 151 instead of storing w_i store the value
 152         inv_w = (1<<FRAC_BITS)/w_i
 153 so we can do F = S + len * inv_w * wsum.
 154 We use W_TOT in the formulas so we can easily move between
 155 static and adaptive weight sum.
 156
 157 The per-scheduler-instance data contain all the data structures
 158 for the scheduler: bitmaps and bucket lists.
 159
 160  */
 161 /*
 162  * Maximum number of consecutive slots occupied by backlogged classes
 163  * inside a group. This is approx lmax/lmin + 5.
 164  * XXX check because it poses constraints on MAX_INDEX
 165  */
 166 #define QFQ_MAX_SLOTS   32
 167 /*
 168  * Shifts used for class<->group mapping. Class weights are
 169  * in the range [1, QFQ_MAX_WEIGHT], we to map each class i to the
 170  * group with the smallest index that can support the L_i / r_i
 171  * configured for the class.
 172  *
 173  * grp->index is the index of the group; and grp->slot_shift
 174  * is the shift for the corresponding (scaled) sigma_i.
 175  *
 176  * When computing the group index, we do (len<<FP_SHIFT)/weight,
 177  * then compute an FLS (which is like a log2()), and if the result
 178  * is below the MAX_INDEX region we use 0 (which is the same as
 179  * using a larger len).
 180  */
 181 #define QFQ_MAX_INDEX           19
 182 #define QFQ_MAX_WSHIFT          16      /* log2(max_weight) */
 183
 184 #define QFQ_MAX_WEIGHT          (1<<QFQ_MAX_WSHIFT)
 185 #define QFQ_MAX_WSUM            (2*QFQ_MAX_WEIGHT)
 186
 187 #define FRAC_BITS               30      /* fixed point arithmetic */
 188 #define ONE_FP                  (1UL << FRAC_BITS)
 189
 190 #define QFQ_MTU_SHIFT           11      /* log2(max_len) */
 191 #define QFQ_MIN_SLOT_SHIFT      (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
 192
 193 /*
 194  * Possible group states, also indexes for the bitmaps array in
 195  * struct qfq_queue. We rely on ER, IR, EB, IB being numbered 0..3
 196  */
 197 enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
 198
 199 struct qfq_group;
 200 /*
 201  * additional queue info. Some of this info should come from
 202  * the flowset, we copy them here for faster processing.
 203  * This is an overlay of the struct dn_queue
 204  */
 205 struct qfq_class {
 206         struct dn_queue _q;
 207         uint64_t S, F;          /* flow timestamps (exact) */
 208         struct qfq_class *next; /* Link for the slot list. */
 209
 210         /* group we belong to. In principle we would need the index,
 211          * which is log_2(lmax/weight), but we never reference it
 212          * directly, only the group.
 213          */
 214         struct qfq_group *grp;
 215
 216         /* these are copied from the flowset. */
 217         uint32_t        inv_w;  /* ONE_FP/weight */
 218         uint32_t        lmax;   /* Max packet size for this flow. */
 219 };
 220
 221 /* Group descriptor, see the paper for details.
 222  * Basically this contains the bucket lists
 223  */
 224 struct qfq_group {
 225         uint64_t S, F;                  /* group timestamps (approx). */
 226         unsigned int slot_shift;        /* Slot shift. */
 227         unsigned int index;             /* Group index. */
 228         unsigned int front;             /* Index of the front slot. */
 229         bitmap full_slots;              /* non-empty slots */
 230
 231         /* Array of lists of active classes. */
 232         struct qfq_class *slots[QFQ_MAX_SLOTS];
 233 };
 234
 235 /* scheduler instance descriptor. */
 236 struct qfq_sched {
 237         uint64_t        V;              /* Precise virtual time. */
 238         uint32_t        wsum;           /* weight sum */
 239         uint32_t        iwsum;          /* inverse weight sum */
 240         NO(uint32_t     i_wsum;)        /* ONE_FP/w_sum */
 241         NO(uint32_t     queued;)        /* debugging */
 242         NO(uint32_t     loops;)         /* debugging */
 243         bitmap bitmaps[QFQ_MAX_STATE];  /* Group bitmaps. */
 244         struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
 245 };
 246
 247 /*---- support functions ----------------------------*/
 248
 249 /* Generic comparison function, handling wraparound. */
 250 static inline int qfq_gt(uint64_t a, uint64_t b)
 251 {
 252         return (int64_t)(a - b) > 0;
 253 }
 254
 255 /* Round a precise timestamp to its slotted value. */
 256 static inline uint64_t qfq_round_down(uint64_t ts, unsigned int shift)
 257 {
 258         return ts & ~((1ULL << shift) - 1);
 259 }
 260
 261 /* return the pointer to the group with lowest index in the bitmap */
 262 static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
 263                                         unsigned long bitmap)
 264 {
 265         int index = ffs(bitmap) - 1; // zero-based
 266         return &q->groups[index];
 267 }
 268
 269 /*
 270  * Calculate a flow index, given its weight and maximum packet length.
 271  * index = log_2(maxlen/weight) but we need to apply the scaling.
 272  * This is used only once at flow creation.
 273  */
 274 static int qfq_calc_index(uint32_t inv_w, unsigned int maxlen)
 275 {
 276         uint64_t slot_size = (uint64_t)maxlen *inv_w;
 277         unsigned long size_map;
 278         int index = 0;
 279
 280         size_map = (unsigned long)(slot_size >> QFQ_MIN_SLOT_SHIFT);
 281         if (!size_map)
 282                 goto out;
 283
 284         index = __fls(size_map) + 1;    // basically a log_2()
 285         index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
 286
 287         if (index < 0)
 288                 index = 0;
 289
 290 out:
 291         ND("W = %d, L = %d, I = %d\n", ONE_FP/inv_w, maxlen, index);
 292         return index;
 293 }
 294 /*---- end support functions ----*/
 295
 296 /*-------- API calls --------------------------------*/
 297 /*
 298  * Validate and copy parameters from flowset.
 299  */
 300 static int
 301 qfq_new_queue(struct dn_queue *_q)
 302 {
 303         struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
 304         struct qfq_class *cl = (struct qfq_class *)_q;
 305         int i;
 306         uint32_t w;     /* approximated weight */
 307
 308         /* import parameters from the flowset. They should be correct
 309          * already.
 310          */
 311         w = _q->fs->fs.par[0];
 312         cl->lmax = _q->fs->fs.par[1];
 313         if (!w || w > QFQ_MAX_WEIGHT) {
 314                 w = 1;
 315                 D("rounding weight to 1");
 316         }
 317         cl->inv_w = ONE_FP/w;
 318         w = ONE_FP/cl->inv_w;
 319         if (q->wsum + w > QFQ_MAX_WSUM)
 320                 return EINVAL;
 321
 322         i = qfq_calc_index(cl->inv_w, cl->lmax);
 323         cl->grp = &q->groups[i];
 324         q->wsum += w;
 325         q->iwsum = ONE_FP / q->wsum; /* XXX note theory */
 326         // XXX cl->S = q->V; ?
 327         return 0;
 328 }
 329
 330 /* remove an empty queue */
 331 static int
 332 qfq_free_queue(struct dn_queue *_q)
 333 {
 334         struct qfq_sched *q = (struct qfq_sched *)(_q->_si + 1);
 335         struct qfq_class *cl = (struct qfq_class *)_q;
 336         if (cl->inv_w) {
 337                 q->wsum -= ONE_FP/cl->inv_w;
 338                 if (q->wsum != 0)
 339                         q->iwsum = ONE_FP / q->wsum;
 340                 cl->inv_w = 0; /* reset weight to avoid run twice */
 341         }
 342         return 0;
 343 }
 344
 345 /* Calculate a mask to mimic what would be ffs_from(). */
 346 static inline unsigned long
 347 mask_from(unsigned long bitmap, int from)
 348 {
 349         return bitmap & ~((1UL << from) - 1);
 350 }
 351
 352 /*
 353  * The state computation relies on ER=0, IR=1, EB=2, IB=3
 354  * First compute eligibility comparing grp->S, q->V,
 355  * then check if someone is blocking us and possibly add EB
 356  */
 357 static inline unsigned int
 358 qfq_calc_state(struct qfq_sched *q, struct qfq_group *grp)
 359 {
 360         /* if S > V we are not eligible */
 361         unsigned int state = qfq_gt(grp->S, q->V);
 362         unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
 363         struct qfq_group *next;
 364
 365         if (mask) {
 366                 next = qfq_ffs(q, mask);
 367                 if (qfq_gt(grp->F, next->F))
 368                         state |= EB;
 369         }
 370
 371         return state;
 372 }
 373
 374 /*
 375  * In principle
 376  *      q->bitmaps[dst] |= q->bitmaps[src] & mask;
 377  *      q->bitmaps[src] &= ~mask;
 378  * but we should make sure that src != dst
 379  */
 380 static inline void
 381 qfq_move_groups(struct qfq_sched *q, unsigned long mask, int src, int dst)
 382 {
 383         q->bitmaps[dst] |= q->bitmaps[src] & mask;
 384         q->bitmaps[src] &= ~mask;
 385 }
 386
 387 static inline void
 388 qfq_unblock_groups(struct qfq_sched *q, int index, uint64_t old_finish)
 389 {
 390         unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
 391         struct qfq_group *next;
 392
 393         if (mask) {
 394                 next = qfq_ffs(q, mask);
 395                 if (!qfq_gt(next->F, old_finish))
 396                         return;
 397         }
 398
 399         mask = (1UL << index) - 1;
 400         qfq_move_groups(q, mask, EB, ER);
 401         qfq_move_groups(q, mask, IB, IR);
 402 }
 403
 404 /*
 405  * perhaps
 406  *
 407         old_V ^= q->V;
 408         old_V >>= QFQ_MIN_SLOT_SHIFT;
 409         if (old_V) {
 410                 ...
 411         }
 412  *
 413  */
 414 static inline void
 415 qfq_make_eligible(struct qfq_sched *q, uint64_t old_V)
 416 {
 417         unsigned long mask, vslot, old_vslot;
 418
 419         vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
 420         old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
 421
 422         if (vslot != old_vslot) {
 423                 /* must be 2ULL, see ToN QFQ article fig.5, we use base-0 fls */
 424                 mask = (2ULL << (__fls(vslot ^ old_vslot))) - 1;
 425                 qfq_move_groups(q, mask, IR, ER);
 426                 qfq_move_groups(q, mask, IB, EB);
 427         }
 428 }
 429
 430 /*
 431  * XXX we should make sure that slot becomes less than 32.
 432  * This is guaranteed by the input values.
 433  * roundedS is always cl->S rounded on grp->slot_shift bits.
 434  */
 435 static inline void
 436 qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl, uint64_t roundedS)
 437 {
 438         uint64_t slot = (roundedS - grp->S) >> grp->slot_shift;
 439         unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
 440
 441         cl->next = grp->slots[i];
 442         grp->slots[i] = cl;
 443         __set_bit(slot, &grp->full_slots);
 444 }
 445
 446 /*
 447  * remove the entry from the slot
 448  */
 449 static inline void
 450 qfq_front_slot_remove(struct qfq_group *grp)
 451 {
 452         struct qfq_class **h = &grp->slots[grp->front];
 453
 454         *h = (*h)->next;
 455         if (!*h)
 456                 __clear_bit(0, &grp->full_slots);
 457 }
 458
 459 /*
 460  * Returns the first full queue in a group. As a side effect,
 461  * adjust the bucket list so the first non-empty bucket is at
 462  * position 0 in full_slots.
 463  */
 464 static inline struct qfq_class *
 465 qfq_slot_scan(struct qfq_group *grp)
 466 {
 467         int i;
 468
 469         ND("grp %d full %x", grp->index, grp->full_slots);
 470         if (!grp->full_slots)
 471                 return NULL;
 472
 473         i = ffs(grp->full_slots) - 1; // zero-based
 474         if (i > 0) {
 475                 grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
 476                 grp->full_slots >>= i;
 477         }
 478
 479         return grp->slots[grp->front];
 480 }
 481
 482 /*
 483  * adjust the bucket list. When the start time of a group decreases,
 484  * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
 485  * move the objects. The mask of occupied slots must be shifted
 486  * because we use ffs() to find the first non-empty slot.
 487  * This covers decreases in the group's start time, but what about
 488  * increases of the start time ?
 489  * Here too we should make sure that i is less than 32
 490  */
 491 static inline void
 492 qfq_slot_rotate(struct qfq_sched *q, struct qfq_group *grp, uint64_t roundedS)
 493 {
 494         unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
 495
 496         (void)q;
 497         grp->full_slots <<= i;
 498         grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
 499 }
 500
 501
 502 static inline void
 503 qfq_update_eligible(struct qfq_sched *q, uint64_t old_V)
 504 {
 505         bitmap ineligible;
 506
 507         ineligible = q->bitmaps[IR] | q->bitmaps[IB];
 508         if (ineligible) {
 509                 if (!q->bitmaps[ER]) {
 510                         struct qfq_group *grp;
 511                         grp = qfq_ffs(q, ineligible);
 512                         if (qfq_gt(grp->S, q->V))
 513                                 q->V = grp->S;
 514                 }
 515                 qfq_make_eligible(q, old_V);
 516         }
 517 }
 518
 519 /*
 520  * Updates the class, returns true if also the group needs to be updated.
 521  */
 522 static inline int
 523 qfq_update_class(struct qfq_sched *q, struct qfq_group *grp,
 524             struct qfq_class *cl)
 525 {
 526
 527         (void)q;
 528         cl->S = cl->F;
 529         if (cl->_q.mq.head == NULL)  {
 530                 qfq_front_slot_remove(grp);
 531         } else {
 532                 unsigned int len;
 533                 uint64_t roundedS;
 534
 535                 len = cl->_q.mq.head->m_pkthdr.len;
 536                 cl->F = cl->S + (uint64_t)len * cl->inv_w;
 537                 roundedS = qfq_round_down(cl->S, grp->slot_shift);
 538                 if (roundedS == grp->S)
 539                         return 0;
 540
 541                 qfq_front_slot_remove(grp);
 542                 qfq_slot_insert(grp, cl, roundedS);
 543         }
 544         return 1;
 545 }
 546
 547 static struct mbuf *
 548 qfq_dequeue(struct dn_sch_inst *si)
 549 {
 550         struct qfq_sched *q = (struct qfq_sched *)(si + 1);
 551         struct qfq_group *grp;
 552         struct qfq_class *cl;
 553         struct mbuf *m;
 554         uint64_t old_V;
 555
 556         NO(q->loops++;)
 557         if (!q->bitmaps[ER]) {
 558                 NO(if (q->queued)
 559                         dump_sched(q, "start dequeue");)
 560                 return NULL;
 561         }
 562
 563         grp = qfq_ffs(q, q->bitmaps[ER]);
 564
 565         cl = grp->slots[grp->front];
 566         /* extract from the first bucket in the bucket list */
 567         m = dn_dequeue(&cl->_q);
 568
 569         if (!m) {
 570                 D("BUG/* non-workconserving leaf */");
 571                 return NULL;
 572         }
 573         NO(q->queued--;)
 574         old_V = q->V;
 575         q->V += (uint64_t)m->m_pkthdr.len * q->iwsum;
 576         ND("m is %p F 0x%llx V now 0x%llx", m, cl->F, q->V);
 577
 578         if (qfq_update_class(q, grp, cl)) {
 579                 uint64_t old_F = grp->F;
 580                 cl = qfq_slot_scan(grp);
 581                 if (!cl) { /* group gone, remove from ER */
 582                         __clear_bit(grp->index, &q->bitmaps[ER]);
 583                         // grp->S = grp->F + 1; // XXX debugging only
 584                 } else {
 585                         uint64_t roundedS = qfq_round_down(cl->S, grp->slot_shift);
 586                         unsigned int s;
 587
 588                         if (grp->S == roundedS)
 589                                 goto skip_unblock;
 590                         grp->S = roundedS;
 591                         grp->F = roundedS + (2ULL << grp->slot_shift);
 592                         /* remove from ER and put in the new set */
 593                         __clear_bit(grp->index, &q->bitmaps[ER]);
 594                         s = qfq_calc_state(q, grp);
 595                         __set_bit(grp->index, &q->bitmaps[s]);
 596                 }
 597                 /* we need to unblock even if the group has gone away */
 598                 qfq_unblock_groups(q, grp->index, old_F);
 599         }
 600
 601 skip_unblock:
 602         qfq_update_eligible(q, old_V);
 603         NO(if (!q->bitmaps[ER] && q->queued)
 604                 dump_sched(q, "end dequeue");)
 605
 606         return m;
 607 }
 608
 609 /*
 610  * Assign a reasonable start time for a new flow k in group i.
 611  * Admissible values for \hat(F) are multiples of \sigma_i
 612  * no greater than V+\sigma_i . Larger values mean that
 613  * we had a wraparound so we consider the timestamp to be stale.
 614  *
 615  * If F is not stale and F >= V then we set S = F.
 616  * Otherwise we should assign S = V, but this may violate
 617  * the ordering in ER. So, if we have groups in ER, set S to
 618  * the F_j of the first group j which would be blocking us.
 619  * We are guaranteed not to move S backward because
 620  * otherwise our group i would still be blocked.
 621  */
 622 static inline void
 623 qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
 624 {
 625         unsigned long mask;
 626         uint64_t limit, roundedF;
 627         int slot_shift = cl->grp->slot_shift;
 628
 629         roundedF = qfq_round_down(cl->F, slot_shift);
 630         limit = qfq_round_down(q->V, slot_shift) + (1ULL << slot_shift);
 631
 632         if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
 633                 /* timestamp was stale */
 634                 mask = mask_from(q->bitmaps[ER], cl->grp->index);
 635                 if (mask) {
 636                         struct qfq_group *next = qfq_ffs(q, mask);
 637                         if (qfq_gt(roundedF, next->F)) {
 638                                 /* from pv 71261956973ba9e0637848a5adb4a5819b4bae83 */
 639                                 if (qfq_gt(limit, next->F))
 640                                         cl->S = next->F;
 641                                 else /* preserve timestamp correctness */
 642                                         cl->S = limit;
 643                                 return;
 644                         }
 645                 }
 646                 cl->S = q->V;
 647         } else { /* timestamp is not stale */
 648                 cl->S = cl->F;
 649         }
 650 }
 651
 652 static int
 653 qfq_enqueue(struct dn_sch_inst *si, struct dn_queue *_q, struct mbuf *m)
 654 {
 655         struct qfq_sched *q = (struct qfq_sched *)(si + 1);
 656         struct qfq_group *grp;
 657         struct qfq_class *cl = (struct qfq_class *)_q;
 658         uint64_t roundedS;
 659         int s;
 660
 661         NO(q->loops++;)
 662         DX(4, "len %d flow %p inv_w 0x%x grp %d", m->m_pkthdr.len,
 663                 _q, cl->inv_w, cl->grp->index);
 664         /* XXX verify that the packet obeys the parameters */
 665         if (m != _q->mq.head) {
 666                 if (dn_enqueue(_q, m, 0)) /* packet was dropped */
 667                         return 1;
 668                 NO(q->queued++;)
 669                 if (m != _q->mq.head)
 670                         return 0;
 671         }
 672         /* If reach this point, queue q was idle */
 673         grp = cl->grp;
 674         qfq_update_start(q, cl); /* adjust start time */
 675         /* compute new finish time and rounded start. */
 676         cl->F = cl->S + (uint64_t)(m->m_pkthdr.len) * cl->inv_w;
 677         roundedS = qfq_round_down(cl->S, grp->slot_shift);
 678
 679         /*
 680          * insert cl in the correct bucket.
 681          * If cl->S >= grp->S we don't need to adjust the
 682          * bucket list and simply go to the insertion phase.
 683          * Otherwise grp->S is decreasing, we must make room
 684          * in the bucket list, and also recompute the group state.
 685          * Finally, if there were no flows in this group and nobody
 686          * was in ER make sure to adjust V.
 687          */
 688         if (grp->full_slots) {
 689                 if (!qfq_gt(grp->S, cl->S))
 690                         goto skip_update;
 691                 /* create a slot for this cl->S */
 692                 qfq_slot_rotate(q, grp, roundedS);
 693                 /* group was surely ineligible, remove */
 694                 __clear_bit(grp->index, &q->bitmaps[IR]);
 695                 __clear_bit(grp->index, &q->bitmaps[IB]);
 696         } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
 697                 q->V = roundedS;
 698
 699         grp->S = roundedS;
 700         grp->F = roundedS + (2ULL << grp->slot_shift); // i.e. 2\sigma_i
 701         s = qfq_calc_state(q, grp);
 702         __set_bit(grp->index, &q->bitmaps[s]);
 703         ND("new state %d 0x%x", s, q->bitmaps[s]);
 704         ND("S %llx F %llx V %llx", cl->S, cl->F, q->V);
 705 skip_update:
 706         qfq_slot_insert(grp, cl, roundedS);
 707
 708         return 0;
 709 }
 710
 711
 712 #if 0
 713 static inline void
 714 qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
 715         struct qfq_class *cl, struct qfq_class **pprev)
 716 {
 717         unsigned int i, offset;
 718         uint64_t roundedS;
 719
 720         roundedS = qfq_round_down(cl->S, grp->slot_shift);
 721         offset = (roundedS - grp->S) >> grp->slot_shift;
 722         i = (grp->front + offset) % QFQ_MAX_SLOTS;
 723
 724 #ifdef notyet
 725         if (!pprev) {
 726                 pprev = &grp->slots[i];
 727                 while (*pprev && *pprev != cl)
 728                         pprev = &(*pprev)->next;
 729         }
 730 #endif
 731
 732         *pprev = cl->next;
 733         if (!grp->slots[i])
 734                 __clear_bit(offset, &grp->full_slots);
 735 }
 736
 737 /*
 738  * called to forcibly destroy a queue.
 739  * If the queue is not in the front bucket, or if it has
 740  * other queues in the front bucket, we can simply remove
 741  * the queue with no other side effects.
 742  * Otherwise we must propagate the event up.
 743  * XXX description to be completed.
 744  */
 745 static void
 746 qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
 747                                  struct qfq_class **pprev)
 748 {
 749         struct qfq_group *grp = &q->groups[cl->index];
 750         unsigned long mask;
 751         uint64_t roundedS;
 752         int s;
 753
 754         cl->F = cl->S;  // not needed if the class goes away.
 755         qfq_slot_remove(q, grp, cl, pprev);
 756
 757         if (!grp->full_slots) {
 758                 /* nothing left in the group, remove from all sets.
 759                  * Do ER last because if we were blocking other groups
 760                  * we must unblock them.
 761                  */
 762                 __clear_bit(grp->index, &q->bitmaps[IR]);
 763                 __clear_bit(grp->index, &q->bitmaps[EB]);
 764                 __clear_bit(grp->index, &q->bitmaps[IB]);
 765
 766                 if (test_bit(grp->index, &q->bitmaps[ER]) &&
 767                     !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
 768                         mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
 769                         if (mask)
 770                                 mask = ~((1UL << __fls(mask)) - 1);
 771                         else
 772                                 mask = ~0UL;
 773                         qfq_move_groups(q, mask, EB, ER);
 774                         qfq_move_groups(q, mask, IB, IR);
 775                 }
 776                 __clear_bit(grp->index, &q->bitmaps[ER]);
 777         } else if (!grp->slots[grp->front]) {
 778                 cl = qfq_slot_scan(grp);
 779                 roundedS = qfq_round_down(cl->S, grp->slot_shift);
 780                 if (grp->S != roundedS) {
 781                         __clear_bit(grp->index, &q->bitmaps[ER]);
 782                         __clear_bit(grp->index, &q->bitmaps[IR]);
 783                         __clear_bit(grp->index, &q->bitmaps[EB]);
 784                         __clear_bit(grp->index, &q->bitmaps[IB]);
 785                         grp->S = roundedS;
 786                         grp->F = roundedS + (2ULL << grp->slot_shift);
 787                         s = qfq_calc_state(q, grp);
 788                         __set_bit(grp->index, &q->bitmaps[s]);
 789                 }
 790         }
 791         qfq_update_eligible(q, q->V);
 792 }
 793 #endif
 794
 795 static int
 796 qfq_new_fsk(struct dn_fsk *f)
 797 {
 798         ipdn_bound_var(&f->fs.par[0], 1, 1, QFQ_MAX_WEIGHT, "qfq weight");
 799         ipdn_bound_var(&f->fs.par[1], 1500, 1, 2000, "qfq maxlen");
 800         ND("weight %d len %d\n", f->fs.par[0], f->fs.par[1]);
 801         return 0;
 802 }
 803
 804 /*
 805  * initialize a new scheduler instance
 806  */
 807 static int
 808 qfq_new_sched(struct dn_sch_inst *si)
 809 {
 810         struct qfq_sched *q = (struct qfq_sched *)(si + 1);
 811         struct qfq_group *grp;
 812         int i;
 813
 814         for (i = 0; i <= QFQ_MAX_INDEX; i++) {
 815                 grp = &q->groups[i];
 816                 grp->index = i;
 817                 grp->slot_shift = QFQ_MTU_SHIFT + FRAC_BITS -
 818                                         (QFQ_MAX_INDEX - i);
 819         }
 820         return 0;
 821 }
 822
 823 /*
 824  * QFQ scheduler descriptor
 825  */
 826 static struct dn_alg qfq_desc = {
 827         _SI( .type = ) DN_SCHED_QFQ,
 828         _SI( .name = ) "QFQ",
 829         _SI( .flags = ) DN_MULTIQUEUE,
 830
 831         _SI( .schk_datalen = ) 0,
 832         _SI( .si_datalen = ) sizeof(struct qfq_sched),
 833         _SI( .q_datalen = ) sizeof(struct qfq_class) - sizeof(struct dn_queue),
 834
 835         _SI( .enqueue = ) qfq_enqueue,
 836         _SI( .dequeue = ) qfq_dequeue,
 837
 838         _SI( .config = )  NULL,
 839         _SI( .destroy = )  NULL,
 840         _SI( .new_sched = ) qfq_new_sched,
 841         _SI( .free_sched = )  NULL,
 842         _SI( .new_fsk = ) qfq_new_fsk,
 843         _SI( .free_fsk = )  NULL,
 844         _SI( .new_queue = ) qfq_new_queue,
 845         _SI( .free_queue = ) qfq_free_queue,
 846 #ifdef NEW_AQM
 847         _SI( .getconfig = )  NULL,
 848 #endif
 849 };
 850
 851 DECLARE_DNSCHED_MODULE(dn_qfq, &qfq_desc);
 852
 853 #ifdef QFQ_DEBUG
 854 static void
 855 dump_groups(struct qfq_sched *q, uint32_t mask)
 856 {
 857         int i, j;
 858
 859         for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
 860                 struct qfq_group *g = &q->groups[i];
 861
 862                 if (0 == (mask & (1<<i)))
 863                         continue;
 864                 for (j = 0; j < QFQ_MAX_SLOTS; j++) {
 865                         if (g->slots[j])
 866                                 D("    bucket %d %p", j, g->slots[j]);
 867                 }
 868                 D("full_slots 0x%llx", (_P64)g->full_slots);
 869                 D("        %2d S 0x%20llx F 0x%llx %c", i,
 870                         (_P64)g->S, (_P64)g->F,
 871                         mask & (1<<i) ? '1' : '0');
 872         }
 873 }
 874
 875 static void
 876 dump_sched(struct qfq_sched *q, const char *msg)
 877 {
 878         D("--- in %s: ---", msg);
 879         D("loops %d queued %d V 0x%llx", q->loops, q->queued, (_P64)q->V);
 880         D("    ER 0x%08x", (unsigned)q->bitmaps[ER]);
 881         D("    EB 0x%08x", (unsigned)q->bitmaps[EB]);
 882         D("    IR 0x%08x", (unsigned)q->bitmaps[IR]);
 883         D("    IB 0x%08x", (unsigned)q->bitmaps[IB]);
 884         dump_groups(q, 0xffffffff);
 885 };
 886 #endif /* QFQ_DEBUG */