2 * Copyright (c) 2010-2011 Juniper Networks, Inc.
5 * This software was developed by Robert N. M. Watson under contract
6 * to Juniper Networks, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
34 #include "opt_inet6.h"
37 #include <sys/param.h>
39 #include <sys/malloc.h>
41 #include <sys/mutex.h>
43 #include <sys/socketvar.h>
45 #include <netinet/in.h>
46 #include <netinet/in_pcb.h>
47 #include <netinet/in_rss.h>
49 #include <netinet6/in6_pcb.h>
53 * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
54 * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
55 * Strategies in Modern Operating Systems". This implementation differs
56 * significantly from that described in the paper, in that it attempts to
57 * introduce not just notions of affinity for connections and distribute work
58 * so as to reduce lock contention, but also align those notions with
59 * hardware work distribution strategies such as RSS. In this construction,
60 * connection groups supplement, rather than replace, existing reservation
61 * tables for protocol 4-tuples, offering CPU-affine lookup tables with
62 * minimal cache line migration and lock contention during steady state
65 * Hardware-offloaded checksums are often inefficient in software -- for
66 * example, Toeplitz, specified by RSS, introduced a significant overhead if
67 * performed during per-packge processing. It is therefore desirable to fall
68 * back on traditional reservation table lookups without affinity where
69 * hardware-offloaded checksums aren't available, such as for traffic over
72 * Internet protocols, such as UDP and TCP, register to use connection groups
73 * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
74 * indicates to the connection group code whether a 2-tuple or 4-tuple is
75 * used as an argument to hashes that assign a connection to a particular
76 * group. This must be aligned with any hardware offloaded distribution
77 * model, such as RSS or similar approaches taken in embedded network boards.
78 * Wildcard sockets require special handling, as in Willman 2006, and are
79 * shared between connection groups -- while being protected by group-local
80 * locks. This means that connection establishment and teardown can be
81 * signficantly more expensive than without connection groups, but that
82 * steady-state processing can be significantly faster.
84 * When RSS is used, certain connection group parameters, such as the number
85 * of groups, are provided by the RSS implementation, found in in_rss.c.
86 * Otherwise, in_pcbgroup.c selects possible sensible parameters
87 * corresponding to the degree of parallelism exposed by netisr.
89 * Most of the implementation of connection groups is in this file; however,
90 * connection group lookup is implemented in in_pcb.c alongside reservation
91 * table lookups -- see in_pcblookup_group().
95 * Implement dynamic rebalancing of buckets with connection groups; when
96 * load is unevenly distributed, search for more optimal balancing on
97 * demand. This might require scaling up the number of connection groups
100 * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
101 * groups for ip_input and ip6_input, allowing non-offloaded work
104 * Expose effective CPU affinity of connections to userspace using socket
107 * Investigate per-connection affinity overrides based on socket options; an
108 * option could be set, certainly resulting in work being distributed
109 * differently in software, and possibly propagated to supporting hardware
110 * with TCAMs or hardware hash tables. This might require connections to
111 * exist in more than one connection group at a time.
113 * Hook netisr thread reconfiguration events, and propagate those to RSS so
114 * that rebalancing can occur when the thread pool grows or shrinks.
116 * Expose per-pcbgroup statistics to userspace monitoring tools such as
117 * netstat, in order to allow better debugging and profiling.
121 in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
124 struct inpcbgroup *pcbgroup;
125 u_int numpcbgroups, pgn;
128 * Only enable connection groups for a protocol if it has been
129 * specifically requested.
131 if (hashfields == IPI_HASHFIELDS_NONE)
135 * Connection groups are about multi-processor load distribution,
136 * lock contention, and connection CPU affinity. As such, no point
137 * in turning them on for a uniprocessor machine, it only wastes
145 * If we're using RSS, then RSS determines the number of connection
146 * groups to use: one connection group per RSS bucket. If for some
147 * reason RSS isn't able to provide a number of buckets, disable
148 * connection groups entirely.
150 * XXXRW: Can this ever happen?
152 numpcbgroups = rss_getnumbuckets();
153 if (numpcbgroups == 0)
157 * Otherwise, we'll just use one per CPU for now. If we decide to
158 * do dynamic rebalancing a la RSS, we'll need similar logic here.
160 numpcbgroups = mp_ncpus;
163 pcbinfo->ipi_hashfields = hashfields;
164 pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
165 sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
166 pcbinfo->ipi_npcbgroups = numpcbgroups;
167 pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
168 &pcbinfo->ipi_wildmask);
169 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
170 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
171 pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
172 &pcbgroup->ipg_hashmask);
173 INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
176 * Initialise notional affinity of the pcbgroup -- for RSS,
177 * we want the same notion of affinity as NICs to be used. In
178 * the non-RSS case, just round robin for the time being.
180 * XXXRW: The notion of a bucket to CPU mapping is common at
181 * both pcbgroup and RSS layers -- does that mean that we
182 * should migrate it all from RSS to here, and just leave RSS
183 * responsible only for providing hashing and mapping funtions?
186 pcbgroup->ipg_cpu = rss_getcpu(pgn);
188 pcbgroup->ipg_cpu = (pgn % mp_ncpus);
194 in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
196 struct inpcbgroup *pcbgroup;
199 if (pcbinfo->ipi_npcbgroups == 0)
202 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
203 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
204 KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
205 ("in_pcbinfo_destroy: listhead not empty"));
206 INP_GROUP_LOCK_DESTROY(pcbgroup);
207 hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
208 pcbgroup->ipg_hashmask);
210 hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
211 free(pcbinfo->ipi_pcbgroups, M_PCB);
212 pcbinfo->ipi_pcbgroups = NULL;
213 pcbinfo->ipi_npcbgroups = 0;
214 pcbinfo->ipi_hashfields = 0;
218 * Given a hash of whatever the covered tuple might be, return a pcbgroup
219 * index. Where RSS is supported, try to align bucket selection with RSS CPU
222 static __inline u_int
223 in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
227 return (rss_getbucket(hash));
229 return (hash % pcbinfo->ipi_npcbgroups);
234 * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
235 * information is insufficient to identify the pcbgroup. This might occur if
236 * a TCP packet turns up with a 2-tuple hash, or if an RSS hash is present but
237 * RSS is not compiled into the kernel.
240 in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
244 if ((pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
245 hashtype == M_HASHTYPE_RSS_TCP_IPV4) ||
246 (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_4TUPLE &&
247 hashtype == M_HASHTYPE_RSS_UDP_IPV4) ||
248 (pcbinfo->ipi_hashfields == IPI_HASHFIELDS_2TUPLE &&
249 hashtype == M_HASHTYPE_RSS_IPV4))
250 return (&pcbinfo->ipi_pcbgroups[
251 in_pcbgroup_getbucket(pcbinfo, hash)]);
256 static struct inpcbgroup *
257 in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
260 return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
261 m->m_pkthdr.flowid));
265 in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
266 u_short lport, struct in_addr faddr, u_short fport)
271 * RSS note: we pass foreign addr/port as source, and local addr/port
272 * as destination, as we want to align with what the hardware is
275 switch (pcbinfo->ipi_hashfields) {
276 case IPI_HASHFIELDS_4TUPLE:
278 hash = rss_hash_ip4_4tuple(faddr, fport, laddr, lport);
280 hash = faddr.s_addr ^ fport;
284 case IPI_HASHFIELDS_2TUPLE:
286 hash = rss_hash_ip4_2tuple(faddr, laddr);
288 hash = faddr.s_addr ^ laddr.s_addr;
295 return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
300 in_pcbgroup_byinpcb(struct inpcb *inp)
304 * Listen sockets with INP_RSS_BUCKET_SET set have a pre-determined
305 * RSS bucket and thus we should use this pcbgroup, rather than
306 * using a tuple or hash.
308 * XXX should verify that there's actually pcbgroups and inp_rss_listen_bucket
311 if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
312 return (&inp->inp_pcbinfo->ipi_pcbgroups[inp->inp_rss_listen_bucket]);
315 return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
316 inp->inp_lport, inp->inp_faddr, inp->inp_fport));
320 in_pcbwild_add(struct inpcb *inp)
322 struct inpcbinfo *pcbinfo;
323 struct inpcbhead *head;
326 INP_WLOCK_ASSERT(inp);
327 KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
328 ("%s: is wild",__func__));
330 pcbinfo = inp->inp_pcbinfo;
331 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
332 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
333 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
334 0, pcbinfo->ipi_wildmask)];
335 LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
336 inp->inp_flags2 |= INP_PCBGROUPWILD;
337 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
338 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
342 in_pcbwild_remove(struct inpcb *inp)
344 struct inpcbinfo *pcbinfo;
347 INP_WLOCK_ASSERT(inp);
348 KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
349 ("%s: not wild", __func__));
351 pcbinfo = inp->inp_pcbinfo;
352 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
353 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
354 LIST_REMOVE(inp, inp_pcbgroup_wild);
355 for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
356 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
357 inp->inp_flags2 &= ~INP_PCBGROUPWILD;
361 in_pcbwild_needed(struct inpcb *inp)
365 * If it's a listen socket and INP_RSS_BUCKET_SET is set,
366 * it's a wildcard socket _but_ it's in a specific pcbgroup.
367 * Thus we don't treat it as a pcbwild inp.
369 if (inp->inp_flags2 & INP_RSS_BUCKET_SET)
374 if (inp->inp_vflag & INP_IPV6)
375 return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
378 return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
382 in_pcbwild_update_internal(struct inpcb *inp)
386 wildcard_needed = in_pcbwild_needed(inp);
387 if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
389 else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
390 in_pcbwild_remove(inp);
394 * Update the pcbgroup of an inpcb, which might include removing an old
395 * pcbgroup reference and/or adding a new one. Wildcard processing is not
396 * performed here, although ideally we'll never install a pcbgroup for a
397 * wildcard inpcb (asserted below).
400 in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
401 struct inpcbgroup *newpcbgroup, struct inpcb *inp)
403 struct inpcbgroup *oldpcbgroup;
404 struct inpcbhead *pcbhash;
405 uint32_t hashkey_faddr;
407 INP_WLOCK_ASSERT(inp);
409 oldpcbgroup = inp->inp_pcbgroup;
410 if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
411 INP_GROUP_LOCK(oldpcbgroup);
412 LIST_REMOVE(inp, inp_pcbgrouphash);
413 inp->inp_pcbgroup = NULL;
414 INP_GROUP_UNLOCK(oldpcbgroup);
416 if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
418 if (inp->inp_vflag & INP_IPV6)
419 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
422 hashkey_faddr = inp->inp_faddr.s_addr;
423 INP_GROUP_LOCK(newpcbgroup);
425 * If the inp is an RSS bucket wildcard entry, ensure
426 * that the PCB hash is calculated correctly.
428 * The wildcard hash calculation differs from the
429 * non-wildcard definition. The source address is
430 * INADDR_ANY and the far port is 0.
432 if (inp->inp_flags2 & INP_RSS_BUCKET_SET) {
433 pcbhash = &newpcbgroup->ipg_hashbase[
434 INP_PCBHASH(INADDR_ANY, inp->inp_lport, 0,
435 newpcbgroup->ipg_hashmask)];
437 pcbhash = &newpcbgroup->ipg_hashbase[
438 INP_PCBHASH(hashkey_faddr, inp->inp_lport,
440 newpcbgroup->ipg_hashmask)];
442 LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
443 inp->inp_pcbgroup = newpcbgroup;
444 INP_GROUP_UNLOCK(newpcbgroup);
447 KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
448 ("%s: pcbgroup and wildcard!", __func__));
452 * Two update paths: one in which the 4-tuple on an inpcb has been updated
453 * and therefore connection groups may need to change (or a wildcard entry
454 * may needed to be installed), and another in which the 4-tuple has been
455 * set as a result of a packet received, in which case we may be able to use
456 * the hash on the mbuf to avoid doing a software hash calculation for RSS.
458 * In each case: first, let the wildcard code have a go at placing it as a
459 * wildcard socket. If it was a wildcard, or if the connection has been
460 * dropped, then no pcbgroup is required (so potentially clear it);
461 * otherwise, calculate and update the pcbgroup for the inpcb.
464 in_pcbgroup_update(struct inpcb *inp)
466 struct inpcbinfo *pcbinfo;
467 struct inpcbgroup *newpcbgroup;
469 INP_WLOCK_ASSERT(inp);
471 pcbinfo = inp->inp_pcbinfo;
472 if (!in_pcbgroup_enabled(pcbinfo))
475 in_pcbwild_update_internal(inp);
476 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
477 !(inp->inp_flags & INP_DROPPED)) {
479 if (inp->inp_vflag & INP_IPV6)
480 newpcbgroup = in6_pcbgroup_byinpcb(inp);
483 newpcbgroup = in_pcbgroup_byinpcb(inp);
486 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
490 in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
492 struct inpcbinfo *pcbinfo;
493 struct inpcbgroup *newpcbgroup;
495 INP_WLOCK_ASSERT(inp);
497 pcbinfo = inp->inp_pcbinfo;
498 if (!in_pcbgroup_enabled(pcbinfo))
502 * Possibly should assert !INP_PCBGROUPWILD rather than testing for
503 * it; presumably this function should never be called for anything
504 * other than non-wildcard socket?
506 in_pcbwild_update_internal(inp);
507 if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
508 !(inp->inp_flags & INP_DROPPED)) {
509 newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
511 if (inp->inp_vflag & INP_IPV6) {
512 if (newpcbgroup == NULL)
513 newpcbgroup = in6_pcbgroup_byinpcb(inp);
516 if (newpcbgroup == NULL)
517 newpcbgroup = in_pcbgroup_byinpcb(inp);
523 in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
527 * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
530 in_pcbgroup_remove(struct inpcb *inp)
532 struct inpcbgroup *pcbgroup;
534 INP_WLOCK_ASSERT(inp);
536 if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
539 if (inp->inp_flags2 & INP_PCBGROUPWILD)
540 in_pcbwild_remove(inp);
542 pcbgroup = inp->inp_pcbgroup;
543 if (pcbgroup != NULL) {
544 INP_GROUP_LOCK(pcbgroup);
545 LIST_REMOVE(inp, inp_pcbgrouphash);
546 inp->inp_pcbgroup = NULL;
547 INP_GROUP_UNLOCK(pcbgroup);
552 * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
556 in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
559 return (pcbinfo->ipi_npcbgroups > 0);