2 * Copyright (c) 2010-2011 Juniper Networks, Inc.
5 * This software was developed by Robert N. M. Watson under contract
6 * to Juniper Networks, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
34 #include "opt_inet6.h"
35 #include "opt_pcbgroup.h"
38 #error "options RSS depends on options PCBGROUP"
41 #include <sys/param.h>
43 #include <sys/socket.h>
45 #include <sys/kernel.h>
47 #include <sys/sysctl.h>
51 #include <net/if_var.h>
52 #include <net/netisr.h>
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_rss.h>
57 #include <netinet/in_var.h>
58 #include <netinet/toeplitz.h>
61 * Operating system parts of receiver-side scaling (RSS), which allows
62 * network cards to direct flows to particular receive queues based on hashes
63 * of header tuples. This implementation aligns RSS buckets with connection
64 * groups at the TCP/IP layer, so each bucket is associated with exactly one
65 * group. As a result, the group lookup structures (and lock) should have an
66 * effective affinity with exactly one CPU.
68 * Network device drivers needing to configure RSS will query this framework
69 * for parameters, such as the current RSS key, hashing policies, number of
70 * bits, and indirection table mapping hashes to buckets and CPUs. They may
71 * provide their own supplementary information, such as queue<->CPU bindings.
72 * It is the responsibility of the network device driver to inject packets
73 * into the stack on as close to the right CPU as possible, if playing by RSS
78 * - Synchronization for rss_key and other future-configurable parameters.
79 * - Event handler drivers can register to pick up RSS configuration changes.
80 * - Should we allow rss_basecpu to be configured?
81 * - Randomize key on boot.
83 * - Statistics on how often there's a misalignment between hardware
84 * placement and pcbgroup expectations.
87 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
90 * Toeplitz is the only required hash function in the RSS spec, so use it by
93 static u_int rss_hashalgo = RSS_HASH_TOEPLITZ;
94 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RD, &rss_hashalgo, 0,
95 "RSS hash algorithm");
96 TUNABLE_INT("net.inet.rss.hashalgo", &rss_hashalgo);
99 * Size of the indirection table; at most 128 entries per the RSS spec. We
100 * size it to at least 2 times the number of CPUs by default to allow useful
101 * rebalancing. If not set explicitly with a loader tunable, we tune based
102 * on the number of CPUs present.
104 * XXXRW: buckets might be better to use for the tunable than bits.
106 static u_int rss_bits;
107 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RD, &rss_bits, 0,
109 TUNABLE_INT("net.inet.rss.bits", &rss_bits);
111 static u_int rss_mask;
112 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
115 static const u_int rss_maxbits = RSS_MAXBITS;
116 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
117 __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
120 * RSS's own count of the number of CPUs it could be using for processing.
121 * Bounded to 64 by RSS constants.
123 static u_int rss_ncpus;
124 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
125 "Number of CPUs available to RSS");
127 #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1))
128 static const u_int rss_maxcpus = RSS_MAXCPUS;
129 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
130 __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
133 * Variable exists just for reporting rss_bits in a user-friendly way.
135 static u_int rss_buckets;
136 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
140 * Base CPU number; devices will add this to all CPU numbers returned by the
141 * RSS indirection table. Currently unmodifable in FreeBSD.
143 static const u_int rss_basecpu;
144 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
145 __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
148 * RSS secret key, intended to prevent attacks on load-balancing. Its
149 * effectiveness may be limited by algorithm choice and available entropy
152 * XXXRW: And that we don't randomize it yet!
154 * XXXRW: This default is actually the default key from Chelsio T3 cards, as
155 * it offers reasonable distribution, unlike all-0 keys which always
156 * generate a hash of 0 (upsettingly).
158 static uint8_t rss_key[RSS_KEYSIZE] = {
159 0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d,
160 0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda,
161 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
162 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
163 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
167 * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
168 * Drivers may supplement this table with a seperate CPU<->queue table when
169 * programming devices.
171 struct rss_table_entry {
172 uint8_t rte_cpu; /* CPU affinity of bucket. */
174 static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN];
177 rss_init(__unused void *arg)
183 * Validate tunables, coerce to sensible values.
185 switch (rss_hashalgo) {
186 case RSS_HASH_TOEPLITZ:
191 printf("%s: invalid RSS hashalgo %u, coercing to %u",
192 __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
193 rss_hashalgo = RSS_HASH_TOEPLITZ;
197 * Count available CPUs.
199 * XXXRW: Note incorrect assumptions regarding contiguity of this set
203 for (i = 0; i <= mp_maxid; i++) {
208 if (rss_ncpus > RSS_MAXCPUS)
209 rss_ncpus = RSS_MAXCPUS;
212 * Tune RSS table entries to be no less than 2x the number of CPUs
213 * -- unless we're running uniprocessor, in which case there's not
214 * much point in having buckets to rearrange for load-balancing!
218 rss_bits = fls(rss_ncpus - 1) + 1;
221 * Microsoft limits RSS table entries to 128, so apply that
222 * limit to both auto-detected CPU counts and user-configured
225 if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
226 printf("%s: RSS bits %u not valid, coercing to %u",
227 __func__, rss_bits, RSS_MAXBITS);
228 rss_bits = RSS_MAXBITS;
232 * Figure out how many buckets to use; warn if less than the
233 * number of configured CPUs, although this is not a fatal
236 rss_buckets = (1 << rss_bits);
237 if (rss_buckets < rss_ncpus)
238 printf("%s: WARNING: rss_buckets (%u) less than "
239 "rss_ncpus (%u)\n", __func__, rss_buckets,
241 rss_mask = rss_buckets - 1;
249 * Set up initial CPU assignments: round-robin by default.
252 for (i = 0; i < rss_buckets; i++) {
253 rss_table[i].rte_cpu = cpuid;
254 cpuid = CPU_NEXT(cpuid);
260 * XXXRW: Not yet. If nothing else, will require an rss_isbadkey()
261 * loop to check for "bad" RSS keys.
264 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
267 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
274 for (i = 0; i < keylen; i++)
276 for (i = 0; i < datalen; i++)
282 rss_hash(u_int datalen, const uint8_t *data)
285 switch (rss_hashalgo) {
286 case RSS_HASH_TOEPLITZ:
287 return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
291 return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
295 panic("%s: unsupported/unknown hashalgo %d", __func__,
301 * Hash an IPv4 2-tuple.
304 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
306 uint8_t data[sizeof(src) + sizeof(dst)];
310 bcopy(&src, &data[datalen], sizeof(src));
311 datalen += sizeof(src);
312 bcopy(&dst, &data[datalen], sizeof(dst));
313 datalen += sizeof(dst);
314 return (rss_hash(datalen, data));
318 * Hash an IPv4 4-tuple.
321 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
324 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
329 bcopy(&src, &data[datalen], sizeof(src));
330 datalen += sizeof(src);
331 bcopy(&dst, &data[datalen], sizeof(dst));
332 datalen += sizeof(dst);
333 bcopy(&srcport, &data[datalen], sizeof(srcport));
334 datalen += sizeof(srcport);
335 bcopy(&dstport, &data[datalen], sizeof(dstport));
336 datalen += sizeof(dstport);
337 return (rss_hash(datalen, data));
342 * Hash an IPv6 2-tuple.
345 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst)
347 uint8_t data[sizeof(src) + sizeof(dst)];
351 bcopy(&src, &data[datalen], sizeof(src));
352 datalen += sizeof(src);
353 bcopy(&dst, &data[datalen], sizeof(dst));
354 datalen += sizeof(dst);
355 return (rss_hash(datalen, data));
359 * Hash an IPv6 4-tuple.
362 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport,
363 struct in6_addr dst, u_short dstport)
365 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
370 bcopy(&src, &data[datalen], sizeof(src));
371 datalen += sizeof(src);
372 bcopy(&dst, &data[datalen], sizeof(dst));
373 datalen += sizeof(dst);
374 bcopy(&srcport, &data[datalen], sizeof(srcport));
375 datalen += sizeof(srcport);
376 bcopy(&dstport, &data[datalen], sizeof(dstport));
377 datalen += sizeof(dstport);
378 return (rss_hash(datalen, data));
383 * Query the number of RSS bits in use.
393 * Query the RSS bucket associated with an RSS hash.
396 rss_getbucket(u_int hash)
399 return (hash & rss_mask);
403 * Query the RSS layer bucket associated with the given
404 * entry in the RSS hash space.
406 * The RSS indirection table is 0 .. rss_buckets-1,
407 * covering the low 'rss_bits' of the total 128 slot
408 * RSS indirection table. So just mask off rss_bits and
411 * NIC drivers can then iterate over the 128 slot RSS
412 * indirection table and fetch which RSS bucket to
413 * map it to. This will typically be a CPU queue
416 rss_get_indirection_to_bucket(u_int index)
419 return (index & rss_mask);
423 * Query the RSS CPU associated with an RSS bucket.
426 rss_getcpu(u_int bucket)
429 return (rss_table[bucket].rte_cpu);
433 * netisr CPU affinity lookup given just the hash and hashtype.
436 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
440 case M_HASHTYPE_RSS_IPV4:
441 case M_HASHTYPE_RSS_TCP_IPV4:
442 return (rss_getcpu(rss_getbucket(hash_val)));
444 return (NETISR_CPUID_NONE);
449 * Query the RSS bucket associated with the given hash value and
453 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
457 case M_HASHTYPE_RSS_IPV4:
458 case M_HASHTYPE_RSS_TCP_IPV4:
459 *bucket_id = rss_getbucket(hash_val);
467 * netisr CPU affinity lookup routine for use by protocols.
470 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
474 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
479 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
484 return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
489 * Query the RSS hash algorithm.
492 rss_gethashalgo(void)
495 return (rss_hashalgo);
499 * Query the current RSS key; likely to be used by device drivers when
500 * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE.
502 * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
505 rss_getkey(uint8_t *key)
508 bcopy(rss_key, key, sizeof(rss_key));
512 * Query the number of buckets; this may be used by both network device
513 * drivers, which will need to populate hardware shadows of the software
514 * indirection table, and the network stack itself (such as when deciding how
515 * many connection groups to allocate).
518 rss_getnumbuckets(void)
521 return (rss_buckets);
525 * Query the number of CPUs in use by RSS; may be useful to device drivers
526 * trying to figure out how to map a larger number of CPUs into a smaller
527 * number of receive queues.
537 * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
538 * it appearing in debugging output unnecessarily.
541 sysctl_rss_key(SYSCTL_HANDLER_ARGS)
543 uint8_t temp_rss_key[RSS_KEYSIZE];
546 error = priv_check(req->td, PRIV_NETINET_HASHKEY);
550 bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
551 error = sysctl_handle_opaque(oidp, temp_rss_key,
552 sizeof(temp_rss_key), req);
555 if (req->newptr != NULL) {
556 /* XXXRW: Not yet. */
561 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
562 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
563 "", "RSS keying material");
566 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
573 error = sysctl_wire_old_buffer(req, 0);
576 sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
579 for (i = 0; i < rss_buckets; i++) {
580 sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
584 error = sbuf_finish(sb);
589 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
590 CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
591 sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");