]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/uipc_ktls.c
ktls: Don't mark existing received mbufs notready for TOE TLS.
[FreeBSD/FreeBSD.git] / sys / kern / uipc_ktls.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2014-2019 Netflix Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_ratelimit.h"
34 #include "opt_rss.h"
35
36 #include <sys/param.h>
37 #include <sys/kernel.h>
38 #include <sys/domainset.h>
39 #include <sys/ktls.h>
40 #include <sys/lock.h>
41 #include <sys/mbuf.h>
42 #include <sys/mutex.h>
43 #include <sys/rmlock.h>
44 #include <sys/proc.h>
45 #include <sys/protosw.h>
46 #include <sys/refcount.h>
47 #include <sys/smp.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/sysctl.h>
51 #include <sys/taskqueue.h>
52 #include <sys/kthread.h>
53 #include <sys/uio.h>
54 #include <sys/vmmeter.h>
55 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
56 #include <machine/pcb.h>
57 #endif
58 #include <machine/vmparam.h>
59 #include <net/if.h>
60 #include <net/if_var.h>
61 #ifdef RSS
62 #include <net/netisr.h>
63 #include <net/rss_config.h>
64 #endif
65 #include <net/route.h>
66 #include <net/route/nhop.h>
67 #if defined(INET) || defined(INET6)
68 #include <netinet/in.h>
69 #include <netinet/in_pcb.h>
70 #endif
71 #include <netinet/tcp_var.h>
72 #ifdef TCP_OFFLOAD
73 #include <netinet/tcp_offload.h>
74 #endif
75 #include <opencrypto/xform.h>
76 #include <vm/uma_dbg.h>
77 #include <vm/vm.h>
78 #include <vm/vm_pageout.h>
79 #include <vm/vm_page.h>
80
81 struct ktls_wq {
82         struct mtx      mtx;
83         STAILQ_HEAD(, mbuf) m_head;
84         STAILQ_HEAD(, socket) so_head;
85         bool            running;
86         int             lastallocfail;
87 } __aligned(CACHE_LINE_SIZE);
88
89 struct ktls_domain_info {
90         int count;
91         int cpu[MAXCPU];
92 };
93
94 struct ktls_domain_info ktls_domains[MAXMEMDOM];
95 static struct ktls_wq *ktls_wq;
96 static struct proc *ktls_proc;
97 static uma_zone_t ktls_session_zone;
98 static uma_zone_t ktls_buffer_zone;
99 static uint16_t ktls_cpuid_lookup[MAXCPU];
100
101 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
102     "Kernel TLS offload");
103 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
104     "Kernel TLS offload stats");
105
106 #ifdef RSS
107 static int ktls_bind_threads = 1;
108 #else
109 static int ktls_bind_threads;
110 #endif
111 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
112     &ktls_bind_threads, 0,
113     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
114
115 static u_int ktls_maxlen = 16384;
116 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
117     &ktls_maxlen, 0, "Maximum TLS record size");
118
119 static int ktls_number_threads;
120 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
121     &ktls_number_threads, 0,
122     "Number of TLS threads in thread-pool");
123
124 static bool ktls_offload_enable;
125 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
126     &ktls_offload_enable, 0,
127     "Enable support for kernel TLS offload");
128
129 static bool ktls_cbc_enable = true;
130 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
131     &ktls_cbc_enable, 1,
132     "Enable Support of AES-CBC crypto for kernel TLS");
133
134 static bool ktls_sw_buffer_cache = true;
135 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
136     &ktls_sw_buffer_cache, 1,
137     "Enable caching of output buffers for SW encryption");
138
139 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
140 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
141     &ktls_tasks_active, "Number of active tasks");
142
143 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
144 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
145     &ktls_cnt_tx_queued,
146     "Number of TLS records in queue to tasks for SW encryption");
147
148 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
149 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
150     &ktls_cnt_rx_queued,
151     "Number of TLS sockets in queue to tasks for SW decryption");
152
153 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
154 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
155     CTLFLAG_RD, &ktls_offload_total,
156     "Total successful TLS setups (parameters set)");
157
158 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
159 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
160     CTLFLAG_RD, &ktls_offload_enable_calls,
161     "Total number of TLS enable calls made");
162
163 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
164 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
165     &ktls_offload_active, "Total Active TLS sessions");
166
167 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
168 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
169     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
170
171 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
172 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
173     &ktls_offload_failed_crypto, "Total TLS crypto failures");
174
175 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
176 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
177     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
178
179 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
180 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
181     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
182
183 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
184 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
185     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
186
187 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
188     "Software TLS session stats");
189 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
190     "Hardware (ifnet) TLS session stats");
191 #ifdef TCP_OFFLOAD
192 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
193     "TOE TLS session stats");
194 #endif
195
196 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
197 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
198     "Active number of software TLS sessions using AES-CBC");
199
200 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
201 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
202     "Active number of software TLS sessions using AES-GCM");
203
204 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
205 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
206     &ktls_sw_chacha20,
207     "Active number of software TLS sessions using Chacha20-Poly1305");
208
209 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
210 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
211     &ktls_ifnet_cbc,
212     "Active number of ifnet TLS sessions using AES-CBC");
213
214 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
215 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
216     &ktls_ifnet_gcm,
217     "Active number of ifnet TLS sessions using AES-GCM");
218
219 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
220 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
221     &ktls_ifnet_chacha20,
222     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
223
224 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
225 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
226     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
227
228 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
229 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
230     &ktls_ifnet_reset_dropped,
231     "TLS sessions dropped after failing to update ifnet send tag");
232
233 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
234 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
235     &ktls_ifnet_reset_failed,
236     "TLS sessions that failed to allocate a new ifnet send tag");
237
238 static int ktls_ifnet_permitted;
239 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
240     &ktls_ifnet_permitted, 1,
241     "Whether to permit hardware (ifnet) TLS sessions");
242
243 #ifdef TCP_OFFLOAD
244 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
245 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
246     &ktls_toe_cbc,
247     "Active number of TOE TLS sessions using AES-CBC");
248
249 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
250 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
251     &ktls_toe_gcm,
252     "Active number of TOE TLS sessions using AES-GCM");
253
254 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
255 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
256     &ktls_toe_chacha20,
257     "Active number of TOE TLS sessions using Chacha20-Poly1305");
258 #endif
259
260 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
261
262 static void ktls_cleanup(struct ktls_session *tls);
263 #if defined(INET) || defined(INET6)
264 static void ktls_reset_send_tag(void *context, int pending);
265 #endif
266 static void ktls_work_thread(void *ctx);
267
268 #if defined(INET) || defined(INET6)
269 static u_int
270 ktls_get_cpu(struct socket *so)
271 {
272         struct inpcb *inp;
273 #ifdef NUMA
274         struct ktls_domain_info *di;
275 #endif
276         u_int cpuid;
277
278         inp = sotoinpcb(so);
279 #ifdef RSS
280         cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
281         if (cpuid != NETISR_CPUID_NONE)
282                 return (cpuid);
283 #endif
284         /*
285          * Just use the flowid to shard connections in a repeatable
286          * fashion.  Note that TLS 1.0 sessions rely on the
287          * serialization provided by having the same connection use
288          * the same queue.
289          */
290 #ifdef NUMA
291         if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
292                 di = &ktls_domains[inp->inp_numa_domain];
293                 cpuid = di->cpu[inp->inp_flowid % di->count];
294         } else
295 #endif
296                 cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
297         return (cpuid);
298 }
299 #endif
300
301 static int
302 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
303 {
304         vm_page_t m;
305         int i;
306
307         KASSERT((ktls_maxlen & PAGE_MASK) == 0,
308             ("%s: ktls max length %d is not page size-aligned",
309             __func__, ktls_maxlen));
310
311         for (i = 0; i < count; i++) {
312                 m = vm_page_alloc_contig_domain(NULL, 0, domain,
313                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
314                     VM_ALLOC_NODUMP | malloc2vm_flags(flags),
315                     atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
316                     VM_MEMATTR_DEFAULT);
317                 if (m == NULL)
318                         break;
319                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
320         }
321         return (i);
322 }
323
324 static void
325 ktls_buffer_release(void *arg __unused, void **store, int count)
326 {
327         vm_page_t m;
328         int i, j;
329
330         for (i = 0; i < count; i++) {
331                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
332                 for (j = 0; j < atop(ktls_maxlen); j++) {
333                         (void)vm_page_unwire_noq(m + j);
334                         vm_page_free(m + j);
335                 }
336         }
337 }
338
339 static void
340 ktls_free_mext_contig(struct mbuf *m)
341 {
342         M_ASSERTEXTPG(m);
343         uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
344 }
345
346 static void
347 ktls_init(void *dummy __unused)
348 {
349         struct thread *td;
350         struct pcpu *pc;
351         cpuset_t mask;
352         int count, domain, error, i;
353
354         ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
355             M_WAITOK | M_ZERO);
356
357         ktls_session_zone = uma_zcreate("ktls_session",
358             sizeof(struct ktls_session),
359             NULL, NULL, NULL, NULL,
360             UMA_ALIGN_CACHE, 0);
361
362         if (ktls_sw_buffer_cache) {
363                 ktls_buffer_zone = uma_zcache_create("ktls_buffers",
364                     roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
365                     ktls_buffer_import, ktls_buffer_release, NULL,
366                     UMA_ZONE_FIRSTTOUCH);
367         }
368
369         /*
370          * Initialize the workqueues to run the TLS work.  We create a
371          * work queue for each CPU.
372          */
373         CPU_FOREACH(i) {
374                 STAILQ_INIT(&ktls_wq[i].m_head);
375                 STAILQ_INIT(&ktls_wq[i].so_head);
376                 mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
377                 error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
378                     &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
379                 if (error)
380                         panic("Can't add KTLS thread %d error %d", i, error);
381
382                 /*
383                  * Bind threads to cores.  If ktls_bind_threads is >
384                  * 1, then we bind to the NUMA domain.
385                  */
386                 if (ktls_bind_threads) {
387                         if (ktls_bind_threads > 1) {
388                                 pc = pcpu_find(i);
389                                 domain = pc->pc_domain;
390                                 CPU_COPY(&cpuset_domain[domain], &mask);
391                                 count = ktls_domains[domain].count;
392                                 ktls_domains[domain].cpu[count] = i;
393                                 ktls_domains[domain].count++;
394                         } else {
395                                 CPU_SETOF(i, &mask);
396                         }
397                         error = cpuset_setthread(td->td_tid, &mask);
398                         if (error)
399                                 panic(
400                             "Unable to bind KTLS thread for CPU %d error %d",
401                                      i, error);
402                 }
403                 ktls_cpuid_lookup[ktls_number_threads] = i;
404                 ktls_number_threads++;
405         }
406
407         /*
408          * If we somehow have an empty domain, fall back to choosing
409          * among all KTLS threads.
410          */
411         if (ktls_bind_threads > 1) {
412                 for (i = 0; i < vm_ndomains; i++) {
413                         if (ktls_domains[i].count == 0) {
414                                 ktls_bind_threads = 1;
415                                 break;
416                         }
417                 }
418         }
419
420         if (bootverbose)
421                 printf("KTLS: Initialized %d threads\n", ktls_number_threads);
422 }
423 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
424
425 #if defined(INET) || defined(INET6)
426 static int
427 ktls_create_session(struct socket *so, struct tls_enable *en,
428     struct ktls_session **tlsp)
429 {
430         struct ktls_session *tls;
431         int error;
432
433         /* Only TLS 1.0 - 1.3 are supported. */
434         if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
435                 return (EINVAL);
436         if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
437             en->tls_vminor > TLS_MINOR_VER_THREE)
438                 return (EINVAL);
439
440         if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
441                 return (EINVAL);
442         if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
443                 return (EINVAL);
444         if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
445                 return (EINVAL);
446
447         /* All supported algorithms require a cipher key. */
448         if (en->cipher_key_len == 0)
449                 return (EINVAL);
450
451         /* No flags are currently supported. */
452         if (en->flags != 0)
453                 return (EINVAL);
454
455         /* Common checks for supported algorithms. */
456         switch (en->cipher_algorithm) {
457         case CRYPTO_AES_NIST_GCM_16:
458                 /*
459                  * auth_algorithm isn't used, but permit GMAC values
460                  * for compatibility.
461                  */
462                 switch (en->auth_algorithm) {
463                 case 0:
464 #ifdef COMPAT_FREEBSD12
465                 /* XXX: Really 13.0-current COMPAT. */
466                 case CRYPTO_AES_128_NIST_GMAC:
467                 case CRYPTO_AES_192_NIST_GMAC:
468                 case CRYPTO_AES_256_NIST_GMAC:
469 #endif
470                         break;
471                 default:
472                         return (EINVAL);
473                 }
474                 if (en->auth_key_len != 0)
475                         return (EINVAL);
476                 if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
477                         en->iv_len != TLS_AEAD_GCM_LEN) ||
478                     (en->tls_vminor == TLS_MINOR_VER_THREE &&
479                         en->iv_len != TLS_1_3_GCM_IV_LEN))
480                         return (EINVAL);
481                 break;
482         case CRYPTO_AES_CBC:
483                 switch (en->auth_algorithm) {
484                 case CRYPTO_SHA1_HMAC:
485                         /*
486                          * TLS 1.0 requires an implicit IV.  TLS 1.1+
487                          * all use explicit IVs.
488                          */
489                         if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
490                                 if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
491                                         return (EINVAL);
492                                 break;
493                         }
494
495                         /* FALLTHROUGH */
496                 case CRYPTO_SHA2_256_HMAC:
497                 case CRYPTO_SHA2_384_HMAC:
498                         /* Ignore any supplied IV. */
499                         en->iv_len = 0;
500                         break;
501                 default:
502                         return (EINVAL);
503                 }
504                 if (en->auth_key_len == 0)
505                         return (EINVAL);
506                 break;
507         case CRYPTO_CHACHA20_POLY1305:
508                 if (en->auth_algorithm != 0 || en->auth_key_len != 0)
509                         return (EINVAL);
510                 if (en->tls_vminor != TLS_MINOR_VER_TWO &&
511                     en->tls_vminor != TLS_MINOR_VER_THREE)
512                         return (EINVAL);
513                 if (en->iv_len != TLS_CHACHA20_IV_LEN)
514                         return (EINVAL);
515                 break;
516         default:
517                 return (EINVAL);
518         }
519
520         tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
521
522         counter_u64_add(ktls_offload_active, 1);
523
524         refcount_init(&tls->refcount, 1);
525         TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
526
527         tls->wq_index = ktls_get_cpu(so);
528
529         tls->params.cipher_algorithm = en->cipher_algorithm;
530         tls->params.auth_algorithm = en->auth_algorithm;
531         tls->params.tls_vmajor = en->tls_vmajor;
532         tls->params.tls_vminor = en->tls_vminor;
533         tls->params.flags = en->flags;
534         tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
535
536         /* Set the header and trailer lengths. */
537         tls->params.tls_hlen = sizeof(struct tls_record_layer);
538         switch (en->cipher_algorithm) {
539         case CRYPTO_AES_NIST_GCM_16:
540                 /*
541                  * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
542                  * nonce.  TLS 1.3 uses a 12 byte implicit IV.
543                  */
544                 if (en->tls_vminor < TLS_MINOR_VER_THREE)
545                         tls->params.tls_hlen += sizeof(uint64_t);
546                 tls->params.tls_tlen = AES_GMAC_HASH_LEN;
547                 tls->params.tls_bs = 1;
548                 break;
549         case CRYPTO_AES_CBC:
550                 switch (en->auth_algorithm) {
551                 case CRYPTO_SHA1_HMAC:
552                         if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
553                                 /* Implicit IV, no nonce. */
554                         } else {
555                                 tls->params.tls_hlen += AES_BLOCK_LEN;
556                         }
557                         tls->params.tls_tlen = AES_BLOCK_LEN +
558                             SHA1_HASH_LEN;
559                         break;
560                 case CRYPTO_SHA2_256_HMAC:
561                         tls->params.tls_hlen += AES_BLOCK_LEN;
562                         tls->params.tls_tlen = AES_BLOCK_LEN +
563                             SHA2_256_HASH_LEN;
564                         break;
565                 case CRYPTO_SHA2_384_HMAC:
566                         tls->params.tls_hlen += AES_BLOCK_LEN;
567                         tls->params.tls_tlen = AES_BLOCK_LEN +
568                             SHA2_384_HASH_LEN;
569                         break;
570                 default:
571                         panic("invalid hmac");
572                 }
573                 tls->params.tls_bs = AES_BLOCK_LEN;
574                 break;
575         case CRYPTO_CHACHA20_POLY1305:
576                 /*
577                  * Chacha20 uses a 12 byte implicit IV.
578                  */
579                 tls->params.tls_tlen = POLY1305_HASH_LEN;
580                 tls->params.tls_bs = 1;
581                 break;
582         default:
583                 panic("invalid cipher");
584         }
585
586         /*
587          * TLS 1.3 includes optional padding which we do not support,
588          * and also puts the "real" record type at the end of the
589          * encrypted data.
590          */
591         if (en->tls_vminor == TLS_MINOR_VER_THREE)
592                 tls->params.tls_tlen += sizeof(uint8_t);
593
594         KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
595             ("TLS header length too long: %d", tls->params.tls_hlen));
596         KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
597             ("TLS trailer length too long: %d", tls->params.tls_tlen));
598
599         if (en->auth_key_len != 0) {
600                 tls->params.auth_key_len = en->auth_key_len;
601                 tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
602                     M_WAITOK);
603                 error = copyin(en->auth_key, tls->params.auth_key,
604                     en->auth_key_len);
605                 if (error)
606                         goto out;
607         }
608
609         tls->params.cipher_key_len = en->cipher_key_len;
610         tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
611         error = copyin(en->cipher_key, tls->params.cipher_key,
612             en->cipher_key_len);
613         if (error)
614                 goto out;
615
616         /*
617          * This holds the implicit portion of the nonce for AEAD
618          * ciphers and the initial implicit IV for TLS 1.0.  The
619          * explicit portions of the IV are generated in ktls_frame().
620          */
621         if (en->iv_len != 0) {
622                 tls->params.iv_len = en->iv_len;
623                 error = copyin(en->iv, tls->params.iv, en->iv_len);
624                 if (error)
625                         goto out;
626
627                 /*
628                  * For TLS 1.2 with GCM, generate an 8-byte nonce as a
629                  * counter to generate unique explicit IVs.
630                  *
631                  * Store this counter in the last 8 bytes of the IV
632                  * array so that it is 8-byte aligned.
633                  */
634                 if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
635                     en->tls_vminor == TLS_MINOR_VER_TWO)
636                         arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
637         }
638
639         *tlsp = tls;
640         return (0);
641
642 out:
643         ktls_cleanup(tls);
644         return (error);
645 }
646
647 static struct ktls_session *
648 ktls_clone_session(struct ktls_session *tls)
649 {
650         struct ktls_session *tls_new;
651
652         tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
653
654         counter_u64_add(ktls_offload_active, 1);
655
656         refcount_init(&tls_new->refcount, 1);
657
658         /* Copy fields from existing session. */
659         tls_new->params = tls->params;
660         tls_new->wq_index = tls->wq_index;
661
662         /* Deep copy keys. */
663         if (tls_new->params.auth_key != NULL) {
664                 tls_new->params.auth_key = malloc(tls->params.auth_key_len,
665                     M_KTLS, M_WAITOK);
666                 memcpy(tls_new->params.auth_key, tls->params.auth_key,
667                     tls->params.auth_key_len);
668         }
669
670         tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
671             M_WAITOK);
672         memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
673             tls->params.cipher_key_len);
674
675         return (tls_new);
676 }
677 #endif
678
679 static void
680 ktls_cleanup(struct ktls_session *tls)
681 {
682
683         counter_u64_add(ktls_offload_active, -1);
684         switch (tls->mode) {
685         case TCP_TLS_MODE_SW:
686                 switch (tls->params.cipher_algorithm) {
687                 case CRYPTO_AES_CBC:
688                         counter_u64_add(ktls_sw_cbc, -1);
689                         break;
690                 case CRYPTO_AES_NIST_GCM_16:
691                         counter_u64_add(ktls_sw_gcm, -1);
692                         break;
693                 case CRYPTO_CHACHA20_POLY1305:
694                         counter_u64_add(ktls_sw_chacha20, -1);
695                         break;
696                 }
697                 ktls_ocf_free(tls);
698                 break;
699         case TCP_TLS_MODE_IFNET:
700                 switch (tls->params.cipher_algorithm) {
701                 case CRYPTO_AES_CBC:
702                         counter_u64_add(ktls_ifnet_cbc, -1);
703                         break;
704                 case CRYPTO_AES_NIST_GCM_16:
705                         counter_u64_add(ktls_ifnet_gcm, -1);
706                         break;
707                 case CRYPTO_CHACHA20_POLY1305:
708                         counter_u64_add(ktls_ifnet_chacha20, -1);
709                         break;
710                 }
711                 if (tls->snd_tag != NULL)
712                         m_snd_tag_rele(tls->snd_tag);
713                 break;
714 #ifdef TCP_OFFLOAD
715         case TCP_TLS_MODE_TOE:
716                 switch (tls->params.cipher_algorithm) {
717                 case CRYPTO_AES_CBC:
718                         counter_u64_add(ktls_toe_cbc, -1);
719                         break;
720                 case CRYPTO_AES_NIST_GCM_16:
721                         counter_u64_add(ktls_toe_gcm, -1);
722                         break;
723                 case CRYPTO_CHACHA20_POLY1305:
724                         counter_u64_add(ktls_toe_chacha20, -1);
725                         break;
726                 }
727                 break;
728 #endif
729         }
730         if (tls->params.auth_key != NULL) {
731                 zfree(tls->params.auth_key, M_KTLS);
732                 tls->params.auth_key = NULL;
733                 tls->params.auth_key_len = 0;
734         }
735         if (tls->params.cipher_key != NULL) {
736                 zfree(tls->params.cipher_key, M_KTLS);
737                 tls->params.cipher_key = NULL;
738                 tls->params.cipher_key_len = 0;
739         }
740         explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
741 }
742
743 #if defined(INET) || defined(INET6)
744
745 #ifdef TCP_OFFLOAD
746 static int
747 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
748 {
749         struct inpcb *inp;
750         struct tcpcb *tp;
751         int error;
752
753         inp = so->so_pcb;
754         INP_WLOCK(inp);
755         if (inp->inp_flags2 & INP_FREED) {
756                 INP_WUNLOCK(inp);
757                 return (ECONNRESET);
758         }
759         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
760                 INP_WUNLOCK(inp);
761                 return (ECONNRESET);
762         }
763         if (inp->inp_socket == NULL) {
764                 INP_WUNLOCK(inp);
765                 return (ECONNRESET);
766         }
767         tp = intotcpcb(inp);
768         if (!(tp->t_flags & TF_TOE)) {
769                 INP_WUNLOCK(inp);
770                 return (EOPNOTSUPP);
771         }
772
773         error = tcp_offload_alloc_tls_session(tp, tls, direction);
774         INP_WUNLOCK(inp);
775         if (error == 0) {
776                 tls->mode = TCP_TLS_MODE_TOE;
777                 switch (tls->params.cipher_algorithm) {
778                 case CRYPTO_AES_CBC:
779                         counter_u64_add(ktls_toe_cbc, 1);
780                         break;
781                 case CRYPTO_AES_NIST_GCM_16:
782                         counter_u64_add(ktls_toe_gcm, 1);
783                         break;
784                 case CRYPTO_CHACHA20_POLY1305:
785                         counter_u64_add(ktls_toe_chacha20, 1);
786                         break;
787                 }
788         }
789         return (error);
790 }
791 #endif
792
793 /*
794  * Common code used when first enabling ifnet TLS on a connection or
795  * when allocating a new ifnet TLS session due to a routing change.
796  * This function allocates a new TLS send tag on whatever interface
797  * the connection is currently routed over.
798  */
799 static int
800 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
801     struct m_snd_tag **mstp)
802 {
803         union if_snd_tag_alloc_params params;
804         struct ifnet *ifp;
805         struct nhop_object *nh;
806         struct tcpcb *tp;
807         int error;
808
809         INP_RLOCK(inp);
810         if (inp->inp_flags2 & INP_FREED) {
811                 INP_RUNLOCK(inp);
812                 return (ECONNRESET);
813         }
814         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
815                 INP_RUNLOCK(inp);
816                 return (ECONNRESET);
817         }
818         if (inp->inp_socket == NULL) {
819                 INP_RUNLOCK(inp);
820                 return (ECONNRESET);
821         }
822         tp = intotcpcb(inp);
823
824         /*
825          * Check administrative controls on ifnet TLS to determine if
826          * ifnet TLS should be denied.
827          *
828          * - Always permit 'force' requests.
829          * - ktls_ifnet_permitted == 0: always deny.
830          */
831         if (!force && ktls_ifnet_permitted == 0) {
832                 INP_RUNLOCK(inp);
833                 return (ENXIO);
834         }
835
836         /*
837          * XXX: Use the cached route in the inpcb to find the
838          * interface.  This should perhaps instead use
839          * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
840          * enabled after a connection has completed key negotiation in
841          * userland, the cached route will be present in practice.
842          */
843         nh = inp->inp_route.ro_nh;
844         if (nh == NULL) {
845                 INP_RUNLOCK(inp);
846                 return (ENXIO);
847         }
848         ifp = nh->nh_ifp;
849         if_ref(ifp);
850
851         /*
852          * Allocate a TLS + ratelimit tag if the connection has an
853          * existing pacing rate.
854          */
855         if (tp->t_pacing_rate != -1 &&
856             (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
857                 params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
858                 params.tls_rate_limit.inp = inp;
859                 params.tls_rate_limit.tls = tls;
860                 params.tls_rate_limit.max_rate = tp->t_pacing_rate;
861         } else {
862                 params.hdr.type = IF_SND_TAG_TYPE_TLS;
863                 params.tls.inp = inp;
864                 params.tls.tls = tls;
865         }
866         params.hdr.flowid = inp->inp_flowid;
867         params.hdr.flowtype = inp->inp_flowtype;
868         params.hdr.numa_domain = inp->inp_numa_domain;
869         INP_RUNLOCK(inp);
870
871         if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
872                 error = EOPNOTSUPP;
873                 goto out;
874         }
875         if (inp->inp_vflag & INP_IPV6) {
876                 if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
877                         error = EOPNOTSUPP;
878                         goto out;
879                 }
880         } else {
881                 if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
882                         error = EOPNOTSUPP;
883                         goto out;
884                 }
885         }
886         error = m_snd_tag_alloc(ifp, &params, mstp);
887 out:
888         if_rele(ifp);
889         return (error);
890 }
891
892 static int
893 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
894 {
895         struct m_snd_tag *mst;
896         int error;
897
898         error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
899         if (error == 0) {
900                 tls->mode = TCP_TLS_MODE_IFNET;
901                 tls->snd_tag = mst;
902                 switch (tls->params.cipher_algorithm) {
903                 case CRYPTO_AES_CBC:
904                         counter_u64_add(ktls_ifnet_cbc, 1);
905                         break;
906                 case CRYPTO_AES_NIST_GCM_16:
907                         counter_u64_add(ktls_ifnet_gcm, 1);
908                         break;
909                 case CRYPTO_CHACHA20_POLY1305:
910                         counter_u64_add(ktls_ifnet_chacha20, 1);
911                         break;
912                 }
913         }
914         return (error);
915 }
916
917 static int
918 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
919 {
920         int error;
921
922         error = ktls_ocf_try(so, tls, direction);
923         if (error)
924                 return (error);
925         tls->mode = TCP_TLS_MODE_SW;
926         switch (tls->params.cipher_algorithm) {
927         case CRYPTO_AES_CBC:
928                 counter_u64_add(ktls_sw_cbc, 1);
929                 break;
930         case CRYPTO_AES_NIST_GCM_16:
931                 counter_u64_add(ktls_sw_gcm, 1);
932                 break;
933         case CRYPTO_CHACHA20_POLY1305:
934                 counter_u64_add(ktls_sw_chacha20, 1);
935                 break;
936         }
937         return (0);
938 }
939
940 /*
941  * KTLS RX stores data in the socket buffer as a list of TLS records,
942  * where each record is stored as a control message containg the TLS
943  * header followed by data mbufs containing the decrypted data.  This
944  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
945  * both encrypted and decrypted data.  TLS records decrypted by a NIC
946  * should be queued to the socket buffer as records, but encrypted
947  * data which needs to be decrypted by software arrives as a stream of
948  * regular mbufs which need to be converted.  In addition, there may
949  * already be pending encrypted data in the socket buffer when KTLS RX
950  * is enabled.
951  *
952  * To manage not-yet-decrypted data for KTLS RX, the following scheme
953  * is used:
954  *
955  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
956  *
957  * - ktls_check_rx checks this chain of mbufs reading the TLS header
958  *   from the first mbuf.  Once all of the data for that TLS record is
959  *   queued, the socket is queued to a worker thread.
960  *
961  * - The worker thread calls ktls_decrypt to decrypt TLS records in
962  *   the TLS chain.  Each TLS record is detached from the TLS chain,
963  *   decrypted, and inserted into the regular socket buffer chain as
964  *   record starting with a control message holding the TLS header and
965  *   a chain of mbufs holding the encrypted data.
966  */
967
968 static void
969 sb_mark_notready(struct sockbuf *sb)
970 {
971         struct mbuf *m;
972
973         m = sb->sb_mb;
974         sb->sb_mtls = m;
975         sb->sb_mb = NULL;
976         sb->sb_mbtail = NULL;
977         sb->sb_lastrecord = NULL;
978         for (; m != NULL; m = m->m_next) {
979                 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
980                     __func__));
981                 KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
982                     __func__));
983                 KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
984                     __func__));
985                 m->m_flags |= M_NOTREADY;
986                 sb->sb_acc -= m->m_len;
987                 sb->sb_tlscc += m->m_len;
988                 sb->sb_mtlstail = m;
989         }
990         KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
991             ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
992             sb->sb_ccc));
993 }
994
995 int
996 ktls_enable_rx(struct socket *so, struct tls_enable *en)
997 {
998         struct ktls_session *tls;
999         int error;
1000
1001         if (!ktls_offload_enable)
1002                 return (ENOTSUP);
1003         if (SOLISTENING(so))
1004                 return (EINVAL);
1005
1006         counter_u64_add(ktls_offload_enable_calls, 1);
1007
1008         /*
1009          * This should always be true since only the TCP socket option
1010          * invokes this function.
1011          */
1012         if (so->so_proto->pr_protocol != IPPROTO_TCP)
1013                 return (EINVAL);
1014
1015         /*
1016          * XXX: Don't overwrite existing sessions.  We should permit
1017          * this to support rekeying in the future.
1018          */
1019         if (so->so_rcv.sb_tls_info != NULL)
1020                 return (EALREADY);
1021
1022         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
1023                 return (ENOTSUP);
1024
1025         /* TLS 1.3 is not yet supported. */
1026         if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
1027             en->tls_vminor == TLS_MINOR_VER_THREE)
1028                 return (ENOTSUP);
1029
1030         error = ktls_create_session(so, en, &tls);
1031         if (error)
1032                 return (error);
1033
1034 #ifdef TCP_OFFLOAD
1035         error = ktls_try_toe(so, tls, KTLS_RX);
1036         if (error)
1037 #endif
1038                 error = ktls_try_sw(so, tls, KTLS_RX);
1039
1040         if (error) {
1041                 ktls_cleanup(tls);
1042                 return (error);
1043         }
1044
1045         /* Mark the socket as using TLS offload. */
1046         SOCKBUF_LOCK(&so->so_rcv);
1047         so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
1048         so->so_rcv.sb_tls_info = tls;
1049         so->so_rcv.sb_flags |= SB_TLS_RX;
1050
1051         /* Mark existing data as not ready until it can be decrypted. */
1052         if (tls->mode != TCP_TLS_MODE_TOE) {
1053                 sb_mark_notready(&so->so_rcv);
1054                 ktls_check_rx(&so->so_rcv);
1055         }
1056         SOCKBUF_UNLOCK(&so->so_rcv);
1057
1058         counter_u64_add(ktls_offload_total, 1);
1059
1060         return (0);
1061 }
1062
1063 int
1064 ktls_enable_tx(struct socket *so, struct tls_enable *en)
1065 {
1066         struct ktls_session *tls;
1067         struct inpcb *inp;
1068         int error;
1069
1070         if (!ktls_offload_enable)
1071                 return (ENOTSUP);
1072         if (SOLISTENING(so))
1073                 return (EINVAL);
1074
1075         counter_u64_add(ktls_offload_enable_calls, 1);
1076
1077         /*
1078          * This should always be true since only the TCP socket option
1079          * invokes this function.
1080          */
1081         if (so->so_proto->pr_protocol != IPPROTO_TCP)
1082                 return (EINVAL);
1083
1084         /*
1085          * XXX: Don't overwrite existing sessions.  We should permit
1086          * this to support rekeying in the future.
1087          */
1088         if (so->so_snd.sb_tls_info != NULL)
1089                 return (EALREADY);
1090
1091         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
1092                 return (ENOTSUP);
1093
1094         /* TLS requires ext pgs */
1095         if (mb_use_ext_pgs == 0)
1096                 return (ENXIO);
1097
1098         error = ktls_create_session(so, en, &tls);
1099         if (error)
1100                 return (error);
1101
1102         /* Prefer TOE -> ifnet TLS -> software TLS. */
1103 #ifdef TCP_OFFLOAD
1104         error = ktls_try_toe(so, tls, KTLS_TX);
1105         if (error)
1106 #endif
1107                 error = ktls_try_ifnet(so, tls, false);
1108         if (error)
1109                 error = ktls_try_sw(so, tls, KTLS_TX);
1110
1111         if (error) {
1112                 ktls_cleanup(tls);
1113                 return (error);
1114         }
1115
1116         error = sblock(&so->so_snd, SBL_WAIT);
1117         if (error) {
1118                 ktls_cleanup(tls);
1119                 return (error);
1120         }
1121
1122         /*
1123          * Write lock the INP when setting sb_tls_info so that
1124          * routines in tcp_ratelimit.c can read sb_tls_info while
1125          * holding the INP lock.
1126          */
1127         inp = so->so_pcb;
1128         INP_WLOCK(inp);
1129         SOCKBUF_LOCK(&so->so_snd);
1130         so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
1131         so->so_snd.sb_tls_info = tls;
1132         if (tls->mode != TCP_TLS_MODE_SW)
1133                 so->so_snd.sb_flags |= SB_TLS_IFNET;
1134         SOCKBUF_UNLOCK(&so->so_snd);
1135         INP_WUNLOCK(inp);
1136         sbunlock(&so->so_snd);
1137
1138         counter_u64_add(ktls_offload_total, 1);
1139
1140         return (0);
1141 }
1142
1143 int
1144 ktls_get_rx_mode(struct socket *so)
1145 {
1146         struct ktls_session *tls;
1147         struct inpcb *inp;
1148         int mode;
1149
1150         if (SOLISTENING(so))
1151                 return (EINVAL);
1152         inp = so->so_pcb;
1153         INP_WLOCK_ASSERT(inp);
1154         SOCKBUF_LOCK(&so->so_rcv);
1155         tls = so->so_rcv.sb_tls_info;
1156         if (tls == NULL)
1157                 mode = TCP_TLS_MODE_NONE;
1158         else
1159                 mode = tls->mode;
1160         SOCKBUF_UNLOCK(&so->so_rcv);
1161         return (mode);
1162 }
1163
1164 int
1165 ktls_get_tx_mode(struct socket *so)
1166 {
1167         struct ktls_session *tls;
1168         struct inpcb *inp;
1169         int mode;
1170
1171         if (SOLISTENING(so))
1172                 return (EINVAL);
1173         inp = so->so_pcb;
1174         INP_WLOCK_ASSERT(inp);
1175         SOCKBUF_LOCK(&so->so_snd);
1176         tls = so->so_snd.sb_tls_info;
1177         if (tls == NULL)
1178                 mode = TCP_TLS_MODE_NONE;
1179         else
1180                 mode = tls->mode;
1181         SOCKBUF_UNLOCK(&so->so_snd);
1182         return (mode);
1183 }
1184
1185 /*
1186  * Switch between SW and ifnet TLS sessions as requested.
1187  */
1188 int
1189 ktls_set_tx_mode(struct socket *so, int mode)
1190 {
1191         struct ktls_session *tls, *tls_new;
1192         struct inpcb *inp;
1193         int error;
1194
1195         if (SOLISTENING(so))
1196                 return (EINVAL);
1197         switch (mode) {
1198         case TCP_TLS_MODE_SW:
1199         case TCP_TLS_MODE_IFNET:
1200                 break;
1201         default:
1202                 return (EINVAL);
1203         }
1204
1205         inp = so->so_pcb;
1206         INP_WLOCK_ASSERT(inp);
1207         SOCKBUF_LOCK(&so->so_snd);
1208         tls = so->so_snd.sb_tls_info;
1209         if (tls == NULL) {
1210                 SOCKBUF_UNLOCK(&so->so_snd);
1211                 return (0);
1212         }
1213
1214         if (tls->mode == mode) {
1215                 SOCKBUF_UNLOCK(&so->so_snd);
1216                 return (0);
1217         }
1218
1219         tls = ktls_hold(tls);
1220         SOCKBUF_UNLOCK(&so->so_snd);
1221         INP_WUNLOCK(inp);
1222
1223         tls_new = ktls_clone_session(tls);
1224
1225         if (mode == TCP_TLS_MODE_IFNET)
1226                 error = ktls_try_ifnet(so, tls_new, true);
1227         else
1228                 error = ktls_try_sw(so, tls_new, KTLS_TX);
1229         if (error) {
1230                 counter_u64_add(ktls_switch_failed, 1);
1231                 ktls_free(tls_new);
1232                 ktls_free(tls);
1233                 INP_WLOCK(inp);
1234                 return (error);
1235         }
1236
1237         error = sblock(&so->so_snd, SBL_WAIT);
1238         if (error) {
1239                 counter_u64_add(ktls_switch_failed, 1);
1240                 ktls_free(tls_new);
1241                 ktls_free(tls);
1242                 INP_WLOCK(inp);
1243                 return (error);
1244         }
1245
1246         /*
1247          * If we raced with another session change, keep the existing
1248          * session.
1249          */
1250         if (tls != so->so_snd.sb_tls_info) {
1251                 counter_u64_add(ktls_switch_failed, 1);
1252                 sbunlock(&so->so_snd);
1253                 ktls_free(tls_new);
1254                 ktls_free(tls);
1255                 INP_WLOCK(inp);
1256                 return (EBUSY);
1257         }
1258
1259         SOCKBUF_LOCK(&so->so_snd);
1260         so->so_snd.sb_tls_info = tls_new;
1261         if (tls_new->mode != TCP_TLS_MODE_SW)
1262                 so->so_snd.sb_flags |= SB_TLS_IFNET;
1263         SOCKBUF_UNLOCK(&so->so_snd);
1264         sbunlock(&so->so_snd);
1265
1266         /*
1267          * Drop two references on 'tls'.  The first is for the
1268          * ktls_hold() above.  The second drops the reference from the
1269          * socket buffer.
1270          */
1271         KASSERT(tls->refcount >= 2, ("too few references on old session"));
1272         ktls_free(tls);
1273         ktls_free(tls);
1274
1275         if (mode == TCP_TLS_MODE_IFNET)
1276                 counter_u64_add(ktls_switch_to_ifnet, 1);
1277         else
1278                 counter_u64_add(ktls_switch_to_sw, 1);
1279
1280         INP_WLOCK(inp);
1281         return (0);
1282 }
1283
1284 /*
1285  * Try to allocate a new TLS send tag.  This task is scheduled when
1286  * ip_output detects a route change while trying to transmit a packet
1287  * holding a TLS record.  If a new tag is allocated, replace the tag
1288  * in the TLS session.  Subsequent packets on the connection will use
1289  * the new tag.  If a new tag cannot be allocated, drop the
1290  * connection.
1291  */
1292 static void
1293 ktls_reset_send_tag(void *context, int pending)
1294 {
1295         struct epoch_tracker et;
1296         struct ktls_session *tls;
1297         struct m_snd_tag *old, *new;
1298         struct inpcb *inp;
1299         struct tcpcb *tp;
1300         int error;
1301
1302         MPASS(pending == 1);
1303
1304         tls = context;
1305         inp = tls->inp;
1306
1307         /*
1308          * Free the old tag first before allocating a new one.
1309          * ip[6]_output_send() will treat a NULL send tag the same as
1310          * an ifp mismatch and drop packets until a new tag is
1311          * allocated.
1312          *
1313          * Write-lock the INP when changing tls->snd_tag since
1314          * ip[6]_output_send() holds a read-lock when reading the
1315          * pointer.
1316          */
1317         INP_WLOCK(inp);
1318         old = tls->snd_tag;
1319         tls->snd_tag = NULL;
1320         INP_WUNLOCK(inp);
1321         if (old != NULL)
1322                 m_snd_tag_rele(old);
1323
1324         error = ktls_alloc_snd_tag(inp, tls, true, &new);
1325
1326         if (error == 0) {
1327                 INP_WLOCK(inp);
1328                 tls->snd_tag = new;
1329                 mtx_pool_lock(mtxpool_sleep, tls);
1330                 tls->reset_pending = false;
1331                 mtx_pool_unlock(mtxpool_sleep, tls);
1332                 if (!in_pcbrele_wlocked(inp))
1333                         INP_WUNLOCK(inp);
1334
1335                 counter_u64_add(ktls_ifnet_reset, 1);
1336
1337                 /*
1338                  * XXX: Should we kick tcp_output explicitly now that
1339                  * the send tag is fixed or just rely on timers?
1340                  */
1341         } else {
1342                 NET_EPOCH_ENTER(et);
1343                 INP_WLOCK(inp);
1344                 if (!in_pcbrele_wlocked(inp)) {
1345                         if (!(inp->inp_flags & INP_TIMEWAIT) &&
1346                             !(inp->inp_flags & INP_DROPPED)) {
1347                                 tp = intotcpcb(inp);
1348                                 CURVNET_SET(tp->t_vnet);
1349                                 tp = tcp_drop(tp, ECONNABORTED);
1350                                 CURVNET_RESTORE();
1351                                 if (tp != NULL)
1352                                         INP_WUNLOCK(inp);
1353                                 counter_u64_add(ktls_ifnet_reset_dropped, 1);
1354                         } else
1355                                 INP_WUNLOCK(inp);
1356                 }
1357                 NET_EPOCH_EXIT(et);
1358
1359                 counter_u64_add(ktls_ifnet_reset_failed, 1);
1360
1361                 /*
1362                  * Leave reset_pending true to avoid future tasks while
1363                  * the socket goes away.
1364                  */
1365         }
1366
1367         ktls_free(tls);
1368 }
1369
1370 int
1371 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
1372 {
1373
1374         if (inp == NULL)
1375                 return (ENOBUFS);
1376
1377         INP_LOCK_ASSERT(inp);
1378
1379         /*
1380          * See if we should schedule a task to update the send tag for
1381          * this session.
1382          */
1383         mtx_pool_lock(mtxpool_sleep, tls);
1384         if (!tls->reset_pending) {
1385                 (void) ktls_hold(tls);
1386                 in_pcbref(inp);
1387                 tls->inp = inp;
1388                 tls->reset_pending = true;
1389                 taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
1390         }
1391         mtx_pool_unlock(mtxpool_sleep, tls);
1392         return (ENOBUFS);
1393 }
1394
1395 #ifdef RATELIMIT
1396 int
1397 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
1398 {
1399         union if_snd_tag_modify_params params = {
1400                 .rate_limit.max_rate = max_pacing_rate,
1401                 .rate_limit.flags = M_NOWAIT,
1402         };
1403         struct m_snd_tag *mst;
1404         struct ifnet *ifp;
1405
1406         /* Can't get to the inp, but it should be locked. */
1407         /* INP_LOCK_ASSERT(inp); */
1408
1409         MPASS(tls->mode == TCP_TLS_MODE_IFNET);
1410
1411         if (tls->snd_tag == NULL) {
1412                 /*
1413                  * Resetting send tag, ignore this change.  The
1414                  * pending reset may or may not see this updated rate
1415                  * in the tcpcb.  If it doesn't, we will just lose
1416                  * this rate change.
1417                  */
1418                 return (0);
1419         }
1420
1421         MPASS(tls->snd_tag != NULL);
1422         MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
1423
1424         mst = tls->snd_tag;
1425         ifp = mst->ifp;
1426         return (ifp->if_snd_tag_modify(mst, &params));
1427 }
1428 #endif
1429 #endif
1430
1431 void
1432 ktls_destroy(struct ktls_session *tls)
1433 {
1434
1435         ktls_cleanup(tls);
1436         uma_zfree(ktls_session_zone, tls);
1437 }
1438
1439 void
1440 ktls_seq(struct sockbuf *sb, struct mbuf *m)
1441 {
1442
1443         for (; m != NULL; m = m->m_next) {
1444                 KASSERT((m->m_flags & M_EXTPG) != 0,
1445                     ("ktls_seq: mapped mbuf %p", m));
1446
1447                 m->m_epg_seqno = sb->sb_tls_seqno;
1448                 sb->sb_tls_seqno++;
1449         }
1450 }
1451
1452 /*
1453  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
1454  * mbuf in the chain must be an unmapped mbuf.  The payload of the
1455  * mbuf must be populated with the payload of each TLS record.
1456  *
1457  * The record_type argument specifies the TLS record type used when
1458  * populating the TLS header.
1459  *
1460  * The enq_count argument on return is set to the number of pages of
1461  * payload data for this entire chain that need to be encrypted via SW
1462  * encryption.  The returned value should be passed to ktls_enqueue
1463  * when scheduling encryption of this chain of mbufs.  To handle the
1464  * special case of empty fragments for TLS 1.0 sessions, an empty
1465  * fragment counts as one page.
1466  */
1467 void
1468 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
1469     uint8_t record_type)
1470 {
1471         struct tls_record_layer *tlshdr;
1472         struct mbuf *m;
1473         uint64_t *noncep;
1474         uint16_t tls_len;
1475         int maxlen;
1476
1477         maxlen = tls->params.max_frame_len;
1478         *enq_cnt = 0;
1479         for (m = top; m != NULL; m = m->m_next) {
1480                 /*
1481                  * All mbufs in the chain should be TLS records whose
1482                  * payload does not exceed the maximum frame length.
1483                  *
1484                  * Empty TLS records are permitted when using CBC.
1485                  */
1486                 KASSERT(m->m_len <= maxlen &&
1487                     (tls->params.cipher_algorithm == CRYPTO_AES_CBC ?
1488                     m->m_len >= 0 : m->m_len > 0),
1489                     ("ktls_frame: m %p len %d\n", m, m->m_len));
1490
1491                 /*
1492                  * TLS frames require unmapped mbufs to store session
1493                  * info.
1494                  */
1495                 KASSERT((m->m_flags & M_EXTPG) != 0,
1496                     ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
1497
1498                 tls_len = m->m_len;
1499
1500                 /* Save a reference to the session. */
1501                 m->m_epg_tls = ktls_hold(tls);
1502
1503                 m->m_epg_hdrlen = tls->params.tls_hlen;
1504                 m->m_epg_trllen = tls->params.tls_tlen;
1505                 if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
1506                         int bs, delta;
1507
1508                         /*
1509                          * AES-CBC pads messages to a multiple of the
1510                          * block size.  Note that the padding is
1511                          * applied after the digest and the encryption
1512                          * is done on the "plaintext || mac || padding".
1513                          * At least one byte of padding is always
1514                          * present.
1515                          *
1516                          * Compute the final trailer length assuming
1517                          * at most one block of padding.
1518                          * tls->params.tls_tlen is the maximum
1519                          * possible trailer length (padding + digest).
1520                          * delta holds the number of excess padding
1521                          * bytes if the maximum were used.  Those
1522                          * extra bytes are removed.
1523                          */
1524                         bs = tls->params.tls_bs;
1525                         delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
1526                         m->m_epg_trllen -= delta;
1527                 }
1528                 m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
1529
1530                 /* Populate the TLS header. */
1531                 tlshdr = (void *)m->m_epg_hdr;
1532                 tlshdr->tls_vmajor = tls->params.tls_vmajor;
1533
1534                 /*
1535                  * TLS 1.3 masquarades as TLS 1.2 with a record type
1536                  * of TLS_RLTYPE_APP.
1537                  */
1538                 if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
1539                     tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
1540                         tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
1541                         tlshdr->tls_type = TLS_RLTYPE_APP;
1542                         /* save the real record type for later */
1543                         m->m_epg_record_type = record_type;
1544                         m->m_epg_trail[0] = record_type;
1545                 } else {
1546                         tlshdr->tls_vminor = tls->params.tls_vminor;
1547                         tlshdr->tls_type = record_type;
1548                 }
1549                 tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
1550
1551                 /*
1552                  * Store nonces / explicit IVs after the end of the
1553                  * TLS header.
1554                  *
1555                  * For GCM with TLS 1.2, an 8 byte nonce is copied
1556                  * from the end of the IV.  The nonce is then
1557                  * incremented for use by the next record.
1558                  *
1559                  * For CBC, a random nonce is inserted for TLS 1.1+.
1560                  */
1561                 if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
1562                     tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
1563                         noncep = (uint64_t *)(tls->params.iv + 8);
1564                         be64enc(tlshdr + 1, *noncep);
1565                         (*noncep)++;
1566                 } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
1567                     tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
1568                         arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
1569
1570                 /*
1571                  * When using SW encryption, mark the mbuf not ready.
1572                  * It will be marked ready via sbready() after the
1573                  * record has been encrypted.
1574                  *
1575                  * When using ifnet TLS, unencrypted TLS records are
1576                  * sent down the stack to the NIC.
1577                  */
1578                 if (tls->mode == TCP_TLS_MODE_SW) {
1579                         m->m_flags |= M_NOTREADY;
1580                         m->m_epg_nrdy = m->m_epg_npgs;
1581                         if (__predict_false(tls_len == 0)) {
1582                                 /* TLS 1.0 empty fragment. */
1583                                 *enq_cnt += 1;
1584                         } else
1585                                 *enq_cnt += m->m_epg_npgs;
1586                 }
1587         }
1588 }
1589
1590 void
1591 ktls_check_rx(struct sockbuf *sb)
1592 {
1593         struct tls_record_layer hdr;
1594         struct ktls_wq *wq;
1595         struct socket *so;
1596         bool running;
1597
1598         SOCKBUF_LOCK_ASSERT(sb);
1599         KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
1600             __func__, sb));
1601         so = __containerof(sb, struct socket, so_rcv);
1602
1603         if (sb->sb_flags & SB_TLS_RX_RUNNING)
1604                 return;
1605
1606         /* Is there enough queued for a TLS header? */
1607         if (sb->sb_tlscc < sizeof(hdr)) {
1608                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
1609                         so->so_error = EMSGSIZE;
1610                 return;
1611         }
1612
1613         m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
1614
1615         /* Is the entire record queued? */
1616         if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
1617                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
1618                         so->so_error = EMSGSIZE;
1619                 return;
1620         }
1621
1622         sb->sb_flags |= SB_TLS_RX_RUNNING;
1623
1624         soref(so);
1625         wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
1626         mtx_lock(&wq->mtx);
1627         STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
1628         running = wq->running;
1629         mtx_unlock(&wq->mtx);
1630         if (!running)
1631                 wakeup(wq);
1632         counter_u64_add(ktls_cnt_rx_queued, 1);
1633 }
1634
1635 static struct mbuf *
1636 ktls_detach_record(struct sockbuf *sb, int len)
1637 {
1638         struct mbuf *m, *n, *top;
1639         int remain;
1640
1641         SOCKBUF_LOCK_ASSERT(sb);
1642         MPASS(len <= sb->sb_tlscc);
1643
1644         /*
1645          * If TLS chain is the exact size of the record,
1646          * just grab the whole record.
1647          */
1648         top = sb->sb_mtls;
1649         if (sb->sb_tlscc == len) {
1650                 sb->sb_mtls = NULL;
1651                 sb->sb_mtlstail = NULL;
1652                 goto out;
1653         }
1654
1655         /*
1656          * While it would be nice to use m_split() here, we need
1657          * to know exactly what m_split() allocates to update the
1658          * accounting, so do it inline instead.
1659          */
1660         remain = len;
1661         for (m = top; remain > m->m_len; m = m->m_next)
1662                 remain -= m->m_len;
1663
1664         /* Easy case: don't have to split 'm'. */
1665         if (remain == m->m_len) {
1666                 sb->sb_mtls = m->m_next;
1667                 if (sb->sb_mtls == NULL)
1668                         sb->sb_mtlstail = NULL;
1669                 m->m_next = NULL;
1670                 goto out;
1671         }
1672
1673         /*
1674          * Need to allocate an mbuf to hold the remainder of 'm'.  Try
1675          * with M_NOWAIT first.
1676          */
1677         n = m_get(M_NOWAIT, MT_DATA);
1678         if (n == NULL) {
1679                 /*
1680                  * Use M_WAITOK with socket buffer unlocked.  If
1681                  * 'sb_mtls' changes while the lock is dropped, return
1682                  * NULL to force the caller to retry.
1683                  */
1684                 SOCKBUF_UNLOCK(sb);
1685
1686                 n = m_get(M_WAITOK, MT_DATA);
1687
1688                 SOCKBUF_LOCK(sb);
1689                 if (sb->sb_mtls != top) {
1690                         m_free(n);
1691                         return (NULL);
1692                 }
1693         }
1694         n->m_flags |= M_NOTREADY;
1695
1696         /* Store remainder in 'n'. */
1697         n->m_len = m->m_len - remain;
1698         if (m->m_flags & M_EXT) {
1699                 n->m_data = m->m_data + remain;
1700                 mb_dupcl(n, m);
1701         } else {
1702                 bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
1703         }
1704
1705         /* Trim 'm' and update accounting. */
1706         m->m_len -= n->m_len;
1707         sb->sb_tlscc -= n->m_len;
1708         sb->sb_ccc -= n->m_len;
1709
1710         /* Account for 'n'. */
1711         sballoc_ktls_rx(sb, n);
1712
1713         /* Insert 'n' into the TLS chain. */
1714         sb->sb_mtls = n;
1715         n->m_next = m->m_next;
1716         if (sb->sb_mtlstail == m)
1717                 sb->sb_mtlstail = n;
1718
1719         /* Detach the record from the TLS chain. */
1720         m->m_next = NULL;
1721
1722 out:
1723         MPASS(m_length(top, NULL) == len);
1724         for (m = top; m != NULL; m = m->m_next)
1725                 sbfree_ktls_rx(sb, m);
1726         sb->sb_tlsdcc = len;
1727         sb->sb_ccc += len;
1728         SBCHECK(sb);
1729         return (top);
1730 }
1731
1732 static void
1733 ktls_decrypt(struct socket *so)
1734 {
1735         char tls_header[MBUF_PEXT_HDR_LEN];
1736         struct ktls_session *tls;
1737         struct sockbuf *sb;
1738         struct tls_record_layer *hdr;
1739         struct tls_get_record tgr;
1740         struct mbuf *control, *data, *m;
1741         uint64_t seqno;
1742         int error, remain, tls_len, trail_len;
1743
1744         hdr = (struct tls_record_layer *)tls_header;
1745         sb = &so->so_rcv;
1746         SOCKBUF_LOCK(sb);
1747         KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
1748             ("%s: socket %p not running", __func__, so));
1749
1750         tls = sb->sb_tls_info;
1751         MPASS(tls != NULL);
1752
1753         for (;;) {
1754                 /* Is there enough queued for a TLS header? */
1755                 if (sb->sb_tlscc < tls->params.tls_hlen)
1756                         break;
1757
1758                 m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
1759                 tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
1760
1761                 if (hdr->tls_vmajor != tls->params.tls_vmajor ||
1762                     hdr->tls_vminor != tls->params.tls_vminor)
1763                         error = EINVAL;
1764                 else if (tls_len < tls->params.tls_hlen || tls_len >
1765                     tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
1766                     tls->params.tls_tlen)
1767                         error = EMSGSIZE;
1768                 else
1769                         error = 0;
1770                 if (__predict_false(error != 0)) {
1771                         /*
1772                          * We have a corrupted record and are likely
1773                          * out of sync.  The connection isn't
1774                          * recoverable at this point, so abort it.
1775                          */
1776                         SOCKBUF_UNLOCK(sb);
1777                         counter_u64_add(ktls_offload_corrupted_records, 1);
1778
1779                         CURVNET_SET(so->so_vnet);
1780                         so->so_proto->pr_usrreqs->pru_abort(so);
1781                         so->so_error = error;
1782                         CURVNET_RESTORE();
1783                         goto deref;
1784                 }
1785
1786                 /* Is the entire record queued? */
1787                 if (sb->sb_tlscc < tls_len)
1788                         break;
1789
1790                 /*
1791                  * Split out the portion of the mbuf chain containing
1792                  * this TLS record.
1793                  */
1794                 data = ktls_detach_record(sb, tls_len);
1795                 if (data == NULL)
1796                         continue;
1797                 MPASS(sb->sb_tlsdcc == tls_len);
1798
1799                 seqno = sb->sb_tls_seqno;
1800                 sb->sb_tls_seqno++;
1801                 SBCHECK(sb);
1802                 SOCKBUF_UNLOCK(sb);
1803
1804                 error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
1805                 if (error) {
1806                         counter_u64_add(ktls_offload_failed_crypto, 1);
1807
1808                         SOCKBUF_LOCK(sb);
1809                         if (sb->sb_tlsdcc == 0) {
1810                                 /*
1811                                  * sbcut/drop/flush discarded these
1812                                  * mbufs.
1813                                  */
1814                                 m_freem(data);
1815                                 break;
1816                         }
1817
1818                         /*
1819                          * Drop this TLS record's data, but keep
1820                          * decrypting subsequent records.
1821                          */
1822                         sb->sb_ccc -= tls_len;
1823                         sb->sb_tlsdcc = 0;
1824
1825                         CURVNET_SET(so->so_vnet);
1826                         so->so_error = EBADMSG;
1827                         sorwakeup_locked(so);
1828                         CURVNET_RESTORE();
1829
1830                         m_freem(data);
1831
1832                         SOCKBUF_LOCK(sb);
1833                         continue;
1834                 }
1835
1836                 /* Allocate the control mbuf. */
1837                 tgr.tls_type = hdr->tls_type;
1838                 tgr.tls_vmajor = hdr->tls_vmajor;
1839                 tgr.tls_vminor = hdr->tls_vminor;
1840                 tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
1841                     trail_len);
1842                 control = sbcreatecontrol_how(&tgr, sizeof(tgr),
1843                     TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
1844
1845                 SOCKBUF_LOCK(sb);
1846                 if (sb->sb_tlsdcc == 0) {
1847                         /* sbcut/drop/flush discarded these mbufs. */
1848                         MPASS(sb->sb_tlscc == 0);
1849                         m_freem(data);
1850                         m_freem(control);
1851                         break;
1852                 }
1853
1854                 /*
1855                  * Clear the 'dcc' accounting in preparation for
1856                  * adding the decrypted record.
1857                  */
1858                 sb->sb_ccc -= tls_len;
1859                 sb->sb_tlsdcc = 0;
1860                 SBCHECK(sb);
1861
1862                 /* If there is no payload, drop all of the data. */
1863                 if (tgr.tls_length == htobe16(0)) {
1864                         m_freem(data);
1865                         data = NULL;
1866                 } else {
1867                         /* Trim header. */
1868                         remain = tls->params.tls_hlen;
1869                         while (remain > 0) {
1870                                 if (data->m_len > remain) {
1871                                         data->m_data += remain;
1872                                         data->m_len -= remain;
1873                                         break;
1874                                 }
1875                                 remain -= data->m_len;
1876                                 data = m_free(data);
1877                         }
1878
1879                         /* Trim trailer and clear M_NOTREADY. */
1880                         remain = be16toh(tgr.tls_length);
1881                         m = data;
1882                         for (m = data; remain > m->m_len; m = m->m_next) {
1883                                 m->m_flags &= ~M_NOTREADY;
1884                                 remain -= m->m_len;
1885                         }
1886                         m->m_len = remain;
1887                         m_freem(m->m_next);
1888                         m->m_next = NULL;
1889                         m->m_flags &= ~M_NOTREADY;
1890
1891                         /* Set EOR on the final mbuf. */
1892                         m->m_flags |= M_EOR;
1893                 }
1894
1895                 sbappendcontrol_locked(sb, data, control, 0);
1896         }
1897
1898         sb->sb_flags &= ~SB_TLS_RX_RUNNING;
1899
1900         if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
1901                 so->so_error = EMSGSIZE;
1902
1903         sorwakeup_locked(so);
1904
1905 deref:
1906         SOCKBUF_UNLOCK_ASSERT(sb);
1907
1908         CURVNET_SET(so->so_vnet);
1909         SOCK_LOCK(so);
1910         sorele(so);
1911         CURVNET_RESTORE();
1912 }
1913
1914 void
1915 ktls_enqueue_to_free(struct mbuf *m)
1916 {
1917         struct ktls_wq *wq;
1918         bool running;
1919
1920         /* Mark it for freeing. */
1921         m->m_epg_flags |= EPG_FLAG_2FREE;
1922         wq = &ktls_wq[m->m_epg_tls->wq_index];
1923         mtx_lock(&wq->mtx);
1924         STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
1925         running = wq->running;
1926         mtx_unlock(&wq->mtx);
1927         if (!running)
1928                 wakeup(wq);
1929 }
1930
1931 static void *
1932 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
1933 {
1934         void *buf;
1935
1936         if (m->m_epg_npgs <= 2)
1937                 return (NULL);
1938         if (ktls_buffer_zone == NULL)
1939                 return (NULL);
1940         if ((u_int)(ticks - wq->lastallocfail) < hz) {
1941                 /*
1942                  * Rate-limit allocation attempts after a failure.
1943                  * ktls_buffer_import() will acquire a per-domain mutex to check
1944                  * the free page queues and may fail consistently if memory is
1945                  * fragmented.
1946                  */
1947                 return (NULL);
1948         }
1949         buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
1950         if (buf == NULL)
1951                 wq->lastallocfail = ticks;
1952         return (buf);
1953 }
1954
1955 void
1956 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
1957 {
1958         struct ktls_wq *wq;
1959         bool running;
1960
1961         KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
1962             (M_EXTPG | M_NOTREADY)),
1963             ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
1964         KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
1965
1966         KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
1967
1968         m->m_epg_enc_cnt = page_count;
1969
1970         /*
1971          * Save a pointer to the socket.  The caller is responsible
1972          * for taking an additional reference via soref().
1973          */
1974         m->m_epg_so = so;
1975
1976         wq = &ktls_wq[m->m_epg_tls->wq_index];
1977         mtx_lock(&wq->mtx);
1978         STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
1979         running = wq->running;
1980         mtx_unlock(&wq->mtx);
1981         if (!running)
1982                 wakeup(wq);
1983         counter_u64_add(ktls_cnt_tx_queued, 1);
1984 }
1985
1986 #define MAX_TLS_PAGES   (1 + btoc(TLS_MAX_MSG_SIZE_V10_2))
1987
1988 static __noinline void
1989 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
1990 {
1991         struct ktls_session *tls;
1992         struct socket *so;
1993         struct mbuf *m;
1994         vm_paddr_t parray[MAX_TLS_PAGES + 1];
1995         struct iovec dst_iov[MAX_TLS_PAGES + 2];
1996         vm_page_t pg;
1997         void *cbuf;
1998         int error, i, len, npages, off, total_pages;
1999
2000         so = top->m_epg_so;
2001         tls = top->m_epg_tls;
2002         KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
2003         KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
2004 #ifdef INVARIANTS
2005         top->m_epg_so = NULL;
2006 #endif
2007         total_pages = top->m_epg_enc_cnt;
2008         npages = 0;
2009
2010         /*
2011          * Encrypt the TLS records in the chain of mbufs starting with
2012          * 'top'.  'total_pages' gives us a total count of pages and is
2013          * used to know when we have finished encrypting the TLS
2014          * records originally queued with 'top'.
2015          *
2016          * NB: These mbufs are queued in the socket buffer and
2017          * 'm_next' is traversing the mbufs in the socket buffer.  The
2018          * socket buffer lock is not held while traversing this chain.
2019          * Since the mbufs are all marked M_NOTREADY their 'm_next'
2020          * pointers should be stable.  However, the 'm_next' of the
2021          * last mbuf encrypted is not necessarily NULL.  It can point
2022          * to other mbufs appended while 'top' was on the TLS work
2023          * queue.
2024          *
2025          * Each mbuf holds an entire TLS record.
2026          */
2027         error = 0;
2028         for (m = top; npages != total_pages; m = m->m_next) {
2029                 KASSERT(m->m_epg_tls == tls,
2030                     ("different TLS sessions in a single mbuf chain: %p vs %p",
2031                     tls, m->m_epg_tls));
2032                 KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
2033                     (M_EXTPG | M_NOTREADY),
2034                     ("%p not unready & nomap mbuf (top = %p)\n", m, top));
2035                 KASSERT(npages + m->m_epg_npgs <= total_pages,
2036                     ("page count mismatch: top %p, total_pages %d, m %p", top,
2037                     total_pages, m));
2038                 KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
2039                     ("page count %d larger than maximum frame length %d",
2040                     m->m_epg_npgs, ktls_maxlen));
2041
2042                 /*
2043                  * For anonymous mbufs, encryption is done in place.
2044                  * For file-backed mbufs (from sendfile), anonymous
2045                  * wired pages are allocated and used as the
2046                  * encryption destination.
2047                  */
2048                 if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) {
2049                         error = (*tls->sw_encrypt)(tls, m, NULL, 0);
2050                 } else {
2051                         if ((cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
2052                                 len = ptoa(m->m_epg_npgs - 1) +
2053                                     m->m_epg_last_len - m->m_epg_1st_off;
2054                                 dst_iov[0].iov_base = (char *)cbuf +
2055                                     m->m_epg_1st_off;
2056                                 dst_iov[0].iov_len = len;
2057                                 parray[0] = DMAP_TO_PHYS((vm_offset_t)cbuf);
2058                                 i = 1;
2059                         } else {
2060                                 off = m->m_epg_1st_off;
2061                                 for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
2062                                         do {
2063                                                 pg = vm_page_alloc(NULL, 0,
2064                                                     VM_ALLOC_NORMAL |
2065                                                     VM_ALLOC_NOOBJ |
2066                                                     VM_ALLOC_NODUMP |
2067                                                     VM_ALLOC_WIRED |
2068                                                     VM_ALLOC_WAITFAIL);
2069                                         } while (pg == NULL);
2070
2071                                         len = m_epg_pagelen(m, i, off);
2072                                         parray[i] = VM_PAGE_TO_PHYS(pg);
2073                                         dst_iov[i].iov_base =
2074                                             (char *)(void *)PHYS_TO_DMAP(
2075                                             parray[i]) + off;
2076                                         dst_iov[i].iov_len = len;
2077                                 }
2078                         }
2079                         KASSERT(i + 1 <= nitems(dst_iov),
2080                             ("dst_iov is too small"));
2081                         dst_iov[i].iov_base = m->m_epg_trail;
2082                         dst_iov[i].iov_len = m->m_epg_trllen;
2083
2084                         error = (*tls->sw_encrypt)(tls, m, dst_iov, i + 1);
2085
2086                         /* Free the old pages. */
2087                         m->m_ext.ext_free(m);
2088
2089                         /* Replace them with the new pages. */
2090                         if (cbuf != NULL) {
2091                                 for (i = 0; i < m->m_epg_npgs; i++)
2092                                         m->m_epg_pa[i] = parray[0] + ptoa(i);
2093
2094                                 /* Contig pages should go back to the cache. */
2095                                 m->m_ext.ext_free = ktls_free_mext_contig;
2096                         } else {
2097                                 for (i = 0; i < m->m_epg_npgs; i++)
2098                                         m->m_epg_pa[i] = parray[i];
2099
2100                                 /* Use the basic free routine. */
2101                                 m->m_ext.ext_free = mb_free_mext_pgs;
2102                         }
2103
2104                         /* Pages are now writable. */
2105                         m->m_epg_flags |= EPG_FLAG_ANON;
2106                 }
2107                 if (error) {
2108                         counter_u64_add(ktls_offload_failed_crypto, 1);
2109                         break;
2110                 }
2111
2112                 if (__predict_false(m->m_epg_npgs == 0)) {
2113                         /* TLS 1.0 empty fragment. */
2114                         npages++;
2115                 } else
2116                         npages += m->m_epg_npgs;
2117
2118                 /*
2119                  * Drop a reference to the session now that it is no
2120                  * longer needed.  Existing code depends on encrypted
2121                  * records having no associated session vs
2122                  * yet-to-be-encrypted records having an associated
2123                  * session.
2124                  */
2125                 m->m_epg_tls = NULL;
2126                 ktls_free(tls);
2127         }
2128
2129         CURVNET_SET(so->so_vnet);
2130         if (error == 0) {
2131                 (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
2132         } else {
2133                 so->so_proto->pr_usrreqs->pru_abort(so);
2134                 so->so_error = EIO;
2135                 mb_free_notready(top, total_pages);
2136         }
2137
2138         SOCK_LOCK(so);
2139         sorele(so);
2140         CURVNET_RESTORE();
2141 }
2142
2143 static void
2144 ktls_work_thread(void *ctx)
2145 {
2146         struct ktls_wq *wq = ctx;
2147         struct mbuf *m, *n;
2148         struct socket *so, *son;
2149         STAILQ_HEAD(, mbuf) local_m_head;
2150         STAILQ_HEAD(, socket) local_so_head;
2151
2152         if (ktls_bind_threads > 1) {
2153                 curthread->td_domain.dr_policy =
2154                         DOMAINSET_PREF(PCPU_GET(domain));
2155         }
2156 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
2157         fpu_kern_thread(0);
2158 #endif
2159         for (;;) {
2160                 mtx_lock(&wq->mtx);
2161                 while (STAILQ_EMPTY(&wq->m_head) &&
2162                     STAILQ_EMPTY(&wq->so_head)) {
2163                         wq->running = false;
2164                         mtx_sleep(wq, &wq->mtx, 0, "-", 0);
2165                         wq->running = true;
2166                 }
2167
2168                 STAILQ_INIT(&local_m_head);
2169                 STAILQ_CONCAT(&local_m_head, &wq->m_head);
2170                 STAILQ_INIT(&local_so_head);
2171                 STAILQ_CONCAT(&local_so_head, &wq->so_head);
2172                 mtx_unlock(&wq->mtx);
2173
2174                 STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
2175                         if (m->m_epg_flags & EPG_FLAG_2FREE) {
2176                                 ktls_free(m->m_epg_tls);
2177                                 uma_zfree(zone_mbuf, m);
2178                         } else {
2179                                 ktls_encrypt(wq, m);
2180                                 counter_u64_add(ktls_cnt_tx_queued, -1);
2181                         }
2182                 }
2183
2184                 STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
2185                         ktls_decrypt(so);
2186                         counter_u64_add(ktls_cnt_rx_queued, -1);
2187                 }
2188         }
2189 }