]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - sys/kern/uipc_ktls.c
zfs: merge openzfs/zfs@3522f57b6 (master) to main
[FreeBSD/FreeBSD.git] / sys / kern / uipc_ktls.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2014-2019 Netflix Inc.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_rss.h"
34
35 #include <sys/param.h>
36 #include <sys/kernel.h>
37 #include <sys/domainset.h>
38 #include <sys/ktls.h>
39 #include <sys/lock.h>
40 #include <sys/mbuf.h>
41 #include <sys/mutex.h>
42 #include <sys/rmlock.h>
43 #include <sys/proc.h>
44 #include <sys/protosw.h>
45 #include <sys/refcount.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/taskqueue.h>
51 #include <sys/kthread.h>
52 #include <sys/uio.h>
53 #include <sys/vmmeter.h>
54 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
55 #include <machine/pcb.h>
56 #endif
57 #include <machine/vmparam.h>
58 #include <net/if.h>
59 #include <net/if_var.h>
60 #ifdef RSS
61 #include <net/netisr.h>
62 #include <net/rss_config.h>
63 #endif
64 #include <net/route.h>
65 #include <net/route/nhop.h>
66 #if defined(INET) || defined(INET6)
67 #include <netinet/in.h>
68 #include <netinet/in_pcb.h>
69 #endif
70 #include <netinet/tcp_var.h>
71 #ifdef TCP_OFFLOAD
72 #include <netinet/tcp_offload.h>
73 #endif
74 #include <opencrypto/xform.h>
75 #include <vm/uma_dbg.h>
76 #include <vm/vm.h>
77 #include <vm/vm_pageout.h>
78 #include <vm/vm_page.h>
79
80 struct ktls_wq {
81         struct mtx      mtx;
82         STAILQ_HEAD(, mbuf) m_head;
83         STAILQ_HEAD(, socket) so_head;
84         bool            running;
85         int             lastallocfail;
86 } __aligned(CACHE_LINE_SIZE);
87
88 struct ktls_domain_info {
89         int count;
90         int cpu[MAXCPU];
91 };
92
93 struct ktls_domain_info ktls_domains[MAXMEMDOM];
94 static struct ktls_wq *ktls_wq;
95 static struct proc *ktls_proc;
96 static uma_zone_t ktls_session_zone;
97 static uma_zone_t ktls_buffer_zone;
98 static uint16_t ktls_cpuid_lookup[MAXCPU];
99
100 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
101     "Kernel TLS offload");
102 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
103     "Kernel TLS offload stats");
104
105 #ifdef RSS
106 static int ktls_bind_threads = 1;
107 #else
108 static int ktls_bind_threads;
109 #endif
110 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
111     &ktls_bind_threads, 0,
112     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
113
114 static u_int ktls_maxlen = 16384;
115 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
116     &ktls_maxlen, 0, "Maximum TLS record size");
117
118 static int ktls_number_threads;
119 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
120     &ktls_number_threads, 0,
121     "Number of TLS threads in thread-pool");
122
123 static bool ktls_offload_enable;
124 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
125     &ktls_offload_enable, 0,
126     "Enable support for kernel TLS offload");
127
128 static bool ktls_cbc_enable = true;
129 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
130     &ktls_cbc_enable, 1,
131     "Enable Support of AES-CBC crypto for kernel TLS");
132
133 static bool ktls_sw_buffer_cache = true;
134 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
135     &ktls_sw_buffer_cache, 1,
136     "Enable caching of output buffers for SW encryption");
137
138 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
139 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
140     &ktls_tasks_active, "Number of active tasks");
141
142 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
143 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
144     &ktls_cnt_tx_queued,
145     "Number of TLS records in queue to tasks for SW encryption");
146
147 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
148 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
149     &ktls_cnt_rx_queued,
150     "Number of TLS sockets in queue to tasks for SW decryption");
151
152 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
153 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
154     CTLFLAG_RD, &ktls_offload_total,
155     "Total successful TLS setups (parameters set)");
156
157 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
158 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
159     CTLFLAG_RD, &ktls_offload_enable_calls,
160     "Total number of TLS enable calls made");
161
162 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
163 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
164     &ktls_offload_active, "Total Active TLS sessions");
165
166 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
167 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
168     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
169
170 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
171 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
172     &ktls_offload_failed_crypto, "Total TLS crypto failures");
173
174 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
175 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
176     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
177
178 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
179 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
180     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
181
182 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
183 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
184     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
185
186 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
187     "Software TLS session stats");
188 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
189     "Hardware (ifnet) TLS session stats");
190 #ifdef TCP_OFFLOAD
191 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
192     "TOE TLS session stats");
193 #endif
194
195 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
196 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
197     "Active number of software TLS sessions using AES-CBC");
198
199 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
200 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
201     "Active number of software TLS sessions using AES-GCM");
202
203 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
204 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
205     &ktls_sw_chacha20,
206     "Active number of software TLS sessions using Chacha20-Poly1305");
207
208 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
209 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
210     &ktls_ifnet_cbc,
211     "Active number of ifnet TLS sessions using AES-CBC");
212
213 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
214 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
215     &ktls_ifnet_gcm,
216     "Active number of ifnet TLS sessions using AES-GCM");
217
218 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
219 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
220     &ktls_ifnet_chacha20,
221     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
222
223 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
224 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
225     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
226
227 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
228 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
229     &ktls_ifnet_reset_dropped,
230     "TLS sessions dropped after failing to update ifnet send tag");
231
232 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
233 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
234     &ktls_ifnet_reset_failed,
235     "TLS sessions that failed to allocate a new ifnet send tag");
236
237 static int ktls_ifnet_permitted;
238 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
239     &ktls_ifnet_permitted, 1,
240     "Whether to permit hardware (ifnet) TLS sessions");
241
242 #ifdef TCP_OFFLOAD
243 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
244 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
245     &ktls_toe_cbc,
246     "Active number of TOE TLS sessions using AES-CBC");
247
248 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
249 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
250     &ktls_toe_gcm,
251     "Active number of TOE TLS sessions using AES-GCM");
252
253 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
254 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
255     &ktls_toe_chacha20,
256     "Active number of TOE TLS sessions using Chacha20-Poly1305");
257 #endif
258
259 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
260
261 static void ktls_cleanup(struct ktls_session *tls);
262 #if defined(INET) || defined(INET6)
263 static void ktls_reset_send_tag(void *context, int pending);
264 #endif
265 static void ktls_work_thread(void *ctx);
266
267 #if defined(INET) || defined(INET6)
268 static u_int
269 ktls_get_cpu(struct socket *so)
270 {
271         struct inpcb *inp;
272 #ifdef NUMA
273         struct ktls_domain_info *di;
274 #endif
275         u_int cpuid;
276
277         inp = sotoinpcb(so);
278 #ifdef RSS
279         cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
280         if (cpuid != NETISR_CPUID_NONE)
281                 return (cpuid);
282 #endif
283         /*
284          * Just use the flowid to shard connections in a repeatable
285          * fashion.  Note that TLS 1.0 sessions rely on the
286          * serialization provided by having the same connection use
287          * the same queue.
288          */
289 #ifdef NUMA
290         if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
291                 di = &ktls_domains[inp->inp_numa_domain];
292                 cpuid = di->cpu[inp->inp_flowid % di->count];
293         } else
294 #endif
295                 cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
296         return (cpuid);
297 }
298 #endif
299
300 static int
301 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
302 {
303         vm_page_t m;
304         int i;
305
306         KASSERT((ktls_maxlen & PAGE_MASK) == 0,
307             ("%s: ktls max length %d is not page size-aligned",
308             __func__, ktls_maxlen));
309
310         for (i = 0; i < count; i++) {
311                 m = vm_page_alloc_contig_domain(NULL, 0, domain,
312                     VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
313                     VM_ALLOC_NODUMP | malloc2vm_flags(flags),
314                     atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
315                     VM_MEMATTR_DEFAULT);
316                 if (m == NULL)
317                         break;
318                 store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
319         }
320         return (i);
321 }
322
323 static void
324 ktls_buffer_release(void *arg __unused, void **store, int count)
325 {
326         vm_page_t m;
327         int i, j;
328
329         for (i = 0; i < count; i++) {
330                 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
331                 for (j = 0; j < atop(ktls_maxlen); j++) {
332                         (void)vm_page_unwire_noq(m + j);
333                         vm_page_free(m + j);
334                 }
335         }
336 }
337
338 static void
339 ktls_free_mext_contig(struct mbuf *m)
340 {
341         M_ASSERTEXTPG(m);
342         uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
343 }
344
345 static void
346 ktls_init(void *dummy __unused)
347 {
348         struct thread *td;
349         struct pcpu *pc;
350         cpuset_t mask;
351         int count, domain, error, i;
352
353         ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
354             M_WAITOK | M_ZERO);
355
356         ktls_session_zone = uma_zcreate("ktls_session",
357             sizeof(struct ktls_session),
358             NULL, NULL, NULL, NULL,
359             UMA_ALIGN_CACHE, 0);
360
361         if (ktls_sw_buffer_cache) {
362                 ktls_buffer_zone = uma_zcache_create("ktls_buffers",
363                     roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
364                     ktls_buffer_import, ktls_buffer_release, NULL,
365                     UMA_ZONE_FIRSTTOUCH);
366         }
367
368         /*
369          * Initialize the workqueues to run the TLS work.  We create a
370          * work queue for each CPU.
371          */
372         CPU_FOREACH(i) {
373                 STAILQ_INIT(&ktls_wq[i].m_head);
374                 STAILQ_INIT(&ktls_wq[i].so_head);
375                 mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
376                 error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
377                     &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
378                 if (error)
379                         panic("Can't add KTLS thread %d error %d", i, error);
380
381                 /*
382                  * Bind threads to cores.  If ktls_bind_threads is >
383                  * 1, then we bind to the NUMA domain.
384                  */
385                 if (ktls_bind_threads) {
386                         if (ktls_bind_threads > 1) {
387                                 pc = pcpu_find(i);
388                                 domain = pc->pc_domain;
389                                 CPU_COPY(&cpuset_domain[domain], &mask);
390                                 count = ktls_domains[domain].count;
391                                 ktls_domains[domain].cpu[count] = i;
392                                 ktls_domains[domain].count++;
393                         } else {
394                                 CPU_SETOF(i, &mask);
395                         }
396                         error = cpuset_setthread(td->td_tid, &mask);
397                         if (error)
398                                 panic(
399                             "Unable to bind KTLS thread for CPU %d error %d",
400                                      i, error);
401                 }
402                 ktls_cpuid_lookup[ktls_number_threads] = i;
403                 ktls_number_threads++;
404         }
405
406         /*
407          * If we somehow have an empty domain, fall back to choosing
408          * among all KTLS threads.
409          */
410         if (ktls_bind_threads > 1) {
411                 for (i = 0; i < vm_ndomains; i++) {
412                         if (ktls_domains[i].count == 0) {
413                                 ktls_bind_threads = 1;
414                                 break;
415                         }
416                 }
417         }
418
419         if (bootverbose)
420                 printf("KTLS: Initialized %d threads\n", ktls_number_threads);
421 }
422 SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL);
423
424 #if defined(INET) || defined(INET6)
425 static int
426 ktls_create_session(struct socket *so, struct tls_enable *en,
427     struct ktls_session **tlsp)
428 {
429         struct ktls_session *tls;
430         int error;
431
432         /* Only TLS 1.0 - 1.3 are supported. */
433         if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
434                 return (EINVAL);
435         if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
436             en->tls_vminor > TLS_MINOR_VER_THREE)
437                 return (EINVAL);
438
439         if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
440                 return (EINVAL);
441         if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
442                 return (EINVAL);
443         if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
444                 return (EINVAL);
445
446         /* All supported algorithms require a cipher key. */
447         if (en->cipher_key_len == 0)
448                 return (EINVAL);
449
450         /* No flags are currently supported. */
451         if (en->flags != 0)
452                 return (EINVAL);
453
454         /* Common checks for supported algorithms. */
455         switch (en->cipher_algorithm) {
456         case CRYPTO_AES_NIST_GCM_16:
457                 /*
458                  * auth_algorithm isn't used, but permit GMAC values
459                  * for compatibility.
460                  */
461                 switch (en->auth_algorithm) {
462                 case 0:
463 #ifdef COMPAT_FREEBSD12
464                 /* XXX: Really 13.0-current COMPAT. */
465                 case CRYPTO_AES_128_NIST_GMAC:
466                 case CRYPTO_AES_192_NIST_GMAC:
467                 case CRYPTO_AES_256_NIST_GMAC:
468 #endif
469                         break;
470                 default:
471                         return (EINVAL);
472                 }
473                 if (en->auth_key_len != 0)
474                         return (EINVAL);
475                 if ((en->tls_vminor == TLS_MINOR_VER_TWO &&
476                         en->iv_len != TLS_AEAD_GCM_LEN) ||
477                     (en->tls_vminor == TLS_MINOR_VER_THREE &&
478                         en->iv_len != TLS_1_3_GCM_IV_LEN))
479                         return (EINVAL);
480                 break;
481         case CRYPTO_AES_CBC:
482                 switch (en->auth_algorithm) {
483                 case CRYPTO_SHA1_HMAC:
484                         /*
485                          * TLS 1.0 requires an implicit IV.  TLS 1.1+
486                          * all use explicit IVs.
487                          */
488                         if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
489                                 if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
490                                         return (EINVAL);
491                                 break;
492                         }
493
494                         /* FALLTHROUGH */
495                 case CRYPTO_SHA2_256_HMAC:
496                 case CRYPTO_SHA2_384_HMAC:
497                         /* Ignore any supplied IV. */
498                         en->iv_len = 0;
499                         break;
500                 default:
501                         return (EINVAL);
502                 }
503                 if (en->auth_key_len == 0)
504                         return (EINVAL);
505                 break;
506         case CRYPTO_CHACHA20_POLY1305:
507                 if (en->auth_algorithm != 0 || en->auth_key_len != 0)
508                         return (EINVAL);
509                 if (en->tls_vminor != TLS_MINOR_VER_TWO &&
510                     en->tls_vminor != TLS_MINOR_VER_THREE)
511                         return (EINVAL);
512                 if (en->iv_len != TLS_CHACHA20_IV_LEN)
513                         return (EINVAL);
514                 break;
515         default:
516                 return (EINVAL);
517         }
518
519         tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
520
521         counter_u64_add(ktls_offload_active, 1);
522
523         refcount_init(&tls->refcount, 1);
524         TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
525
526         tls->wq_index = ktls_get_cpu(so);
527
528         tls->params.cipher_algorithm = en->cipher_algorithm;
529         tls->params.auth_algorithm = en->auth_algorithm;
530         tls->params.tls_vmajor = en->tls_vmajor;
531         tls->params.tls_vminor = en->tls_vminor;
532         tls->params.flags = en->flags;
533         tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
534
535         /* Set the header and trailer lengths. */
536         tls->params.tls_hlen = sizeof(struct tls_record_layer);
537         switch (en->cipher_algorithm) {
538         case CRYPTO_AES_NIST_GCM_16:
539                 /*
540                  * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
541                  * nonce.  TLS 1.3 uses a 12 byte implicit IV.
542                  */
543                 if (en->tls_vminor < TLS_MINOR_VER_THREE)
544                         tls->params.tls_hlen += sizeof(uint64_t);
545                 tls->params.tls_tlen = AES_GMAC_HASH_LEN;
546                 tls->params.tls_bs = 1;
547                 break;
548         case CRYPTO_AES_CBC:
549                 switch (en->auth_algorithm) {
550                 case CRYPTO_SHA1_HMAC:
551                         if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
552                                 /* Implicit IV, no nonce. */
553                         } else {
554                                 tls->params.tls_hlen += AES_BLOCK_LEN;
555                         }
556                         tls->params.tls_tlen = AES_BLOCK_LEN +
557                             SHA1_HASH_LEN;
558                         break;
559                 case CRYPTO_SHA2_256_HMAC:
560                         tls->params.tls_hlen += AES_BLOCK_LEN;
561                         tls->params.tls_tlen = AES_BLOCK_LEN +
562                             SHA2_256_HASH_LEN;
563                         break;
564                 case CRYPTO_SHA2_384_HMAC:
565                         tls->params.tls_hlen += AES_BLOCK_LEN;
566                         tls->params.tls_tlen = AES_BLOCK_LEN +
567                             SHA2_384_HASH_LEN;
568                         break;
569                 default:
570                         panic("invalid hmac");
571                 }
572                 tls->params.tls_bs = AES_BLOCK_LEN;
573                 break;
574         case CRYPTO_CHACHA20_POLY1305:
575                 /*
576                  * Chacha20 uses a 12 byte implicit IV.
577                  */
578                 tls->params.tls_tlen = POLY1305_HASH_LEN;
579                 tls->params.tls_bs = 1;
580                 break;
581         default:
582                 panic("invalid cipher");
583         }
584
585         /*
586          * TLS 1.3 includes optional padding which we do not support,
587          * and also puts the "real" record type at the end of the
588          * encrypted data.
589          */
590         if (en->tls_vminor == TLS_MINOR_VER_THREE)
591                 tls->params.tls_tlen += sizeof(uint8_t);
592
593         KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
594             ("TLS header length too long: %d", tls->params.tls_hlen));
595         KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
596             ("TLS trailer length too long: %d", tls->params.tls_tlen));
597
598         if (en->auth_key_len != 0) {
599                 tls->params.auth_key_len = en->auth_key_len;
600                 tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
601                     M_WAITOK);
602                 error = copyin(en->auth_key, tls->params.auth_key,
603                     en->auth_key_len);
604                 if (error)
605                         goto out;
606         }
607
608         tls->params.cipher_key_len = en->cipher_key_len;
609         tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
610         error = copyin(en->cipher_key, tls->params.cipher_key,
611             en->cipher_key_len);
612         if (error)
613                 goto out;
614
615         /*
616          * This holds the implicit portion of the nonce for AEAD
617          * ciphers and the initial implicit IV for TLS 1.0.  The
618          * explicit portions of the IV are generated in ktls_frame().
619          */
620         if (en->iv_len != 0) {
621                 tls->params.iv_len = en->iv_len;
622                 error = copyin(en->iv, tls->params.iv, en->iv_len);
623                 if (error)
624                         goto out;
625
626                 /*
627                  * For TLS 1.2 with GCM, generate an 8-byte nonce as a
628                  * counter to generate unique explicit IVs.
629                  *
630                  * Store this counter in the last 8 bytes of the IV
631                  * array so that it is 8-byte aligned.
632                  */
633                 if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
634                     en->tls_vminor == TLS_MINOR_VER_TWO)
635                         arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
636         }
637
638         *tlsp = tls;
639         return (0);
640
641 out:
642         ktls_cleanup(tls);
643         return (error);
644 }
645
646 static struct ktls_session *
647 ktls_clone_session(struct ktls_session *tls)
648 {
649         struct ktls_session *tls_new;
650
651         tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
652
653         counter_u64_add(ktls_offload_active, 1);
654
655         refcount_init(&tls_new->refcount, 1);
656
657         /* Copy fields from existing session. */
658         tls_new->params = tls->params;
659         tls_new->wq_index = tls->wq_index;
660
661         /* Deep copy keys. */
662         if (tls_new->params.auth_key != NULL) {
663                 tls_new->params.auth_key = malloc(tls->params.auth_key_len,
664                     M_KTLS, M_WAITOK);
665                 memcpy(tls_new->params.auth_key, tls->params.auth_key,
666                     tls->params.auth_key_len);
667         }
668
669         tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
670             M_WAITOK);
671         memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
672             tls->params.cipher_key_len);
673
674         return (tls_new);
675 }
676 #endif
677
678 static void
679 ktls_cleanup(struct ktls_session *tls)
680 {
681
682         counter_u64_add(ktls_offload_active, -1);
683         switch (tls->mode) {
684         case TCP_TLS_MODE_SW:
685                 switch (tls->params.cipher_algorithm) {
686                 case CRYPTO_AES_CBC:
687                         counter_u64_add(ktls_sw_cbc, -1);
688                         break;
689                 case CRYPTO_AES_NIST_GCM_16:
690                         counter_u64_add(ktls_sw_gcm, -1);
691                         break;
692                 case CRYPTO_CHACHA20_POLY1305:
693                         counter_u64_add(ktls_sw_chacha20, -1);
694                         break;
695                 }
696                 ktls_ocf_free(tls);
697                 break;
698         case TCP_TLS_MODE_IFNET:
699                 switch (tls->params.cipher_algorithm) {
700                 case CRYPTO_AES_CBC:
701                         counter_u64_add(ktls_ifnet_cbc, -1);
702                         break;
703                 case CRYPTO_AES_NIST_GCM_16:
704                         counter_u64_add(ktls_ifnet_gcm, -1);
705                         break;
706                 case CRYPTO_CHACHA20_POLY1305:
707                         counter_u64_add(ktls_ifnet_chacha20, -1);
708                         break;
709                 }
710                 if (tls->snd_tag != NULL)
711                         m_snd_tag_rele(tls->snd_tag);
712                 break;
713 #ifdef TCP_OFFLOAD
714         case TCP_TLS_MODE_TOE:
715                 switch (tls->params.cipher_algorithm) {
716                 case CRYPTO_AES_CBC:
717                         counter_u64_add(ktls_toe_cbc, -1);
718                         break;
719                 case CRYPTO_AES_NIST_GCM_16:
720                         counter_u64_add(ktls_toe_gcm, -1);
721                         break;
722                 case CRYPTO_CHACHA20_POLY1305:
723                         counter_u64_add(ktls_toe_chacha20, -1);
724                         break;
725                 }
726                 break;
727 #endif
728         }
729         if (tls->params.auth_key != NULL) {
730                 zfree(tls->params.auth_key, M_KTLS);
731                 tls->params.auth_key = NULL;
732                 tls->params.auth_key_len = 0;
733         }
734         if (tls->params.cipher_key != NULL) {
735                 zfree(tls->params.cipher_key, M_KTLS);
736                 tls->params.cipher_key = NULL;
737                 tls->params.cipher_key_len = 0;
738         }
739         explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
740 }
741
742 #if defined(INET) || defined(INET6)
743
744 #ifdef TCP_OFFLOAD
745 static int
746 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
747 {
748         struct inpcb *inp;
749         struct tcpcb *tp;
750         int error;
751
752         inp = so->so_pcb;
753         INP_WLOCK(inp);
754         if (inp->inp_flags2 & INP_FREED) {
755                 INP_WUNLOCK(inp);
756                 return (ECONNRESET);
757         }
758         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
759                 INP_WUNLOCK(inp);
760                 return (ECONNRESET);
761         }
762         if (inp->inp_socket == NULL) {
763                 INP_WUNLOCK(inp);
764                 return (ECONNRESET);
765         }
766         tp = intotcpcb(inp);
767         if (!(tp->t_flags & TF_TOE)) {
768                 INP_WUNLOCK(inp);
769                 return (EOPNOTSUPP);
770         }
771
772         error = tcp_offload_alloc_tls_session(tp, tls, direction);
773         INP_WUNLOCK(inp);
774         if (error == 0) {
775                 tls->mode = TCP_TLS_MODE_TOE;
776                 switch (tls->params.cipher_algorithm) {
777                 case CRYPTO_AES_CBC:
778                         counter_u64_add(ktls_toe_cbc, 1);
779                         break;
780                 case CRYPTO_AES_NIST_GCM_16:
781                         counter_u64_add(ktls_toe_gcm, 1);
782                         break;
783                 case CRYPTO_CHACHA20_POLY1305:
784                         counter_u64_add(ktls_toe_chacha20, 1);
785                         break;
786                 }
787         }
788         return (error);
789 }
790 #endif
791
792 /*
793  * Common code used when first enabling ifnet TLS on a connection or
794  * when allocating a new ifnet TLS session due to a routing change.
795  * This function allocates a new TLS send tag on whatever interface
796  * the connection is currently routed over.
797  */
798 static int
799 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
800     struct m_snd_tag **mstp)
801 {
802         union if_snd_tag_alloc_params params;
803         struct ifnet *ifp;
804         struct nhop_object *nh;
805         struct tcpcb *tp;
806         int error;
807
808         INP_RLOCK(inp);
809         if (inp->inp_flags2 & INP_FREED) {
810                 INP_RUNLOCK(inp);
811                 return (ECONNRESET);
812         }
813         if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
814                 INP_RUNLOCK(inp);
815                 return (ECONNRESET);
816         }
817         if (inp->inp_socket == NULL) {
818                 INP_RUNLOCK(inp);
819                 return (ECONNRESET);
820         }
821         tp = intotcpcb(inp);
822
823         /*
824          * Check administrative controls on ifnet TLS to determine if
825          * ifnet TLS should be denied.
826          *
827          * - Always permit 'force' requests.
828          * - ktls_ifnet_permitted == 0: always deny.
829          */
830         if (!force && ktls_ifnet_permitted == 0) {
831                 INP_RUNLOCK(inp);
832                 return (ENXIO);
833         }
834
835         /*
836          * XXX: Use the cached route in the inpcb to find the
837          * interface.  This should perhaps instead use
838          * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
839          * enabled after a connection has completed key negotiation in
840          * userland, the cached route will be present in practice.
841          */
842         nh = inp->inp_route.ro_nh;
843         if (nh == NULL) {
844                 INP_RUNLOCK(inp);
845                 return (ENXIO);
846         }
847         ifp = nh->nh_ifp;
848         if_ref(ifp);
849
850         /*
851          * Allocate a TLS + ratelimit tag if the connection has an
852          * existing pacing rate.
853          */
854         if (tp->t_pacing_rate != -1 &&
855             (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
856                 params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
857                 params.tls_rate_limit.inp = inp;
858                 params.tls_rate_limit.tls = tls;
859                 params.tls_rate_limit.max_rate = tp->t_pacing_rate;
860         } else {
861                 params.hdr.type = IF_SND_TAG_TYPE_TLS;
862                 params.tls.inp = inp;
863                 params.tls.tls = tls;
864         }
865         params.hdr.flowid = inp->inp_flowid;
866         params.hdr.flowtype = inp->inp_flowtype;
867         params.hdr.numa_domain = inp->inp_numa_domain;
868         INP_RUNLOCK(inp);
869
870         if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
871                 error = EOPNOTSUPP;
872                 goto out;
873         }
874         if (inp->inp_vflag & INP_IPV6) {
875                 if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
876                         error = EOPNOTSUPP;
877                         goto out;
878                 }
879         } else {
880                 if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
881                         error = EOPNOTSUPP;
882                         goto out;
883                 }
884         }
885         error = m_snd_tag_alloc(ifp, &params, mstp);
886 out:
887         if_rele(ifp);
888         return (error);
889 }
890
891 static int
892 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, bool force)
893 {
894         struct m_snd_tag *mst;
895         int error;
896
897         error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
898         if (error == 0) {
899                 tls->mode = TCP_TLS_MODE_IFNET;
900                 tls->snd_tag = mst;
901                 switch (tls->params.cipher_algorithm) {
902                 case CRYPTO_AES_CBC:
903                         counter_u64_add(ktls_ifnet_cbc, 1);
904                         break;
905                 case CRYPTO_AES_NIST_GCM_16:
906                         counter_u64_add(ktls_ifnet_gcm, 1);
907                         break;
908                 case CRYPTO_CHACHA20_POLY1305:
909                         counter_u64_add(ktls_ifnet_chacha20, 1);
910                         break;
911                 }
912         }
913         return (error);
914 }
915
916 static int
917 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
918 {
919         int error;
920
921         error = ktls_ocf_try(so, tls, direction);
922         if (error)
923                 return (error);
924         tls->mode = TCP_TLS_MODE_SW;
925         switch (tls->params.cipher_algorithm) {
926         case CRYPTO_AES_CBC:
927                 counter_u64_add(ktls_sw_cbc, 1);
928                 break;
929         case CRYPTO_AES_NIST_GCM_16:
930                 counter_u64_add(ktls_sw_gcm, 1);
931                 break;
932         case CRYPTO_CHACHA20_POLY1305:
933                 counter_u64_add(ktls_sw_chacha20, 1);
934                 break;
935         }
936         return (0);
937 }
938
939 /*
940  * KTLS RX stores data in the socket buffer as a list of TLS records,
941  * where each record is stored as a control message containg the TLS
942  * header followed by data mbufs containing the decrypted data.  This
943  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
944  * both encrypted and decrypted data.  TLS records decrypted by a NIC
945  * should be queued to the socket buffer as records, but encrypted
946  * data which needs to be decrypted by software arrives as a stream of
947  * regular mbufs which need to be converted.  In addition, there may
948  * already be pending encrypted data in the socket buffer when KTLS RX
949  * is enabled.
950  *
951  * To manage not-yet-decrypted data for KTLS RX, the following scheme
952  * is used:
953  *
954  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
955  *
956  * - ktls_check_rx checks this chain of mbufs reading the TLS header
957  *   from the first mbuf.  Once all of the data for that TLS record is
958  *   queued, the socket is queued to a worker thread.
959  *
960  * - The worker thread calls ktls_decrypt to decrypt TLS records in
961  *   the TLS chain.  Each TLS record is detached from the TLS chain,
962  *   decrypted, and inserted into the regular socket buffer chain as
963  *   record starting with a control message holding the TLS header and
964  *   a chain of mbufs holding the encrypted data.
965  */
966
967 static void
968 sb_mark_notready(struct sockbuf *sb)
969 {
970         struct mbuf *m;
971
972         m = sb->sb_mb;
973         sb->sb_mtls = m;
974         sb->sb_mb = NULL;
975         sb->sb_mbtail = NULL;
976         sb->sb_lastrecord = NULL;
977         for (; m != NULL; m = m->m_next) {
978                 KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
979                     __func__));
980                 KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
981                     __func__));
982                 KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
983                     __func__));
984                 m->m_flags |= M_NOTREADY;
985                 sb->sb_acc -= m->m_len;
986                 sb->sb_tlscc += m->m_len;
987                 sb->sb_mtlstail = m;
988         }
989         KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
990             ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
991             sb->sb_ccc));
992 }
993
994 int
995 ktls_enable_rx(struct socket *so, struct tls_enable *en)
996 {
997         struct ktls_session *tls;
998         int error;
999
1000         if (!ktls_offload_enable)
1001                 return (ENOTSUP);
1002         if (SOLISTENING(so))
1003                 return (EINVAL);
1004
1005         counter_u64_add(ktls_offload_enable_calls, 1);
1006
1007         /*
1008          * This should always be true since only the TCP socket option
1009          * invokes this function.
1010          */
1011         if (so->so_proto->pr_protocol != IPPROTO_TCP)
1012                 return (EINVAL);
1013
1014         /*
1015          * XXX: Don't overwrite existing sessions.  We should permit
1016          * this to support rekeying in the future.
1017          */
1018         if (so->so_rcv.sb_tls_info != NULL)
1019                 return (EALREADY);
1020
1021         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
1022                 return (ENOTSUP);
1023
1024         /* TLS 1.3 is not yet supported. */
1025         if (en->tls_vmajor == TLS_MAJOR_VER_ONE &&
1026             en->tls_vminor == TLS_MINOR_VER_THREE)
1027                 return (ENOTSUP);
1028
1029         error = ktls_create_session(so, en, &tls);
1030         if (error)
1031                 return (error);
1032
1033 #ifdef TCP_OFFLOAD
1034         error = ktls_try_toe(so, tls, KTLS_RX);
1035         if (error)
1036 #endif
1037                 error = ktls_try_sw(so, tls, KTLS_RX);
1038
1039         if (error) {
1040                 ktls_cleanup(tls);
1041                 return (error);
1042         }
1043
1044         /* Mark the socket as using TLS offload. */
1045         SOCKBUF_LOCK(&so->so_rcv);
1046         so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
1047         so->so_rcv.sb_tls_info = tls;
1048         so->so_rcv.sb_flags |= SB_TLS_RX;
1049
1050         /* Mark existing data as not ready until it can be decrypted. */
1051         sb_mark_notready(&so->so_rcv);
1052         ktls_check_rx(&so->so_rcv);
1053         SOCKBUF_UNLOCK(&so->so_rcv);
1054
1055         counter_u64_add(ktls_offload_total, 1);
1056
1057         return (0);
1058 }
1059
1060 int
1061 ktls_enable_tx(struct socket *so, struct tls_enable *en)
1062 {
1063         struct ktls_session *tls;
1064         struct inpcb *inp;
1065         int error;
1066
1067         if (!ktls_offload_enable)
1068                 return (ENOTSUP);
1069         if (SOLISTENING(so))
1070                 return (EINVAL);
1071
1072         counter_u64_add(ktls_offload_enable_calls, 1);
1073
1074         /*
1075          * This should always be true since only the TCP socket option
1076          * invokes this function.
1077          */
1078         if (so->so_proto->pr_protocol != IPPROTO_TCP)
1079                 return (EINVAL);
1080
1081         /*
1082          * XXX: Don't overwrite existing sessions.  We should permit
1083          * this to support rekeying in the future.
1084          */
1085         if (so->so_snd.sb_tls_info != NULL)
1086                 return (EALREADY);
1087
1088         if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
1089                 return (ENOTSUP);
1090
1091         /* TLS requires ext pgs */
1092         if (mb_use_ext_pgs == 0)
1093                 return (ENXIO);
1094
1095         error = ktls_create_session(so, en, &tls);
1096         if (error)
1097                 return (error);
1098
1099         /* Prefer TOE -> ifnet TLS -> software TLS. */
1100 #ifdef TCP_OFFLOAD
1101         error = ktls_try_toe(so, tls, KTLS_TX);
1102         if (error)
1103 #endif
1104                 error = ktls_try_ifnet(so, tls, false);
1105         if (error)
1106                 error = ktls_try_sw(so, tls, KTLS_TX);
1107
1108         if (error) {
1109                 ktls_cleanup(tls);
1110                 return (error);
1111         }
1112
1113         error = sblock(&so->so_snd, SBL_WAIT);
1114         if (error) {
1115                 ktls_cleanup(tls);
1116                 return (error);
1117         }
1118
1119         /*
1120          * Write lock the INP when setting sb_tls_info so that
1121          * routines in tcp_ratelimit.c can read sb_tls_info while
1122          * holding the INP lock.
1123          */
1124         inp = so->so_pcb;
1125         INP_WLOCK(inp);
1126         SOCKBUF_LOCK(&so->so_snd);
1127         so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
1128         so->so_snd.sb_tls_info = tls;
1129         if (tls->mode != TCP_TLS_MODE_SW)
1130                 so->so_snd.sb_flags |= SB_TLS_IFNET;
1131         SOCKBUF_UNLOCK(&so->so_snd);
1132         INP_WUNLOCK(inp);
1133         sbunlock(&so->so_snd);
1134
1135         counter_u64_add(ktls_offload_total, 1);
1136
1137         return (0);
1138 }
1139
1140 int
1141 ktls_get_rx_mode(struct socket *so)
1142 {
1143         struct ktls_session *tls;
1144         struct inpcb *inp;
1145         int mode;
1146
1147         if (SOLISTENING(so))
1148                 return (EINVAL);
1149         inp = so->so_pcb;
1150         INP_WLOCK_ASSERT(inp);
1151         SOCKBUF_LOCK(&so->so_rcv);
1152         tls = so->so_rcv.sb_tls_info;
1153         if (tls == NULL)
1154                 mode = TCP_TLS_MODE_NONE;
1155         else
1156                 mode = tls->mode;
1157         SOCKBUF_UNLOCK(&so->so_rcv);
1158         return (mode);
1159 }
1160
1161 int
1162 ktls_get_tx_mode(struct socket *so)
1163 {
1164         struct ktls_session *tls;
1165         struct inpcb *inp;
1166         int mode;
1167
1168         if (SOLISTENING(so))
1169                 return (EINVAL);
1170         inp = so->so_pcb;
1171         INP_WLOCK_ASSERT(inp);
1172         SOCKBUF_LOCK(&so->so_snd);
1173         tls = so->so_snd.sb_tls_info;
1174         if (tls == NULL)
1175                 mode = TCP_TLS_MODE_NONE;
1176         else
1177                 mode = tls->mode;
1178         SOCKBUF_UNLOCK(&so->so_snd);
1179         return (mode);
1180 }
1181
1182 /*
1183  * Switch between SW and ifnet TLS sessions as requested.
1184  */
1185 int
1186 ktls_set_tx_mode(struct socket *so, int mode)
1187 {
1188         struct ktls_session *tls, *tls_new;
1189         struct inpcb *inp;
1190         int error;
1191
1192         if (SOLISTENING(so))
1193                 return (EINVAL);
1194         switch (mode) {
1195         case TCP_TLS_MODE_SW:
1196         case TCP_TLS_MODE_IFNET:
1197                 break;
1198         default:
1199                 return (EINVAL);
1200         }
1201
1202         inp = so->so_pcb;
1203         INP_WLOCK_ASSERT(inp);
1204         SOCKBUF_LOCK(&so->so_snd);
1205         tls = so->so_snd.sb_tls_info;
1206         if (tls == NULL) {
1207                 SOCKBUF_UNLOCK(&so->so_snd);
1208                 return (0);
1209         }
1210
1211         if (tls->mode == mode) {
1212                 SOCKBUF_UNLOCK(&so->so_snd);
1213                 return (0);
1214         }
1215
1216         tls = ktls_hold(tls);
1217         SOCKBUF_UNLOCK(&so->so_snd);
1218         INP_WUNLOCK(inp);
1219
1220         tls_new = ktls_clone_session(tls);
1221
1222         if (mode == TCP_TLS_MODE_IFNET)
1223                 error = ktls_try_ifnet(so, tls_new, true);
1224         else
1225                 error = ktls_try_sw(so, tls_new, KTLS_TX);
1226         if (error) {
1227                 counter_u64_add(ktls_switch_failed, 1);
1228                 ktls_free(tls_new);
1229                 ktls_free(tls);
1230                 INP_WLOCK(inp);
1231                 return (error);
1232         }
1233
1234         error = sblock(&so->so_snd, SBL_WAIT);
1235         if (error) {
1236                 counter_u64_add(ktls_switch_failed, 1);
1237                 ktls_free(tls_new);
1238                 ktls_free(tls);
1239                 INP_WLOCK(inp);
1240                 return (error);
1241         }
1242
1243         /*
1244          * If we raced with another session change, keep the existing
1245          * session.
1246          */
1247         if (tls != so->so_snd.sb_tls_info) {
1248                 counter_u64_add(ktls_switch_failed, 1);
1249                 sbunlock(&so->so_snd);
1250                 ktls_free(tls_new);
1251                 ktls_free(tls);
1252                 INP_WLOCK(inp);
1253                 return (EBUSY);
1254         }
1255
1256         SOCKBUF_LOCK(&so->so_snd);
1257         so->so_snd.sb_tls_info = tls_new;
1258         if (tls_new->mode != TCP_TLS_MODE_SW)
1259                 so->so_snd.sb_flags |= SB_TLS_IFNET;
1260         SOCKBUF_UNLOCK(&so->so_snd);
1261         sbunlock(&so->so_snd);
1262
1263         /*
1264          * Drop two references on 'tls'.  The first is for the
1265          * ktls_hold() above.  The second drops the reference from the
1266          * socket buffer.
1267          */
1268         KASSERT(tls->refcount >= 2, ("too few references on old session"));
1269         ktls_free(tls);
1270         ktls_free(tls);
1271
1272         if (mode == TCP_TLS_MODE_IFNET)
1273                 counter_u64_add(ktls_switch_to_ifnet, 1);
1274         else
1275                 counter_u64_add(ktls_switch_to_sw, 1);
1276
1277         INP_WLOCK(inp);
1278         return (0);
1279 }
1280
1281 /*
1282  * Try to allocate a new TLS send tag.  This task is scheduled when
1283  * ip_output detects a route change while trying to transmit a packet
1284  * holding a TLS record.  If a new tag is allocated, replace the tag
1285  * in the TLS session.  Subsequent packets on the connection will use
1286  * the new tag.  If a new tag cannot be allocated, drop the
1287  * connection.
1288  */
1289 static void
1290 ktls_reset_send_tag(void *context, int pending)
1291 {
1292         struct epoch_tracker et;
1293         struct ktls_session *tls;
1294         struct m_snd_tag *old, *new;
1295         struct inpcb *inp;
1296         struct tcpcb *tp;
1297         int error;
1298
1299         MPASS(pending == 1);
1300
1301         tls = context;
1302         inp = tls->inp;
1303
1304         /*
1305          * Free the old tag first before allocating a new one.
1306          * ip[6]_output_send() will treat a NULL send tag the same as
1307          * an ifp mismatch and drop packets until a new tag is
1308          * allocated.
1309          *
1310          * Write-lock the INP when changing tls->snd_tag since
1311          * ip[6]_output_send() holds a read-lock when reading the
1312          * pointer.
1313          */
1314         INP_WLOCK(inp);
1315         old = tls->snd_tag;
1316         tls->snd_tag = NULL;
1317         INP_WUNLOCK(inp);
1318         if (old != NULL)
1319                 m_snd_tag_rele(old);
1320
1321         error = ktls_alloc_snd_tag(inp, tls, true, &new);
1322
1323         if (error == 0) {
1324                 INP_WLOCK(inp);
1325                 tls->snd_tag = new;
1326                 mtx_pool_lock(mtxpool_sleep, tls);
1327                 tls->reset_pending = false;
1328                 mtx_pool_unlock(mtxpool_sleep, tls);
1329                 if (!in_pcbrele_wlocked(inp))
1330                         INP_WUNLOCK(inp);
1331
1332                 counter_u64_add(ktls_ifnet_reset, 1);
1333
1334                 /*
1335                  * XXX: Should we kick tcp_output explicitly now that
1336                  * the send tag is fixed or just rely on timers?
1337                  */
1338         } else {
1339                 NET_EPOCH_ENTER(et);
1340                 INP_WLOCK(inp);
1341                 if (!in_pcbrele_wlocked(inp)) {
1342                         if (!(inp->inp_flags & INP_TIMEWAIT) &&
1343                             !(inp->inp_flags & INP_DROPPED)) {
1344                                 tp = intotcpcb(inp);
1345                                 CURVNET_SET(tp->t_vnet);
1346                                 tp = tcp_drop(tp, ECONNABORTED);
1347                                 CURVNET_RESTORE();
1348                                 if (tp != NULL)
1349                                         INP_WUNLOCK(inp);
1350                                 counter_u64_add(ktls_ifnet_reset_dropped, 1);
1351                         } else
1352                                 INP_WUNLOCK(inp);
1353                 }
1354                 NET_EPOCH_EXIT(et);
1355
1356                 counter_u64_add(ktls_ifnet_reset_failed, 1);
1357
1358                 /*
1359                  * Leave reset_pending true to avoid future tasks while
1360                  * the socket goes away.
1361                  */
1362         }
1363
1364         ktls_free(tls);
1365 }
1366
1367 int
1368 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
1369 {
1370
1371         if (inp == NULL)
1372                 return (ENOBUFS);
1373
1374         INP_LOCK_ASSERT(inp);
1375
1376         /*
1377          * See if we should schedule a task to update the send tag for
1378          * this session.
1379          */
1380         mtx_pool_lock(mtxpool_sleep, tls);
1381         if (!tls->reset_pending) {
1382                 (void) ktls_hold(tls);
1383                 in_pcbref(inp);
1384                 tls->inp = inp;
1385                 tls->reset_pending = true;
1386                 taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
1387         }
1388         mtx_pool_unlock(mtxpool_sleep, tls);
1389         return (ENOBUFS);
1390 }
1391
1392 #ifdef RATELIMIT
1393 int
1394 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
1395 {
1396         union if_snd_tag_modify_params params = {
1397                 .rate_limit.max_rate = max_pacing_rate,
1398                 .rate_limit.flags = M_NOWAIT,
1399         };
1400         struct m_snd_tag *mst;
1401         struct ifnet *ifp;
1402         int error;
1403
1404         /* Can't get to the inp, but it should be locked. */
1405         /* INP_LOCK_ASSERT(inp); */
1406
1407         MPASS(tls->mode == TCP_TLS_MODE_IFNET);
1408
1409         if (tls->snd_tag == NULL) {
1410                 /*
1411                  * Resetting send tag, ignore this change.  The
1412                  * pending reset may or may not see this updated rate
1413                  * in the tcpcb.  If it doesn't, we will just lose
1414                  * this rate change.
1415                  */
1416                 return (0);
1417         }
1418
1419         MPASS(tls->snd_tag != NULL);
1420         MPASS(tls->snd_tag->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
1421
1422         mst = tls->snd_tag;
1423         ifp = mst->ifp;
1424         return (ifp->if_snd_tag_modify(mst, &params));
1425 }
1426 #endif
1427 #endif
1428
1429 void
1430 ktls_destroy(struct ktls_session *tls)
1431 {
1432
1433         ktls_cleanup(tls);
1434         uma_zfree(ktls_session_zone, tls);
1435 }
1436
1437 void
1438 ktls_seq(struct sockbuf *sb, struct mbuf *m)
1439 {
1440
1441         for (; m != NULL; m = m->m_next) {
1442                 KASSERT((m->m_flags & M_EXTPG) != 0,
1443                     ("ktls_seq: mapped mbuf %p", m));
1444
1445                 m->m_epg_seqno = sb->sb_tls_seqno;
1446                 sb->sb_tls_seqno++;
1447         }
1448 }
1449
1450 /*
1451  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
1452  * mbuf in the chain must be an unmapped mbuf.  The payload of the
1453  * mbuf must be populated with the payload of each TLS record.
1454  *
1455  * The record_type argument specifies the TLS record type used when
1456  * populating the TLS header.
1457  *
1458  * The enq_count argument on return is set to the number of pages of
1459  * payload data for this entire chain that need to be encrypted via SW
1460  * encryption.  The returned value should be passed to ktls_enqueue
1461  * when scheduling encryption of this chain of mbufs.  To handle the
1462  * special case of empty fragments for TLS 1.0 sessions, an empty
1463  * fragment counts as one page.
1464  */
1465 void
1466 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
1467     uint8_t record_type)
1468 {
1469         struct tls_record_layer *tlshdr;
1470         struct mbuf *m;
1471         uint64_t *noncep;
1472         uint16_t tls_len;
1473         int maxlen;
1474
1475         maxlen = tls->params.max_frame_len;
1476         *enq_cnt = 0;
1477         for (m = top; m != NULL; m = m->m_next) {
1478                 /*
1479                  * All mbufs in the chain should be TLS records whose
1480                  * payload does not exceed the maximum frame length.
1481                  *
1482                  * Empty TLS records are permitted when using CBC.
1483                  */
1484                 KASSERT(m->m_len <= maxlen &&
1485                     (tls->params.cipher_algorithm == CRYPTO_AES_CBC ?
1486                     m->m_len >= 0 : m->m_len > 0),
1487                     ("ktls_frame: m %p len %d\n", m, m->m_len));
1488
1489                 /*
1490                  * TLS frames require unmapped mbufs to store session
1491                  * info.
1492                  */
1493                 KASSERT((m->m_flags & M_EXTPG) != 0,
1494                     ("ktls_frame: mapped mbuf %p (top = %p)\n", m, top));
1495
1496                 tls_len = m->m_len;
1497
1498                 /* Save a reference to the session. */
1499                 m->m_epg_tls = ktls_hold(tls);
1500
1501                 m->m_epg_hdrlen = tls->params.tls_hlen;
1502                 m->m_epg_trllen = tls->params.tls_tlen;
1503                 if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
1504                         int bs, delta;
1505
1506                         /*
1507                          * AES-CBC pads messages to a multiple of the
1508                          * block size.  Note that the padding is
1509                          * applied after the digest and the encryption
1510                          * is done on the "plaintext || mac || padding".
1511                          * At least one byte of padding is always
1512                          * present.
1513                          *
1514                          * Compute the final trailer length assuming
1515                          * at most one block of padding.
1516                          * tls->params.tls_tlen is the maximum
1517                          * possible trailer length (padding + digest).
1518                          * delta holds the number of excess padding
1519                          * bytes if the maximum were used.  Those
1520                          * extra bytes are removed.
1521                          */
1522                         bs = tls->params.tls_bs;
1523                         delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
1524                         m->m_epg_trllen -= delta;
1525                 }
1526                 m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
1527
1528                 /* Populate the TLS header. */
1529                 tlshdr = (void *)m->m_epg_hdr;
1530                 tlshdr->tls_vmajor = tls->params.tls_vmajor;
1531
1532                 /*
1533                  * TLS 1.3 masquarades as TLS 1.2 with a record type
1534                  * of TLS_RLTYPE_APP.
1535                  */
1536                 if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
1537                     tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
1538                         tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
1539                         tlshdr->tls_type = TLS_RLTYPE_APP;
1540                         /* save the real record type for later */
1541                         m->m_epg_record_type = record_type;
1542                         m->m_epg_trail[0] = record_type;
1543                 } else {
1544                         tlshdr->tls_vminor = tls->params.tls_vminor;
1545                         tlshdr->tls_type = record_type;
1546                 }
1547                 tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
1548
1549                 /*
1550                  * Store nonces / explicit IVs after the end of the
1551                  * TLS header.
1552                  *
1553                  * For GCM with TLS 1.2, an 8 byte nonce is copied
1554                  * from the end of the IV.  The nonce is then
1555                  * incremented for use by the next record.
1556                  *
1557                  * For CBC, a random nonce is inserted for TLS 1.1+.
1558                  */
1559                 if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
1560                     tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
1561                         noncep = (uint64_t *)(tls->params.iv + 8);
1562                         be64enc(tlshdr + 1, *noncep);
1563                         (*noncep)++;
1564                 } else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
1565                     tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
1566                         arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
1567
1568                 /*
1569                  * When using SW encryption, mark the mbuf not ready.
1570                  * It will be marked ready via sbready() after the
1571                  * record has been encrypted.
1572                  *
1573                  * When using ifnet TLS, unencrypted TLS records are
1574                  * sent down the stack to the NIC.
1575                  */
1576                 if (tls->mode == TCP_TLS_MODE_SW) {
1577                         m->m_flags |= M_NOTREADY;
1578                         m->m_epg_nrdy = m->m_epg_npgs;
1579                         if (__predict_false(tls_len == 0)) {
1580                                 /* TLS 1.0 empty fragment. */
1581                                 *enq_cnt += 1;
1582                         } else
1583                                 *enq_cnt += m->m_epg_npgs;
1584                 }
1585         }
1586 }
1587
1588 void
1589 ktls_check_rx(struct sockbuf *sb)
1590 {
1591         struct tls_record_layer hdr;
1592         struct ktls_wq *wq;
1593         struct socket *so;
1594         bool running;
1595
1596         SOCKBUF_LOCK_ASSERT(sb);
1597         KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
1598             __func__, sb));
1599         so = __containerof(sb, struct socket, so_rcv);
1600
1601         if (sb->sb_flags & SB_TLS_RX_RUNNING)
1602                 return;
1603
1604         /* Is there enough queued for a TLS header? */
1605         if (sb->sb_tlscc < sizeof(hdr)) {
1606                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
1607                         so->so_error = EMSGSIZE;
1608                 return;
1609         }
1610
1611         m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
1612
1613         /* Is the entire record queued? */
1614         if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
1615                 if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
1616                         so->so_error = EMSGSIZE;
1617                 return;
1618         }
1619
1620         sb->sb_flags |= SB_TLS_RX_RUNNING;
1621
1622         soref(so);
1623         wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
1624         mtx_lock(&wq->mtx);
1625         STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
1626         running = wq->running;
1627         mtx_unlock(&wq->mtx);
1628         if (!running)
1629                 wakeup(wq);
1630         counter_u64_add(ktls_cnt_rx_queued, 1);
1631 }
1632
1633 static struct mbuf *
1634 ktls_detach_record(struct sockbuf *sb, int len)
1635 {
1636         struct mbuf *m, *n, *top;
1637         int remain;
1638
1639         SOCKBUF_LOCK_ASSERT(sb);
1640         MPASS(len <= sb->sb_tlscc);
1641
1642         /*
1643          * If TLS chain is the exact size of the record,
1644          * just grab the whole record.
1645          */
1646         top = sb->sb_mtls;
1647         if (sb->sb_tlscc == len) {
1648                 sb->sb_mtls = NULL;
1649                 sb->sb_mtlstail = NULL;
1650                 goto out;
1651         }
1652
1653         /*
1654          * While it would be nice to use m_split() here, we need
1655          * to know exactly what m_split() allocates to update the
1656          * accounting, so do it inline instead.
1657          */
1658         remain = len;
1659         for (m = top; remain > m->m_len; m = m->m_next)
1660                 remain -= m->m_len;
1661
1662         /* Easy case: don't have to split 'm'. */
1663         if (remain == m->m_len) {
1664                 sb->sb_mtls = m->m_next;
1665                 if (sb->sb_mtls == NULL)
1666                         sb->sb_mtlstail = NULL;
1667                 m->m_next = NULL;
1668                 goto out;
1669         }
1670
1671         /*
1672          * Need to allocate an mbuf to hold the remainder of 'm'.  Try
1673          * with M_NOWAIT first.
1674          */
1675         n = m_get(M_NOWAIT, MT_DATA);
1676         if (n == NULL) {
1677                 /*
1678                  * Use M_WAITOK with socket buffer unlocked.  If
1679                  * 'sb_mtls' changes while the lock is dropped, return
1680                  * NULL to force the caller to retry.
1681                  */
1682                 SOCKBUF_UNLOCK(sb);
1683
1684                 n = m_get(M_WAITOK, MT_DATA);
1685
1686                 SOCKBUF_LOCK(sb);
1687                 if (sb->sb_mtls != top) {
1688                         m_free(n);
1689                         return (NULL);
1690                 }
1691         }
1692         n->m_flags |= M_NOTREADY;
1693
1694         /* Store remainder in 'n'. */
1695         n->m_len = m->m_len - remain;
1696         if (m->m_flags & M_EXT) {
1697                 n->m_data = m->m_data + remain;
1698                 mb_dupcl(n, m);
1699         } else {
1700                 bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
1701         }
1702
1703         /* Trim 'm' and update accounting. */
1704         m->m_len -= n->m_len;
1705         sb->sb_tlscc -= n->m_len;
1706         sb->sb_ccc -= n->m_len;
1707
1708         /* Account for 'n'. */
1709         sballoc_ktls_rx(sb, n);
1710
1711         /* Insert 'n' into the TLS chain. */
1712         sb->sb_mtls = n;
1713         n->m_next = m->m_next;
1714         if (sb->sb_mtlstail == m)
1715                 sb->sb_mtlstail = n;
1716
1717         /* Detach the record from the TLS chain. */
1718         m->m_next = NULL;
1719
1720 out:
1721         MPASS(m_length(top, NULL) == len);
1722         for (m = top; m != NULL; m = m->m_next)
1723                 sbfree_ktls_rx(sb, m);
1724         sb->sb_tlsdcc = len;
1725         sb->sb_ccc += len;
1726         SBCHECK(sb);
1727         return (top);
1728 }
1729
1730 static void
1731 ktls_decrypt(struct socket *so)
1732 {
1733         char tls_header[MBUF_PEXT_HDR_LEN];
1734         struct ktls_session *tls;
1735         struct sockbuf *sb;
1736         struct tls_record_layer *hdr;
1737         struct tls_get_record tgr;
1738         struct mbuf *control, *data, *m;
1739         uint64_t seqno;
1740         int error, remain, tls_len, trail_len;
1741
1742         hdr = (struct tls_record_layer *)tls_header;
1743         sb = &so->so_rcv;
1744         SOCKBUF_LOCK(sb);
1745         KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
1746             ("%s: socket %p not running", __func__, so));
1747
1748         tls = sb->sb_tls_info;
1749         MPASS(tls != NULL);
1750
1751         for (;;) {
1752                 /* Is there enough queued for a TLS header? */
1753                 if (sb->sb_tlscc < tls->params.tls_hlen)
1754                         break;
1755
1756                 m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
1757                 tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
1758
1759                 if (hdr->tls_vmajor != tls->params.tls_vmajor ||
1760                     hdr->tls_vminor != tls->params.tls_vminor)
1761                         error = EINVAL;
1762                 else if (tls_len < tls->params.tls_hlen || tls_len >
1763                     tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
1764                     tls->params.tls_tlen)
1765                         error = EMSGSIZE;
1766                 else
1767                         error = 0;
1768                 if (__predict_false(error != 0)) {
1769                         /*
1770                          * We have a corrupted record and are likely
1771                          * out of sync.  The connection isn't
1772                          * recoverable at this point, so abort it.
1773                          */
1774                         SOCKBUF_UNLOCK(sb);
1775                         counter_u64_add(ktls_offload_corrupted_records, 1);
1776
1777                         CURVNET_SET(so->so_vnet);
1778                         so->so_proto->pr_usrreqs->pru_abort(so);
1779                         so->so_error = error;
1780                         CURVNET_RESTORE();
1781                         goto deref;
1782                 }
1783
1784                 /* Is the entire record queued? */
1785                 if (sb->sb_tlscc < tls_len)
1786                         break;
1787
1788                 /*
1789                  * Split out the portion of the mbuf chain containing
1790                  * this TLS record.
1791                  */
1792                 data = ktls_detach_record(sb, tls_len);
1793                 if (data == NULL)
1794                         continue;
1795                 MPASS(sb->sb_tlsdcc == tls_len);
1796
1797                 seqno = sb->sb_tls_seqno;
1798                 sb->sb_tls_seqno++;
1799                 SBCHECK(sb);
1800                 SOCKBUF_UNLOCK(sb);
1801
1802                 error = tls->sw_decrypt(tls, hdr, data, seqno, &trail_len);
1803                 if (error) {
1804                         counter_u64_add(ktls_offload_failed_crypto, 1);
1805
1806                         SOCKBUF_LOCK(sb);
1807                         if (sb->sb_tlsdcc == 0) {
1808                                 /*
1809                                  * sbcut/drop/flush discarded these
1810                                  * mbufs.
1811                                  */
1812                                 m_freem(data);
1813                                 break;
1814                         }
1815
1816                         /*
1817                          * Drop this TLS record's data, but keep
1818                          * decrypting subsequent records.
1819                          */
1820                         sb->sb_ccc -= tls_len;
1821                         sb->sb_tlsdcc = 0;
1822
1823                         CURVNET_SET(so->so_vnet);
1824                         so->so_error = EBADMSG;
1825                         sorwakeup_locked(so);
1826                         CURVNET_RESTORE();
1827
1828                         m_freem(data);
1829
1830                         SOCKBUF_LOCK(sb);
1831                         continue;
1832                 }
1833
1834                 /* Allocate the control mbuf. */
1835                 tgr.tls_type = hdr->tls_type;
1836                 tgr.tls_vmajor = hdr->tls_vmajor;
1837                 tgr.tls_vminor = hdr->tls_vminor;
1838                 tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
1839                     trail_len);
1840                 control = sbcreatecontrol_how(&tgr, sizeof(tgr),
1841                     TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
1842
1843                 SOCKBUF_LOCK(sb);
1844                 if (sb->sb_tlsdcc == 0) {
1845                         /* sbcut/drop/flush discarded these mbufs. */
1846                         MPASS(sb->sb_tlscc == 0);
1847                         m_freem(data);
1848                         m_freem(control);
1849                         break;
1850                 }
1851
1852                 /*
1853                  * Clear the 'dcc' accounting in preparation for
1854                  * adding the decrypted record.
1855                  */
1856                 sb->sb_ccc -= tls_len;
1857                 sb->sb_tlsdcc = 0;
1858                 SBCHECK(sb);
1859
1860                 /* If there is no payload, drop all of the data. */
1861                 if (tgr.tls_length == htobe16(0)) {
1862                         m_freem(data);
1863                         data = NULL;
1864                 } else {
1865                         /* Trim header. */
1866                         remain = tls->params.tls_hlen;
1867                         while (remain > 0) {
1868                                 if (data->m_len > remain) {
1869                                         data->m_data += remain;
1870                                         data->m_len -= remain;
1871                                         break;
1872                                 }
1873                                 remain -= data->m_len;
1874                                 data = m_free(data);
1875                         }
1876
1877                         /* Trim trailer and clear M_NOTREADY. */
1878                         remain = be16toh(tgr.tls_length);
1879                         m = data;
1880                         for (m = data; remain > m->m_len; m = m->m_next) {
1881                                 m->m_flags &= ~M_NOTREADY;
1882                                 remain -= m->m_len;
1883                         }
1884                         m->m_len = remain;
1885                         m_freem(m->m_next);
1886                         m->m_next = NULL;
1887                         m->m_flags &= ~M_NOTREADY;
1888
1889                         /* Set EOR on the final mbuf. */
1890                         m->m_flags |= M_EOR;
1891                 }
1892
1893                 sbappendcontrol_locked(sb, data, control, 0);
1894         }
1895
1896         sb->sb_flags &= ~SB_TLS_RX_RUNNING;
1897
1898         if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
1899                 so->so_error = EMSGSIZE;
1900
1901         sorwakeup_locked(so);
1902
1903 deref:
1904         SOCKBUF_UNLOCK_ASSERT(sb);
1905
1906         CURVNET_SET(so->so_vnet);
1907         SOCK_LOCK(so);
1908         sorele(so);
1909         CURVNET_RESTORE();
1910 }
1911
1912 void
1913 ktls_enqueue_to_free(struct mbuf *m)
1914 {
1915         struct ktls_wq *wq;
1916         bool running;
1917
1918         /* Mark it for freeing. */
1919         m->m_epg_flags |= EPG_FLAG_2FREE;
1920         wq = &ktls_wq[m->m_epg_tls->wq_index];
1921         mtx_lock(&wq->mtx);
1922         STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
1923         running = wq->running;
1924         mtx_unlock(&wq->mtx);
1925         if (!running)
1926                 wakeup(wq);
1927 }
1928
1929 static void *
1930 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
1931 {
1932         void *buf;
1933
1934         if (m->m_epg_npgs <= 2)
1935                 return (NULL);
1936         if (ktls_buffer_zone == NULL)
1937                 return (NULL);
1938         if ((u_int)(ticks - wq->lastallocfail) < hz) {
1939                 /*
1940                  * Rate-limit allocation attempts after a failure.
1941                  * ktls_buffer_import() will acquire a per-domain mutex to check
1942                  * the free page queues and may fail consistently if memory is
1943                  * fragmented.
1944                  */
1945                 return (NULL);
1946         }
1947         buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
1948         if (buf == NULL)
1949                 wq->lastallocfail = ticks;
1950         return (buf);
1951 }
1952
1953 void
1954 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
1955 {
1956         struct ktls_wq *wq;
1957         bool running;
1958
1959         KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
1960             (M_EXTPG | M_NOTREADY)),
1961             ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
1962         KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
1963
1964         KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
1965
1966         m->m_epg_enc_cnt = page_count;
1967
1968         /*
1969          * Save a pointer to the socket.  The caller is responsible
1970          * for taking an additional reference via soref().
1971          */
1972         m->m_epg_so = so;
1973
1974         wq = &ktls_wq[m->m_epg_tls->wq_index];
1975         mtx_lock(&wq->mtx);
1976         STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
1977         running = wq->running;
1978         mtx_unlock(&wq->mtx);
1979         if (!running)
1980                 wakeup(wq);
1981         counter_u64_add(ktls_cnt_tx_queued, 1);
1982 }
1983
1984 #define MAX_TLS_PAGES   (1 + btoc(TLS_MAX_MSG_SIZE_V10_2))
1985
1986 static __noinline void
1987 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
1988 {
1989         struct ktls_session *tls;
1990         struct socket *so;
1991         struct mbuf *m;
1992         vm_paddr_t parray[MAX_TLS_PAGES + 1];
1993         struct iovec dst_iov[MAX_TLS_PAGES + 2];
1994         vm_page_t pg;
1995         void *cbuf;
1996         int error, i, len, npages, off, total_pages;
1997
1998         so = top->m_epg_so;
1999         tls = top->m_epg_tls;
2000         KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
2001         KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
2002 #ifdef INVARIANTS
2003         top->m_epg_so = NULL;
2004 #endif
2005         total_pages = top->m_epg_enc_cnt;
2006         npages = 0;
2007
2008         /*
2009          * Encrypt the TLS records in the chain of mbufs starting with
2010          * 'top'.  'total_pages' gives us a total count of pages and is
2011          * used to know when we have finished encrypting the TLS
2012          * records originally queued with 'top'.
2013          *
2014          * NB: These mbufs are queued in the socket buffer and
2015          * 'm_next' is traversing the mbufs in the socket buffer.  The
2016          * socket buffer lock is not held while traversing this chain.
2017          * Since the mbufs are all marked M_NOTREADY their 'm_next'
2018          * pointers should be stable.  However, the 'm_next' of the
2019          * last mbuf encrypted is not necessarily NULL.  It can point
2020          * to other mbufs appended while 'top' was on the TLS work
2021          * queue.
2022          *
2023          * Each mbuf holds an entire TLS record.
2024          */
2025         error = 0;
2026         for (m = top; npages != total_pages; m = m->m_next) {
2027                 KASSERT(m->m_epg_tls == tls,
2028                     ("different TLS sessions in a single mbuf chain: %p vs %p",
2029                     tls, m->m_epg_tls));
2030                 KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
2031                     (M_EXTPG | M_NOTREADY),
2032                     ("%p not unready & nomap mbuf (top = %p)\n", m, top));
2033                 KASSERT(npages + m->m_epg_npgs <= total_pages,
2034                     ("page count mismatch: top %p, total_pages %d, m %p", top,
2035                     total_pages, m));
2036                 KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
2037                     ("page count %d larger than maximum frame length %d",
2038                     m->m_epg_npgs, ktls_maxlen));
2039
2040                 /*
2041                  * For anonymous mbufs, encryption is done in place.
2042                  * For file-backed mbufs (from sendfile), anonymous
2043                  * wired pages are allocated and used as the
2044                  * encryption destination.
2045                  */
2046                 if ((m->m_epg_flags & EPG_FLAG_ANON) != 0) {
2047                         error = (*tls->sw_encrypt)(tls, m, NULL, 0);
2048                 } else {
2049                         if ((cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
2050                                 len = ptoa(m->m_epg_npgs - 1) +
2051                                     m->m_epg_last_len - m->m_epg_1st_off;
2052                                 dst_iov[0].iov_base = (char *)cbuf +
2053                                     m->m_epg_1st_off;
2054                                 dst_iov[0].iov_len = len;
2055                                 parray[0] = DMAP_TO_PHYS((vm_offset_t)cbuf);
2056                                 i = 1;
2057                         } else {
2058                                 off = m->m_epg_1st_off;
2059                                 for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
2060                                         do {
2061                                                 pg = vm_page_alloc(NULL, 0,
2062                                                     VM_ALLOC_NORMAL |
2063                                                     VM_ALLOC_NOOBJ |
2064                                                     VM_ALLOC_NODUMP |
2065                                                     VM_ALLOC_WIRED |
2066                                                     VM_ALLOC_WAITFAIL);
2067                                         } while (pg == NULL);
2068
2069                                         len = m_epg_pagelen(m, i, off);
2070                                         parray[i] = VM_PAGE_TO_PHYS(pg);
2071                                         dst_iov[i].iov_base =
2072                                             (char *)(void *)PHYS_TO_DMAP(
2073                                             parray[i]) + off;
2074                                         dst_iov[i].iov_len = len;
2075                                 }
2076                         }
2077                         KASSERT(i + 1 <= nitems(dst_iov),
2078                             ("dst_iov is too small"));
2079                         dst_iov[i].iov_base = m->m_epg_trail;
2080                         dst_iov[i].iov_len = m->m_epg_trllen;
2081
2082                         error = (*tls->sw_encrypt)(tls, m, dst_iov, i + 1);
2083
2084                         /* Free the old pages. */
2085                         m->m_ext.ext_free(m);
2086
2087                         /* Replace them with the new pages. */
2088                         if (cbuf != NULL) {
2089                                 for (i = 0; i < m->m_epg_npgs; i++)
2090                                         m->m_epg_pa[i] = parray[0] + ptoa(i);
2091
2092                                 /* Contig pages should go back to the cache. */
2093                                 m->m_ext.ext_free = ktls_free_mext_contig;
2094                         } else {
2095                                 for (i = 0; i < m->m_epg_npgs; i++)
2096                                         m->m_epg_pa[i] = parray[i];
2097
2098                                 /* Use the basic free routine. */
2099                                 m->m_ext.ext_free = mb_free_mext_pgs;
2100                         }
2101
2102                         /* Pages are now writable. */
2103                         m->m_epg_flags |= EPG_FLAG_ANON;
2104                 }
2105                 if (error) {
2106                         counter_u64_add(ktls_offload_failed_crypto, 1);
2107                         break;
2108                 }
2109
2110                 if (__predict_false(m->m_epg_npgs == 0)) {
2111                         /* TLS 1.0 empty fragment. */
2112                         npages++;
2113                 } else
2114                         npages += m->m_epg_npgs;
2115
2116                 /*
2117                  * Drop a reference to the session now that it is no
2118                  * longer needed.  Existing code depends on encrypted
2119                  * records having no associated session vs
2120                  * yet-to-be-encrypted records having an associated
2121                  * session.
2122                  */
2123                 m->m_epg_tls = NULL;
2124                 ktls_free(tls);
2125         }
2126
2127         CURVNET_SET(so->so_vnet);
2128         if (error == 0) {
2129                 (void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
2130         } else {
2131                 so->so_proto->pr_usrreqs->pru_abort(so);
2132                 so->so_error = EIO;
2133                 mb_free_notready(top, total_pages);
2134         }
2135
2136         SOCK_LOCK(so);
2137         sorele(so);
2138         CURVNET_RESTORE();
2139 }
2140
2141 static void
2142 ktls_work_thread(void *ctx)
2143 {
2144         struct ktls_wq *wq = ctx;
2145         struct mbuf *m, *n;
2146         struct socket *so, *son;
2147         STAILQ_HEAD(, mbuf) local_m_head;
2148         STAILQ_HEAD(, socket) local_so_head;
2149
2150         if (ktls_bind_threads > 1) {
2151                 curthread->td_domain.dr_policy =
2152                         DOMAINSET_PREF(PCPU_GET(domain));
2153         }
2154 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
2155         fpu_kern_thread(0);
2156 #endif
2157         for (;;) {
2158                 mtx_lock(&wq->mtx);
2159                 while (STAILQ_EMPTY(&wq->m_head) &&
2160                     STAILQ_EMPTY(&wq->so_head)) {
2161                         wq->running = false;
2162                         mtx_sleep(wq, &wq->mtx, 0, "-", 0);
2163                         wq->running = true;
2164                 }
2165
2166                 STAILQ_INIT(&local_m_head);
2167                 STAILQ_CONCAT(&local_m_head, &wq->m_head);
2168                 STAILQ_INIT(&local_so_head);
2169                 STAILQ_CONCAT(&local_so_head, &wq->so_head);
2170                 mtx_unlock(&wq->mtx);
2171
2172                 STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
2173                         if (m->m_epg_flags & EPG_FLAG_2FREE) {
2174                                 ktls_free(m->m_epg_tls);
2175                                 uma_zfree(zone_mbuf, m);
2176                         } else {
2177                                 ktls_encrypt(wq, m);
2178                                 counter_u64_add(ktls_cnt_tx_queued, -1);
2179                         }
2180                 }
2181
2182                 STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
2183                         ktls_decrypt(so);
2184                         counter_u64_add(ktls_cnt_rx_queued, -1);
2185                 }
2186         }
2187 }