]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/unbound/services/listen_dnsport.c
MFV r367082:
[FreeBSD/FreeBSD.git] / contrib / unbound / services / listen_dnsport.c
1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  * 
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  * 
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  * 
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  * 
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #ifdef USE_TCP_FASTOPEN
47 #include <netinet/tcp.h>
48 #endif
49 #include "services/listen_dnsport.h"
50 #include "services/outside_network.h"
51 #include "util/netevent.h"
52 #include "util/log.h"
53 #include "util/config_file.h"
54 #include "util/net_help.h"
55 #include "sldns/sbuffer.h"
56 #include "sldns/parseutil.h"
57 #include "services/mesh.h"
58 #include "util/fptr_wlist.h"
59 #include "util/locks.h"
60
61 #ifdef HAVE_NETDB_H
62 #include <netdb.h>
63 #endif
64 #include <fcntl.h>
65
66 #ifdef HAVE_SYS_UN_H
67 #include <sys/un.h>
68 #endif
69
70 #ifdef HAVE_SYSTEMD
71 #include <systemd/sd-daemon.h>
72 #endif
73
74 #ifdef HAVE_IFADDRS_H
75 #include <ifaddrs.h>
76 #endif
77 #ifdef HAVE_NET_IF_H
78 #include <net/if.h>
79 #endif
80
81 /** number of queued TCP connections for listen() */
82 #define TCP_BACKLOG 256 
83
84 /** number of simultaneous requests a client can have */
85 #define TCP_MAX_REQ_SIMULTANEOUS 32
86
87 #ifndef THREADS_DISABLED
88 /** lock on the counter of stream buffer memory */
89 static lock_basic_type stream_wait_count_lock;
90 /** lock on the counter of HTTP2 query buffer memory */
91 static lock_basic_type http2_query_buffer_count_lock;
92 /** lock on the counter of HTTP2 response buffer memory */
93 static lock_basic_type http2_response_buffer_count_lock;
94 #endif
95 /** size (in bytes) of stream wait buffers */
96 static size_t stream_wait_count = 0;
97 /** is the lock initialised for stream wait buffers */
98 static int stream_wait_lock_inited = 0;
99 /** size (in bytes) of HTTP2 query buffers */
100 static size_t http2_query_buffer_count = 0;
101 /** is the lock initialised for HTTP2 query buffers */
102 static int http2_query_buffer_lock_inited = 0;
103 /** size (in bytes) of HTTP2 response buffers */
104 static size_t http2_response_buffer_count = 0;
105 /** is the lock initialised for HTTP2 response buffers */
106 static int http2_response_buffer_lock_inited = 0;
107
108 /**
109  * Debug print of the getaddrinfo returned address.
110  * @param addr: the address returned.
111  */
112 static void
113 verbose_print_addr(struct addrinfo *addr)
114 {
115         if(verbosity >= VERB_ALGO) {
116                 char buf[100];
117                 void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
118 #ifdef INET6
119                 if(addr->ai_family == AF_INET6)
120                         sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
121                                 sin6_addr;
122 #endif /* INET6 */
123                 if(inet_ntop(addr->ai_family, sinaddr, buf,
124                         (socklen_t)sizeof(buf)) == 0) {
125                         (void)strlcpy(buf, "(null)", sizeof(buf));
126                 }
127                 buf[sizeof(buf)-1] = 0;
128                 verbose(VERB_ALGO, "creating %s%s socket %s %d", 
129                         addr->ai_socktype==SOCK_DGRAM?"udp":
130                         addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
131                         addr->ai_family==AF_INET?"4":
132                         addr->ai_family==AF_INET6?"6":
133                         "_otherfam", buf, 
134                         ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
135         }
136 }
137
138 #ifdef HAVE_SYSTEMD
139 static int
140 systemd_get_activated(int family, int socktype, int listen,
141                       struct sockaddr *addr, socklen_t addrlen,
142                       const char *path)
143 {
144         int i = 0;
145         int r = 0;
146         int s = -1;
147         const char* listen_pid, *listen_fds;
148
149         /* We should use "listen" option only for stream protocols. For UDP it should be -1 */
150
151         if((r = sd_booted()) < 1) {
152                 if(r == 0)
153                         log_warn("systemd is not running");
154                 else
155                         log_err("systemd sd_booted(): %s", strerror(-r));
156                 return -1;
157         }
158
159         listen_pid = getenv("LISTEN_PID");
160         listen_fds = getenv("LISTEN_FDS");
161
162         if (!listen_pid) {
163                 log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
164                 return -1;
165         }
166
167         if (!listen_fds) {
168                 log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
169                 return -1;
170         }
171
172         if((r = sd_listen_fds(0)) < 1) {
173                 if(r == 0)
174                         log_warn("systemd: did not return socket, check unit configuration");
175                 else
176                         log_err("systemd sd_listen_fds(): %s", strerror(-r));
177                 return -1;
178         }
179         
180         for(i = 0; i < r; i++) {
181                 if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
182                         s = SD_LISTEN_FDS_START + i;
183                         break;
184                 }
185         }
186         if (s == -1) {
187                 if (addr)
188                         log_err_addr("systemd sd_listen_fds()",
189                                      "no such socket",
190                                      (struct sockaddr_storage *)addr, addrlen);
191                 else
192                         log_err("systemd sd_listen_fds(): %s", path);
193         }
194         return s;
195 }
196 #endif
197
198 int
199 create_udp_sock(int family, int socktype, struct sockaddr* addr,
200         socklen_t addrlen, int v6only, int* inuse, int* noproto,
201         int rcv, int snd, int listen, int* reuseport, int transparent,
202         int freebind, int use_systemd, int dscp)
203 {
204         int s;
205         char* err;
206 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
207         int on=1;
208 #endif
209 #ifdef IPV6_MTU
210         int mtu = IPV6_MIN_MTU;
211 #endif
212 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
213         (void)rcv;
214 #endif
215 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
216         (void)snd;
217 #endif
218 #ifndef IPV6_V6ONLY
219         (void)v6only;
220 #endif
221 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
222         (void)transparent;
223 #endif
224 #if !defined(IP_FREEBIND)
225         (void)freebind;
226 #endif
227 #ifdef HAVE_SYSTEMD
228         int got_fd_from_systemd = 0;
229
230         if (!use_systemd
231             || (use_systemd
232                 && (s = systemd_get_activated(family, socktype, -1, addr,
233                                               addrlen, NULL)) == -1)) {
234 #else
235         (void)use_systemd;
236 #endif
237         if((s = socket(family, socktype, 0)) == -1) {
238                 *inuse = 0;
239 #ifndef USE_WINSOCK
240                 if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
241                         *noproto = 1;
242                         return -1;
243                 }
244 #else
245                 if(WSAGetLastError() == WSAEAFNOSUPPORT || 
246                         WSAGetLastError() == WSAEPROTONOSUPPORT) {
247                         *noproto = 1;
248                         return -1;
249                 }
250 #endif
251                 log_err("can't create socket: %s", sock_strerror(errno));
252                 *noproto = 0;
253                 return -1;
254         }
255 #ifdef HAVE_SYSTEMD
256         } else {
257                 got_fd_from_systemd = 1;
258         }
259 #endif
260         if(listen) {
261 #ifdef SO_REUSEADDR
262                 if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on, 
263                         (socklen_t)sizeof(on)) < 0) {
264                         log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
265                                 sock_strerror(errno));
266 #ifndef USE_WINSOCK
267                         if(errno != ENOSYS) {
268                                 close(s);
269                                 *noproto = 0;
270                                 *inuse = 0;
271                                 return -1;
272                         }
273 #else
274                         closesocket(s);
275                         *noproto = 0;
276                         *inuse = 0;
277                         return -1;
278 #endif
279                 }
280 #endif /* SO_REUSEADDR */
281 #ifdef SO_REUSEPORT
282 #  ifdef SO_REUSEPORT_LB
283                 /* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
284                  * like SO_REUSEPORT on Linux.  This is what the users want
285                  * with the config option in unbound.conf; if we actually
286                  * need local address and port reuse they'll also need to
287                  * have SO_REUSEPORT set for them, assume it was _LB they want.
288                  */
289                 if (reuseport && *reuseport &&
290                     setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
291                         (socklen_t)sizeof(on)) < 0) {
292 #ifdef ENOPROTOOPT
293                         if(errno != ENOPROTOOPT || verbosity >= 3)
294                                 log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
295                                         strerror(errno));
296 #endif
297                         /* this option is not essential, we can continue */
298                         *reuseport = 0;
299                 }
300 #  else /* no SO_REUSEPORT_LB */
301
302                 /* try to set SO_REUSEPORT so that incoming
303                  * queries are distributed evenly among the receiving threads.
304                  * Each thread must have its own socket bound to the same port,
305                  * with SO_REUSEPORT set on each socket.
306                  */
307                 if (reuseport && *reuseport &&
308                     setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
309                         (socklen_t)sizeof(on)) < 0) {
310 #ifdef ENOPROTOOPT
311                         if(errno != ENOPROTOOPT || verbosity >= 3)
312                                 log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
313                                         strerror(errno));
314 #endif
315                         /* this option is not essential, we can continue */
316                         *reuseport = 0;
317                 }
318 #  endif /* SO_REUSEPORT_LB */
319 #else
320                 (void)reuseport;
321 #endif /* defined(SO_REUSEPORT) */
322 #ifdef IP_TRANSPARENT
323                 if (transparent &&
324                     setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
325                     (socklen_t)sizeof(on)) < 0) {
326                         log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
327                         strerror(errno));
328                 }
329 #elif defined(IP_BINDANY)
330                 if (transparent &&
331                     setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
332                     (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
333                     (void*)&on, (socklen_t)sizeof(on)) < 0) {
334                         log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
335                         (family==AF_INET6?"V6":""), strerror(errno));
336                 }
337 #elif defined(SO_BINDANY)
338                 if (transparent &&
339                     setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
340                     (socklen_t)sizeof(on)) < 0) {
341                         log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
342                         strerror(errno));
343                 }
344 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
345         }
346 #ifdef IP_FREEBIND
347         if(freebind &&
348             setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
349             (socklen_t)sizeof(on)) < 0) {
350                 log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
351                 strerror(errno));
352         }
353 #endif /* IP_FREEBIND */
354         if(rcv) {
355 #ifdef SO_RCVBUF
356                 int got;
357                 socklen_t slen = (socklen_t)sizeof(got);
358 #  ifdef SO_RCVBUFFORCE
359                 /* Linux specific: try to use root permission to override
360                  * system limits on rcvbuf. The limit is stored in 
361                  * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
362                 if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv, 
363                         (socklen_t)sizeof(rcv)) < 0) {
364                         if(errno != EPERM) {
365                                 log_err("setsockopt(..., SO_RCVBUFFORCE, "
366                                         "...) failed: %s", sock_strerror(errno));
367                                 sock_close(s);
368                                 *noproto = 0;
369                                 *inuse = 0;
370                                 return -1;
371                         }
372 #  endif /* SO_RCVBUFFORCE */
373                         if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv, 
374                                 (socklen_t)sizeof(rcv)) < 0) {
375                                 log_err("setsockopt(..., SO_RCVBUF, "
376                                         "...) failed: %s", sock_strerror(errno));
377                                 sock_close(s);
378                                 *noproto = 0;
379                                 *inuse = 0;
380                                 return -1;
381                         }
382                         /* check if we got the right thing or if system
383                          * reduced to some system max.  Warn if so */
384                         if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got, 
385                                 &slen) >= 0 && got < rcv/2) {
386                                 log_warn("so-rcvbuf %u was not granted. "
387                                         "Got %u. To fix: start with "
388                                         "root permissions(linux) or sysctl "
389                                         "bigger net.core.rmem_max(linux) or "
390                                         "kern.ipc.maxsockbuf(bsd) values.",
391                                         (unsigned)rcv, (unsigned)got);
392                         }
393 #  ifdef SO_RCVBUFFORCE
394                 }
395 #  endif
396 #endif /* SO_RCVBUF */
397         }
398         /* first do RCVBUF as the receive buffer is more important */
399         if(snd) {
400 #ifdef SO_SNDBUF
401                 int got;
402                 socklen_t slen = (socklen_t)sizeof(got);
403 #  ifdef SO_SNDBUFFORCE
404                 /* Linux specific: try to use root permission to override
405                  * system limits on sndbuf. The limit is stored in 
406                  * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
407                 if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd, 
408                         (socklen_t)sizeof(snd)) < 0) {
409                         if(errno != EPERM) {
410                                 log_err("setsockopt(..., SO_SNDBUFFORCE, "
411                                         "...) failed: %s", sock_strerror(errno));
412                                 sock_close(s);
413                                 *noproto = 0;
414                                 *inuse = 0;
415                                 return -1;
416                         }
417 #  endif /* SO_SNDBUFFORCE */
418                         if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd, 
419                                 (socklen_t)sizeof(snd)) < 0) {
420                                 log_err("setsockopt(..., SO_SNDBUF, "
421                                         "...) failed: %s", sock_strerror(errno));
422                                 sock_close(s);
423                                 *noproto = 0;
424                                 *inuse = 0;
425                                 return -1;
426                         }
427                         /* check if we got the right thing or if system
428                          * reduced to some system max.  Warn if so */
429                         if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got, 
430                                 &slen) >= 0 && got < snd/2) {
431                                 log_warn("so-sndbuf %u was not granted. "
432                                         "Got %u. To fix: start with "
433                                         "root permissions(linux) or sysctl "
434                                         "bigger net.core.wmem_max(linux) or "
435                                         "kern.ipc.maxsockbuf(bsd) values.",
436                                         (unsigned)snd, (unsigned)got);
437                         }
438 #  ifdef SO_SNDBUFFORCE
439                 }
440 #  endif
441 #endif /* SO_SNDBUF */
442         }
443         err = set_ip_dscp(s, family, dscp);
444         if(err != NULL)
445                 log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
446         if(family == AF_INET6) {
447 # if defined(IPV6_V6ONLY)
448                 if(v6only) {
449                         int val=(v6only==2)?0:1;
450                         if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, 
451                                 (void*)&val, (socklen_t)sizeof(val)) < 0) {
452                                 log_err("setsockopt(..., IPV6_V6ONLY"
453                                         ", ...) failed: %s", sock_strerror(errno));
454                                 sock_close(s);
455                                 *noproto = 0;
456                                 *inuse = 0;
457                                 return -1;
458                         }
459                 }
460 # endif
461 # if defined(IPV6_USE_MIN_MTU)
462                 /*
463                  * There is no fragmentation of IPv6 datagrams
464                  * during forwarding in the network. Therefore
465                  * we do not send UDP datagrams larger than
466                  * the minimum IPv6 MTU of 1280 octets. The
467                  * EDNS0 message length can be larger if the
468                  * network stack supports IPV6_USE_MIN_MTU.
469                  */
470                 if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
471                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
472                         log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
473                                 "...) failed: %s", sock_strerror(errno));
474                         sock_close(s);
475                         *noproto = 0;
476                         *inuse = 0;
477                         return -1;
478                 }
479 # elif defined(IPV6_MTU)
480                 /*
481                  * On Linux, to send no larger than 1280, the PMTUD is
482                  * disabled by default for datagrams anyway, so we set
483                  * the MTU to use.
484                  */
485                 if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
486                         (void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
487                         log_err("setsockopt(..., IPV6_MTU, ...) failed: %s", 
488                                 sock_strerror(errno));
489                         sock_close(s);
490                         *noproto = 0;
491                         *inuse = 0;
492                         return -1;
493                 }
494 # endif /* IPv6 MTU */
495         } else if(family == AF_INET) {
496 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
497 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
498  * PMTU information is not accepted, but fragmentation is allowed
499  * if and only if the packet size exceeds the outgoing interface MTU
500  * (and also uses the interface mtu to determine the size of the packets).
501  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
502  * FreeBSD already has same semantics without setting the option. */
503                 int omit_set = 0;
504                 int action;
505 #   if defined(IP_PMTUDISC_OMIT)
506                 action = IP_PMTUDISC_OMIT;
507                 if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER, 
508                         &action, (socklen_t)sizeof(action)) < 0) {
509
510                         if (errno != EINVAL) {
511                                 log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
512                                         strerror(errno));
513                                 sock_close(s);
514                                 *noproto = 0;
515                                 *inuse = 0;
516                                 return -1;
517                         }
518                 }
519                 else
520                 {
521                     omit_set = 1;
522                 }
523 #   endif
524                 if (omit_set == 0) {
525                         action = IP_PMTUDISC_DONT;
526                         if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
527                                 &action, (socklen_t)sizeof(action)) < 0) {
528                                 log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
529                                         strerror(errno));
530                                 sock_close(s);
531                                 *noproto = 0;
532                                 *inuse = 0;
533                                 return -1;
534                         }
535                 }
536 #  elif defined(IP_DONTFRAG)
537                 int off = 0;
538                 if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG, 
539                         &off, (socklen_t)sizeof(off)) < 0) {
540                         log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
541                                 strerror(errno));
542                         sock_close(s);
543                         *noproto = 0;
544                         *inuse = 0;
545                         return -1;
546                 }
547 #  endif /* IPv4 MTU */
548         }
549         if(
550 #ifdef HAVE_SYSTEMD
551                 !got_fd_from_systemd &&
552 #endif
553                 bind(s, (struct sockaddr*)addr, addrlen) != 0) {
554                 *noproto = 0;
555                 *inuse = 0;
556 #ifndef USE_WINSOCK
557 #ifdef EADDRINUSE
558                 *inuse = (errno == EADDRINUSE);
559                 /* detect freebsd jail with no ipv6 permission */
560                 if(family==AF_INET6 && errno==EINVAL)
561                         *noproto = 1;
562                 else if(errno != EADDRINUSE &&
563                         !(errno == EACCES && verbosity < 4 && !listen)
564 #ifdef EADDRNOTAVAIL
565                         && !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
566 #endif
567                         ) {
568                         log_err_addr("can't bind socket", strerror(errno),
569                                 (struct sockaddr_storage*)addr, addrlen);
570                 }
571 #endif /* EADDRINUSE */
572 #else /* USE_WINSOCK */
573                 if(WSAGetLastError() != WSAEADDRINUSE &&
574                         WSAGetLastError() != WSAEADDRNOTAVAIL &&
575                         !(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
576                         log_err_addr("can't bind socket", 
577                                 wsa_strerror(WSAGetLastError()),
578                                 (struct sockaddr_storage*)addr, addrlen);
579                 }
580 #endif /* USE_WINSOCK */
581                 sock_close(s);
582                 return -1;
583         }
584         if(!fd_set_nonblock(s)) {
585                 *noproto = 0;
586                 *inuse = 0;
587                 sock_close(s);
588                 return -1;
589         }
590         return s;
591 }
592
593 int
594 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
595         int* reuseport, int transparent, int mss, int nodelay, int freebind,
596         int use_systemd, int dscp)
597 {
598         int s;
599         char* err;
600 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
601         int on = 1;
602 #endif
603 #ifdef HAVE_SYSTEMD
604         int got_fd_from_systemd = 0;
605 #endif
606 #ifdef USE_TCP_FASTOPEN
607         int qlen;
608 #endif
609 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
610         (void)transparent;
611 #endif
612 #if !defined(IP_FREEBIND)
613         (void)freebind;
614 #endif
615         verbose_print_addr(addr);
616         *noproto = 0;
617 #ifdef HAVE_SYSTEMD
618         if (!use_systemd ||
619             (use_systemd
620              && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
621                                            addr->ai_addr, addr->ai_addrlen,
622                                            NULL)) == -1)) {
623 #else
624         (void)use_systemd;
625 #endif
626         if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
627 #ifndef USE_WINSOCK
628                 if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
629                         *noproto = 1;
630                         return -1;
631                 }
632 #else
633                 if(WSAGetLastError() == WSAEAFNOSUPPORT ||
634                         WSAGetLastError() == WSAEPROTONOSUPPORT) {
635                         *noproto = 1;
636                         return -1;
637                 }
638 #endif
639                 log_err("can't create socket: %s", sock_strerror(errno));
640                 return -1;
641         }
642         if(nodelay) {
643 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
644                 if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
645                         (socklen_t)sizeof(on)) < 0) {
646                         #ifndef USE_WINSOCK
647                         log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
648                                 strerror(errno));
649                         #else
650                         log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
651                                 wsa_strerror(WSAGetLastError()));
652                         #endif
653                 }
654 #else
655                 log_warn(" setsockopt(TCP_NODELAY) unsupported");
656 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
657         }
658         if (mss > 0) {
659 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
660                 if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
661                         (socklen_t)sizeof(mss)) < 0) {
662                         log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
663                                 sock_strerror(errno));
664                 } else {
665                         verbose(VERB_ALGO,
666                                 " tcp socket mss set to %d", mss);
667                 }
668 #else
669                 log_warn(" setsockopt(TCP_MAXSEG) unsupported");
670 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
671         }
672 #ifdef HAVE_SYSTEMD
673         } else {
674                 got_fd_from_systemd = 1;
675     }
676 #endif
677 #ifdef SO_REUSEADDR
678         if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on, 
679                 (socklen_t)sizeof(on)) < 0) {
680                 log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
681                         sock_strerror(errno));
682                 sock_close(s);
683                 return -1;
684         }
685 #endif /* SO_REUSEADDR */
686 #ifdef IP_FREEBIND
687         if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
688             (socklen_t)sizeof(on)) < 0) {
689                 log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
690                 strerror(errno));
691         }
692 #endif /* IP_FREEBIND */
693 #ifdef SO_REUSEPORT
694         /* try to set SO_REUSEPORT so that incoming
695          * connections are distributed evenly among the receiving threads.
696          * Each thread must have its own socket bound to the same port,
697          * with SO_REUSEPORT set on each socket.
698          */
699         if (reuseport && *reuseport &&
700                 setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
701                 (socklen_t)sizeof(on)) < 0) {
702 #ifdef ENOPROTOOPT
703                 if(errno != ENOPROTOOPT || verbosity >= 3)
704                         log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
705                                 strerror(errno));
706 #endif
707                 /* this option is not essential, we can continue */
708                 *reuseport = 0;
709         }
710 #else
711         (void)reuseport;
712 #endif /* defined(SO_REUSEPORT) */
713 #if defined(IPV6_V6ONLY)
714         if(addr->ai_family == AF_INET6 && v6only) {
715                 if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, 
716                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
717                         log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
718                                 sock_strerror(errno));
719                         sock_close(s);
720                         return -1;
721                 }
722         }
723 #else
724         (void)v6only;
725 #endif /* IPV6_V6ONLY */
726 #ifdef IP_TRANSPARENT
727         if (transparent &&
728             setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
729             (socklen_t)sizeof(on)) < 0) {
730                 log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
731                         strerror(errno));
732         }
733 #elif defined(IP_BINDANY)
734         if (transparent &&
735             setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
736             (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
737             (void*)&on, (socklen_t)sizeof(on)) < 0) {
738                 log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
739                 (addr->ai_family==AF_INET6?"V6":""), strerror(errno));
740         }
741 #elif defined(SO_BINDANY)
742         if (transparent &&
743             setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
744             sizeof(on)) < 0) {
745                 log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
746                 strerror(errno));
747         }
748 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
749         err = set_ip_dscp(s, addr->ai_family, dscp);
750         if(err != NULL)
751                 log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
752         if(
753 #ifdef HAVE_SYSTEMD
754                 !got_fd_from_systemd &&
755 #endif
756         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
757 #ifndef USE_WINSOCK
758                 /* detect freebsd jail with no ipv6 permission */
759                 if(addr->ai_family==AF_INET6 && errno==EINVAL)
760                         *noproto = 1;
761                 else {
762                         log_err_addr("can't bind socket", strerror(errno),
763                                 (struct sockaddr_storage*)addr->ai_addr,
764                                 addr->ai_addrlen);
765                 }
766 #else
767                 log_err_addr("can't bind socket", 
768                         wsa_strerror(WSAGetLastError()),
769                         (struct sockaddr_storage*)addr->ai_addr,
770                         addr->ai_addrlen);
771 #endif
772                 sock_close(s);
773                 return -1;
774         }
775         if(!fd_set_nonblock(s)) {
776                 sock_close(s);
777                 return -1;
778         }
779         if(listen(s, TCP_BACKLOG) == -1) {
780                 log_err("can't listen: %s", sock_strerror(errno));
781                 sock_close(s);
782                 return -1;
783         }
784 #ifdef USE_TCP_FASTOPEN
785         /* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
786            against IP spoofing attacks as suggested in RFC7413 */
787 #ifdef __APPLE__
788         /* OS X implementation only supports qlen of 1 via this call. Actual
789            value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
790         qlen = 1;
791 #else
792         /* 5 is recommended on linux */
793         qlen = 5;
794 #endif
795         if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, 
796                   sizeof(qlen))) == -1 ) {
797 #ifdef ENOPROTOOPT
798                 /* squelch ENOPROTOOPT: freebsd server mode with kernel support
799                    disabled, except when verbosity enabled for debugging */
800                 if(errno != ENOPROTOOPT || verbosity >= 3) {
801 #endif
802                   if(errno == EPERM) {
803                         log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
804                   } else {
805                         log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
806                   }
807 #ifdef ENOPROTOOPT
808                 }
809 #endif
810         }
811 #endif
812         return s;
813 }
814
815 char*
816 set_ip_dscp(int socket, int addrfamily, int dscp)
817 {
818         int ds;
819
820         if(dscp == 0)
821                 return NULL;
822         ds = dscp << 2;
823         switch(addrfamily) {
824         case AF_INET6:
825                 if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds, sizeof(ds)) < 0)
826                         return sock_strerror(errno);
827                 break;
828         default:
829                 if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
830                         return sock_strerror(errno);
831                 break;
832         }
833         return NULL;
834 }
835
836 int
837 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
838 {
839 #ifdef HAVE_SYSTEMD
840         int ret;
841
842         if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
843                 return ret;
844         else {
845 #endif
846 #ifdef HAVE_SYS_UN_H
847         int s;
848         struct sockaddr_un usock;
849 #ifndef HAVE_SYSTEMD
850         (void)use_systemd;
851 #endif
852
853         verbose(VERB_ALGO, "creating unix socket %s", path);
854 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
855         /* this member exists on BSDs, not Linux */
856         usock.sun_len = (unsigned)sizeof(usock);
857 #endif
858         usock.sun_family = AF_LOCAL;
859         /* length is 92-108, 104 on FreeBSD */
860         (void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
861
862         if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
863                 log_err("Cannot create local socket %s (%s)",
864                         path, strerror(errno));
865                 return -1;
866         }
867
868         if (unlink(path) && errno != ENOENT) {
869                 /* The socket already exists and cannot be removed */
870                 log_err("Cannot remove old local socket %s (%s)",
871                         path, strerror(errno));
872                 goto err;
873         }
874
875         if (bind(s, (struct sockaddr *)&usock,
876                 (socklen_t)sizeof(struct sockaddr_un)) == -1) {
877                 log_err("Cannot bind local socket %s (%s)",
878                         path, strerror(errno));
879                 goto err;
880         }
881
882         if (!fd_set_nonblock(s)) {
883                 log_err("Cannot set non-blocking mode");
884                 goto err;
885         }
886
887         if (listen(s, TCP_BACKLOG) == -1) {
888                 log_err("can't listen: %s", strerror(errno));
889                 goto err;
890         }
891
892         (void)noproto; /*unused*/
893         return s;
894
895 err:
896         sock_close(s);
897         return -1;
898
899 #ifdef HAVE_SYSTEMD
900         }
901 #endif
902 #else
903         (void)use_systemd;
904         (void)path;
905         log_err("Local sockets are not supported");
906         *noproto = 1;
907         return -1;
908 #endif
909 }
910
911
912 /**
913  * Create socket from getaddrinfo results
914  */
915 static int
916 make_sock(int stype, const char* ifname, const char* port, 
917         struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
918         int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
919         int use_systemd, int dscp)
920 {
921         struct addrinfo *res = NULL;
922         int r, s, inuse, noproto;
923         hints->ai_socktype = stype;
924         *noip6 = 0;
925         if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
926 #ifdef USE_WINSOCK
927                 if(r == EAI_NONAME && hints->ai_family == AF_INET6){
928                         *noip6 = 1; /* 'Host not found' for IP6 on winXP */
929                         return -1;
930                 }
931 #endif
932                 log_err("node %s:%s getaddrinfo: %s %s", 
933                         ifname?ifname:"default", port, gai_strerror(r),
934 #ifdef EAI_SYSTEM
935                         r==EAI_SYSTEM?(char*)strerror(errno):""
936 #else
937                         ""
938 #endif
939                 );
940                 return -1;
941         }
942         if(stype == SOCK_DGRAM) {
943                 verbose_print_addr(res);
944                 s = create_udp_sock(res->ai_family, res->ai_socktype,
945                         (struct sockaddr*)res->ai_addr, res->ai_addrlen,
946                         v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
947                         reuseport, transparent, freebind, use_systemd, dscp);
948                 if(s == -1 && inuse) {
949                         log_err("bind: address already in use");
950                 } else if(s == -1 && noproto && hints->ai_family == AF_INET6){
951                         *noip6 = 1;
952                 }
953         } else  {
954                 s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
955                         transparent, tcp_mss, nodelay, freebind, use_systemd,
956                         dscp);
957                 if(s == -1 && noproto && hints->ai_family == AF_INET6){
958                         *noip6 = 1;
959                 }
960         }
961         freeaddrinfo(res);
962         return s;
963 }
964
965 /** make socket and first see if ifname contains port override info */
966 static int
967 make_sock_port(int stype, const char* ifname, const char* port, 
968         struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
969         int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
970         int use_systemd, int dscp)
971 {
972         char* s = strchr(ifname, '@');
973         if(s) {
974                 /* override port with ifspec@port */
975                 char p[16];
976                 char newif[128];
977                 if((size_t)(s-ifname) >= sizeof(newif)) {
978                         log_err("ifname too long: %s", ifname);
979                         *noip6 = 0;
980                         return -1;
981                 }
982                 if(strlen(s+1) >= sizeof(p)) {
983                         log_err("portnumber too long: %s", ifname);
984                         *noip6 = 0;
985                         return -1;
986                 }
987                 (void)strlcpy(newif, ifname, sizeof(newif));
988                 newif[s-ifname] = 0;
989                 (void)strlcpy(p, s+1, sizeof(p));
990                 p[strlen(s+1)]=0;
991                 return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
992                         snd, reuseport, transparent, tcp_mss, nodelay, freebind,
993                         use_systemd, dscp);
994         }
995         return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
996                 reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
997                 dscp);
998 }
999
1000 /**
1001  * Add port to open ports list.
1002  * @param list: list head. changed.
1003  * @param s: fd.
1004  * @param ftype: if fd is UDP.
1005  * @return false on failure. list in unchanged then.
1006  */
1007 static int
1008 port_insert(struct listen_port** list, int s, enum listen_type ftype)
1009 {
1010         struct listen_port* item = (struct listen_port*)malloc(
1011                 sizeof(struct listen_port));
1012         if(!item)
1013                 return 0;
1014         item->next = *list;
1015         item->fd = s;
1016         item->ftype = ftype;
1017         *list = item;
1018         return 1;
1019 }
1020
1021 /** set fd to receive source address packet info */
1022 static int
1023 set_recvpktinfo(int s, int family) 
1024 {
1025 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1026         int on = 1;
1027 #else
1028         (void)s;
1029 #endif
1030         if(family == AF_INET6) {
1031 #           ifdef IPV6_RECVPKTINFO
1032                 if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1033                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1034                         log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1035                                 strerror(errno));
1036                         return 0;
1037                 }
1038 #           elif defined(IPV6_PKTINFO)
1039                 if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1040                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1041                         log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1042                                 strerror(errno));
1043                         return 0;
1044                 }
1045 #           else
1046                 log_err("no IPV6_RECVPKTINFO and no IPV6_PKTINFO option, please "
1047                         "disable interface-automatic or do-ip6 in config");
1048                 return 0;
1049 #           endif /* defined IPV6_RECVPKTINFO */
1050
1051         } else if(family == AF_INET) {
1052 #           ifdef IP_PKTINFO
1053                 if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1054                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1055                         log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1056                                 strerror(errno));
1057                         return 0;
1058                 }
1059 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1060                 if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1061                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1062                         log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1063                                 strerror(errno));
1064                         return 0;
1065                 }
1066 #           else
1067                 log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1068                         "interface-automatic or do-ip4 in config");
1069                 return 0;
1070 #           endif /* IP_PKTINFO */
1071
1072         }
1073         return 1;
1074 }
1075
1076 /** see if interface is ssl, its port number == the ssl port number */
1077 static int
1078 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1079         struct config_strlist* tls_additional_port)
1080 {
1081         struct config_strlist* s;
1082         char* p = strchr(ifname, '@');
1083         if(!p && atoi(port) == ssl_port)
1084                 return 1;
1085         if(p && atoi(p+1) == ssl_port)
1086                 return 1;
1087         for(s = tls_additional_port; s; s = s->next) {
1088                 if(p && atoi(p+1) == atoi(s->str))
1089                         return 1;
1090                 if(!p && atoi(port) == atoi(s->str))
1091                         return 1;
1092         }
1093         return 0;
1094 }
1095
1096 /** see if interface is https, its port number == the https port number */
1097 static int
1098 if_is_https(const char* ifname, const char* port, int https_port)
1099 {
1100         char* p = strchr(ifname, '@');
1101         if(!p && atoi(port) == https_port)
1102                 return 1;
1103         if(p && atoi(p+1) == https_port)
1104                 return 1;
1105         return 0;
1106 }
1107
1108 /**
1109  * Helper for ports_open. Creates one interface (or NULL for default).
1110  * @param ifname: The interface ip address.
1111  * @param do_auto: use automatic interface detection.
1112  *      If enabled, then ifname must be the wildcard name.
1113  * @param do_udp: if udp should be used.
1114  * @param do_tcp: if udp should be used.
1115  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1116  * @param port: Port number to use (as string).
1117  * @param list: list of open ports, appended to, changed to point to list head.
1118  * @param rcv: receive buffer size for UDP
1119  * @param snd: send buffer size for UDP
1120  * @param ssl_port: ssl service port number
1121  * @param tls_additional_port: list of additional ssl service port numbers.
1122  * @param https_port: DoH service port number
1123  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1124  *      set to false on exit if reuseport failed due to no kernel support.
1125  * @param transparent: set IP_TRANSPARENT socket option.
1126  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1127  * @param freebind: set IP_FREEBIND socket option.
1128  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1129  * @param use_systemd: if true, fetch sockets from systemd.
1130  * @param dnscrypt_port: dnscrypt service port number
1131  * @param dscp: DSCP to use.
1132  * @return: returns false on error.
1133  */
1134 static int
1135 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, 
1136         struct addrinfo *hints, const char* port, struct listen_port** list,
1137         size_t rcv, size_t snd, int ssl_port,
1138         struct config_strlist* tls_additional_port, int https_port,
1139         int* reuseport, int transparent, int tcp_mss, int freebind,
1140         int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp)
1141 {
1142         int s, noip6=0;
1143         int is_https = if_is_https(ifname, port, https_port);
1144         int nodelay = is_https && http2_nodelay;
1145 #ifdef USE_DNSCRYPT
1146         int is_dnscrypt = ((strchr(ifname, '@') && 
1147                         atoi(strchr(ifname, '@')+1) == dnscrypt_port) ||
1148                         (!strchr(ifname, '@') && atoi(port) == dnscrypt_port));
1149 #else
1150         int is_dnscrypt = 0;
1151         (void)dnscrypt_port;
1152 #endif
1153
1154         if(!do_udp && !do_tcp)
1155                 return 0;
1156         if(do_auto) {
1157                 if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, 
1158                         &noip6, rcv, snd, reuseport, transparent,
1159                         tcp_mss, nodelay, freebind, use_systemd, dscp)) == -1) {
1160                         if(noip6) {
1161                                 log_warn("IPv6 protocol not available");
1162                                 return 1;
1163                         }
1164                         return 0;
1165                 }
1166                 /* getting source addr packet info is highly non-portable */
1167                 if(!set_recvpktinfo(s, hints->ai_family)) {
1168                         sock_close(s);
1169                         return 0;
1170                 }
1171                 if(!port_insert(list, s,
1172                    is_dnscrypt?listen_type_udpancil_dnscrypt:listen_type_udpancil)) {
1173                         sock_close(s);
1174                         return 0;
1175                 }
1176         } else if(do_udp) {
1177                 /* regular udp socket */
1178                 if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, 
1179                         &noip6, rcv, snd, reuseport, transparent,
1180                         tcp_mss, nodelay, freebind, use_systemd, dscp)) == -1) {
1181                         if(noip6) {
1182                                 log_warn("IPv6 protocol not available");
1183                                 return 1;
1184                         }
1185                         return 0;
1186                 }
1187                 if(!port_insert(list, s,
1188                    is_dnscrypt?listen_type_udp_dnscrypt:listen_type_udp)) {
1189                         sock_close(s);
1190                         return 0;
1191                 }
1192         }
1193         if(do_tcp) {
1194                 int is_ssl = if_is_ssl(ifname, port, ssl_port,
1195                         tls_additional_port);
1196                 enum listen_type port_type;
1197                 if(is_ssl)
1198                         port_type = listen_type_ssl;
1199                 else if(is_https)
1200                         port_type = listen_type_http;
1201                 else if(is_dnscrypt)
1202                         port_type = listen_type_tcp_dnscrypt;
1203                 else
1204                         port_type = listen_type_tcp;
1205                 if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1, 
1206                         &noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1207                         freebind, use_systemd, dscp)) == -1) {
1208                         if(noip6) {
1209                                 /*log_warn("IPv6 protocol not available");*/
1210                                 return 1;
1211                         }
1212                         return 0;
1213                 }
1214                 if(is_ssl)
1215                         verbose(VERB_ALGO, "setup TCP for SSL service");
1216                 if(!port_insert(list, s, port_type)) {
1217                         sock_close(s);
1218                         return 0;
1219                 }
1220         }
1221         return 1;
1222 }
1223
1224 /** 
1225  * Add items to commpoint list in front.
1226  * @param c: commpoint to add.
1227  * @param front: listen struct.
1228  * @return: false on failure.
1229  */
1230 static int
1231 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1232 {
1233         struct listen_list* item = (struct listen_list*)malloc(
1234                 sizeof(struct listen_list));
1235         if(!item)
1236                 return 0;
1237         item->com = c;
1238         item->next = front->cps;
1239         front->cps = item;
1240         return 1;
1241 }
1242
1243 struct listen_dnsport* 
1244 listen_create(struct comm_base* base, struct listen_port* ports,
1245         size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1246         int harden_large_queries, uint32_t http_max_streams,
1247         char* http_endpoint, struct tcl_list* tcp_conn_limit, void* sslctx,
1248         struct dt_env* dtenv, comm_point_callback_type* cb, void *cb_arg)
1249 {
1250         struct listen_dnsport* front = (struct listen_dnsport*)
1251                 malloc(sizeof(struct listen_dnsport));
1252         if(!front)
1253                 return NULL;
1254         front->cps = NULL;
1255         front->udp_buff = sldns_buffer_new(bufsize);
1256 #ifdef USE_DNSCRYPT
1257         front->dnscrypt_udp_buff = NULL;
1258 #endif
1259         if(!front->udp_buff) {
1260                 free(front);
1261                 return NULL;
1262         }
1263         if(!stream_wait_lock_inited) {
1264                 lock_basic_init(&stream_wait_count_lock);
1265                 stream_wait_lock_inited = 1;
1266         }
1267         if(!http2_query_buffer_lock_inited) {
1268                 lock_basic_init(&http2_query_buffer_count_lock);
1269                 http2_query_buffer_lock_inited = 1;
1270         }
1271         if(!http2_response_buffer_lock_inited) {
1272                 lock_basic_init(&http2_response_buffer_count_lock);
1273                 http2_response_buffer_lock_inited = 1;
1274         }
1275
1276         /* create comm points as needed */
1277         while(ports) {
1278                 struct comm_point* cp = NULL;
1279                 if(ports->ftype == listen_type_udp ||
1280                    ports->ftype == listen_type_udp_dnscrypt)
1281                         cp = comm_point_create_udp(base, ports->fd, 
1282                                 front->udp_buff, cb, cb_arg);
1283                 else if(ports->ftype == listen_type_tcp ||
1284                                 ports->ftype == listen_type_tcp_dnscrypt)
1285                         cp = comm_point_create_tcp(base, ports->fd, 
1286                                 tcp_accept_count, tcp_idle_timeout,
1287                                 harden_large_queries, 0, NULL,
1288                                 tcp_conn_limit, bufsize, front->udp_buff,
1289                                 ports->ftype, cb, cb_arg);
1290                 else if(ports->ftype == listen_type_ssl ||
1291                         ports->ftype == listen_type_http) {
1292                         cp = comm_point_create_tcp(base, ports->fd, 
1293                                 tcp_accept_count, tcp_idle_timeout,
1294                                 harden_large_queries,
1295                                 http_max_streams, http_endpoint,
1296                                 tcp_conn_limit, bufsize, front->udp_buff,
1297                                 ports->ftype, cb, cb_arg);
1298                         cp->ssl = sslctx;
1299                         if(ports->ftype == listen_type_http) {
1300                                 if(!sslctx) {
1301                                 log_warn("HTTPS port configured, but no TLS "
1302                                         "tls-service-key or tls-service-pem "
1303                                         "set");
1304                                 }
1305 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1306                                 log_warn("Unbound is not compiled with an "
1307                                         "OpenSSL version supporting ALPN "
1308                                         " (OpenSSL >= 1.0.2). This is required "
1309                                         "to use DNS-over-HTTPS");
1310 #endif
1311 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1312                                 log_warn("Unbound is not compiled with "
1313                                         "nghttp2. This is required to use "
1314                                         "DNS-over-HTTPS.");
1315 #endif
1316                         }
1317                 } else if(ports->ftype == listen_type_udpancil ||
1318                                   ports->ftype == listen_type_udpancil_dnscrypt)
1319                         cp = comm_point_create_udp_ancil(base, ports->fd, 
1320                                 front->udp_buff, cb, cb_arg);
1321                 if(!cp) {
1322                         log_err("can't create commpoint");      
1323                         listen_delete(front);
1324                         return NULL;
1325                 }
1326                 cp->dtenv = dtenv;
1327                 cp->do_not_close = 1;
1328 #ifdef USE_DNSCRYPT
1329                 if (ports->ftype == listen_type_udp_dnscrypt ||
1330                         ports->ftype == listen_type_tcp_dnscrypt ||
1331                         ports->ftype == listen_type_udpancil_dnscrypt) {
1332                         cp->dnscrypt = 1;
1333                         cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1334                         if(!cp->dnscrypt_buffer) {
1335                                 log_err("can't alloc dnscrypt_buffer");
1336                                 comm_point_delete(cp);
1337                                 listen_delete(front);
1338                                 return NULL;
1339                         }
1340                         front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1341                 }
1342 #endif
1343                 if(!listen_cp_insert(cp, front)) {
1344                         log_err("malloc failed");
1345                         comm_point_delete(cp);
1346                         listen_delete(front);
1347                         return NULL;
1348                 }
1349                 ports = ports->next;
1350         }
1351         if(!front->cps) {
1352                 log_err("Could not open sockets to accept queries.");
1353                 listen_delete(front);
1354                 return NULL;
1355         }
1356
1357         return front;
1358 }
1359
1360 void
1361 listen_list_delete(struct listen_list* list)
1362 {
1363         struct listen_list *p = list, *pn;
1364         while(p) {
1365                 pn = p->next;
1366                 comm_point_delete(p->com);
1367                 free(p);
1368                 p = pn;
1369         }
1370 }
1371
1372 void 
1373 listen_delete(struct listen_dnsport* front)
1374 {
1375         if(!front) 
1376                 return;
1377         listen_list_delete(front->cps);
1378 #ifdef USE_DNSCRYPT
1379         if(front->dnscrypt_udp_buff &&
1380                 front->udp_buff != front->dnscrypt_udp_buff) {
1381                 sldns_buffer_free(front->dnscrypt_udp_buff);
1382         }
1383 #endif
1384         sldns_buffer_free(front->udp_buff);
1385         free(front);
1386         if(stream_wait_lock_inited) {
1387                 stream_wait_lock_inited = 0;
1388                 lock_basic_destroy(&stream_wait_count_lock);
1389         }
1390         if(http2_query_buffer_lock_inited) {
1391                 http2_query_buffer_lock_inited = 0;
1392                 lock_basic_destroy(&http2_query_buffer_count_lock);
1393         }
1394         if(http2_response_buffer_lock_inited) {
1395                 http2_response_buffer_lock_inited = 0;
1396                 lock_basic_destroy(&http2_response_buffer_count_lock);
1397         }
1398 }
1399
1400 #ifdef HAVE_GETIFADDRS
1401 static int
1402 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1403 {
1404         struct ifaddrs *ifa;
1405         int last_ip_addresses_size = *ip_addresses_size;
1406
1407         for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1408                 sa_family_t family;
1409                 const char* atsign;
1410 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1411                 char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1412 #else
1413                 char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1414 #endif
1415
1416                 if((atsign=strrchr(search_ifa, '@')) != NULL) {
1417                         if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1418                            || strncmp(ifa->ifa_name, search_ifa,
1419                            atsign-search_ifa) != 0)
1420                                 continue;
1421                 } else {
1422                         if(strcmp(ifa->ifa_name, search_ifa) != 0)
1423                                 continue;
1424                         atsign = "";
1425                 }
1426
1427                 if(ifa->ifa_addr == NULL)
1428                         continue;
1429
1430                 family = ifa->ifa_addr->sa_family;
1431                 if(family == AF_INET) {
1432                         char a4[INET_ADDRSTRLEN + 1];
1433                         struct sockaddr_in *in4 = (struct sockaddr_in *)
1434                                 ifa->ifa_addr;
1435                         if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1436                                 log_err("inet_ntop failed");
1437                                 return 0;
1438                         }
1439                         snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1440                                 a4, atsign);
1441                 }
1442 #ifdef INET6
1443                 else if(family == AF_INET6) {
1444                         struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1445                                 ifa->ifa_addr;
1446                         char a6[INET6_ADDRSTRLEN + 1];
1447                         char if_index_name[IF_NAMESIZE + 1];
1448                         if_index_name[0] = 0;
1449                         if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1450                                 log_err("inet_ntop failed");
1451                                 return 0;
1452                         }
1453                         if_indextoname(in6->sin6_scope_id,
1454                                 (char *)if_index_name);
1455                         if (strlen(if_index_name) != 0) {
1456                                 snprintf(addr_buf, sizeof(addr_buf),
1457                                         "%s%%%s%s", a6, if_index_name, atsign);
1458                         } else {
1459                                 snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1460                                         a6, atsign);
1461                         }
1462                 }
1463 #endif
1464                 else {
1465                         continue;
1466                 }
1467                 verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1468
1469                 *ip_addresses = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1470                 if(!*ip_addresses) {
1471                         log_err("realloc failed: out of memory");
1472                         return 0;
1473                 }
1474                 (*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1475                 if(!(*ip_addresses)[*ip_addresses_size]) {
1476                         log_err("strdup failed: out of memory");
1477                         return 0;
1478                 }
1479                 (*ip_addresses_size)++;
1480         }
1481
1482         if (*ip_addresses_size == last_ip_addresses_size) {
1483                 *ip_addresses = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1484                 if(!*ip_addresses) {
1485                         log_err("realloc failed: out of memory");
1486                         return 0;
1487                 }
1488                 (*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1489                 if(!(*ip_addresses)[*ip_addresses_size]) {
1490                         log_err("strdup failed: out of memory");
1491                         return 0;
1492                 }
1493                 (*ip_addresses_size)++;
1494         }
1495         return 1;
1496 }
1497 #endif /* HAVE_GETIFADDRS */
1498
1499 int resolve_interface_names(struct config_file* cfg, char*** resif,
1500         int* num_resif)
1501 {
1502 #ifdef HAVE_GETIFADDRS
1503         int i;
1504         struct ifaddrs *addrs = NULL;
1505         if(cfg->num_ifs == 0) {
1506                 *resif = NULL;
1507                 *num_resif = 0;
1508                 return 1;
1509         }
1510         if(getifaddrs(&addrs) == -1) {
1511                 log_err("failed to list interfaces: getifaddrs: %s",
1512                         strerror(errno));
1513                 freeifaddrs(addrs);
1514                 return 0;
1515         }
1516         for(i=0; i<cfg->num_ifs; i++) {
1517                 if(!resolve_ifa_name(addrs, cfg->ifs[i], resif, num_resif)) {
1518                         freeifaddrs(addrs);
1519                         config_del_strarray(*resif, *num_resif);
1520                         *resif = NULL;
1521                         *num_resif = 0;
1522                         return 0;
1523                 }
1524         }
1525         freeifaddrs(addrs);
1526         return 1;
1527 #else
1528         int i;
1529         if(cfg->num_ifs == 0) {
1530                 *resif = NULL;
1531                 *num_resif = 0;
1532                 return 1;
1533         }
1534         *num_resif = cfg->num_ifs;
1535         *resif = calloc(*num_resif, sizeof(**resif));
1536         if(!*resif) {
1537                 log_err("out of memory");
1538                 return 0;
1539         }
1540         for(i=0; i<*num_resif; i++) {
1541                 (*resif)[i] = strdup(cfg->ifs[i]);
1542                 if(!((*resif)[i])) {
1543                         log_err("out of memory");
1544                         config_del_strarray(*resif, *num_resif);
1545                         *resif = NULL;
1546                         *num_resif = 0;
1547                         return 0;
1548                 }
1549         }
1550         return 1;
1551 #endif /* HAVE_GETIFADDRS */
1552 }
1553
1554 struct listen_port* 
1555 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1556         int* reuseport)
1557 {
1558         struct listen_port* list = NULL;
1559         struct addrinfo hints;
1560         int i, do_ip4, do_ip6;
1561         int do_tcp, do_auto;
1562         char portbuf[32];
1563         snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1564         do_ip4 = cfg->do_ip4;
1565         do_ip6 = cfg->do_ip6;
1566         do_tcp = cfg->do_tcp;
1567         do_auto = cfg->if_automatic && cfg->do_udp;
1568         if(cfg->incoming_num_tcp == 0)
1569                 do_tcp = 0;
1570
1571         /* getaddrinfo */
1572         memset(&hints, 0, sizeof(hints));
1573         hints.ai_flags = AI_PASSIVE;
1574         /* no name lookups on our listening ports */
1575         if(num_ifs > 0)
1576                 hints.ai_flags |= AI_NUMERICHOST;
1577         hints.ai_family = AF_UNSPEC;
1578 #ifndef INET6
1579         do_ip6 = 0;
1580 #endif
1581         if(!do_ip4 && !do_ip6) {
1582                 return NULL;
1583         }
1584         /* create ip4 and ip6 ports so that return addresses are nice. */
1585         if(do_auto || num_ifs == 0) {
1586                 if(do_ip6) {
1587                         hints.ai_family = AF_INET6;
1588                         if(!ports_create_if(do_auto?"::0":"::1", 
1589                                 do_auto, cfg->do_udp, do_tcp, 
1590                                 &hints, portbuf, &list,
1591                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1592                                 cfg->ssl_port, cfg->tls_additional_port,
1593                                 cfg->https_port, reuseport, cfg->ip_transparent,
1594                                 cfg->tcp_mss, cfg->ip_freebind,
1595                                 cfg->http_nodelay, cfg->use_systemd,
1596                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1597                                 listening_ports_free(list);
1598                                 return NULL;
1599                         }
1600                 }
1601                 if(do_ip4) {
1602                         hints.ai_family = AF_INET;
1603                         if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1", 
1604                                 do_auto, cfg->do_udp, do_tcp, 
1605                                 &hints, portbuf, &list,
1606                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1607                                 cfg->ssl_port, cfg->tls_additional_port,
1608                                 cfg->https_port, reuseport, cfg->ip_transparent,
1609                                 cfg->tcp_mss, cfg->ip_freebind,
1610                                 cfg->http_nodelay, cfg->use_systemd,
1611                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1612                                 listening_ports_free(list);
1613                                 return NULL;
1614                         }
1615                 }
1616         } else for(i = 0; i<num_ifs; i++) {
1617                 if(str_is_ip6(ifs[i])) {
1618                         if(!do_ip6)
1619                                 continue;
1620                         hints.ai_family = AF_INET6;
1621                         if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1622                                 do_tcp, &hints, portbuf, &list, 
1623                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1624                                 cfg->ssl_port, cfg->tls_additional_port,
1625                                 cfg->https_port, reuseport, cfg->ip_transparent,
1626                                 cfg->tcp_mss, cfg->ip_freebind,
1627                                 cfg->http_nodelay, cfg->use_systemd,
1628                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1629                                 listening_ports_free(list);
1630                                 return NULL;
1631                         }
1632                 } else {
1633                         if(!do_ip4)
1634                                 continue;
1635                         hints.ai_family = AF_INET;
1636                         if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1637                                 do_tcp, &hints, portbuf, &list, 
1638                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1639                                 cfg->ssl_port, cfg->tls_additional_port,
1640                                 cfg->https_port, reuseport, cfg->ip_transparent,
1641                                 cfg->tcp_mss, cfg->ip_freebind,
1642                                 cfg->http_nodelay, cfg->use_systemd,
1643                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1644                                 listening_ports_free(list);
1645                                 return NULL;
1646                         }
1647                 }
1648         }
1649         return list;
1650 }
1651
1652 void listening_ports_free(struct listen_port* list)
1653 {
1654         struct listen_port* nx;
1655         while(list) {
1656                 nx = list->next;
1657                 if(list->fd != -1) {
1658                         sock_close(list->fd);
1659                 }
1660                 free(list);
1661                 list = nx;
1662         }
1663 }
1664
1665 size_t listen_get_mem(struct listen_dnsport* listen)
1666 {
1667         struct listen_list* p;
1668         size_t s = sizeof(*listen) + sizeof(*listen->base) + 
1669                 sizeof(*listen->udp_buff) + 
1670                 sldns_buffer_capacity(listen->udp_buff);
1671 #ifdef USE_DNSCRYPT
1672         s += sizeof(*listen->dnscrypt_udp_buff);
1673         if(listen->udp_buff != listen->dnscrypt_udp_buff){
1674                 s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1675         }
1676 #endif
1677         for(p = listen->cps; p; p = p->next) {
1678                 s += sizeof(*p);
1679                 s += comm_point_get_mem(p->com);
1680         }
1681         return s;
1682 }
1683
1684 void listen_stop_accept(struct listen_dnsport* listen)
1685 {
1686         /* do not stop the ones that have no tcp_free list
1687          * (they have already stopped listening) */
1688         struct listen_list* p;
1689         for(p=listen->cps; p; p=p->next) {
1690                 if(p->com->type == comm_tcp_accept &&
1691                         p->com->tcp_free != NULL) {
1692                         comm_point_stop_listening(p->com);
1693                 }
1694         }
1695 }
1696
1697 void listen_start_accept(struct listen_dnsport* listen)
1698 {
1699         /* do not start the ones that have no tcp_free list, it is no
1700          * use to listen to them because they have no free tcp handlers */
1701         struct listen_list* p;
1702         for(p=listen->cps; p; p=p->next) {
1703                 if(p->com->type == comm_tcp_accept &&
1704                         p->com->tcp_free != NULL) {
1705                         comm_point_start_listening(p->com, -1, -1);
1706                 }
1707         }
1708 }
1709
1710 struct tcp_req_info*
1711 tcp_req_info_create(struct sldns_buffer* spoolbuf)
1712 {
1713         struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
1714         if(!req) {
1715                 log_err("malloc failure for new stream outoforder processing structure");
1716                 return NULL;
1717         }
1718         memset(req, 0, sizeof(*req));
1719         req->spool_buffer = spoolbuf;
1720         return req;
1721 }
1722
1723 void
1724 tcp_req_info_delete(struct tcp_req_info* req)
1725 {
1726         if(!req) return;
1727         tcp_req_info_clear(req);
1728         /* cp is pointer back to commpoint that owns this struct and
1729          * called delete on us */
1730         /* spool_buffer is shared udp buffer, not deleted here */
1731         free(req);
1732 }
1733
1734 void tcp_req_info_clear(struct tcp_req_info* req)
1735 {
1736         struct tcp_req_open_item* open, *nopen;
1737         struct tcp_req_done_item* item, *nitem;
1738         if(!req) return;
1739
1740         /* free outstanding request mesh reply entries */
1741         open = req->open_req_list;
1742         while(open) {
1743                 nopen = open->next;
1744                 mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
1745                 free(open);
1746                 open = nopen;
1747         }
1748         req->open_req_list = NULL;
1749         req->num_open_req = 0;
1750         
1751         /* free pending writable result packets */
1752         item = req->done_req_list;
1753         while(item) {
1754                 nitem = item->next;
1755                 lock_basic_lock(&stream_wait_count_lock);
1756                 stream_wait_count -= (sizeof(struct tcp_req_done_item)
1757                         +item->len);
1758                 lock_basic_unlock(&stream_wait_count_lock);
1759                 free(item->buf);
1760                 free(item);
1761                 item = nitem;
1762         }
1763         req->done_req_list = NULL;
1764         req->num_done_req = 0;
1765         req->read_is_closed = 0;
1766 }
1767
1768 void
1769 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
1770 {
1771         struct tcp_req_open_item* open, *prev = NULL;
1772         if(!req || !m) return;
1773         open = req->open_req_list;
1774         while(open) {
1775                 if(open->mesh_state == m) {
1776                         struct tcp_req_open_item* next;
1777                         if(prev) prev->next = open->next;
1778                         else req->open_req_list = open->next;
1779                         /* caller has to manage the mesh state reply entry */
1780                         next = open->next;
1781                         free(open);
1782                         req->num_open_req --;
1783
1784                         /* prev = prev; */
1785                         open = next;
1786                         continue;
1787                 }
1788                 prev = open;
1789                 open = open->next;
1790         }
1791 }
1792
1793 /** setup listening for read or write */
1794 static void
1795 tcp_req_info_setup_listen(struct tcp_req_info* req)
1796 {
1797         int wr = 0;
1798         int rd = 0;
1799
1800         if(req->cp->tcp_byte_count != 0) {
1801                 /* cannot change, halfway through */
1802                 return;
1803         }
1804
1805         if(!req->cp->tcp_is_reading)
1806                 wr = 1;
1807         if(req->num_open_req + req->num_done_req < TCP_MAX_REQ_SIMULTANEOUS &&
1808                 !req->read_is_closed)
1809                 rd = 1;
1810         
1811         if(wr) {
1812                 req->cp->tcp_is_reading = 0;
1813                 comm_point_stop_listening(req->cp);
1814                 comm_point_start_listening(req->cp, -1,
1815                         req->cp->tcp_timeout_msec);
1816         } else if(rd) {
1817                 req->cp->tcp_is_reading = 1;
1818                 comm_point_stop_listening(req->cp);
1819                 comm_point_start_listening(req->cp, -1,
1820                         req->cp->tcp_timeout_msec);
1821                 /* and also read it (from SSL stack buffers), so
1822                  * no event read event is expected since the remainder of
1823                  * the TLS frame is sitting in the buffers. */
1824                 req->read_again = 1;
1825         } else {
1826                 comm_point_stop_listening(req->cp);
1827                 comm_point_start_listening(req->cp, -1,
1828                         req->cp->tcp_timeout_msec);
1829                 comm_point_listen_for_rw(req->cp, 0, 0);
1830         }
1831 }
1832
1833 /** remove first item from list of pending results */
1834 static struct tcp_req_done_item*
1835 tcp_req_info_pop_done(struct tcp_req_info* req)
1836 {
1837         struct tcp_req_done_item* item;
1838         log_assert(req->num_done_req > 0 && req->done_req_list);
1839         item = req->done_req_list;
1840         lock_basic_lock(&stream_wait_count_lock);
1841         stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
1842         lock_basic_unlock(&stream_wait_count_lock);
1843         req->done_req_list = req->done_req_list->next;
1844         req->num_done_req --;
1845         return item;
1846 }
1847
1848 /** Send given buffer and setup to write */
1849 static void
1850 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
1851         size_t len)
1852 {
1853         sldns_buffer_clear(req->cp->buffer);
1854         sldns_buffer_write(req->cp->buffer, buf, len);
1855         sldns_buffer_flip(req->cp->buffer);
1856
1857         req->cp->tcp_is_reading = 0; /* we are now writing */
1858 }
1859
1860 /** pick up the next result and start writing it to the channel */
1861 static void
1862 tcp_req_pickup_next_result(struct tcp_req_info* req)
1863 {
1864         if(req->num_done_req > 0) {
1865                 /* unlist the done item from the list of pending results */
1866                 struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
1867                 tcp_req_info_start_write_buf(req, item->buf, item->len);
1868                 free(item->buf);
1869                 free(item);
1870         }
1871 }
1872
1873 /** the read channel has closed */
1874 int
1875 tcp_req_info_handle_read_close(struct tcp_req_info* req)
1876 {
1877         verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
1878         /* reset byte count for (potential) partial read */
1879         req->cp->tcp_byte_count = 0;
1880         /* if we still have results to write, pick up next and write it */
1881         if(req->num_done_req != 0) {
1882                 tcp_req_pickup_next_result(req);
1883                 tcp_req_info_setup_listen(req);
1884                 return 1;
1885         }
1886         /* if nothing to do, this closes the connection */
1887         if(req->num_open_req == 0 && req->num_done_req == 0)
1888                 return 0;
1889         /* otherwise, we must be waiting for dns resolve, wait with timeout */
1890         req->read_is_closed = 1;
1891         tcp_req_info_setup_listen(req);
1892         return 1;
1893 }
1894
1895 void
1896 tcp_req_info_handle_writedone(struct tcp_req_info* req)
1897 {
1898         /* back to reading state, we finished this write event */
1899         sldns_buffer_clear(req->cp->buffer);
1900         if(req->num_done_req == 0 && req->read_is_closed) {
1901                 /* no more to write and nothing to read, close it */
1902                 comm_point_drop_reply(&req->cp->repinfo);
1903                 return;
1904         }
1905         req->cp->tcp_is_reading = 1;
1906         /* see if another result needs writing */
1907         tcp_req_pickup_next_result(req);
1908
1909         /* see if there is more to write, if not stop_listening for writing */
1910         /* see if new requests are allowed, if so, start_listening
1911          * for reading */
1912         tcp_req_info_setup_listen(req);
1913 }
1914
1915 void
1916 tcp_req_info_handle_readdone(struct tcp_req_info* req)
1917 {
1918         struct comm_point* c = req->cp;
1919
1920         /* we want to read up several requests, unless there are
1921          * pending answers */
1922
1923         req->is_drop = 0;
1924         req->is_reply = 0;
1925         req->in_worker_handle = 1;
1926         sldns_buffer_set_limit(req->spool_buffer, 0);
1927         /* handle the current request */
1928         /* this calls the worker handle request routine that could give
1929          * a cache response, or localdata response, or drop the reply,
1930          * or schedule a mesh entry for later */
1931         fptr_ok(fptr_whitelist_comm_point(c->callback));
1932         if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
1933                 req->in_worker_handle = 0;
1934                 /* there is an answer, put it up.  It is already in the
1935                  * c->buffer, just send it. */
1936                 /* since we were just reading a query, the channel is
1937                  * clear to write to */
1938         send_it:
1939                 c->tcp_is_reading = 0;
1940                 comm_point_stop_listening(c);
1941                 comm_point_start_listening(c, -1, c->tcp_timeout_msec);
1942                 return;
1943         }
1944         req->in_worker_handle = 0;
1945         /* it should be waiting in the mesh for recursion.
1946          * If mesh failed to add a new entry and called commpoint_drop_reply. 
1947          * Then the mesh state has been cleared. */
1948         if(req->is_drop) {
1949                 /* the reply has been dropped, stream has been closed. */
1950                 return;
1951         }
1952         /* If mesh failed(mallocfail) and called commpoint_send_reply with
1953          * something like servfail then we pick up that reply below. */
1954         if(req->is_reply) {
1955                 goto send_it;
1956         }
1957
1958         sldns_buffer_clear(c->buffer);
1959         /* if pending answers, pick up an answer and start sending it */
1960         tcp_req_pickup_next_result(req);
1961
1962         /* if answers pending, start sending answers */
1963         /* read more requests if we can have more requests */
1964         tcp_req_info_setup_listen(req);
1965 }
1966
1967 int
1968 tcp_req_info_add_meshstate(struct tcp_req_info* req,
1969         struct mesh_area* mesh, struct mesh_state* m)
1970 {
1971         struct tcp_req_open_item* item;
1972         log_assert(req && mesh && m);
1973         item = (struct tcp_req_open_item*)malloc(sizeof(*item));
1974         if(!item) return 0;
1975         item->next = req->open_req_list;
1976         item->mesh = mesh;
1977         item->mesh_state = m;
1978         req->open_req_list = item;
1979         req->num_open_req++;
1980         return 1;
1981 }
1982
1983 /** Add a result to the result list.  At the end. */
1984 static int
1985 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
1986 {
1987         struct tcp_req_done_item* last = NULL;
1988         struct tcp_req_done_item* item;
1989         size_t space;
1990
1991         /* see if we have space */
1992         space = sizeof(struct tcp_req_done_item) + len;
1993         lock_basic_lock(&stream_wait_count_lock);
1994         if(stream_wait_count + space > stream_wait_max) {
1995                 lock_basic_unlock(&stream_wait_count_lock);
1996                 verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
1997                 return 0;
1998         }
1999         stream_wait_count += space;
2000         lock_basic_unlock(&stream_wait_count_lock);
2001
2002         /* find last element */
2003         last = req->done_req_list;
2004         while(last && last->next)
2005                 last = last->next;
2006         
2007         /* create new element */
2008         item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2009         if(!item) {
2010                 log_err("malloc failure, for stream result list");
2011                 return 0;
2012         }
2013         item->next = NULL;
2014         item->len = len;
2015         item->buf = memdup(buf, len);
2016         if(!item->buf) {
2017                 free(item);
2018                 log_err("malloc failure, adding reply to stream result list");
2019                 return 0;
2020         }
2021
2022         /* link in */
2023         if(last) last->next = item;
2024         else req->done_req_list = item;
2025         req->num_done_req++;
2026         return 1;
2027 }
2028
2029 void
2030 tcp_req_info_send_reply(struct tcp_req_info* req)
2031 {
2032         if(req->in_worker_handle) {
2033                 /* reply from mesh is in the spool_buffer */
2034                 /* copy now, so that the spool buffer is free for other tasks
2035                  * before the callback is done */
2036                 sldns_buffer_clear(req->cp->buffer);
2037                 sldns_buffer_write(req->cp->buffer,
2038                         sldns_buffer_begin(req->spool_buffer),
2039                         sldns_buffer_limit(req->spool_buffer));
2040                 sldns_buffer_flip(req->cp->buffer);
2041                 req->is_reply = 1;
2042                 return;
2043         }
2044         /* now that the query has been handled, that mesh_reply entry
2045          * should be removed, from the tcp_req_info list,
2046          * the mesh state cleanup removes then with region_cleanup and
2047          * replies_sent true. */
2048         /* see if we can send it straight away (we are not doing
2049          * anything else).  If so, copy to buffer and start */
2050         if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2051                 /* buffer is free, and was ready to read new query into,
2052                  * but we are now going to use it to send this answer */
2053                 tcp_req_info_start_write_buf(req,
2054                         sldns_buffer_begin(req->spool_buffer),
2055                         sldns_buffer_limit(req->spool_buffer));
2056                 /* switch to listen to write events */
2057                 comm_point_stop_listening(req->cp);
2058                 comm_point_start_listening(req->cp, -1,
2059                         req->cp->tcp_timeout_msec);
2060                 return;
2061         }
2062         /* queue up the answer behind the others already pending */
2063         if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2064                 sldns_buffer_limit(req->spool_buffer))) {
2065                 /* drop the connection, we are out of resources */
2066                 comm_point_drop_reply(&req->cp->repinfo);
2067         }
2068 }
2069
2070 size_t tcp_req_info_get_stream_buffer_size(void)
2071 {
2072         size_t s;
2073         if(!stream_wait_lock_inited)
2074                 return stream_wait_count;
2075         lock_basic_lock(&stream_wait_count_lock);
2076         s = stream_wait_count;
2077         lock_basic_unlock(&stream_wait_count_lock);
2078         return s;
2079 }
2080
2081 size_t http2_get_query_buffer_size(void)
2082 {
2083         size_t s;
2084         if(!http2_query_buffer_lock_inited)
2085                 return http2_query_buffer_count;
2086         lock_basic_lock(&http2_query_buffer_count_lock);
2087         s = http2_query_buffer_count;
2088         lock_basic_unlock(&http2_query_buffer_count_lock);
2089         return s;
2090 }
2091
2092 size_t http2_get_response_buffer_size(void)
2093 {
2094         size_t s;
2095         if(!http2_response_buffer_lock_inited)
2096                 return http2_response_buffer_count;
2097         lock_basic_lock(&http2_response_buffer_count_lock);
2098         s = http2_response_buffer_count;
2099         lock_basic_unlock(&http2_response_buffer_count_lock);
2100         return s;
2101 }
2102
2103 #ifdef HAVE_NGHTTP2
2104 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2105 static ssize_t http2_submit_response_read_callback(
2106         nghttp2_session* ATTR_UNUSED(session),
2107         int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2108         nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2109 {
2110         struct http2_stream* h2_stream;
2111         struct http2_session* h2_session = source->ptr;
2112         size_t copylen = length;
2113         if(!(h2_stream = nghttp2_session_get_stream_user_data(
2114                 h2_session->session, stream_id))) {
2115                 verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2116                         "stream");
2117                 return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2118         }
2119         if(!h2_stream->rbuffer ||
2120                 sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2121                 verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2122                         "available in rbuffer");
2123                 /* rbuffer will be free'd in frame close cb */
2124                 return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2125         }
2126
2127         if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2128                 copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2129         if(copylen > SSIZE_MAX)
2130                 copylen = SSIZE_MAX; /* will probably never happen */
2131
2132         memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2133         sldns_buffer_skip(h2_stream->rbuffer, copylen);
2134
2135         if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2136                 *data_flags |= NGHTTP2_DATA_FLAG_EOF;
2137                 lock_basic_lock(&http2_response_buffer_count_lock);
2138                 http2_response_buffer_count -=
2139                         sldns_buffer_capacity(h2_stream->rbuffer);
2140                 lock_basic_unlock(&http2_response_buffer_count_lock);
2141                 sldns_buffer_free(h2_stream->rbuffer);
2142                 h2_stream->rbuffer = NULL;
2143         }
2144
2145         return copylen;
2146 }
2147
2148 /**
2149  * Send RST_STREAM frame for stream.
2150  * @param h2_session: http2 session to submit frame to
2151  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2152  * @return 0 on error, 1 otherwise
2153  */
2154 static int http2_submit_rst_stream(struct http2_session* h2_session,
2155                 struct http2_stream* h2_stream)
2156 {
2157         int ret = nghttp2_submit_rst_stream(h2_session->session,
2158                 NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2159                 NGHTTP2_INTERNAL_ERROR);
2160         if(ret) {
2161                 verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2162                         "error: %s", nghttp2_strerror(ret));
2163                 return 0;
2164         }
2165         return 1;
2166 }
2167
2168 /**
2169  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2170  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2171  * might be used before this will be sent out.
2172  * @param h2_session: http2 session, containing c->buffer which contains answer
2173  * @return 0 on error, 1 otherwise
2174  */
2175 int http2_submit_dns_response(struct http2_session* h2_session)
2176 {
2177         int ret;
2178         nghttp2_data_provider data_prd;
2179         char status[4];
2180         nghttp2_nv headers[2];
2181         struct http2_stream* h2_stream = h2_session->c->h2_stream;
2182         size_t rlen;
2183
2184         if(h2_stream->rbuffer) {
2185                 log_err("http2 submit response error: rbuffer already "
2186                         "exists");
2187                 return 0;
2188         }
2189         if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2190                 log_err("http2 submit response error: c->buffer not complete");
2191                 return 0;
2192         }
2193
2194         if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2195                 verbose(VERB_QUERY, "http2: submit response error: "
2196                         "invalid status");
2197                 return 0;
2198         }
2199
2200         rlen = sldns_buffer_remaining(h2_session->c->buffer);
2201         lock_basic_lock(&http2_response_buffer_count_lock);
2202         if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2203                 lock_basic_unlock(&http2_response_buffer_count_lock);
2204                 verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2205                         "in https-response-buffer-size");
2206                 return http2_submit_rst_stream(h2_session, h2_stream);
2207         }
2208         http2_response_buffer_count += rlen;
2209         lock_basic_unlock(&http2_response_buffer_count_lock);
2210
2211         if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2212                 lock_basic_lock(&http2_response_buffer_count_lock);
2213                 http2_response_buffer_count -= rlen;
2214                 lock_basic_unlock(&http2_response_buffer_count_lock);
2215                 log_err("http2 submit response error: malloc failure");
2216                 return 0;
2217         }
2218
2219         headers[0].name = (uint8_t*)":status";
2220         headers[0].namelen = 7;
2221         headers[0].value = (uint8_t*)status;
2222         headers[0].valuelen = 3;
2223         headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2224
2225         headers[1].name = (uint8_t*)"content-type";
2226         headers[1].namelen = 12;
2227         headers[1].value = (uint8_t*)"application/dns-message";
2228         headers[1].valuelen = 23;
2229         headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2230
2231         /*TODO be nice and add the content-length header
2232         headers[2].name = (uint8_t*)"content-length";
2233         headers[2].namelen = 14;
2234         headers[2].value = 
2235         headers[2].valuelen = 
2236         headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2237         */
2238
2239         sldns_buffer_write(h2_stream->rbuffer,
2240                 sldns_buffer_current(h2_session->c->buffer),
2241                 sldns_buffer_remaining(h2_session->c->buffer));
2242         sldns_buffer_flip(h2_stream->rbuffer);
2243
2244         data_prd.source.ptr = h2_session;
2245         data_prd.read_callback = http2_submit_response_read_callback;
2246         ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2247                 headers, 2, &data_prd);
2248         if(ret) {
2249                 verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2250                         "error: %s", nghttp2_strerror(ret));
2251                 return 0;
2252         }
2253         return 1;
2254 }
2255 #else
2256 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2257 {
2258         return 0;
2259 }
2260 #endif
2261
2262 #ifdef HAVE_NGHTTP2
2263 /** HTTP status to descriptive string */
2264 static char* http_status_to_str(enum http_status s)
2265 {
2266         switch(s) {
2267                 case HTTP_STATUS_OK:
2268                         return "OK";
2269                 case HTTP_STATUS_BAD_REQUEST:
2270                         return "Bad Request";
2271                 case HTTP_STATUS_NOT_FOUND:
2272                         return "Not Found";
2273                 case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2274                         return "Payload Too Large";
2275                 case HTTP_STATUS_URI_TOO_LONG:
2276                         return "URI Too Long";
2277                 case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2278                         return "Unsupported Media Type";
2279                 case HTTP_STATUS_NOT_IMPLEMENTED:
2280                         return "Not Implemented";
2281         }
2282         return "Status Unknown";
2283 }
2284
2285 /** nghttp2 callback. Used to copy error message to nghttp2 session */
2286 static ssize_t http2_submit_error_read_callback(
2287         nghttp2_session* ATTR_UNUSED(session),
2288         int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2289         nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2290 {
2291         struct http2_stream* h2_stream;
2292         struct http2_session* h2_session = source->ptr;
2293         char* msg;
2294         if(!(h2_stream = nghttp2_session_get_stream_user_data(
2295                 h2_session->session, stream_id))) {
2296                 verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2297                         "stream");
2298                 return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2299         }
2300         *data_flags |= NGHTTP2_DATA_FLAG_EOF;
2301         msg = http_status_to_str(h2_stream->status);
2302         if(length < strlen(msg))
2303                 return 0; /* not worth trying over multiple frames */
2304         memcpy(buf, msg, strlen(msg));
2305         return strlen(msg);
2306
2307 }
2308
2309 /**
2310  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2311  * sending out. Message body will contain descriptive string for HTTP status.
2312  * @param h2_session: http2 session to submit to
2313  * @param h2_stream: http2 stream containing HTTP status to use for error
2314  * @return 0 on error, 1 otherwise
2315  */
2316 static int http2_submit_error(struct http2_session* h2_session,
2317         struct http2_stream* h2_stream)
2318 {
2319         int ret;
2320         char status[4];
2321         nghttp2_data_provider data_prd;
2322         nghttp2_nv headers[1]; /* will be copied by nghttp */
2323         if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2324                 verbose(VERB_QUERY, "http2: submit error failed, "
2325                         "invalid status");
2326                 return 0;
2327         }
2328         headers[0].name = (uint8_t*)":status";
2329         headers[0].namelen = 7;
2330         headers[0].value = (uint8_t*)status;
2331         headers[0].valuelen = 3;
2332         headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2333
2334         data_prd.source.ptr = h2_session;
2335         data_prd.read_callback = http2_submit_error_read_callback;
2336
2337         ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2338                 headers, 1, &data_prd);
2339         if(ret) {
2340                 verbose(VERB_QUERY, "http2: submit error failed, "
2341                         "error: %s", nghttp2_strerror(ret));
2342                 return 0;
2343         }
2344         return 1;
2345 }
2346
2347 /**
2348  * Start query handling. Query is stored in the stream, and will be free'd here.
2349  * @param h2_session: http2 session, containing comm point
2350  * @param h2_stream: stream containing buffered query
2351  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2352  * reply available (yet).
2353  */
2354 static int http2_query_read_done(struct http2_session* h2_session,
2355         struct http2_stream* h2_stream)
2356 {
2357         log_assert(h2_stream->qbuffer);
2358
2359         if(h2_session->c->h2_stream) {
2360                 verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2361                         "buffer already assigned to stream");
2362                 return -1;
2363         }
2364         if(sldns_buffer_remaining(h2_session->c->buffer) <
2365                 sldns_buffer_remaining(h2_stream->qbuffer)) {
2366                 /* qbuffer will be free'd in frame close cb */
2367                 sldns_buffer_clear(h2_session->c->buffer);
2368                 verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2369                         "qbuffer in c->buffer");
2370                 return -1;
2371         }
2372
2373         sldns_buffer_write(h2_session->c->buffer,
2374                 sldns_buffer_current(h2_stream->qbuffer),
2375                 sldns_buffer_remaining(h2_stream->qbuffer));
2376
2377         lock_basic_lock(&http2_query_buffer_count_lock);
2378         http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2379         lock_basic_unlock(&http2_query_buffer_count_lock);
2380         sldns_buffer_free(h2_stream->qbuffer);
2381         h2_stream->qbuffer = NULL;
2382
2383         sldns_buffer_flip(h2_session->c->buffer);
2384         h2_session->c->h2_stream = h2_stream;
2385         fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2386         if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2387                 NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2388                 return 1; /* answer in c->buffer */
2389         }
2390         sldns_buffer_clear(h2_session->c->buffer);
2391         h2_session->c->h2_stream = NULL;
2392         return 0; /* mesh state added, or dropped */
2393 }
2394
2395 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2396  * stream. Gather collected request data and start query handling. */
2397 static int http2_req_frame_recv_cb(nghttp2_session* session,
2398         const nghttp2_frame* frame, void* cb_arg)
2399 {
2400         struct http2_session* h2_session = (struct http2_session*)cb_arg;
2401         struct http2_stream* h2_stream;
2402         int query_read_done;
2403
2404         if((frame->hd.type != NGHTTP2_DATA &&
2405                 frame->hd.type != NGHTTP2_HEADERS) ||
2406                 !(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2407                         return 0;
2408         }
2409
2410         if(!(h2_stream = nghttp2_session_get_stream_user_data(
2411                 session, frame->hd.stream_id)))
2412                 return 0;
2413
2414         if(h2_stream->invalid_endpoint) {
2415                 h2_stream->status = HTTP_STATUS_NOT_FOUND;
2416                 goto submit_http_error;
2417         }
2418
2419         if(h2_stream->invalid_content_type) {
2420                 h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2421                 goto submit_http_error;
2422         }
2423
2424         if(h2_stream->http_method != HTTP_METHOD_GET &&
2425                 h2_stream->http_method != HTTP_METHOD_POST) {
2426                 h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2427                 goto submit_http_error;
2428         }
2429
2430         if(h2_stream->query_too_large) {
2431                 if(h2_stream->http_method == HTTP_METHOD_POST)
2432                         h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2433                 else
2434                         h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2435                 goto submit_http_error;
2436         }
2437
2438         if(!h2_stream->qbuffer) {
2439                 h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2440                 goto submit_http_error;
2441         }
2442
2443         if(h2_stream->status) {
2444 submit_http_error:
2445                 verbose(VERB_QUERY, "http2 request invalid, returning :status="
2446                         "%d", h2_stream->status);
2447                 if(!http2_submit_error(h2_session, h2_stream)) {
2448                         return NGHTTP2_ERR_CALLBACK_FAILURE;
2449                 }
2450                 return 0;
2451         }
2452         h2_stream->status = HTTP_STATUS_OK;
2453
2454         sldns_buffer_flip(h2_stream->qbuffer);
2455         h2_session->postpone_drop = 1;
2456         query_read_done = http2_query_read_done(h2_session, h2_stream);
2457         if(query_read_done < 0)
2458                 return NGHTTP2_ERR_CALLBACK_FAILURE;
2459         else if(!query_read_done) {
2460                 if(h2_session->is_drop) {
2461                         /* connection needs to be closed. Return failure to make
2462                          * sure no other action are taken anymore on comm point.
2463                          * failure will result in reclaiming (and closing)
2464                          * of comm point. */
2465                         verbose(VERB_QUERY, "http2 query dropped in worker cb");
2466                         h2_session->postpone_drop = 0;
2467                         return NGHTTP2_ERR_CALLBACK_FAILURE;
2468                 }
2469                 /* nothing to submit right now, query added to mesh. */
2470                 h2_session->postpone_drop = 0;
2471                 return 0;
2472         }
2473         if(!http2_submit_dns_response(h2_session)) {
2474                 sldns_buffer_clear(h2_session->c->buffer);
2475                 h2_session->c->h2_stream = NULL;
2476                 return NGHTTP2_ERR_CALLBACK_FAILURE;
2477         }
2478         verbose(VERB_QUERY, "http2 query submitted to session");
2479         sldns_buffer_clear(h2_session->c->buffer);
2480         h2_session->c->h2_stream = NULL;
2481         return 0;
2482 }
2483
2484 /** nghttp2 callback. Used to detect start of new streams. */
2485 static int http2_req_begin_headers_cb(nghttp2_session* session,
2486         const nghttp2_frame* frame, void* cb_arg)
2487 {
2488         struct http2_session* h2_session = (struct http2_session*)cb_arg;
2489         struct http2_stream* h2_stream;
2490         int ret;
2491         if(frame->hd.type != NGHTTP2_HEADERS ||
2492                 frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2493                 /* only interrested in request headers */
2494                 return 0;
2495         }
2496         if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2497                 log_err("malloc failure while creating http2 stream");
2498                 return NGHTTP2_ERR_CALLBACK_FAILURE;
2499         }
2500         http2_session_add_stream(h2_session, h2_stream);
2501         ret = nghttp2_session_set_stream_user_data(session,
2502                 frame->hd.stream_id, h2_stream);
2503         if(ret) {
2504                 /* stream does not exist */
2505                 verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2506                         "error: %s", nghttp2_strerror(ret));
2507                 return NGHTTP2_ERR_CALLBACK_FAILURE;
2508         }
2509
2510         return 0;
2511 }
2512
2513 /**
2514  * base64url decode, store in qbuffer
2515  * @param h2_session: http2 session
2516  * @param h2_stream: http2 stream
2517  * @param start: start of the base64 string
2518  * @param length: length of the base64 string
2519  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2520  * buffer will be NULL is unparseble.
2521  */
2522 static int http2_buffer_uri_query(struct http2_session* h2_session,
2523         struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2524 {
2525         size_t expectb64len;
2526         int b64len;
2527         if(h2_stream->http_method == HTTP_METHOD_POST)
2528                 return 1;
2529         if(length == 0)
2530                 return 1;
2531         if(h2_stream->qbuffer) {
2532                 verbose(VERB_ALGO, "http2_req_header fail, "
2533                         "qbuffer already set");
2534                 return 0;
2535         }
2536
2537         /* calculate size, might be a bit bigger than the real
2538          * decoded buffer size */
2539         expectb64len = sldns_b64_pton_calculate_size(length);
2540         log_assert(expectb64len > 0);
2541         if(expectb64len >
2542                 h2_session->c->http2_stream_max_qbuffer_size) {
2543                 h2_stream->query_too_large = 1;
2544                 return 1;
2545         }
2546
2547         lock_basic_lock(&http2_query_buffer_count_lock);
2548         if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2549                 lock_basic_unlock(&http2_query_buffer_count_lock);
2550                 verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2551                         "in http2-query-buffer-size");
2552                 return http2_submit_rst_stream(h2_session, h2_stream);
2553         }
2554         http2_query_buffer_count += expectb64len;
2555         lock_basic_unlock(&http2_query_buffer_count_lock);
2556         if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2557                 lock_basic_lock(&http2_query_buffer_count_lock);
2558                 http2_query_buffer_count -= expectb64len;
2559                 lock_basic_unlock(&http2_query_buffer_count_lock);
2560                 log_err("http2_req_header fail, qbuffer "
2561                         "malloc failure");
2562                 return 0;
2563         }
2564
2565         if(!(b64len = sldns_b64url_pton(
2566                 (char const *)start, length,
2567                 sldns_buffer_current(h2_stream->qbuffer),
2568                 expectb64len)) || b64len < 0) {
2569                 lock_basic_lock(&http2_query_buffer_count_lock);
2570                 http2_query_buffer_count -= expectb64len;
2571                 lock_basic_unlock(&http2_query_buffer_count_lock);
2572                 sldns_buffer_free(h2_stream->qbuffer);
2573                 h2_stream->qbuffer = NULL;
2574                 /* return without error, method can be an
2575                  * unknown POST */
2576                 return 1;
2577         }
2578         sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2579         return 1;
2580 }
2581
2582 /** nghttp2 callback. Used to parse headers from HEADER frames. */
2583 static int http2_req_header_cb(nghttp2_session* session,
2584         const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2585         const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2586         void* cb_arg)
2587 {
2588         struct http2_stream* h2_stream = NULL;
2589         struct http2_session* h2_session = (struct http2_session*)cb_arg;
2590         /* nghttp2 deals with CONTINUATION frames and provides them as part of
2591          * the HEADER */
2592         if(frame->hd.type != NGHTTP2_HEADERS ||
2593                 frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2594                 /* only interrested in request headers */
2595                 return 0;
2596         }
2597         if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2598                 frame->hd.stream_id)))
2599                 return 0;
2600
2601         /* earlier checks already indicate we can stop handling this query */
2602         if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2603                 h2_stream->invalid_content_type ||
2604                 h2_stream->invalid_endpoint)
2605                 return 0;
2606
2607
2608         /* nghttp2 performs some sanity checks in the headers, including:
2609          * name and value are guaranteed to be null terminated
2610          * name is guaranteed to be lowercase
2611          * content-length value is guaranteed to contain digits
2612          */
2613
2614         if(!h2_stream->http_method && namelen == 7 &&
2615                 memcmp(":method", name, namelen) == 0) {
2616                 /* Case insensitive check on :method value to be on the safe
2617                  * side. I failed to find text about case sensitivity in specs.
2618                  */
2619                 if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2620                         h2_stream->http_method = HTTP_METHOD_GET;
2621                 else if(valuelen == 4 &&
2622                         strcasecmp("POST", (const char*)value) == 0) {
2623                         h2_stream->http_method = HTTP_METHOD_POST;
2624                         if(h2_stream->qbuffer) {
2625                                 /* POST method uses query from DATA frames */
2626                                 lock_basic_lock(&http2_query_buffer_count_lock);
2627                                 http2_query_buffer_count -=
2628                                         sldns_buffer_capacity(h2_stream->qbuffer);
2629                                 lock_basic_unlock(&http2_query_buffer_count_lock);
2630                                 sldns_buffer_free(h2_stream->qbuffer);
2631                                 h2_stream->qbuffer = NULL;
2632                         }
2633                 } else
2634                         h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2635                 return 0;
2636         }
2637         if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2638                 /* :path may contain DNS query, depending on method. Method might
2639                  * not be known yet here, so check after finishing receiving
2640                  * stream. */
2641 #define HTTP_QUERY_PARAM "?dns="
2642                 size_t el = strlen(h2_session->c->http_endpoint);
2643                 size_t qpl = strlen(HTTP_QUERY_PARAM);
2644
2645                 if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2646                         value, el) != 0) {
2647                         h2_stream->invalid_endpoint = 1;
2648                         return 0;
2649                 }
2650                 /* larger than endpoint only allowed if it is for the query
2651                  * parameter */
2652                 if(valuelen <= el+qpl ||
2653                         memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2654                         if(valuelen != el)
2655                                 h2_stream->invalid_endpoint = 1;
2656                         return 0;
2657                 }
2658
2659                 if(!http2_buffer_uri_query(h2_session, h2_stream,
2660                         value+(el+qpl), valuelen-(el+qpl))) {
2661                         return NGHTTP2_ERR_CALLBACK_FAILURE;
2662                 }
2663                 return 0;
2664         }
2665         /* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
2666          * and not needed when using GET. Don't enfore.
2667          * If set only allow lowercase "application/dns-message".
2668          *
2669          * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
2670          * be able to handle "application/dns-message". Since that is the only
2671          * content-type supported we can ignore the accept header.
2672          */
2673         if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
2674                 if(valuelen != 23 || memcmp("application/dns-message", value,
2675                         valuelen) != 0) {
2676                         h2_stream->invalid_content_type = 1;
2677                 }
2678         }
2679
2680         /* Only interested in content-lentg for POST (on not yet known) method.
2681          */
2682         if((!h2_stream->http_method ||
2683                 h2_stream->http_method == HTTP_METHOD_POST) &&
2684                 !h2_stream->content_length && namelen  == 14 &&
2685                 memcmp("content-length", name, namelen) == 0) {
2686                 if(valuelen > 5) {
2687                         h2_stream->query_too_large = 1;
2688                         return 0;
2689                 }
2690                 /* guaranteed to only contian digits and be null terminated */
2691                 h2_stream->content_length = atoi((const char*)value);
2692                 if(h2_stream->content_length >
2693                         h2_session->c->http2_stream_max_qbuffer_size) {
2694                         h2_stream->query_too_large = 1;
2695                         return 0;
2696                 }
2697         }
2698         return 0;
2699 }
2700
2701 /** nghttp2 callback. Used to get data from DATA frames, which can contain
2702  * queries in POST requests. */
2703 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
2704         uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
2705         size_t len, void* cb_arg)
2706 {
2707         struct http2_session* h2_session = (struct http2_session*)cb_arg;
2708         struct http2_stream* h2_stream;
2709         size_t qlen = 0;
2710
2711         if(!(h2_stream = nghttp2_session_get_stream_user_data(
2712                 h2_session->session, stream_id))) {
2713                 return 0;
2714         }
2715
2716         if(h2_stream->query_too_large)
2717                 return 0;
2718
2719         if(!h2_stream->qbuffer) {
2720                 if(h2_stream->content_length) {
2721                         if(h2_stream->content_length < len)
2722                                 /* getting more data in DATA frame than
2723                                  * advertised in content-length header. */
2724                                 return NGHTTP2_ERR_CALLBACK_FAILURE;
2725                         qlen = h2_stream->content_length;
2726                 } else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
2727                         /* setting this to msg-buffer-size can result in a lot
2728                          * of memory consuption. Most queries should fit in a
2729                          * single DATA frame, and most POST queries will
2730                          * containt content-length which does not impose this
2731                          * limit. */
2732                         qlen = len;
2733                 }
2734         }
2735         if(!h2_stream->qbuffer && qlen) {
2736                 lock_basic_lock(&http2_query_buffer_count_lock);
2737                 if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
2738                         lock_basic_unlock(&http2_query_buffer_count_lock);
2739                         verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2740                                 "in http2-query-buffer-size");
2741                         return http2_submit_rst_stream(h2_session, h2_stream);
2742                 }
2743                 http2_query_buffer_count += qlen;
2744                 lock_basic_unlock(&http2_query_buffer_count_lock);
2745                 if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
2746                         lock_basic_lock(&http2_query_buffer_count_lock);
2747                         http2_query_buffer_count -= qlen;
2748                         lock_basic_unlock(&http2_query_buffer_count_lock);
2749                 }
2750         }
2751
2752         if(!h2_stream->qbuffer ||
2753                 sldns_buffer_remaining(h2_stream->qbuffer) < len) {
2754                 verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
2755                         "buffer space for POST query. Can happen on multi "
2756                         "frame requests without content-length header");
2757                 h2_stream->query_too_large = 1;
2758                 return 0;
2759         }
2760
2761         sldns_buffer_write(h2_stream->qbuffer, data, len);
2762
2763         return 0;
2764 }
2765
2766 void http2_req_stream_clear(struct http2_stream* h2_stream)
2767 {
2768         if(h2_stream->qbuffer) {
2769                 lock_basic_lock(&http2_query_buffer_count_lock);
2770                 http2_query_buffer_count -=
2771                         sldns_buffer_capacity(h2_stream->qbuffer);
2772                 lock_basic_unlock(&http2_query_buffer_count_lock);
2773                 sldns_buffer_free(h2_stream->qbuffer);
2774                 h2_stream->qbuffer = NULL;
2775         }
2776         if(h2_stream->rbuffer) {
2777                 lock_basic_lock(&http2_response_buffer_count_lock);
2778                 http2_response_buffer_count -=
2779                         sldns_buffer_capacity(h2_stream->rbuffer);
2780                 lock_basic_unlock(&http2_response_buffer_count_lock);
2781                 sldns_buffer_free(h2_stream->rbuffer);
2782                 h2_stream->rbuffer = NULL;
2783         }
2784 }
2785
2786 nghttp2_session_callbacks* http2_req_callbacks_create()
2787 {
2788         nghttp2_session_callbacks *callbacks;
2789         if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
2790                 log_err("failed to initialize nghttp2 callback");
2791                 return NULL;
2792         }
2793         /* reception of header block started, used to create h2_stream */
2794         nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
2795                 http2_req_begin_headers_cb);
2796         /* complete frame received, used to get data from stream if frame
2797          * has end stream flag, and start processing query */
2798         nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
2799                 http2_req_frame_recv_cb);
2800         /* get request info from headers */
2801         nghttp2_session_callbacks_set_on_header_callback(callbacks,
2802                 http2_req_header_cb);
2803         /* get data from DATA frames, containing POST query */
2804         nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
2805                 http2_req_data_chunk_recv_cb);
2806
2807         /* generic HTTP2 callbacks */
2808         nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
2809         nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
2810         nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
2811                 http2_stream_close_cb);
2812
2813         return callbacks;
2814 }
2815 #endif /* HAVE_NGHTTP2 */