]> CyberLeo.Net >> Repos - FreeBSD/FreeBSD.git/blob - contrib/unbound/services/listen_dnsport.c
MFV 364468:
[FreeBSD/FreeBSD.git] / contrib / unbound / services / listen_dnsport.c
1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  * 
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  * 
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  * 
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  * 
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #ifdef USE_TCP_FASTOPEN
47 #include <netinet/tcp.h>
48 #endif
49 #include "services/listen_dnsport.h"
50 #include "services/outside_network.h"
51 #include "util/netevent.h"
52 #include "util/log.h"
53 #include "util/config_file.h"
54 #include "util/net_help.h"
55 #include "sldns/sbuffer.h"
56 #include "services/mesh.h"
57 #include "util/fptr_wlist.h"
58 #include "util/locks.h"
59
60 #ifdef HAVE_NETDB_H
61 #include <netdb.h>
62 #endif
63 #include <fcntl.h>
64
65 #ifdef HAVE_SYS_UN_H
66 #include <sys/un.h>
67 #endif
68
69 #ifdef HAVE_SYSTEMD
70 #include <systemd/sd-daemon.h>
71 #endif
72
73 /** number of queued TCP connections for listen() */
74 #define TCP_BACKLOG 256 
75
76 /** number of simultaneous requests a client can have */
77 #define TCP_MAX_REQ_SIMULTANEOUS 32
78
79 #ifndef THREADS_DISABLED
80 /** lock on the counter of stream buffer memory */
81 static lock_basic_type stream_wait_count_lock;
82 #endif
83 /** size (in bytes) of stream wait buffers */
84 static size_t stream_wait_count = 0;
85 /** is the lock initialised for stream wait buffers */
86 static int stream_wait_lock_inited = 0;
87
88 /**
89  * Debug print of the getaddrinfo returned address.
90  * @param addr: the address returned.
91  */
92 static void
93 verbose_print_addr(struct addrinfo *addr)
94 {
95         if(verbosity >= VERB_ALGO) {
96                 char buf[100];
97                 void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
98 #ifdef INET6
99                 if(addr->ai_family == AF_INET6)
100                         sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
101                                 sin6_addr;
102 #endif /* INET6 */
103                 if(inet_ntop(addr->ai_family, sinaddr, buf,
104                         (socklen_t)sizeof(buf)) == 0) {
105                         (void)strlcpy(buf, "(null)", sizeof(buf));
106                 }
107                 buf[sizeof(buf)-1] = 0;
108                 verbose(VERB_ALGO, "creating %s%s socket %s %d", 
109                         addr->ai_socktype==SOCK_DGRAM?"udp":
110                         addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
111                         addr->ai_family==AF_INET?"4":
112                         addr->ai_family==AF_INET6?"6":
113                         "_otherfam", buf, 
114                         ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
115         }
116 }
117
118 #ifdef HAVE_SYSTEMD
119 static int
120 systemd_get_activated(int family, int socktype, int listen,
121                       struct sockaddr *addr, socklen_t addrlen,
122                       const char *path)
123 {
124         int i = 0;
125         int r = 0;
126         int s = -1;
127         const char* listen_pid, *listen_fds;
128
129         /* We should use "listen" option only for stream protocols. For UDP it should be -1 */
130
131         if((r = sd_booted()) < 1) {
132                 if(r == 0)
133                         log_warn("systemd is not running");
134                 else
135                         log_err("systemd sd_booted(): %s", strerror(-r));
136                 return -1;
137         }
138
139         listen_pid = getenv("LISTEN_PID");
140         listen_fds = getenv("LISTEN_FDS");
141
142         if (!listen_pid) {
143                 log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
144                 return -1;
145         }
146
147         if (!listen_fds) {
148                 log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
149                 return -1;
150         }
151
152         if((r = sd_listen_fds(0)) < 1) {
153                 if(r == 0)
154                         log_warn("systemd: did not return socket, check unit configuration");
155                 else
156                         log_err("systemd sd_listen_fds(): %s", strerror(-r));
157                 return -1;
158         }
159         
160         for(i = 0; i < r; i++) {
161                 if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
162                         s = SD_LISTEN_FDS_START + i;
163                         break;
164                 }
165         }
166         if (s == -1) {
167                 if (addr)
168                         log_err_addr("systemd sd_listen_fds()",
169                                      "no such socket",
170                                      (struct sockaddr_storage *)addr, addrlen);
171                 else
172                         log_err("systemd sd_listen_fds(): %s", path);
173         }
174         return s;
175 }
176 #endif
177
178 int
179 create_udp_sock(int family, int socktype, struct sockaddr* addr,
180         socklen_t addrlen, int v6only, int* inuse, int* noproto,
181         int rcv, int snd, int listen, int* reuseport, int transparent,
182         int freebind, int use_systemd, int dscp)
183 {
184         int s;
185         char* err;
186 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
187         int on=1;
188 #endif
189 #ifdef IPV6_MTU
190         int mtu = IPV6_MIN_MTU;
191 #endif
192 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
193         (void)rcv;
194 #endif
195 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
196         (void)snd;
197 #endif
198 #ifndef IPV6_V6ONLY
199         (void)v6only;
200 #endif
201 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
202         (void)transparent;
203 #endif
204 #if !defined(IP_FREEBIND)
205         (void)freebind;
206 #endif
207 #ifdef HAVE_SYSTEMD
208         int got_fd_from_systemd = 0;
209
210         if (!use_systemd
211             || (use_systemd
212                 && (s = systemd_get_activated(family, socktype, -1, addr,
213                                               addrlen, NULL)) == -1)) {
214 #else
215         (void)use_systemd;
216 #endif
217         if((s = socket(family, socktype, 0)) == -1) {
218                 *inuse = 0;
219 #ifndef USE_WINSOCK
220                 if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
221                         *noproto = 1;
222                         return -1;
223                 }
224                 log_err("can't create socket: %s", strerror(errno));
225 #else
226                 if(WSAGetLastError() == WSAEAFNOSUPPORT || 
227                         WSAGetLastError() == WSAEPROTONOSUPPORT) {
228                         *noproto = 1;
229                         return -1;
230                 }
231                 log_err("can't create socket: %s", 
232                         wsa_strerror(WSAGetLastError()));
233 #endif
234                 *noproto = 0;
235                 return -1;
236         }
237 #ifdef HAVE_SYSTEMD
238         } else {
239                 got_fd_from_systemd = 1;
240         }
241 #endif
242         if(listen) {
243 #ifdef SO_REUSEADDR
244                 if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on, 
245                         (socklen_t)sizeof(on)) < 0) {
246 #ifndef USE_WINSOCK
247                         log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
248                                 strerror(errno));
249                         if(errno != ENOSYS) {
250                                 close(s);
251                                 *noproto = 0;
252                                 *inuse = 0;
253                                 return -1;
254                         }
255 #else
256                         log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
257                                 wsa_strerror(WSAGetLastError()));
258                         closesocket(s);
259                         *noproto = 0;
260                         *inuse = 0;
261                         return -1;
262 #endif
263                 }
264 #endif /* SO_REUSEADDR */
265 #ifdef SO_REUSEPORT
266 #  ifdef SO_REUSEPORT_LB
267                 /* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
268                  * like SO_REUSEPORT on Linux.  This is what the users want
269                  * with the config option in unbound.conf; if we actually
270                  * need local address and port reuse they'll also need to
271                  * have SO_REUSEPORT set for them, assume it was _LB they want.
272                  */
273                 if (reuseport && *reuseport &&
274                     setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
275                         (socklen_t)sizeof(on)) < 0) {
276 #ifdef ENOPROTOOPT
277                         if(errno != ENOPROTOOPT || verbosity >= 3)
278                                 log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
279                                         strerror(errno));
280 #endif
281                         /* this option is not essential, we can continue */
282                         *reuseport = 0;
283                 }
284 #  else /* no SO_REUSEPORT_LB */
285
286                 /* try to set SO_REUSEPORT so that incoming
287                  * queries are distributed evenly among the receiving threads.
288                  * Each thread must have its own socket bound to the same port,
289                  * with SO_REUSEPORT set on each socket.
290                  */
291                 if (reuseport && *reuseport &&
292                     setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
293                         (socklen_t)sizeof(on)) < 0) {
294 #ifdef ENOPROTOOPT
295                         if(errno != ENOPROTOOPT || verbosity >= 3)
296                                 log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
297                                         strerror(errno));
298 #endif
299                         /* this option is not essential, we can continue */
300                         *reuseport = 0;
301                 }
302 #  endif /* SO_REUSEPORT_LB */
303 #else
304                 (void)reuseport;
305 #endif /* defined(SO_REUSEPORT) */
306 #ifdef IP_TRANSPARENT
307                 if (transparent &&
308                     setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
309                     (socklen_t)sizeof(on)) < 0) {
310                         log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
311                         strerror(errno));
312                 }
313 #elif defined(IP_BINDANY)
314                 if (transparent &&
315                     setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
316                     (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
317                     (void*)&on, (socklen_t)sizeof(on)) < 0) {
318                         log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
319                         (family==AF_INET6?"V6":""), strerror(errno));
320                 }
321 #elif defined(SO_BINDANY)
322                 if (transparent &&
323                     setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
324                     (socklen_t)sizeof(on)) < 0) {
325                         log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
326                         strerror(errno));
327                 }
328 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
329         }
330 #ifdef IP_FREEBIND
331         if(freebind &&
332             setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
333             (socklen_t)sizeof(on)) < 0) {
334                 log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
335                 strerror(errno));
336         }
337 #endif /* IP_FREEBIND */
338         if(rcv) {
339 #ifdef SO_RCVBUF
340                 int got;
341                 socklen_t slen = (socklen_t)sizeof(got);
342 #  ifdef SO_RCVBUFFORCE
343                 /* Linux specific: try to use root permission to override
344                  * system limits on rcvbuf. The limit is stored in 
345                  * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
346                 if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv, 
347                         (socklen_t)sizeof(rcv)) < 0) {
348                         if(errno != EPERM) {
349 #    ifndef USE_WINSOCK
350                                 log_err("setsockopt(..., SO_RCVBUFFORCE, "
351                                         "...) failed: %s", strerror(errno));
352                                 close(s);
353 #    else
354                                 log_err("setsockopt(..., SO_RCVBUFFORCE, "
355                                         "...) failed: %s", 
356                                         wsa_strerror(WSAGetLastError()));
357                                 closesocket(s);
358 #    endif
359                                 *noproto = 0;
360                                 *inuse = 0;
361                                 return -1;
362                         }
363 #  endif /* SO_RCVBUFFORCE */
364                         if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv, 
365                                 (socklen_t)sizeof(rcv)) < 0) {
366 #  ifndef USE_WINSOCK
367                                 log_err("setsockopt(..., SO_RCVBUF, "
368                                         "...) failed: %s", strerror(errno));
369                                 close(s);
370 #  else
371                                 log_err("setsockopt(..., SO_RCVBUF, "
372                                         "...) failed: %s", 
373                                         wsa_strerror(WSAGetLastError()));
374                                 closesocket(s);
375 #  endif
376                                 *noproto = 0;
377                                 *inuse = 0;
378                                 return -1;
379                         }
380                         /* check if we got the right thing or if system
381                          * reduced to some system max.  Warn if so */
382                         if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got, 
383                                 &slen) >= 0 && got < rcv/2) {
384                                 log_warn("so-rcvbuf %u was not granted. "
385                                         "Got %u. To fix: start with "
386                                         "root permissions(linux) or sysctl "
387                                         "bigger net.core.rmem_max(linux) or "
388                                         "kern.ipc.maxsockbuf(bsd) values.",
389                                         (unsigned)rcv, (unsigned)got);
390                         }
391 #  ifdef SO_RCVBUFFORCE
392                 }
393 #  endif
394 #endif /* SO_RCVBUF */
395         }
396         /* first do RCVBUF as the receive buffer is more important */
397         if(snd) {
398 #ifdef SO_SNDBUF
399                 int got;
400                 socklen_t slen = (socklen_t)sizeof(got);
401 #  ifdef SO_SNDBUFFORCE
402                 /* Linux specific: try to use root permission to override
403                  * system limits on sndbuf. The limit is stored in 
404                  * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
405                 if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd, 
406                         (socklen_t)sizeof(snd)) < 0) {
407                         if(errno != EPERM) {
408 #    ifndef USE_WINSOCK
409                                 log_err("setsockopt(..., SO_SNDBUFFORCE, "
410                                         "...) failed: %s", strerror(errno));
411                                 close(s);
412 #    else
413                                 log_err("setsockopt(..., SO_SNDBUFFORCE, "
414                                         "...) failed: %s", 
415                                         wsa_strerror(WSAGetLastError()));
416                                 closesocket(s);
417 #    endif
418                                 *noproto = 0;
419                                 *inuse = 0;
420                                 return -1;
421                         }
422 #  endif /* SO_SNDBUFFORCE */
423                         if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd, 
424                                 (socklen_t)sizeof(snd)) < 0) {
425 #  ifndef USE_WINSOCK
426                                 log_err("setsockopt(..., SO_SNDBUF, "
427                                         "...) failed: %s", strerror(errno));
428                                 close(s);
429 #  else
430                                 log_err("setsockopt(..., SO_SNDBUF, "
431                                         "...) failed: %s", 
432                                         wsa_strerror(WSAGetLastError()));
433                                 closesocket(s);
434 #  endif
435                                 *noproto = 0;
436                                 *inuse = 0;
437                                 return -1;
438                         }
439                         /* check if we got the right thing or if system
440                          * reduced to some system max.  Warn if so */
441                         if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got, 
442                                 &slen) >= 0 && got < snd/2) {
443                                 log_warn("so-sndbuf %u was not granted. "
444                                         "Got %u. To fix: start with "
445                                         "root permissions(linux) or sysctl "
446                                         "bigger net.core.wmem_max(linux) or "
447                                         "kern.ipc.maxsockbuf(bsd) values.",
448                                         (unsigned)snd, (unsigned)got);
449                         }
450 #  ifdef SO_SNDBUFFORCE
451                 }
452 #  endif
453 #endif /* SO_SNDBUF */
454         }
455         err = set_ip_dscp(s, family, dscp);
456         if(err != NULL)
457                 log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
458         if(family == AF_INET6) {
459 # if defined(IPV6_V6ONLY)
460                 if(v6only) {
461                         int val=(v6only==2)?0:1;
462                         if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, 
463                                 (void*)&val, (socklen_t)sizeof(val)) < 0) {
464 #ifndef USE_WINSOCK
465                                 log_err("setsockopt(..., IPV6_V6ONLY"
466                                         ", ...) failed: %s", strerror(errno));
467                                 close(s);
468 #else
469                                 log_err("setsockopt(..., IPV6_V6ONLY"
470                                         ", ...) failed: %s", 
471                                         wsa_strerror(WSAGetLastError()));
472                                 closesocket(s);
473 #endif
474                                 *noproto = 0;
475                                 *inuse = 0;
476                                 return -1;
477                         }
478                 }
479 # endif
480 # if defined(IPV6_USE_MIN_MTU)
481                 /*
482                  * There is no fragmentation of IPv6 datagrams
483                  * during forwarding in the network. Therefore
484                  * we do not send UDP datagrams larger than
485                  * the minimum IPv6 MTU of 1280 octets. The
486                  * EDNS0 message length can be larger if the
487                  * network stack supports IPV6_USE_MIN_MTU.
488                  */
489                 if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
490                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
491 #  ifndef USE_WINSOCK
492                         log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
493                                 "...) failed: %s", strerror(errno));
494                         close(s);
495 #  else
496                         log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
497                                 "...) failed: %s", 
498                                 wsa_strerror(WSAGetLastError()));
499                         closesocket(s);
500 #  endif
501                         *noproto = 0;
502                         *inuse = 0;
503                         return -1;
504                 }
505 # elif defined(IPV6_MTU)
506                 /*
507                  * On Linux, to send no larger than 1280, the PMTUD is
508                  * disabled by default for datagrams anyway, so we set
509                  * the MTU to use.
510                  */
511                 if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
512                         (void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
513 #  ifndef USE_WINSOCK
514                         log_err("setsockopt(..., IPV6_MTU, ...) failed: %s", 
515                                 strerror(errno));
516                         close(s);
517 #  else
518                         log_err("setsockopt(..., IPV6_MTU, ...) failed: %s", 
519                                 wsa_strerror(WSAGetLastError()));
520                         closesocket(s);
521 #  endif
522                         *noproto = 0;
523                         *inuse = 0;
524                         return -1;
525                 }
526 # endif /* IPv6 MTU */
527         } else if(family == AF_INET) {
528 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
529 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
530  * PMTU information is not accepted, but fragmentation is allowed
531  * if and only if the packet size exceeds the outgoing interface MTU
532  * (and also uses the interface mtu to determine the size of the packets).
533  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
534  * FreeBSD already has same semantics without setting the option. */
535                 int omit_set = 0;
536                 int action;
537 #   if defined(IP_PMTUDISC_OMIT)
538                 action = IP_PMTUDISC_OMIT;
539                 if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER, 
540                         &action, (socklen_t)sizeof(action)) < 0) {
541
542                         if (errno != EINVAL) {
543                                 log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
544                                         strerror(errno));
545
546 #    ifndef USE_WINSOCK
547                                 close(s);
548 #    else
549                                 closesocket(s);
550 #    endif
551                                 *noproto = 0;
552                                 *inuse = 0;
553                                 return -1;
554                         }
555                 }
556                 else
557                 {
558                     omit_set = 1;
559                 }
560 #   endif
561                 if (omit_set == 0) {
562                         action = IP_PMTUDISC_DONT;
563                         if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
564                                 &action, (socklen_t)sizeof(action)) < 0) {
565                                 log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
566                                         strerror(errno));
567 #    ifndef USE_WINSOCK
568                                 close(s);
569 #    else
570                                 closesocket(s);
571 #    endif
572                                 *noproto = 0;
573                                 *inuse = 0;
574                                 return -1;
575                         }
576                 }
577 #  elif defined(IP_DONTFRAG)
578                 int off = 0;
579                 if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG, 
580                         &off, (socklen_t)sizeof(off)) < 0) {
581                         log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
582                                 strerror(errno));
583 #    ifndef USE_WINSOCK
584                         close(s);
585 #    else
586                         closesocket(s);
587 #    endif
588                         *noproto = 0;
589                         *inuse = 0;
590                         return -1;
591                 }
592 #  endif /* IPv4 MTU */
593         }
594         if(
595 #ifdef HAVE_SYSTEMD
596                 !got_fd_from_systemd &&
597 #endif
598                 bind(s, (struct sockaddr*)addr, addrlen) != 0) {
599                 *noproto = 0;
600                 *inuse = 0;
601 #ifndef USE_WINSOCK
602 #ifdef EADDRINUSE
603                 *inuse = (errno == EADDRINUSE);
604                 /* detect freebsd jail with no ipv6 permission */
605                 if(family==AF_INET6 && errno==EINVAL)
606                         *noproto = 1;
607                 else if(errno != EADDRINUSE &&
608                         !(errno == EACCES && verbosity < 4 && !listen)
609 #ifdef EADDRNOTAVAIL
610                         && !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
611 #endif
612                         ) {
613                         log_err_addr("can't bind socket", strerror(errno),
614                                 (struct sockaddr_storage*)addr, addrlen);
615                 }
616 #endif /* EADDRINUSE */
617                 close(s);
618 #else /* USE_WINSOCK */
619                 if(WSAGetLastError() != WSAEADDRINUSE &&
620                         WSAGetLastError() != WSAEADDRNOTAVAIL &&
621                         !(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
622                         log_err_addr("can't bind socket", 
623                                 wsa_strerror(WSAGetLastError()),
624                                 (struct sockaddr_storage*)addr, addrlen);
625                 }
626                 closesocket(s);
627 #endif /* USE_WINSOCK */
628                 return -1;
629         }
630         if(!fd_set_nonblock(s)) {
631                 *noproto = 0;
632                 *inuse = 0;
633 #ifndef USE_WINSOCK
634                 close(s);
635 #else
636                 closesocket(s);
637 #endif
638                 return -1;
639         }
640         return s;
641 }
642
643 int
644 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
645         int* reuseport, int transparent, int mss, int freebind, int use_systemd, int dscp)
646 {
647         int s;
648         char* err;
649 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
650         int on = 1;
651 #endif
652 #ifdef HAVE_SYSTEMD
653         int got_fd_from_systemd = 0;
654 #endif
655 #ifdef USE_TCP_FASTOPEN
656         int qlen;
657 #endif
658 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
659         (void)transparent;
660 #endif
661 #if !defined(IP_FREEBIND)
662         (void)freebind;
663 #endif
664         verbose_print_addr(addr);
665         *noproto = 0;
666 #ifdef HAVE_SYSTEMD
667         if (!use_systemd ||
668             (use_systemd
669              && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
670                                            addr->ai_addr, addr->ai_addrlen,
671                                            NULL)) == -1)) {
672 #else
673         (void)use_systemd;
674 #endif
675         if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
676 #ifndef USE_WINSOCK
677                 if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
678                         *noproto = 1;
679                         return -1;
680                 }
681                 log_err("can't create socket: %s", strerror(errno));
682 #else
683                 if(WSAGetLastError() == WSAEAFNOSUPPORT ||
684                         WSAGetLastError() == WSAEPROTONOSUPPORT) {
685                         *noproto = 1;
686                         return -1;
687                 }
688                 log_err("can't create socket: %s", 
689                         wsa_strerror(WSAGetLastError()));
690 #endif
691                 return -1;
692         }
693         if (mss > 0) {
694 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
695                 if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
696                         (socklen_t)sizeof(mss)) < 0) {
697                         #ifndef USE_WINSOCK
698                         log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
699                                 strerror(errno));
700                         #else
701                         log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
702                                 wsa_strerror(WSAGetLastError()));
703                         #endif
704                 } else {
705                         verbose(VERB_ALGO,
706                                 " tcp socket mss set to %d", mss);
707                 }
708 #else
709                 log_warn(" setsockopt(TCP_MAXSEG) unsupported");
710 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
711         }
712 #ifdef HAVE_SYSTEMD
713         } else {
714                 got_fd_from_systemd = 1;
715     }
716 #endif
717 #ifdef SO_REUSEADDR
718         if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on, 
719                 (socklen_t)sizeof(on)) < 0) {
720 #ifndef USE_WINSOCK
721                 log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
722                         strerror(errno));
723                 close(s);
724 #else
725                 log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
726                         wsa_strerror(WSAGetLastError()));
727                 closesocket(s);
728 #endif
729                 return -1;
730         }
731 #endif /* SO_REUSEADDR */
732 #ifdef IP_FREEBIND
733         if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
734             (socklen_t)sizeof(on)) < 0) {
735                 log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
736                 strerror(errno));
737         }
738 #endif /* IP_FREEBIND */
739 #ifdef SO_REUSEPORT
740         /* try to set SO_REUSEPORT so that incoming
741          * connections are distributed evenly among the receiving threads.
742          * Each thread must have its own socket bound to the same port,
743          * with SO_REUSEPORT set on each socket.
744          */
745         if (reuseport && *reuseport &&
746                 setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
747                 (socklen_t)sizeof(on)) < 0) {
748 #ifdef ENOPROTOOPT
749                 if(errno != ENOPROTOOPT || verbosity >= 3)
750                         log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
751                                 strerror(errno));
752 #endif
753                 /* this option is not essential, we can continue */
754                 *reuseport = 0;
755         }
756 #else
757         (void)reuseport;
758 #endif /* defined(SO_REUSEPORT) */
759 #if defined(IPV6_V6ONLY)
760         if(addr->ai_family == AF_INET6 && v6only) {
761                 if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, 
762                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
763 #ifndef USE_WINSOCK
764                         log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
765                                 strerror(errno));
766                         close(s);
767 #else
768                         log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
769                                 wsa_strerror(WSAGetLastError()));
770                         closesocket(s);
771 #endif
772                         return -1;
773                 }
774         }
775 #else
776         (void)v6only;
777 #endif /* IPV6_V6ONLY */
778 #ifdef IP_TRANSPARENT
779         if (transparent &&
780             setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
781             (socklen_t)sizeof(on)) < 0) {
782                 log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
783                         strerror(errno));
784         }
785 #elif defined(IP_BINDANY)
786         if (transparent &&
787             setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
788             (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
789             (void*)&on, (socklen_t)sizeof(on)) < 0) {
790                 log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
791                 (addr->ai_family==AF_INET6?"V6":""), strerror(errno));
792         }
793 #elif defined(SO_BINDANY)
794         if (transparent &&
795             setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
796             sizeof(on)) < 0) {
797                 log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
798                 strerror(errno));
799         }
800 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
801         err = set_ip_dscp(s, addr->ai_family, dscp);
802         if(err != NULL)
803                 log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
804         if(
805 #ifdef HAVE_SYSTEMD
806                 !got_fd_from_systemd &&
807 #endif
808         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
809 #ifndef USE_WINSOCK
810                 /* detect freebsd jail with no ipv6 permission */
811                 if(addr->ai_family==AF_INET6 && errno==EINVAL)
812                         *noproto = 1;
813                 else {
814                         log_err_addr("can't bind socket", strerror(errno),
815                                 (struct sockaddr_storage*)addr->ai_addr,
816                                 addr->ai_addrlen);
817                 }
818                 close(s);
819 #else
820                 log_err_addr("can't bind socket", 
821                         wsa_strerror(WSAGetLastError()),
822                         (struct sockaddr_storage*)addr->ai_addr,
823                         addr->ai_addrlen);
824                 closesocket(s);
825 #endif
826                 return -1;
827         }
828         if(!fd_set_nonblock(s)) {
829 #ifndef USE_WINSOCK
830                 close(s);
831 #else
832                 closesocket(s);
833 #endif
834                 return -1;
835         }
836         if(listen(s, TCP_BACKLOG) == -1) {
837 #ifndef USE_WINSOCK
838                 log_err("can't listen: %s", strerror(errno));
839                 close(s);
840 #else
841                 log_err("can't listen: %s", wsa_strerror(WSAGetLastError()));
842                 closesocket(s);
843 #endif
844                 return -1;
845         }
846 #ifdef USE_TCP_FASTOPEN
847         /* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
848            against IP spoofing attacks as suggested in RFC7413 */
849 #ifdef __APPLE__
850         /* OS X implementation only supports qlen of 1 via this call. Actual
851            value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
852         qlen = 1;
853 #else
854         /* 5 is recommended on linux */
855         qlen = 5;
856 #endif
857         if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen, 
858                   sizeof(qlen))) == -1 ) {
859 #ifdef ENOPROTOOPT
860                 /* squelch ENOPROTOOPT: freebsd server mode with kernel support
861                    disabled, except when verbosity enabled for debugging */
862                 if(errno != ENOPROTOOPT || verbosity >= 3) {
863 #endif
864                   if(errno == EPERM) {
865                         log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
866                   } else {
867                         log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
868                   }
869 #ifdef ENOPROTOOPT
870                 }
871 #endif
872         }
873 #endif
874         return s;
875 }
876
877 char*
878 set_ip_dscp(int socket, int addrfamily, int dscp)
879 {
880         int ds;
881
882         if(dscp == 0)
883                 return NULL;
884         ds = dscp << 2;
885         switch(addrfamily) {
886         case AF_INET6:
887                 if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds, sizeof(ds)) < 0)
888                         return sock_strerror(errno);
889                 break;
890         default:
891                 if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
892                         return sock_strerror(errno);
893                 break;
894         }
895         return NULL;
896 }
897
898 #  ifndef USE_WINSOCK
899 char*
900 sock_strerror(int errn)
901 {
902         return strerror(errn);
903 }
904
905 void
906 sock_close(int socket)
907 {
908         close(socket);
909 }
910
911 #  else
912 char*
913 sock_strerror(int ATTR_UNUSED(errn))
914 {
915         return wsa_strerror(WSAGetLastError());
916 }
917
918 void
919 sock_close(int socket)
920 {
921         closesocket(socket);
922 }
923
924 #  endif /* USE_WINSOCK */
925
926 int
927 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
928 {
929 #ifdef HAVE_SYSTEMD
930         int ret;
931
932         if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
933                 return ret;
934         else {
935 #endif
936 #ifdef HAVE_SYS_UN_H
937         int s;
938         struct sockaddr_un usock;
939 #ifndef HAVE_SYSTEMD
940         (void)use_systemd;
941 #endif
942
943         verbose(VERB_ALGO, "creating unix socket %s", path);
944 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
945         /* this member exists on BSDs, not Linux */
946         usock.sun_len = (unsigned)sizeof(usock);
947 #endif
948         usock.sun_family = AF_LOCAL;
949         /* length is 92-108, 104 on FreeBSD */
950         (void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
951
952         if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
953                 log_err("Cannot create local socket %s (%s)",
954                         path, strerror(errno));
955                 return -1;
956         }
957
958         if (unlink(path) && errno != ENOENT) {
959                 /* The socket already exists and cannot be removed */
960                 log_err("Cannot remove old local socket %s (%s)",
961                         path, strerror(errno));
962                 goto err;
963         }
964
965         if (bind(s, (struct sockaddr *)&usock,
966                 (socklen_t)sizeof(struct sockaddr_un)) == -1) {
967                 log_err("Cannot bind local socket %s (%s)",
968                         path, strerror(errno));
969                 goto err;
970         }
971
972         if (!fd_set_nonblock(s)) {
973                 log_err("Cannot set non-blocking mode");
974                 goto err;
975         }
976
977         if (listen(s, TCP_BACKLOG) == -1) {
978                 log_err("can't listen: %s", strerror(errno));
979                 goto err;
980         }
981
982         (void)noproto; /*unused*/
983         return s;
984
985 err:
986 #ifndef USE_WINSOCK
987         close(s);
988 #else
989         closesocket(s);
990 #endif
991         return -1;
992
993 #ifdef HAVE_SYSTEMD
994         }
995 #endif
996 #else
997         (void)use_systemd;
998         (void)path;
999         log_err("Local sockets are not supported");
1000         *noproto = 1;
1001         return -1;
1002 #endif
1003 }
1004
1005
1006 /**
1007  * Create socket from getaddrinfo results
1008  */
1009 static int
1010 make_sock(int stype, const char* ifname, const char* port, 
1011         struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1012         int* reuseport, int transparent, int tcp_mss, int freebind, int use_systemd, int dscp)
1013 {
1014         struct addrinfo *res = NULL;
1015         int r, s, inuse, noproto;
1016         hints->ai_socktype = stype;
1017         *noip6 = 0;
1018         if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1019 #ifdef USE_WINSOCK
1020                 if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1021                         *noip6 = 1; /* 'Host not found' for IP6 on winXP */
1022                         return -1;
1023                 }
1024 #endif
1025                 log_err("node %s:%s getaddrinfo: %s %s", 
1026                         ifname?ifname:"default", port, gai_strerror(r),
1027 #ifdef EAI_SYSTEM
1028                         r==EAI_SYSTEM?(char*)strerror(errno):""
1029 #else
1030                         ""
1031 #endif
1032                 );
1033                 return -1;
1034         }
1035         if(stype == SOCK_DGRAM) {
1036                 verbose_print_addr(res);
1037                 s = create_udp_sock(res->ai_family, res->ai_socktype,
1038                         (struct sockaddr*)res->ai_addr, res->ai_addrlen,
1039                         v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1040                         reuseport, transparent, freebind, use_systemd, dscp);
1041                 if(s == -1 && inuse) {
1042                         log_err("bind: address already in use");
1043                 } else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1044                         *noip6 = 1;
1045                 }
1046         } else  {
1047                 s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1048                         transparent, tcp_mss, freebind, use_systemd, dscp);
1049                 if(s == -1 && noproto && hints->ai_family == AF_INET6){
1050                         *noip6 = 1;
1051                 }
1052         }
1053         freeaddrinfo(res);
1054         return s;
1055 }
1056
1057 /** make socket and first see if ifname contains port override info */
1058 static int
1059 make_sock_port(int stype, const char* ifname, const char* port, 
1060         struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1061         int* reuseport, int transparent, int tcp_mss, int freebind, int use_systemd, int dscp)
1062 {
1063         char* s = strchr(ifname, '@');
1064         if(s) {
1065                 /* override port with ifspec@port */
1066                 char p[16];
1067                 char newif[128];
1068                 if((size_t)(s-ifname) >= sizeof(newif)) {
1069                         log_err("ifname too long: %s", ifname);
1070                         *noip6 = 0;
1071                         return -1;
1072                 }
1073                 if(strlen(s+1) >= sizeof(p)) {
1074                         log_err("portnumber too long: %s", ifname);
1075                         *noip6 = 0;
1076                         return -1;
1077                 }
1078                 (void)strlcpy(newif, ifname, sizeof(newif));
1079                 newif[s-ifname] = 0;
1080                 (void)strlcpy(p, s+1, sizeof(p));
1081                 p[strlen(s+1)]=0;
1082                 return make_sock(stype, newif, p, hints, v6only, noip6,
1083                         rcv, snd, reuseport, transparent, tcp_mss, freebind, use_systemd, dscp);
1084         }
1085         return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1086                 reuseport, transparent, tcp_mss, freebind, use_systemd, dscp);
1087 }
1088
1089 /**
1090  * Add port to open ports list.
1091  * @param list: list head. changed.
1092  * @param s: fd.
1093  * @param ftype: if fd is UDP.
1094  * @return false on failure. list in unchanged then.
1095  */
1096 static int
1097 port_insert(struct listen_port** list, int s, enum listen_type ftype)
1098 {
1099         struct listen_port* item = (struct listen_port*)malloc(
1100                 sizeof(struct listen_port));
1101         if(!item)
1102                 return 0;
1103         item->next = *list;
1104         item->fd = s;
1105         item->ftype = ftype;
1106         *list = item;
1107         return 1;
1108 }
1109
1110 /** set fd to receive source address packet info */
1111 static int
1112 set_recvpktinfo(int s, int family) 
1113 {
1114 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1115         int on = 1;
1116 #else
1117         (void)s;
1118 #endif
1119         if(family == AF_INET6) {
1120 #           ifdef IPV6_RECVPKTINFO
1121                 if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1122                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1123                         log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1124                                 strerror(errno));
1125                         return 0;
1126                 }
1127 #           elif defined(IPV6_PKTINFO)
1128                 if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1129                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1130                         log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1131                                 strerror(errno));
1132                         return 0;
1133                 }
1134 #           else
1135                 log_err("no IPV6_RECVPKTINFO and no IPV6_PKTINFO option, please "
1136                         "disable interface-automatic or do-ip6 in config");
1137                 return 0;
1138 #           endif /* defined IPV6_RECVPKTINFO */
1139
1140         } else if(family == AF_INET) {
1141 #           ifdef IP_PKTINFO
1142                 if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1143                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1144                         log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1145                                 strerror(errno));
1146                         return 0;
1147                 }
1148 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1149                 if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1150                         (void*)&on, (socklen_t)sizeof(on)) < 0) {
1151                         log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1152                                 strerror(errno));
1153                         return 0;
1154                 }
1155 #           else
1156                 log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1157                         "interface-automatic or do-ip4 in config");
1158                 return 0;
1159 #           endif /* IP_PKTINFO */
1160
1161         }
1162         return 1;
1163 }
1164
1165 /** see if interface is ssl, its port number == the ssl port number */
1166 static int
1167 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1168         struct config_strlist* tls_additional_port)
1169 {
1170         struct config_strlist* s;
1171         char* p = strchr(ifname, '@');
1172         if(!p && atoi(port) == ssl_port)
1173                 return 1;
1174         if(p && atoi(p+1) == ssl_port)
1175                 return 1;
1176         for(s = tls_additional_port; s; s = s->next) {
1177                 if(p && atoi(p+1) == atoi(s->str))
1178                         return 1;
1179                 if(!p && atoi(port) == atoi(s->str))
1180                         return 1;
1181         }
1182         return 0;
1183 }
1184
1185 /**
1186  * Helper for ports_open. Creates one interface (or NULL for default).
1187  * @param ifname: The interface ip address.
1188  * @param do_auto: use automatic interface detection.
1189  *      If enabled, then ifname must be the wildcard name.
1190  * @param do_udp: if udp should be used.
1191  * @param do_tcp: if udp should be used.
1192  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1193  * @param port: Port number to use (as string).
1194  * @param list: list of open ports, appended to, changed to point to list head.
1195  * @param rcv: receive buffer size for UDP
1196  * @param snd: send buffer size for UDP
1197  * @param ssl_port: ssl service port number
1198  * @param tls_additional_port: list of additional ssl service port numbers.
1199  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1200  *      set to false on exit if reuseport failed due to no kernel support.
1201  * @param transparent: set IP_TRANSPARENT socket option.
1202  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1203  * @param freebind: set IP_FREEBIND socket option.
1204  * @param use_systemd: if true, fetch sockets from systemd.
1205  * @param dnscrypt_port: dnscrypt service port number
1206  * @param dscp: DSCP to use.
1207  * @return: returns false on error.
1208  */
1209 static int
1210 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp, 
1211         struct addrinfo *hints, const char* port, struct listen_port** list,
1212         size_t rcv, size_t snd, int ssl_port,
1213         struct config_strlist* tls_additional_port, int* reuseport,
1214         int transparent, int tcp_mss, int freebind, int use_systemd,
1215         int dnscrypt_port, int dscp)
1216 {
1217         int s, noip6=0;
1218 #ifdef USE_DNSCRYPT
1219         int is_dnscrypt = ((strchr(ifname, '@') && 
1220                         atoi(strchr(ifname, '@')+1) == dnscrypt_port) ||
1221                         (!strchr(ifname, '@') && atoi(port) == dnscrypt_port));
1222 #else
1223         int is_dnscrypt = 0;
1224         (void)dnscrypt_port;
1225 #endif
1226
1227         if(!do_udp && !do_tcp)
1228                 return 0;
1229         if(do_auto) {
1230                 if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, 
1231                         &noip6, rcv, snd, reuseport, transparent,
1232                         tcp_mss, freebind, use_systemd, dscp)) == -1) {
1233                         if(noip6) {
1234                                 log_warn("IPv6 protocol not available");
1235                                 return 1;
1236                         }
1237                         return 0;
1238                 }
1239                 /* getting source addr packet info is highly non-portable */
1240                 if(!set_recvpktinfo(s, hints->ai_family)) {
1241 #ifndef USE_WINSOCK
1242                         close(s);
1243 #else
1244                         closesocket(s);
1245 #endif
1246                         return 0;
1247                 }
1248                 if(!port_insert(list, s,
1249                    is_dnscrypt?listen_type_udpancil_dnscrypt:listen_type_udpancil)) {
1250 #ifndef USE_WINSOCK
1251                         close(s);
1252 #else
1253                         closesocket(s);
1254 #endif
1255                         return 0;
1256                 }
1257         } else if(do_udp) {
1258                 /* regular udp socket */
1259                 if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1, 
1260                         &noip6, rcv, snd, reuseport, transparent,
1261                         tcp_mss, freebind, use_systemd, dscp)) == -1) {
1262                         if(noip6) {
1263                                 log_warn("IPv6 protocol not available");
1264                                 return 1;
1265                         }
1266                         return 0;
1267                 }
1268                 if(!port_insert(list, s,
1269                    is_dnscrypt?listen_type_udp_dnscrypt:listen_type_udp)) {
1270 #ifndef USE_WINSOCK
1271                         close(s);
1272 #else
1273                         closesocket(s);
1274 #endif
1275                         return 0;
1276                 }
1277         }
1278         if(do_tcp) {
1279                 int is_ssl = if_is_ssl(ifname, port, ssl_port,
1280                         tls_additional_port);
1281                 if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1, 
1282                         &noip6, 0, 0, reuseport, transparent, tcp_mss,
1283                         freebind, use_systemd, dscp)) == -1) {
1284                         if(noip6) {
1285                                 /*log_warn("IPv6 protocol not available");*/
1286                                 return 1;
1287                         }
1288                         return 0;
1289                 }
1290                 if(is_ssl)
1291                         verbose(VERB_ALGO, "setup TCP for SSL service");
1292                 if(!port_insert(list, s, is_ssl?listen_type_ssl:
1293                         (is_dnscrypt?listen_type_tcp_dnscrypt:listen_type_tcp))) {
1294 #ifndef USE_WINSOCK
1295                         close(s);
1296 #else
1297                         closesocket(s);
1298 #endif
1299                         return 0;
1300                 }
1301         }
1302         return 1;
1303 }
1304
1305 /** 
1306  * Add items to commpoint list in front.
1307  * @param c: commpoint to add.
1308  * @param front: listen struct.
1309  * @return: false on failure.
1310  */
1311 static int
1312 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1313 {
1314         struct listen_list* item = (struct listen_list*)malloc(
1315                 sizeof(struct listen_list));
1316         if(!item)
1317                 return 0;
1318         item->com = c;
1319         item->next = front->cps;
1320         front->cps = item;
1321         return 1;
1322 }
1323
1324 struct listen_dnsport* 
1325 listen_create(struct comm_base* base, struct listen_port* ports,
1326         size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1327         struct tcl_list* tcp_conn_limit, void* sslctx,
1328         struct dt_env* dtenv, comm_point_callback_type* cb, void *cb_arg)
1329 {
1330         struct listen_dnsport* front = (struct listen_dnsport*)
1331                 malloc(sizeof(struct listen_dnsport));
1332         if(!front)
1333                 return NULL;
1334         front->cps = NULL;
1335         front->udp_buff = sldns_buffer_new(bufsize);
1336 #ifdef USE_DNSCRYPT
1337         front->dnscrypt_udp_buff = NULL;
1338 #endif
1339         if(!front->udp_buff) {
1340                 free(front);
1341                 return NULL;
1342         }
1343         if(!stream_wait_lock_inited) {
1344                 lock_basic_init(&stream_wait_count_lock);
1345                 stream_wait_lock_inited = 1;
1346         }
1347
1348         /* create comm points as needed */
1349         while(ports) {
1350                 struct comm_point* cp = NULL;
1351                 if(ports->ftype == listen_type_udp ||
1352                    ports->ftype == listen_type_udp_dnscrypt)
1353                         cp = comm_point_create_udp(base, ports->fd, 
1354                                 front->udp_buff, cb, cb_arg);
1355                 else if(ports->ftype == listen_type_tcp ||
1356                                 ports->ftype == listen_type_tcp_dnscrypt)
1357                         cp = comm_point_create_tcp(base, ports->fd, 
1358                                 tcp_accept_count, tcp_idle_timeout,
1359                                 tcp_conn_limit, bufsize, front->udp_buff,
1360                                 cb, cb_arg);
1361                 else if(ports->ftype == listen_type_ssl) {
1362                         cp = comm_point_create_tcp(base, ports->fd, 
1363                                 tcp_accept_count, tcp_idle_timeout,
1364                                 tcp_conn_limit, bufsize, front->udp_buff,
1365                                 cb, cb_arg);
1366                         cp->ssl = sslctx;
1367                 } else if(ports->ftype == listen_type_udpancil ||
1368                                   ports->ftype == listen_type_udpancil_dnscrypt)
1369                         cp = comm_point_create_udp_ancil(base, ports->fd, 
1370                                 front->udp_buff, cb, cb_arg);
1371                 if(!cp) {
1372                         log_err("can't create commpoint");      
1373                         listen_delete(front);
1374                         return NULL;
1375                 }
1376                 cp->dtenv = dtenv;
1377                 cp->do_not_close = 1;
1378 #ifdef USE_DNSCRYPT
1379                 if (ports->ftype == listen_type_udp_dnscrypt ||
1380                         ports->ftype == listen_type_tcp_dnscrypt ||
1381                         ports->ftype == listen_type_udpancil_dnscrypt) {
1382                         cp->dnscrypt = 1;
1383                         cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1384                         if(!cp->dnscrypt_buffer) {
1385                                 log_err("can't alloc dnscrypt_buffer");
1386                                 comm_point_delete(cp);
1387                                 listen_delete(front);
1388                                 return NULL;
1389                         }
1390                         front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1391                 }
1392 #endif
1393                 if(!listen_cp_insert(cp, front)) {
1394                         log_err("malloc failed");
1395                         comm_point_delete(cp);
1396                         listen_delete(front);
1397                         return NULL;
1398                 }
1399                 ports = ports->next;
1400         }
1401         if(!front->cps) {
1402                 log_err("Could not open sockets to accept queries.");
1403                 listen_delete(front);
1404                 return NULL;
1405         }
1406
1407         return front;
1408 }
1409
1410 void
1411 listen_list_delete(struct listen_list* list)
1412 {
1413         struct listen_list *p = list, *pn;
1414         while(p) {
1415                 pn = p->next;
1416                 comm_point_delete(p->com);
1417                 free(p);
1418                 p = pn;
1419         }
1420 }
1421
1422 void 
1423 listen_delete(struct listen_dnsport* front)
1424 {
1425         if(!front) 
1426                 return;
1427         listen_list_delete(front->cps);
1428 #ifdef USE_DNSCRYPT
1429         if(front->dnscrypt_udp_buff &&
1430                 front->udp_buff != front->dnscrypt_udp_buff) {
1431                 sldns_buffer_free(front->dnscrypt_udp_buff);
1432         }
1433 #endif
1434         sldns_buffer_free(front->udp_buff);
1435         free(front);
1436         if(stream_wait_lock_inited) {
1437                 stream_wait_lock_inited = 0;
1438                 lock_basic_destroy(&stream_wait_count_lock);
1439         }
1440 }
1441
1442 struct listen_port* 
1443 listening_ports_open(struct config_file* cfg, int* reuseport)
1444 {
1445         struct listen_port* list = NULL;
1446         struct addrinfo hints;
1447         int i, do_ip4, do_ip6;
1448         int do_tcp, do_auto;
1449         char portbuf[32];
1450         snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1451         do_ip4 = cfg->do_ip4;
1452         do_ip6 = cfg->do_ip6;
1453         do_tcp = cfg->do_tcp;
1454         do_auto = cfg->if_automatic && cfg->do_udp;
1455         if(cfg->incoming_num_tcp == 0)
1456                 do_tcp = 0;
1457
1458         /* getaddrinfo */
1459         memset(&hints, 0, sizeof(hints));
1460         hints.ai_flags = AI_PASSIVE;
1461         /* no name lookups on our listening ports */
1462         if(cfg->num_ifs > 0)
1463                 hints.ai_flags |= AI_NUMERICHOST;
1464         hints.ai_family = AF_UNSPEC;
1465 #ifndef INET6
1466         do_ip6 = 0;
1467 #endif
1468         if(!do_ip4 && !do_ip6) {
1469                 return NULL;
1470         }
1471         /* create ip4 and ip6 ports so that return addresses are nice. */
1472         if(do_auto || cfg->num_ifs == 0) {
1473                 if(do_ip6) {
1474                         hints.ai_family = AF_INET6;
1475                         if(!ports_create_if(do_auto?"::0":"::1", 
1476                                 do_auto, cfg->do_udp, do_tcp, 
1477                                 &hints, portbuf, &list,
1478                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1479                                 cfg->ssl_port, cfg->tls_additional_port,
1480                                 reuseport, cfg->ip_transparent,
1481                                 cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1482                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1483                                 listening_ports_free(list);
1484                                 return NULL;
1485                         }
1486                 }
1487                 if(do_ip4) {
1488                         hints.ai_family = AF_INET;
1489                         if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1", 
1490                                 do_auto, cfg->do_udp, do_tcp, 
1491                                 &hints, portbuf, &list,
1492                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1493                                 cfg->ssl_port, cfg->tls_additional_port,
1494                                 reuseport, cfg->ip_transparent,
1495                                 cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1496                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1497                                 listening_ports_free(list);
1498                                 return NULL;
1499                         }
1500                 }
1501         } else for(i = 0; i<cfg->num_ifs; i++) {
1502                 if(str_is_ip6(cfg->ifs[i])) {
1503                         if(!do_ip6)
1504                                 continue;
1505                         hints.ai_family = AF_INET6;
1506                         if(!ports_create_if(cfg->ifs[i], 0, cfg->do_udp, 
1507                                 do_tcp, &hints, portbuf, &list, 
1508                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1509                                 cfg->ssl_port, cfg->tls_additional_port,
1510                                 reuseport, cfg->ip_transparent,
1511                                 cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1512                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1513                                 listening_ports_free(list);
1514                                 return NULL;
1515                         }
1516                 } else {
1517                         if(!do_ip4)
1518                                 continue;
1519                         hints.ai_family = AF_INET;
1520                         if(!ports_create_if(cfg->ifs[i], 0, cfg->do_udp, 
1521                                 do_tcp, &hints, portbuf, &list, 
1522                                 cfg->so_rcvbuf, cfg->so_sndbuf,
1523                                 cfg->ssl_port, cfg->tls_additional_port,
1524                                 reuseport, cfg->ip_transparent,
1525                                 cfg->tcp_mss, cfg->ip_freebind, cfg->use_systemd,
1526                                 cfg->dnscrypt_port, cfg->ip_dscp)) {
1527                                 listening_ports_free(list);
1528                                 return NULL;
1529                         }
1530                 }
1531         }
1532         return list;
1533 }
1534
1535 void listening_ports_free(struct listen_port* list)
1536 {
1537         struct listen_port* nx;
1538         while(list) {
1539                 nx = list->next;
1540                 if(list->fd != -1) {
1541 #ifndef USE_WINSOCK
1542                         close(list->fd);
1543 #else
1544                         closesocket(list->fd);
1545 #endif
1546                 }
1547                 free(list);
1548                 list = nx;
1549         }
1550 }
1551
1552 size_t listen_get_mem(struct listen_dnsport* listen)
1553 {
1554         struct listen_list* p;
1555         size_t s = sizeof(*listen) + sizeof(*listen->base) + 
1556                 sizeof(*listen->udp_buff) + 
1557                 sldns_buffer_capacity(listen->udp_buff);
1558 #ifdef USE_DNSCRYPT
1559         s += sizeof(*listen->dnscrypt_udp_buff);
1560         if(listen->udp_buff != listen->dnscrypt_udp_buff){
1561                 s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1562         }
1563 #endif
1564         for(p = listen->cps; p; p = p->next) {
1565                 s += sizeof(*p);
1566                 s += comm_point_get_mem(p->com);
1567         }
1568         return s;
1569 }
1570
1571 void listen_stop_accept(struct listen_dnsport* listen)
1572 {
1573         /* do not stop the ones that have no tcp_free list
1574          * (they have already stopped listening) */
1575         struct listen_list* p;
1576         for(p=listen->cps; p; p=p->next) {
1577                 if(p->com->type == comm_tcp_accept &&
1578                         p->com->tcp_free != NULL) {
1579                         comm_point_stop_listening(p->com);
1580                 }
1581         }
1582 }
1583
1584 void listen_start_accept(struct listen_dnsport* listen)
1585 {
1586         /* do not start the ones that have no tcp_free list, it is no
1587          * use to listen to them because they have no free tcp handlers */
1588         struct listen_list* p;
1589         for(p=listen->cps; p; p=p->next) {
1590                 if(p->com->type == comm_tcp_accept &&
1591                         p->com->tcp_free != NULL) {
1592                         comm_point_start_listening(p->com, -1, -1);
1593                 }
1594         }
1595 }
1596
1597 struct tcp_req_info*
1598 tcp_req_info_create(struct sldns_buffer* spoolbuf)
1599 {
1600         struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
1601         if(!req) {
1602                 log_err("malloc failure for new stream outoforder processing structure");
1603                 return NULL;
1604         }
1605         memset(req, 0, sizeof(*req));
1606         req->spool_buffer = spoolbuf;
1607         return req;
1608 }
1609
1610 void
1611 tcp_req_info_delete(struct tcp_req_info* req)
1612 {
1613         if(!req) return;
1614         tcp_req_info_clear(req);
1615         /* cp is pointer back to commpoint that owns this struct and
1616          * called delete on us */
1617         /* spool_buffer is shared udp buffer, not deleted here */
1618         free(req);
1619 }
1620
1621 void tcp_req_info_clear(struct tcp_req_info* req)
1622 {
1623         struct tcp_req_open_item* open, *nopen;
1624         struct tcp_req_done_item* item, *nitem;
1625         if(!req) return;
1626
1627         /* free outstanding request mesh reply entries */
1628         open = req->open_req_list;
1629         while(open) {
1630                 nopen = open->next;
1631                 mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
1632                 free(open);
1633                 open = nopen;
1634         }
1635         req->open_req_list = NULL;
1636         req->num_open_req = 0;
1637         
1638         /* free pending writable result packets */
1639         item = req->done_req_list;
1640         while(item) {
1641                 nitem = item->next;
1642                 lock_basic_lock(&stream_wait_count_lock);
1643                 stream_wait_count -= (sizeof(struct tcp_req_done_item)
1644                         +item->len);
1645                 lock_basic_unlock(&stream_wait_count_lock);
1646                 free(item->buf);
1647                 free(item);
1648                 item = nitem;
1649         }
1650         req->done_req_list = NULL;
1651         req->num_done_req = 0;
1652         req->read_is_closed = 0;
1653 }
1654
1655 void
1656 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
1657 {
1658         struct tcp_req_open_item* open, *prev = NULL;
1659         if(!req || !m) return;
1660         open = req->open_req_list;
1661         while(open) {
1662                 if(open->mesh_state == m) {
1663                         struct tcp_req_open_item* next;
1664                         if(prev) prev->next = open->next;
1665                         else req->open_req_list = open->next;
1666                         /* caller has to manage the mesh state reply entry */
1667                         next = open->next;
1668                         free(open);
1669                         req->num_open_req --;
1670
1671                         /* prev = prev; */
1672                         open = next;
1673                         continue;
1674                 }
1675                 prev = open;
1676                 open = open->next;
1677         }
1678 }
1679
1680 /** setup listening for read or write */
1681 static void
1682 tcp_req_info_setup_listen(struct tcp_req_info* req)
1683 {
1684         int wr = 0;
1685         int rd = 0;
1686
1687         if(req->cp->tcp_byte_count != 0) {
1688                 /* cannot change, halfway through */
1689                 return;
1690         }
1691
1692         if(!req->cp->tcp_is_reading)
1693                 wr = 1;
1694         if(req->num_open_req + req->num_done_req < TCP_MAX_REQ_SIMULTANEOUS &&
1695                 !req->read_is_closed)
1696                 rd = 1;
1697         
1698         if(wr) {
1699                 req->cp->tcp_is_reading = 0;
1700                 comm_point_stop_listening(req->cp);
1701                 comm_point_start_listening(req->cp, -1,
1702                         req->cp->tcp_timeout_msec);
1703         } else if(rd) {
1704                 req->cp->tcp_is_reading = 1;
1705                 comm_point_stop_listening(req->cp);
1706                 comm_point_start_listening(req->cp, -1,
1707                         req->cp->tcp_timeout_msec);
1708                 /* and also read it (from SSL stack buffers), so
1709                  * no event read event is expected since the remainder of
1710                  * the TLS frame is sitting in the buffers. */
1711                 req->read_again = 1;
1712         } else {
1713                 comm_point_stop_listening(req->cp);
1714                 comm_point_start_listening(req->cp, -1,
1715                         req->cp->tcp_timeout_msec);
1716                 comm_point_listen_for_rw(req->cp, 0, 0);
1717         }
1718 }
1719
1720 /** remove first item from list of pending results */
1721 static struct tcp_req_done_item*
1722 tcp_req_info_pop_done(struct tcp_req_info* req)
1723 {
1724         struct tcp_req_done_item* item;
1725         log_assert(req->num_done_req > 0 && req->done_req_list);
1726         item = req->done_req_list;
1727         lock_basic_lock(&stream_wait_count_lock);
1728         stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
1729         lock_basic_unlock(&stream_wait_count_lock);
1730         req->done_req_list = req->done_req_list->next;
1731         req->num_done_req --;
1732         return item;
1733 }
1734
1735 /** Send given buffer and setup to write */
1736 static void
1737 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
1738         size_t len)
1739 {
1740         sldns_buffer_clear(req->cp->buffer);
1741         sldns_buffer_write(req->cp->buffer, buf, len);
1742         sldns_buffer_flip(req->cp->buffer);
1743
1744         req->cp->tcp_is_reading = 0; /* we are now writing */
1745 }
1746
1747 /** pick up the next result and start writing it to the channel */
1748 static void
1749 tcp_req_pickup_next_result(struct tcp_req_info* req)
1750 {
1751         if(req->num_done_req > 0) {
1752                 /* unlist the done item from the list of pending results */
1753                 struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
1754                 tcp_req_info_start_write_buf(req, item->buf, item->len);
1755                 free(item->buf);
1756                 free(item);
1757         }
1758 }
1759
1760 /** the read channel has closed */
1761 int
1762 tcp_req_info_handle_read_close(struct tcp_req_info* req)
1763 {
1764         verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
1765         /* reset byte count for (potential) partial read */
1766         req->cp->tcp_byte_count = 0;
1767         /* if we still have results to write, pick up next and write it */
1768         if(req->num_done_req != 0) {
1769                 tcp_req_pickup_next_result(req);
1770                 tcp_req_info_setup_listen(req);
1771                 return 1;
1772         }
1773         /* if nothing to do, this closes the connection */
1774         if(req->num_open_req == 0 && req->num_done_req == 0)
1775                 return 0;
1776         /* otherwise, we must be waiting for dns resolve, wait with timeout */
1777         req->read_is_closed = 1;
1778         tcp_req_info_setup_listen(req);
1779         return 1;
1780 }
1781
1782 void
1783 tcp_req_info_handle_writedone(struct tcp_req_info* req)
1784 {
1785         /* back to reading state, we finished this write event */
1786         sldns_buffer_clear(req->cp->buffer);
1787         if(req->num_done_req == 0 && req->read_is_closed) {
1788                 /* no more to write and nothing to read, close it */
1789                 comm_point_drop_reply(&req->cp->repinfo);
1790                 return;
1791         }
1792         req->cp->tcp_is_reading = 1;
1793         /* see if another result needs writing */
1794         tcp_req_pickup_next_result(req);
1795
1796         /* see if there is more to write, if not stop_listening for writing */
1797         /* see if new requests are allowed, if so, start_listening
1798          * for reading */
1799         tcp_req_info_setup_listen(req);
1800 }
1801
1802 void
1803 tcp_req_info_handle_readdone(struct tcp_req_info* req)
1804 {
1805         struct comm_point* c = req->cp;
1806
1807         /* we want to read up several requests, unless there are
1808          * pending answers */
1809
1810         req->is_drop = 0;
1811         req->is_reply = 0;
1812         req->in_worker_handle = 1;
1813         sldns_buffer_set_limit(req->spool_buffer, 0);
1814         /* handle the current request */
1815         /* this calls the worker handle request routine that could give
1816          * a cache response, or localdata response, or drop the reply,
1817          * or schedule a mesh entry for later */
1818         fptr_ok(fptr_whitelist_comm_point(c->callback));
1819         if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
1820                 req->in_worker_handle = 0;
1821                 /* there is an answer, put it up.  It is already in the
1822                  * c->buffer, just send it. */
1823                 /* since we were just reading a query, the channel is
1824                  * clear to write to */
1825         send_it:
1826                 c->tcp_is_reading = 0;
1827                 comm_point_stop_listening(c);
1828                 comm_point_start_listening(c, -1, c->tcp_timeout_msec);
1829                 return;
1830         }
1831         req->in_worker_handle = 0;
1832         /* it should be waiting in the mesh for recursion.
1833          * If mesh failed to add a new entry and called commpoint_drop_reply. 
1834          * Then the mesh state has been cleared. */
1835         if(req->is_drop) {
1836                 /* the reply has been dropped, stream has been closed. */
1837                 return;
1838         }
1839         /* If mesh failed(mallocfail) and called commpoint_send_reply with
1840          * something like servfail then we pick up that reply below. */
1841         if(req->is_reply) {
1842                 goto send_it;
1843         }
1844
1845         sldns_buffer_clear(c->buffer);
1846         /* if pending answers, pick up an answer and start sending it */
1847         tcp_req_pickup_next_result(req);
1848
1849         /* if answers pending, start sending answers */
1850         /* read more requests if we can have more requests */
1851         tcp_req_info_setup_listen(req);
1852 }
1853
1854 int
1855 tcp_req_info_add_meshstate(struct tcp_req_info* req,
1856         struct mesh_area* mesh, struct mesh_state* m)
1857 {
1858         struct tcp_req_open_item* item;
1859         log_assert(req && mesh && m);
1860         item = (struct tcp_req_open_item*)malloc(sizeof(*item));
1861         if(!item) return 0;
1862         item->next = req->open_req_list;
1863         item->mesh = mesh;
1864         item->mesh_state = m;
1865         req->open_req_list = item;
1866         req->num_open_req++;
1867         return 1;
1868 }
1869
1870 /** Add a result to the result list.  At the end. */
1871 static int
1872 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
1873 {
1874         struct tcp_req_done_item* last = NULL;
1875         struct tcp_req_done_item* item;
1876         size_t space;
1877
1878         /* see if we have space */
1879         space = sizeof(struct tcp_req_done_item) + len;
1880         lock_basic_lock(&stream_wait_count_lock);
1881         if(stream_wait_count + space > stream_wait_max) {
1882                 lock_basic_unlock(&stream_wait_count_lock);
1883                 verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
1884                 return 0;
1885         }
1886         stream_wait_count += space;
1887         lock_basic_unlock(&stream_wait_count_lock);
1888
1889         /* find last element */
1890         last = req->done_req_list;
1891         while(last && last->next)
1892                 last = last->next;
1893         
1894         /* create new element */
1895         item = (struct tcp_req_done_item*)malloc(sizeof(*item));
1896         if(!item) {
1897                 log_err("malloc failure, for stream result list");
1898                 return 0;
1899         }
1900         item->next = NULL;
1901         item->len = len;
1902         item->buf = memdup(buf, len);
1903         if(!item->buf) {
1904                 free(item);
1905                 log_err("malloc failure, adding reply to stream result list");
1906                 return 0;
1907         }
1908
1909         /* link in */
1910         if(last) last->next = item;
1911         else req->done_req_list = item;
1912         req->num_done_req++;
1913         return 1;
1914 }
1915
1916 void
1917 tcp_req_info_send_reply(struct tcp_req_info* req)
1918 {
1919         if(req->in_worker_handle) {
1920                 /* reply from mesh is in the spool_buffer */
1921                 /* copy now, so that the spool buffer is free for other tasks
1922                  * before the callback is done */
1923                 sldns_buffer_clear(req->cp->buffer);
1924                 sldns_buffer_write(req->cp->buffer,
1925                         sldns_buffer_begin(req->spool_buffer),
1926                         sldns_buffer_limit(req->spool_buffer));
1927                 sldns_buffer_flip(req->cp->buffer);
1928                 req->is_reply = 1;
1929                 return;
1930         }
1931         /* now that the query has been handled, that mesh_reply entry
1932          * should be removed, from the tcp_req_info list,
1933          * the mesh state cleanup removes then with region_cleanup and
1934          * replies_sent true. */
1935         /* see if we can send it straight away (we are not doing
1936          * anything else).  If so, copy to buffer and start */
1937         if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
1938                 /* buffer is free, and was ready to read new query into,
1939                  * but we are now going to use it to send this answer */
1940                 tcp_req_info_start_write_buf(req,
1941                         sldns_buffer_begin(req->spool_buffer),
1942                         sldns_buffer_limit(req->spool_buffer));
1943                 /* switch to listen to write events */
1944                 comm_point_stop_listening(req->cp);
1945                 comm_point_start_listening(req->cp, -1,
1946                         req->cp->tcp_timeout_msec);
1947                 return;
1948         }
1949         /* queue up the answer behind the others already pending */
1950         if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
1951                 sldns_buffer_limit(req->spool_buffer))) {
1952                 /* drop the connection, we are out of resources */
1953                 comm_point_drop_reply(&req->cp->repinfo);
1954         }
1955 }
1956
1957 size_t tcp_req_info_get_stream_buffer_size(void)
1958 {
1959         size_t s;
1960         if(!stream_wait_lock_inited)
1961                 return stream_wait_count;
1962         lock_basic_lock(&stream_wait_count_lock);
1963         s = stream_wait_count;
1964         lock_basic_unlock(&stream_wait_count_lock);
1965         return s;
1966 }